Module:Unicode
Documentation for this module may be created at Module:Unicode/doc
---A collection of functions for working with Unicode character data.
-- This module is not intended to provide general string processing
-- functionality.
require "bit"
require "hex"
local unicode = {}
---Formats the given number as a hexidecimal Unicode codepoint.
-- @param decNum a number representing a codepoint
-- @returns a codepoint reference in the form “U+xxxx”
function tohex(decNum)
local hexNum = hex.to_hex(decNum):sub(3)
return "U+" .. string.rep(0, 4 - #hexNum) .. hexNum
end
---@usage {{#invoke: Unicode |tohex|119070}}
function unicode.tohex(frame)
return tohex(frame.args.num or frame.args[1])
end
---Returns the given character’s Unicode codepoint expressed as a number. If
-- more than one character is given, only the first character is considered.
-- @param char a UTF-8–encoded character
-- @returns a Unicode codepoint expressed as a number
function codepoint(char, formatted)
local strBytes = {string.byte(char, 1, #char)}
-- Single-byte sequences are easy; just return the byte.
local headBits = bit.tobits(strBytes[1])
if #headBits < 8 then
if formatted then return tohex(strBytes[1])
else return strBytes[1] end
end
-- Get the number of bytes in the character from the leading byte.
local numBytes = 0
for i = #headBits, 1, -1 do
if headBits[i] == 1 then numBytes = numBytes + 1 else break end
end
-- Copy the continuation bytes into a bit array in low-endian order.
local charBits = {}
for i = numBytes, 2, -1 do
local bits = bit.tobits(strBytes[i])
-- Ignore the highest two bits.
for j = 1, #bits - 2 do
table.insert(charBits, bits[j])
end
end
-- Copy the low end of the leading byte into the bit array.
for i = 1, #headBits - numBytes - 1 do
table.insert(charBits, headBits[i])
end
-- Convert the bit array into a number.
local codepoint = bit.tonumb(charBits)
if formatted then return tohex(codepoint)
else return codepoint end
end
---@usage {{#invoke: Unicode |codepoint|€}}
function unicode.codepoint(frame)
return codepoint(frame.args.char or frame.args[1], frame.args.formatted)
end
return unicode