-- -- File ctex-zhconv.lua -- -- Copyright (C) 2020 by Qing Lee -------------------------------------------------------------------------- -- -- This work may be distributed and/or modified under the -- conditions of the LaTeX Project Public License, either -- version 1.3c of this license or (at your option) any later -- version. This version of this license is in -- http://www.latex-project.org/lppl/lppl-1-3c.txt -- and the latest version of this license is in -- http://www.latex-project.org/lppl.txt -- and version 1.3 or later is part of all distributions of -- LaTeX version 2005/12/01 or later. -- -- This work has the LPPL maintenance status "maintained". -- -- The Current Maintainer of this work is Qing Lee. -- -- This work consists of the files ctex-zhconv.lua, ctex-zhconv-make.lua -- and the derived file ctex-zhconv-index.lua. -- -------------------------------------------------------------------------- -- if status.kpse_used ~= 1 then kpse.set_program_name("luatex") end local zhconv = { } zhconv.module = { name = "ctex-zhconv", version = "0.2", date = "2020/05/02", description = "GBK/GB18030/Big5 encoder", author = "Qing Lee", copyright = "Qing Lee", license = "LPPL v1.3c" } local utf = require("unicode").utf8 local ubyte, ugsub = utf.byte, utf.gsub local floor = math.floor local unpack, insert, sort = table.unpack, table.insert, table.sort local char, format = string.char, string.format zhconv.index = require("ctex-zhconv-index") local index = zhconv.index zhconv.mapping = { } local mapping = zhconv.mapping mapping.big5, mapping.gbk, mapping.gb18030 = { }, { }, { } local gbk, gb18030, big5 = mapping.gbk, mapping.gb18030, mapping.big5 zhconv.bytes = { } local bytes = zhconv.bytes -- Let lead be pointer / 190 + 0x81. -- Let trail be pointer % 190. -- Let offset be 0x40 if trail is less than 0x3F and 0x41 otherwise. -- Return two bytes whose values are lead and trail + offset. function bytes.gbk (pointer) local lead = floor(pointer / 190) + 0x81 local trail = pointer % 190 local offset = trail < 0x3F and 0x40 or 0x41 return format("%c%c", lead, trail + offset) end -- Let lead be pointer / 157 + 0x81. -- If lead is less than 0xA1, return error with code point. -- Let trail be pointer % 157. -- Let offset be 0x40 if trail is less than 0x3F and 0x62 otherwise. -- Return two bytes whose values are lead and trail + offset. function bytes.big5 (pointer) local lead = floor(pointer / 157) + 0x81 local trail = pointer % 157 local offset = trail < 0x3F and 0x40 or 0x62 return format("%c%c", lead, trail + offset) end -- Let byte1 be pointer / 10 / 126 / 10. -- Set pointer to pointer - byte1 × 10 × 126 × 10. -- Let byte2 be pointer / 10 / 126. -- Set pointer to pointer - byte2 × 10 × 126. -- Let byte3 be pointer / 10. -- Let byte4 be pointer - byte3 × 10. -- Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30. function bytes.gb18030 (pointer) local pointer = pointer local byte1 = floor(pointer / 12600) + 0x81 pointer = pointer % 12600 local byte2 = floor(pointer / 1260) + 0x30 pointer = pointer % 1260 local byte3 = floor(pointer / 10) + 0x81 local byte4 = pointer % 10 + 0x30 return format("%c%c%c%c", byte1, byte2, byte3, byte4) end -- Let offset be the last code point in index gb18030 ranges that is equal to or -- less than code point and let pointer offset be its corresponding pointer. -- Return a pointer whose value is pointer offset + code point - offset. do local metatable = { } local bytes, ranges = bytes.gb18030, index["gb18030_ranges"] function metatable.__index (t, key) if type(key) == "number" then local n = #ranges if key < 0x10000 then local s = gbk[key] if s then return s end repeat n = n - 1 until ranges[n][2] <= key end local pointer, offset = unpack(ranges[n]) return bytes(pointer + key - offset) end end gb18030 = setmetatable(gb18030, metatable) end -- The index Big5 pointer for code point is the return value of these steps: -- 1. Let index be index Big5 excluding all entries whose pointer -- is less than (0xA1 - 0x81) × 157. -- 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, -- return the last pointer corresponding to code point in index. local big5_last = { [0x2550] = true , [0x255E] = true , [0x2561] = true , [0x256A] = true , [0x5341] = true , [0x5345] = true , } do local function spairs (t) local a = { } for i, n in pairs(t) do insert(a, {i, n}) end sort(a, function(b, c) return b[1] < c[1] end) local i = 0 local iter = function () i = i + 1 if a[i] then return a[i][1], a[i][2] end end return iter end local chars = bytes.big5 local m = (0xA1 - 0x81) * 157 for i, v in spairs(index.big5) do if i >= m then if big5[v] then if big5_last[v] then big5[v] = chars(i) end else big5[v] = chars(i) end end end local chars = bytes.gbk for i, v in pairs(index.gb18030) do gbk[v] = chars(i) end end -- If the gbk flag is set and code point is U+20AC, return byte 0x80. gb18030[0x20AC] = gbk[0x20AC] gbk[0x20AC] = char(0x80) local io_open = io.open local encode_error = "Encoding %q not available!" local file_error = "Open file %q failed!" function zhconv.conv (encoding, input, output) local encoding = encoding:lower() local mapping = assert(mapping[encoding], encode_error:format(encoding)) local encoder = function (s) local code_point = ubyte(s) return code_point > 0x7F and mapping[code_point] end if output then local handle = assert(io_open(input, "rb"), file_error:format(input)) local stream = handle:read("*all") handle:close() handle = assert(io_open(output, "wb"), file_error:format(output)) stream = stream:gsub("^\xEF\xBB\xBF", "") stream = ugsub(stream, ".", encoder) handle:write(stream) handle:close() else local s = ugsub(input, ".", encoder) return s end end return zhconv