User:Erutuon/scripts/UnicodeScriptRecognition.js/documentation

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Documentation for User:Erutuon/scripts/UnicodeScriptRecognition.js. [edit]
This page contains usage information, categories, interwiki links and other content describing the template.

Defines a function that returns official Unicode script property for a code point, like lookup_script in Module:Unicode data on Wikipedia. Compare to User:Erutuon/scripts/scriptRecognition.js and char_to_script in Module:Unicode data, which returns the Wiktionary script code.

The data is for Unicode 11.0 and was generated with the following Lua 5.3 scripts, placed in the same directory as Scripts.txt from the Unicode Character Database:

Lua 5.3 scripts

File 1

local infilehandle = assert(io.open([[./Scripts.txt]], 'rb'))
local script_data = assert(infilehandle:read 'a')
infilehandle:close()
local outfile = [[./data.js]]

local script_name_to_code = dofile [[./name_to_code.lua]]

local script_ranges = {}

local prev_codepoint, prev_script_name, prev_script_range
for codepoint1, codepoint2, script_name in script_data:gmatch '%f[^\n%z](%x+)%.?%.?(%x*)%s+;%s*([%w_]+)' do
	codepoint1, codepoint2 = tonumber(codepoint1, 16), tonumber(codepoint2, 16)
	local script_range
	if prev_script_range and script_name == prev_script_name and codepoint1 - prev_codepoint == 1 then
		prev_script_range[2] = codepoint2 or codepoint1
	else
		script_range = { codepoint1, codepoint2 or codepoint1, script_name_to_code[script_name] }
		table.insert(script_ranges, script_range)
	end
	prev_codepoint, prev_script_name, prev_script_range =
		codepoint2 or codepoint1, script_name, script_range or prev_script_range
end

table.sort(script_ranges,
	function (range1, range2)
		return range1[1] < range2[1]
	end)

io.output(outfile)
io.write 'var script_ranges = [\n'
for i, range in ipairs(script_ranges) do
	if i > 1 then io.write ',\n' end
	io.write(('\t[ 0x%05X, 0x%05X, "%s" ]'):format(range[1], range[2], range[3]))
end
io.write '\n];\n'
io.output():close()

File 2: name_to_code.lua

local lpeg = require 'lpeg'

local property_value_aliases_filename = "./PropertyValueAliases.txt"
local property_value_aliases = assert(io.open(property_value_aliases_filename, 'rb')):read('a')

for k, v in pairs(lpeg) do
	local firstletter = k:sub(1, 1)
	if firstletter:upper() == firstletter then
		_ENV[k] = v
	end
end

local script_name_to_code = {}

local function add_to_table(code, name)
	script_name_to_code[name] = code
end

local patt = P {
	(V 'script_line' / add_to_table + 1)^1,
	script_line = V 'nl' * P 'sc' * V 'sep' * C(V 'code') * V 'sep' * C(V 'name') * (P(1) - V 'nl')^0,
	code = R 'AZ' * V 'lower' * V 'lower' * V 'lower',
	name = R('AZ', 'az', '__')^1,
	lower = R 'az',
	sep = V 'w' * P ';' * V 'w',
	w = S ' \t'^0,
	nl = P '\r'^-1 * P '\n'
}

patt:match(property_value_aliases)
-- print(require 'inspect' (script_name_to_code))

return script_name_to_code