Module:encodings

From Wiktionary, the free dictionary
Jump to navigation Jump to search

The module defines a set "encoders" which are able to encode the text into a given encoding. More encoders can be added to the module as necessary.

encode

Function encode(text, encoding)

Takes 'text' in UTF-8 encoding, encodes into 'encoding' and into %-encoding, returns the resulting string.

Explanation

  • UTF-8: the wiki software and all its pages and output use UTF-8.
  • %-encoding: 1) encoded text is bound to contain invalid UTF-8 text and Scribunto does not allow modules to return invalid UTF-8 text (replaces any invalid bytes in the output with � [U+FFFD REPLACEMENT CHARACTER]), 2) primary use of this function is to encode text for use in URLs (external links), for certain sites that use older encodings.

Custom encoder methods

Number difference

  • Paste input and output hex-numbers to LibreOffice Calc and sort by input
  • Convert base-16 [hexadecimal] to base-10 [decimal] (e.g. with onlinenumbertools.com, toolslick.com, or see below custom converter)
  • Calculate difference (using references)
  • Sort by difference, color the background of repeating differences, sort by input
Custom converter
<!DOCTYPE html>
<html>
<head>
<script>
function baseconvert() {
	var base0 = parseInt( document.querySelector("#base0").innerText );
	var base1 = parseInt( document.querySelector("#base1").innerText );
	var dText0 = document.querySelector("#text0");
	var dText1 = document.querySelector("#text1");
	var ns = dText0.value.split("\n"); // numbers
	dText1.innerHTML = "";
	for (var i=0; i < ns.length; i++) {
		if (i)
			dText1.append( "\n" );
		if ( ns[i].search(/^[0-9A-Z]+$/gi) != -1 )
			dText1.append( parseInt(ns[i],base0).toString(base1) ); // XX > 10 > ZZ
	}
	dText0.style["height"] = "32px";  // variable .text height
	dText1.style["height"] = "32px";
	dText0.style["height"] = dText0.scrollHeight + "px";
	dText1.style["height"] = dText1.scrollHeight + "px";
}
document.addEventListener("DOMContentLoaded", function() {
	baseconvert();
	document.querySelector("#text0").addEventListener("input", baseconvert);
});
</script>
<style>
body {background: lightgray; display: grid; grid-template-columns: auto auto; gap: 8px;}
#desc {grid-column: 1 / -1;}
#desc, #base0, #base1 {justify-self: center; width: auto;}
#text0, #text1 {font-family: monospace, monospace; resize: none;}
</style>
</head>
<body>
	<div id="desc">Convert from left to right</div>
	<div id="base0" contenteditable>16</div>
	<div id="base1" contenteditable>10</div>
	<textarea id="text0">BADCAFE</textarea>
	<textarea id="text1" readonly></textarea>
</body>
</html>

Examples

{{R:IEC2}} (Catalan IEC dictionary) required input in ISO 8859-1 encoding:

  • {{#invoke:encodings|encode|abundància|ISO 8859-1}}
  • abund%E0ncia (< abundància)

See also


local export = {}

local m_str_utils = require("Module:string utilities")

local char = string.char
local concat = table.concat
local gcodepoint = m_str_utils.gcodepoint
local insert = table.insert
local u = m_str_utils.char

local encoders = {}

encoders["ISO 8859-1"] = function(text)
	local ret = {}
	
	for cp in gcodepoint(text) do
		if cp >= 256 then
			error("Invalid ISO 8859-1 character \"" .. u(cp) .. "\".")
		end
		
		insert(ret, char(cp))
	end
	
	return concat(ret)
end

encoders["cp1251"] = function(text)	-- [[d:Q1748665|cp1251]]
	local ret = {}
	local range -- 0 1 2 3 4 5 6 7
	local diff2 = {[25]=57, [26]=98, [27]=98, [28]=58, [29]=40, [30]=52, [31]=56, [32]=69, [33]=95, [34]=94, [35]=93, [36]=95, [38]=77, [39]=96}
	local diff4 = {[5]=21, [6]=62, [7]=76, [8]=22, [9]=19, [10]=31, [11]=20, [12]=24, [13]=59, [14]=58, [15]=57, [16]=59, [18]=56, [19]=60, [68]=103, [69]=89}
	local diff6 = {[11]=61, [12]=61, [16]=71, [17]=71, [18]=88, [20]=73, [21]=73, [22]=90, [24]=90, [25]=90, [26]=77, [30]=97, [40]=103, [49]=110, [50]=95, [164]=228, [270]=285, [282]=329}
	
	for cp in gcodepoint(text) do
		range = (191<cp and 1 or 0) + (1023<cp and 1 or 0) + (1039<cp and 1 or 0) + (1104<cp and 1 or 0) + (1169<cp and 1 or 0) + (8210<cp and 1 or 0) + (8482<cp and 1 or 0)
		if range==2 and diff2[cp-1000] then
			cp = cp-800-diff2[cp-1000]
		elseif range==3 then
			cp = cp-848
		elseif range==4 and diff4[cp-1100] then
			cp = cp - 900 - diff4[cp-1100]
		elseif range==6 and diff6[cp-8200] then
			cp = cp -8000 - diff6[cp-8200]
		elseif range>0 then
			cp = 63 -- '?'
		end
		insert(ret, char(cp))
	end
	return concat(ret)
end

function export.encode(text, encoding)
	if type(text) == "table" then
		local params = {
			[1] = {required = true, allow_empty = true},
			[2] = {required = true},
		}
		
		local args = require("Module:parameters").process(text.args, params)
		text = args[1]
		encoding = args[2]
	end
	
	local encoder = encoders[encoding]
	
	if not encoder then
		error("No encoder exists for the encoding \"" .. encoding .. "\".")
	end
	
	return mw.uri.encode(encoder(text))
end

return export