Module:User:Suzukaze-c/punctuation

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of Suzukaze-c, for their own experimentation. Items in this module may be added and removed at Suzukaze-c's discretion; do not rely on this module's stability.


local export = {}

-- do i need to worry about numbers? like 1,000,000

export.spacing_instructions = {
	-- ◇: there would be a space here in normal text
	-- ◆: there would not be a space here in normal text
	-- i.e. "◆.◇" means "." never has a leading space and always has a trailing space

	["mul"] = {
		[" "] = "◆;◇", ["…"]  = "◆…◆",
		["."] = "◆.◆", ["。"] = "◆.◇",
		[","] = "◆,◇", ["、"] = "◆,◇",
		["!"] = "◆!◇", ["?"] = "◆?◇",
		[":"] = "◆:◇", [";"] = "◆;◇",

		["("] = "◇(◆", [")"] = "◆)◇",

		["|"]  = "◆|◆", ["—"]  = "◆—◆",
	},
	["zh"] = {
		["《"] = "◇“◆", ["》"] = "◆”◇",
		["『"] = "◇“◆", ["』"] = "◆”◇",
		["「"] = "◇‘◆", ["」"] = "◆’◇",

		["·"]  = " ",
	},
	["ja"] = {
		-- [[Module:ja/data]]
	},
}

export.space_management_instructions = {
	["◇◆"] = "", -- 「◆!◇◆”◇」→「!”」
	["◆◇"] = "", -- 「◇‘◆◇(◆」→「‘(」
	[" *◆ *"] = "", -- remove spaces near ◆
	[" *◇ *"] = " ", -- keep spaces near ◇
}

-- return all the characters that can be converted for a language
function export.langRegexRange(lang)
	local langs = { lang, 'mul' }
	local chars = {}

	for _, lang in ipairs(langs) do
		if export.spacing_instructions[lang] then
			for punctuation, _ in pairs(export.spacing_instructions[lang]) do
				table.insert(chars, punctuation)
			end
		end
	end

	return table.concat(chars, '')
end

-- pad punctuation with spaces (no conversion)
function export.space(text, lang)
	local out = {}

	for char in mw.ustring.gmatch(text, '.') do
		if export.convChar(char, lang) then
			table.insert(out, ' ' .. char .. ' ')
		else
			table.insert(out, char)
		end
	end

	return table.concat(out)
end

-- convert single punctuation to the ugly marked up forms
-- return false if impossible
function export.convChar(char, lang)
	return ((export.spacing_instructions[lang] and export.spacing_instructions[lang][char]) or export.spacing_instructions['mul'][char] or false)
end

-- convert punctuation in text to the ugly marked up forms
function export.conv(text, lang)
	local out = {}

	for char in mw.ustring.gmatch(text, '.') do
		table.insert(out, export.convChar(char, lang) or char)
	end

	return table.concat(out)
end

-- convert marked up forms to final form
function export.main(text)
	for pattern, replacement in pairs(export.space_management_instructions) do
		text = mw.ustring.gsub(text, pattern, replacement)
	end

	return mw.text.trim(text)
end

return export