Module:bn-translit/sandbox

From Wiktionary, the free dictionary
Jump to navigation Jump to search

-- Transliteration for Bengali

local export = {}
local u = mw.ustring.char
local gsub = mw.ustring.gsub
local match = mw.ustring.match
local sub = mw.ustring.sub

local QO = u(0x003F) -- question mark

local char = {
	-- consonants
	["ক"] = "k",	["খ"] = "kh",	["গ"] = "g",	["ঘ"] = "gh",	["ঙ"] = "ṅ",
	["চ"] = "c",	["ছ"] = "ch",	["জ"] = "j",	["ঝ"] = "jh",	["ঞ"] = "ñ",
	["ট"] = "ṭ",	["ঠ"] = "ṭh",	["ড"] = "ḍ",	["ঢ"] = "ḍh",	["ণ"] = "ṇ",
	["ত"] = "t",	["থ"] = "th",	["দ"] = "d",	["ধ"] = "dh",	["ন"] = "n",
	["প"] = "p",	["ফ"] = "f",	["ব"] = "b",	["ভ"] = "bh",	["ম"] = "m",
	["য"] = "j",	["র"] = "r",	["ল"] = "l",
	["শ"] = "ś",	["ষ"] = "ṣ",	["স"] = "s",	["হ"] = "h",	
	["ড়"] = "ṛ",	["ঢ়"] = "ṛh",	["য়"] = "y",

	-- vowel diacritics
	["ি"] = "i",	["ু"] = "u",	
	["ৃ"] = "ri",	["ে"] = "e",	["ো"] = "ō",
	["া"] = "a",	["ী"] = "i",	["ূ"] = "u",	["ৈ"] = "ōi",	["ৌ"] = "ōu",

	-- archaic vowel diacritics
	["ৄ"] = "ri",	["ৢ"] = "li",	["ৣ"] = "li",

	-- visarga
	["ঃ"] = "ḥ",

	-- vowel signs
	["অ"] = "o", 	["ই"] = "i",	["উ"] = "u",	
	["ঋ"] = "ri",	["এ"] = "e",	["ও"] = "ō",
	["আ"] = "a",	["ঈ"] = "i",	["ঊ"] = "u",	["ঐ"] = "ōi",	["ঔ"] = "ōu",

	-- archaic vowel signs
	["ৠ"] = "ri",	["ঌ"] = "li",	["ৡ"] = "li",

	--virama
	["্"] = "",

	-- chandrabindu
	["ঁ"] = "̃",
	
	-- avagraha
	['ঽ']='’',
		
	-- anusvara
	["ং"] = "ṅ",

	-- khandata, 
	["ৎ"] = "t",

	-- numerals
	["০"] = "0", ["১"] = "1", ["২"] = "2", ["৩"] = "3", ["৪"] = "4", 
	["৫"] = "5", ["৬"] = "6", ["৭"] = "7", ["৮"] = "8", ["৯"] = "9",
 
	-- punctuation
	["।"] = ".", -- dãri
}

local consonant, vowel, vowel_sign = "ক-হড়-য়", "oা-ৌ’", "অ-ঔ"
local c = "[" .. consonant .. "]"
local cc = "়?" .. c
local v = "[" .. vowel .. vowel_sign .. "o]"
local syncope_pattern = "(" .. v .. cc .. v .. cc .. ")o(" .. cc .. "ঁ?" .. v .. ")"

local deaspirate = "[কগচজটডতদপব]"

local function rev_string(text)
	local result, length = "", mw.ustring.len(text)
	for i = 1, length do
		result = result .. mw.ustring.sub(text, length - i + 1, length - i + 1)
	end
	return result
end

function export.tr(text, lang, sc, override)
	text = gsub(text, "(" .. c .. ")ও", "%1্ও")
	text = gsub(text, "^(" .. c .. ")্ও", "%1ও")

	text = gsub(text, "(" .. c .. ")্‌(" .. c .. ")$", "%1্%2্")
	text = gsub(text, "(" .. c .. ")্‌(" .. c .. ") ", "%1্%2্ ")

	text = gsub(text, "(" .. v .. ")ঞ(" .. v .. ")", "%1̃%2")

	text = gsub(text, "(" .. c .. "়?)([" .. vowel .. "’?্]?)", function(a, b)
		return a .. (b == "" and "o" or b) end)
	
	for word in mw.ustring.gmatch(text, "[ঁ-৽o’]+") do
		local orig_word = word
		word = rev_string(word)
		word = gsub(word, "^o(়?" .. c .. ")(ঁ?" .. v .. ")", "%1%2")
		while match(word, syncope_pattern) do
			word = gsub(word, syncope_pattern, "%1%2")
		end
		text = gsub(text, orig_word, rev_string(word))
	end

	text = gsub(text, "(".. deaspirate .. ")হ", "%1'h")

	text = gsub(text, "্ম", "ṃ")
	text = gsub(text, "্য", "ẏ")
	text = gsub(text, "্ব", "v")

	text = gsub(text, "িত$", "ito")
	text = gsub(text, "িত ", "ito ")

	text = gsub(text, "ৃত$", "rito")
	text = gsub(text, "ৃত ", "rito ")

	text = gsub(text, "িব$", "ibo")
	text = gsub(text, "িব ", "ibo ")

	text = gsub(text, "র্চ$", "র্চ্‌")
	text = gsub(text, "র্চ ", "র্চ্‌ ")

	text = gsub(text, "ছিল$", "chilo")
	text = gsub(text, "ছিল ", "chilo ")

	text = gsub(text, "র([মফ])o", "রo%1")

	text = gsub(text, "(".. cc .. ")o([অআ])", "%1%2")
	text = gsub(text, "(".. cc .. ")ও", "%1oō")

	text = gsub(text, ".[়’]?", char)
	text = gsub(text, ".", char)

	local v_Latn = "[oaiueō]̃?"
	local v_Latn_wo_e = "[oaiuō]̃?"
	local c_Latn = "[bcdḍfghjklmṃnṇprsśṣtṭvwyẏ]"
	local consonants_no_h = "[bcdfgjklmnpsśtṭḍ]"

	-- inherent vowel deletion
	text = gsub(text, "(".. v_Latn .. ")bo([bdps])(".. v_Latn .. ")", "%1b%2%3")
	text = gsub(text, "(".. v_Latn .. ")cho([bpt])(".. v_Latn .. ")", "%1ch%2%3")
	text = gsub(text, "(".. v_Latn .. ")do([bp])(".. v_Latn .. ")", "%1d%2%3")
	text = gsub(text, "(".. v_Latn .. ")dho([bp])(".. v_Latn .. ")", "%1dh%2%3")

	text = gsub(text, "(".. v_Latn .. ")ḍo([bnp])(".. v_Latn .. ")", "%1ḍ%2%3")
	text = gsub(text, "(".. v_Latn .. ")fo([bdjmtpz]?)(".. v_Latn_wo_e .. ")", "%1f%2%3")
	text = gsub(text, "(".. v_Latn .. ")go([bpr])(".. v_Latn .. ")", "%1g%2%3")

	text = gsub(text, "(".. v_Latn .. ")jo([bpr])(".. v_Latn .. ")", "%1j%2%3")
	text = gsub(text, "(".. v_Latn .. ")ko([bcmprsśtṭ])(".. v_Latn .. ")", "%1k%2%3")
	text = gsub(text, "(".. v_Latn .. ")kho([bcmpt])(".. v_Latn .. ")", "%1kh%2%3")
	text = gsub(text, "(".. v_Latn .. ")lo([bfd]h?)(".. v_Latn .. ")", "%1lô%2%3")
	text = gsub(text, "(".. v_Latn .. ")lodv(".. v_Latn .. ")", "%1ldv%3")

	text = gsub(text, "(".. v_Latn .. ")mo([bcdkprṛś])(".. v_Latn .. ")", "%1m%2%3")
	text = gsub(text, "(".. v_Latn .. ")no([bcdglpṭ]?)(".. v_Latn .. ")", "%1n%2%3")
	text = gsub(text, "(".. v_Latn .. ")ṅo([blmp]h?)(".. v_Latn .. ")", "%1ṅ%2%3")
	text = gsub(text, "(".. v_Latn .. ")po([bcp])(".. v_Latn .. ")", "%1p%2%3")

	text = gsub(text, "(".. v_Latn .. ")ro([bcdghjklmsṣś]h?)(".. v_Latn .. ")", "%1r%2%3")
	text = gsub(text, "(".. v_Latn .. ")ro([bcp])r(".. v_Latn .. ")", "%1r%2r%3")
	text = gsub(text, "(".. v_Latn .. ")ṣo([bjlmp])(".. v_Latn .. ")", "%1ṣ%2%3")
	text = gsub(text, "(".. v_Latn .. ")śo([bgjklmp])(".. v_Latn .. ")", "%1ś%2%3")
	text = gsub(text, "(".. v_Latn .. ")so([bjlmp])(".. v_Latn .. ")", "%1s%2%3")
	text = gsub(text, "(".. v_Latn .. ")ṭo([bgklp])(".. v_Latn .. ")", "%1ṭ%2%3")

	text = gsub(text, "(".. v_Latn .. ")yo([j])(".. v_Latn .. ")", "%1y%2%3")

	-- exceptional
	text = gsub(text, "([cr])ch$", "%1cho")
	text = gsub(text, "([cr])ch ", "%1cho ")
	text = gsub(text, "([cr])ch(" .. QO .. ")", "%1cho%2")

	text = gsub(text, "apon(".. v_Latn .. ")", "apn%1")
	text = gsub(text, "arbi", "arobi")

	text = gsub(text, "goñjo$", "gonj")
	text = gsub(text, "goñjo ", "gonj ")
	text = gsub(text, "got", "goto")

	text = gsub(text, "hojjo", "hojj")

	text = gsub(text, "ikta$", "ikota")
	text = gsub(text, "ikta ", "ikota ")

	text = gsub(text, "iy$", "iyo")
	text = gsub(text, "iy ", "iyo ")

	text = gsub(text, "ken$", "keno")
	text = gsub(text, "ken ", "keno ")
	text = gsub(text, "ken(" .. QO .. ")", "keno%1")

	text = gsub(text, "korob", "korbo")
	text = gsub(text, "okso", "oks")

	text = gsub(text, "sṭo$", "sṭ")
	text = gsub(text, "sṭo ", "sṭ ")

	text = gsub(text, "(" .. v_Latn .. ")śot$", "%1śt")
	text = gsub(text, "(" .. v_Latn .. ")śot ", "%1śt ")

	text = gsub(text, "ajon(".. v_Latn .. ")", "ajn")
	text = gsub(text, "(".. v_Latn .. ")koṭr(".. v_Latn .. ")", "%1kṭr%2")
	text = gsub(text, "(".. v_Latn .. ")khost(".. v_Latn .. ")", "%1khst%2")
	text = gsub(text, "(".. v_Latn .. ")jost(".. v_Latn .. ")", "%1jst%2")
	text = gsub(text, "(".. v_Latn .. ")lp(".. v_Latn .. ")", "%1lop%3")
	text = gsub(text, "(".. v_Latn .. ")lô([bd]h?)(".. v_Latn .. ")", "%1l%2%3")
	text = gsub(text, "(".. v_Latn .. ")no(".. c_Latn .. "h?)(".. c_Latn .. "h?)(".. v_Latn .. ")", "%1n%2%3%4")
	text = gsub(text, "(".. v_Latn .. ")rkoṭ(".. v_Latn .. ")", "%1rkṭ%2")
	text = gsub(text, "(".. v_Latn .. ")ṣdh(".. v_Latn .. ")", "%1ṣodh%2")
	text = gsub(text, "(".. v_Latn .. ")sm(".. v_Latn .. ")", "%1śom%2")

	-- up prefix
	text = gsub(text, "^up(".. c_Latn .. ")", "upo%1")
	text = gsub(text, " up(".. c_Latn .. ")", " upo%1")
	text = gsub(text, "ōp(".. c_Latn .. ")", "ōpo%1")

	-- qualifiers
	text = gsub(text, "(".. c_Latn .. ")oṭa$", "%1ṭa")
	text = gsub(text, "(".. c_Latn .. ")oṭa ", "%1ṭa ")
	text = gsub(text, "(".. c_Latn .. ")oṭi$", "%1ṭi")
	text = gsub(text, "(".. c_Latn .. ")oṭi ", "%1ṭi ")

	-- Cv
	text = gsub(text, "([bgmr])v", "%1b")
	text = gsub(text, "udv", "udb")
	text = gsub(text, "ttv", "tt")
	text = gsub(text, "^sv", "ś") -- initial
	text = gsub(text, "([sś])v", "śś") -- medial

	text = gsub(text, "^(" .. consonants_no_h .. "h?)v", "%1") -- initial
	text = gsub(text, " (" .. consonants_no_h .. "h?)v", " %1") -- initial
	text = gsub(text, "([lṅ])(" .. consonants_no_h .. "h?)v", "%1%2")
	text = gsub(text, "(" .. consonants_no_h .. ")v", "%1%1") -- medial
	text = gsub(text, "(" .. consonants_no_h .. ")hv", "%1%1h") -- medial_h

	--ahv, ihv
    text = gsub(text, "ahv", "aubh")
    text = gsub(text, "ihv", "iubh")

	text = gsub(text, "hv", "hb")

	-- kṣ
	text = gsub(text, "^kṣ", "kh") -- initial
	text = gsub(text, " kṣ", " kh") -- initial
	text = gsub(text, "ṅkṣ", "ṅkh") -- after_ṅ
	text = gsub(text, "kṣ", "kkh") -- medial
	text = gsub(text, "kkhṃ", "kkh") -- before_ṃ

	-- sm
	text = gsub(text, "^([ṣs])ṃa", "śã") -- initial
	text = gsub(text, " ([ṣs])ṃa", " śã") -- initial
	text = gsub(text, "([ṣs])ṃa", "śśã") -- medial

	text = gsub(text, "^([ṣs])ṃ", "ś") -- initial
	text = gsub(text, " ([ṣs])ṃ", " ś") -- initial
	text = gsub(text, "([ṣs])ṃ", "śś") -- medial

	-- tm
	text = gsub(text, "^tṃ", "t") -- initial
	text = gsub(text, "tṃ", "tt") -- medial

	text = gsub(text, "ṃ", "m")
	text = gsub(text, "ṣ", "ś")

	-- visarga deletion
	text = gsub(text, "ḥ(" .. consonants_no_h .. ")h", "%1%1h")
	text = gsub(text, "ḥ", "")

	-- foreign conjuncts
	text = gsub(text, "([ln])ḍo$", "%1ḍ")
	text = gsub(text, "([ln])nḍo ", "%1ḍ ")

	text = gsub(text, "rko$", "rk")
	text = gsub(text, "rko ", "rk ")

	text = gsub(text, "(" .. v_Latn .. ")h$", "%1ho")
	text = gsub(text, "(" .. v_Latn .. ")h ", "%1ho ")

	text = gsub(text, "([glś])aho$", "%1ah")
	text = gsub(text, "([glś])aho ", "%1ah ")

	text = gsub(text, "ṇn", "ṇon")
	text = gsub(text, "ṇ", "n")

	-- Cẏ
	text = gsub(text, "^eẏa", "ê")
	text = gsub(text, " eẏa", " ê")
	text = gsub(text, "^oẏa", "ê")
	text = gsub(text, " oẏa", " ê")

	text = gsub(text, "^(" .. consonants_no_h .. "h?)ẏa", "%1ê") -- initial
	text = gsub(text, " (" .. consonants_no_h .. "h?)ẏa", " %1ê") -- initial
	text = gsub(text, "^(" .. consonants_no_h .. "h?)(" .. consonants_no_h .. "h?)ẏa", "%1%2ê") -- initial_double
	text = gsub(text, " (" .. consonants_no_h .. "h?)(" .. consonants_no_h .. "h?)ẏa", " %1%2ê") -- initial_double
	text = gsub(text, "^hẏa", "hê") -- h_initial
	text = gsub(text, "ẏal$", "êl") -- final_l

	text = gsub(text, "^jñan", "gên") -- jñan_initial
	text = gsub(text, " jñan", " gên") -- jñan_initial
	text = gsub(text, "jñan", "ggên") -- jñan_medial

	text = gsub(text, "ñ", "n")

	text = gsub(text, "ẏanḍ", "ênḍ")

	text = gsub(text, "^(" .. consonants_no_h .. "h?)ẏo", "%1ê") -- initial
	text = gsub(text, " (" .. consonants_no_h .. "h?)ẏo", " %1ê") -- initial

	text = gsub(text, "^(" .. consonants_no_h .. "h?)ẏ", "%1") -- initial
	text = gsub(text, "ṅ(" .. consonants_no_h .. "h?)ẏ", "ṅ%1")
	text = gsub(text, "(" .. consonants_no_h .. ")ẏ", "%1%1") -- medial
	text = gsub(text, "(" .. consonants_no_h .. ")hẏ", "%1%1h") -- medial_h

	-- hẏ
	text = gsub(text, "^hẏ", "h") -- initial
	text = gsub(text, " hẏ", " h") -- initial
	text = gsub(text, "hẏ", "jjh") -- medial

	-- rẏ
	text = gsub(text, "rẏ", "rj")

	text = gsub(text, "yo([gklmn])([aeiīōuū])", "y%1%2")
	text = gsub(text, "yoō", "yō")
	text = gsub(text, "oō$", "ō")

	text = gsub(text, "([ei])y([" .. consonant .. "])", "%1yo%2")

	-- rules for changing s to ś (applicable for native words only)
	text = gsub(text, "s(".. v_Latn .. ")$", "ś%1") -- final
	text = gsub(text, "s(".. v_Latn .. ") ", "ś%1 ") -- final
	text = gsub(text, "s(" .. v_Latn .. ")", "ŝ%1") -- medial

	text = gsub(text, "([ai])s$", "%1ś")
	text = gsub(text, "([ai])s ", "%1ś ")

	text = gsub(text, "os$", "oŝ")
	text = gsub(text, "os ", "oŝ ")

	text = gsub(text, "^(" .. c_Latn .. ")oŝ$", "%1os")
	text = gsub(text, " (" .. c_Latn .. ")oŝ$", " %1os")
	text = gsub(text, "^(" .. c_Latn .. ")oŝ ", "%1os ")

	text = gsub(text, "^ŝe(" .. c_Latn .. ")$", "^se%1")
	text = gsub(text, " ŝe(" .. c_Latn .. ")$", " se%1")
	text = gsub(text, "^ŝe(" .. c_Latn .. ") ", "^se%1 ")
	text = gsub(text, " ŝe(" .. c_Latn .. ") ", " se%1 ")

	text = gsub(text, "ŝalam", "salam")

	text = gsub(text, "ŝ", "ś")

	text = gsub(text, "śl", "sl")
	text = gsub(text, "śr", "sr")
	text = gsub(text, "sp", "śp")
	text = gsub(text, "^śp", "sp")
	text = gsub(text, " śp", " sp")

	text = gsub(text, "śṭh$", "śṭho")

	text = gsub(text, "^([kg]h?)([dḍtṭ])", "%1o%2")
	text = gsub(text, "^(" .. c_Latn .. ")([aou])b$", "%1bo")
	text = gsub(text, "^(" .. c_Latn .. ")([aou])b ", "%1bo ")

	text = gsub(text, "^([bcdḍghjkmṃnṇprsśṣtṭwẏ])([aou])bh$", "%1%2bho")
	text = gsub(text, "^([bcdḍghjkmṃnṇprsśṣtṭwẏ])([aou])bh ", "%1%2bho ")

	text = gsub(text, "lona$", "lna")
	text = gsub(text, "nola$", "nla")

	text = gsub(text, "ōy", "ōw")
	text = gsub(text, "ō̃y", "ō̃w")

	text = gsub(text, "uy", "uw")
	text = gsub(text, "ũy", "ũw")

	text = gsub(text, "ōw$", "ōy")
	text = gsub(text, "ōw ", "ōy ")

	text = gsub(text, "uw$", "uy")
	text = gsub(text, "uw ", "uy ")

	text = gsub(text, "oo", "o")

	if match(text, "[ঁ-৽]") and mode ~= "debug" then
		return nil
	else
		return mw.ustring.toNFC(text)
	end
end
 
return export