Module:ne-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate Nepali language text per WT:NE TR. It is also used to transliterate Doteli. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:ne-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

-- Transliteration for Nepali
local export = {}

local m_str_utils = require("Module:string utilities")

local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local match = m_str_utils.match
local reverse = m_str_utils.reverse
local toNFC = mw.ustring.toNFC

local conv = {
	-- consonants
	['क'] = 'k', ['ख'] = 'kh', ['ग'] = 'g', ['घ'] = 'gh', ['ङ'] = 'ṅ',
	['च'] = 'c', ['छ'] = 'ch', ['ज'] = 'j', ['झ'] = 'jh', ['ञ'] = 'ñ',
	['ट'] = 'ṭ', ['ठ'] = 'ṭh', ['ड'] = 'ḍ', ['ढ'] = 'ḍh', ['ण'] = 'ṇ',
	['त'] = 't', ['थ'] = 'th', ['द'] = 'd', ['ध'] = 'dh', ['न'] = 'n',
	['प'] = 'p', ['फ'] = 'ph', ['ब'] = 'b', ['भ'] = 'bh', ['म'] = 'm',
	['य'] = 'y', ['र'] = 'r', ['ल'] = 'l', ['व'] = 'w',
	['श'] = 'ś', ['ष'] = 'ṣ', ['स'] = 's', ['ह'] = 'h',

	['क़'] = 'q', ['ख़'] = 'x', ['ग़'] = 'ġ', ['ऴ'] = 'ḻ',
	['ळ'] = 'ḷ', ['ज़'] = 'z', ['श़'] = 'ž', ['झ़'] = 'ž',
	['ड़'] = 'ṛ', ['ढ़'] = 'ṛh', ['फ़'] = 'f', ['थ़'] = 'θ',
	['द़'] = 'ð', ['ऩ'] = 'ṉ', ['ऱ'] = 'ṟ', ['ॽ'] = "'",
	['व़'] = 'V', ['ॹ'] = 'ž',
	-- vowel diacritics
	['ि'] = 'i', ['ु'] = 'u', ['े'] = 'e', ['ो'] = 'o', ['ा'] = 'ā', ['ी'] = 'ī', ['ू'] = 'ū', ['ृ'] = 'r̥',
	['ॄ'] = 'r̥̄', ['ॢ'] = 'l̥', ['ॣ'] = 'l̥̄', ['ै'] = 'ai', ['ौ'] = 'au', ['ॉ'] = 'ŏ', ['ॅ'] = 'ĕ',
	-- vowel signs
	['अ'] = 'a', ['इ'] = 'i', ['उ'] = 'u', ['ए'] = 'e', ['ओ'] = 'o', ['आ'] = 'ā', ['ई'] = 'ī', ['ऊ'] = 'ū',
	['ऋ'] = 'r̥', ['ॠ'] = 'r̥̄', ['ऌ'] = 'l̥', ['ॡ'] = 'l̥̄', ['ऐ'] = 'ai', ['औ'] = 'au', ['ऑ'] = 'ŏ',
	['ॲ'] = 'ĕ', ['ऍ'] = 'ĕ',
	-- chandrabindu
	['ँ'] = '̃',
	-- anusvara
	['ं'] = '̃',
	-- visarga
	['ः'] = 'ḥ',
	-- virama
	['्'] = '',
	-- om
	['ॐ'] = 'om̐',
	-- zero-width non joiner
	['‌'] = '',
	-- zero-width joiner
	['‍'] = 'a',
	-- numerals
	['०'] = '0', ['१'] = '1', ['२'] = '2', ['३'] = '3', ['४'] = '4', ['५'] = '5', ['६'] = '6', ['७'] = '7',
	['८'] = '8', ['९'] = '9',
	-- punctuation
	['।'] = '.', -- danda
	['॥'] = '.', -- double danda
	['+'] = '', -- compound separator

	-- abbreviation sign
	['॰'] = '.',
}

local nasal_assim = {
	['क'] = 'ङ', ['ख'] = 'ङ', ['ग'] = 'ङ', ['घ'] = 'ङ',
	['च'] = 'ञ', ['छ'] = 'ञ', ['ज'] = 'ञ', ['झ'] = 'ञ',
	['ट'] = 'ण', ['ठ'] = 'ण', ['ड'] = 'ण', ['ढ'] = 'ण',
	['प'] = 'म', ['फ'] = 'म', ['ब'] = 'म', ['भ'] = 'म', ['म'] = 'म',
	['त'] = 'न', ['थ'] = 'न', ['द'] = 'न', ['ध'] = 'न', ['न'] = 'न', ['ष'] = 'न', ['श'] = 'ङ',
	['स'] = 'न',
	['य'] = 'म', ['र'] = 'म', ['ल'] = 'ँ', ['व'] = 'म', ['ह'] = 'ङ',
}
local perm_cl = {
	['ज्न'] = true, ['ज्ञ'] = true, ['ट्र'] = true, ['ड्र'] = true, ['ट्स'] = true,
	['ड्स'] = true, ['स्ड'] = true
}

local all_cons, special_cons = 'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवसशषह'
	, 'कखगघचछजझटठडढणतथदधनपफबभमयरलवशषसह'
local vowel, vowel_sign = 'aिुृेोाीूैौॉॅॆॊॄॢॣ', 'अइउएओआईऊऋॠॡऌऐऔऑऍ'
local syncope_pattern = '(़?[' ..
	all_cons ..
	'])([ंँ]?[' ..
	vowel ..
	vowel_sign ..
	'])(़?[' ..
	all_cons ..
	'])([ंँ]?[' ..
	vowel ..
	vowel_sign ..
	'])(़?[' ..
	all_cons ..
	'])a(़?[' ..
	all_cons .. '])([ंँ]?[' .. vowel .. vowel_sign .. '])(़?[' ..
	all_cons .. '])([ंँ]?[' .. vowel .. vowel_sign .. '])'

local nor_cons, sp_cons = 'कखगघङचछजझञटठडढतथदधपफबभशषसयरलवणनमयरलवनम'
	, 'कलम'
local vowel, vowel_sign = 'aिुृेोाीूैौॉॅॆॊॄॢॣ', 'अइउएओआईऊऋॠॡऌऐऔऑऍ'
local koka_sign = 'ोीाैे'
local koka_pattern = '([' ..
	koka_sign .. '])(़?[' .. sp_cons .. '])a(़?[' ..
	gsub(nor_cons, "य", "") .. '])([ंँ]?[' .. vowel .. vowel_sign .. '])'

function export.tr(text, lang, sc)
	text = gsub(text, '([' .. all_cons .. ']़?)([' .. vowel .. '्]?)', function(c, d)
		return c .. (d == "" and 'a' or d)
	end)
	for word in gmatch(text, "[ऀ-ॿa]+") do
		local orig_word = word
		word = reverse(word)
		word = gsub(word, '^a(़?)([' .. all_cons .. '])(.)(.?)', function(opt, first, second, third)
			return (((match(first, '[' .. special_cons .. ']') and match(second, 'ं')
				or match(first, '[' .. special_cons .. ']') and match(second, '्') and not perm_cl[first .. second .. third])
				or match(first .. second, 'य[aिुृेोाीूैौॉॅॆॊआईउऊइएऐओऔ]') or
				match(first .. second, 'ह[अaिुृेोाीूैौॉॅॆॊआईउऊइएऐओऔ]'))
				and 'a' or "") .. opt .. first .. second .. third
		end)

		while match(word, syncope_pattern) do
			word = gsub(word, syncope_pattern, '%1%2%3%4%5%6%7%8%9')
		end
		while match(word, koka_pattern) do

			word = gsub(word, koka_pattern, '%1%2%3%4')
		end
		word = gsub(word, '(.?)ं(.)', function(succ, prev)
			return succ .. (succ .. prev == "a" and "्म" or
				(succ == "" and match(prev, '[' .. vowel .. ']') and "̃" or nasal_assim[succ] or "ṃ")) .. prev
		end)
		
		text = gsub(text, orig_word, reverse(word))
		text = gsub(text, '[इईउऊएऐिीुूेै]ँ?[छनर]%f[%z%s]', '%0a')
		text = gsub(text, 'बाट%f[%z%s]', 'bāṭa')
		text = gsub(text, 'ज्ञ', 'gy')
	end
	text = gsub(text, '.़?', conv)
	text = gsub(text, 'a([iu])̃', 'a͠%1')
	text = gsub(text, "[<>]", "")
	text = gsub(text, "ॱ", "")

	text = gsub(text, 'dach%f[%z%s]', 'dacha')
	text = gsub(text, 'ain%f[%z%s]', 'aina')
	text = gsub(text, 'nach%f[%z%s]', 'nacha')
	text = gsub(text, 'wai', 'vai')
	text = gsub(text, 'w%f[%z%s]', 'v')
	text = gsub(text, '([raäāiīuūeo]r)w', '%1v')
	text = gsub(text, 'w([iīewoy])', 'v%1')
	text = gsub(text, 'w([rl]̥̄?)', 'v%1')
	text = gsub(text, 'w(a[krjtcṅñysśdphṇn][tnrṇṣcśkghjsueoayd])', 'v%1')
	text = gsub(text, 'w(ā[cgjṇtdmyshśṣn])', 'v%1')
	text = gsub(text, 'w(ār[tdābuṇṣh])', 'v%1')
	text = gsub(text, 'w(ālm)', 'v%1')
	text = gsub(text, 'w(a[sśṣṅñṃypdtnc])', 'v%1')
	text = gsub(text, 'rh̥', 'hr̥')
	---text = gsub(text, 'kṣ', 'ch̥')
	text = gsub(text, 'a(har[uū])', '%1')
	text = gsub(text, 'abāṭa', 'bāṭa')

	text = gsub(text, 'kan%f[%z%s]', 'kana')
	text = gsub(text, '([ptkbdgṭṇñḍmṅnlrwyhṣśs][ptkbdgṭḍmṇñnlrwyṣśs])(har[uū])', '%1a%2')
	text = gsub(text, '([ptkbdgṭṇñḍmṅnlrwyhṣśs][ptkbdgṭḍmṇñnlrwyṣśs])bāṭa', '%1abāṭa')
	text = gsub(text, '([ptkbdgṭḍmṅṇñnlrwyhṣśs][ptkbdgṭḍmnlrṇñwyś])([mlk])', '%1a%2')
	text = gsub(text, 'ñz', 'nz')
	text = gsub(text, 'ñgy', 'ṅgy')
	text = gsub(text, 'ãla', 'amla') -- assim case l
	text = gsub(text, '([eāuūiīo][īuiū])mm', '%1ṃm') -- assim case m
	text = gsub(text, 'a([īuiū])mm', 'a͠%1m') -- assim case m2
	text = gsub(text, '([eāuūiīo][īuiū])n([st])', '%1ṃ%2') -- assim case s
	text = gsub(text, 'a([īuiū])n([st])', 'a͠%1%2') -- assim case s2
	text = gsub(text, 'a([uū])ṅ([hk])', 'a͠%1%2') -- assim case kh
	text = gsub(text, '([eāuūiīo][īuiū])ṅ([kh])', '%1ṃ%2') -- assim case kh2
	return toNFC(text)
end

return export