Module:hi-IPA

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Hindi IPA pronunciation module. See {{hi-IPA}}.

Testcases

Module:hi-IPA/testcases:

12 of 59 tests failed. (refresh)

TextExpectedActualComments
test_all:
Passedअशिष्ट॰ता (aśiṣṭ.tā)ə.ʃɪʂʈ.t̪ɑːə.ʃɪʂʈ.t̪ɑːsyllabification
Passedअशिष्ट-ता (aśiṣṭ-tā)ə.ʃɪʂʈ.t̪ɑːə.ʃɪʂʈ.t̪ɑːsyllabification
Passedअल्प्संख्यक (alpsaṅkhyak)əlp.səŋ.kʰjəkəlp.səŋ.kʰjəksyllabification
Passedअंडकोष (aṇḍkoṣ)əɳɖ.koːʂəɳɖ.koːʂsyllabification
Passedअंग्रेज़ (aṅgrez)əŋ.ɡɾeːzəŋ.ɡɾeːzsyllabification
Passedअंटर्क्टिका (aṇṭarkṭikā)əɳ.ʈəɾk.ʈɪ.kɑːəɳ.ʈəɾk.ʈɪ.kɑːsyllabification
Passedमैं (ma͠i)mɛ̃ːmɛ̃ː
Passedदेश (deś)d̪eːʃd̪eːʃ
Passedमेरा (merā)meː.ɾɑːmeː.ɾɑː
Passedखिलौना (khilaunā)kʰɪ.lɔː.nɑːkʰɪ.lɔː.nɑː
Passedनौटंकी (nauṭaṅkī)nɔː.ʈəŋ.kiːnɔː.ʈəŋ.kiː
Passedहौं (ha͠u)ɦɔ̃ːɦɔ̃ː
Failedमुँह (mũh)mũːʱmũːɦ
Failedमाह (māh)mɑːʱmɑːɦ
Failedबहना (bahnā)bəʱ.nɑːbəɦ.nɑː
Failedविवाह (vivāh)ʋɪ.ʋɑːʱʋɪ.ʋɑːɦ
Passedग़म (ġam)ɣəmɣəm
Passedख़रगोश (xargoś)xəɾ.ɡoːʃxəɾ.ɡoːʃ
Passedइकट्ठा (ikaṭṭhā)ɪ.kəʈ.ʈʰɑːɪ.kəʈ.ʈʰɑː
Passedसंस्थान (sansthān)sən.st̪ʰɑːnsən.st̪ʰɑːn
Passedमधु (madhu)mə.d̪ʱuːmə.d̪ʱuːfinal u is lengthened, aspiration should not be split in syllabification
Failedमियाँ (miyā̃)miː.jɑ̃ːmɪ.jɑ̃ːi + y lengthens i
Failedमुहाफ़ज़ाह (muhāfzāh)mʊ.ɦɑːf.zɑːʱmʊ.ɦɑːf.zɑːɦ
Passedस्त्रीत्व (strītva)st̪ɾiːt̪.ʋᵊst̪ɾiːt̪.ʋᵊ
Passedशास्त्र (śāstra)ʃɑːs.t̪ɾᵊʃɑːs.t̪ɾᵊ
Passedसमाचार (samācār)sə.mɑː.t͡ʃɑːɾsə.mɑː.t͡ʃɑːɾ
Passedश्रावण (śrāvaṇ)ʃɾɑː.ʋəɳʃɾɑː.ʋəɳ
Passedहमें (hamẽ)ɦə.mẽːɦə.mẽː
Passedमें (mẽ)mẽːmẽː
Failedभैया (bhaiyā)bʱə.iː.jɑːbʱə̯i.jɑː
Failedसुलह (sulah)sʊ.ləʱsʊ.ləɦ
Passedदृष्टि (dŕṣṭi)d̪ɾɪʂ.ʈiːd̪ɾɪʂ.ʈiː
Passedसोई (soī)soː.iːsoː.iː
Passedखाइए (khāie)kʰɑː.ɪ.eːkʰɑː.ɪ.eː
Passedशक्ति (śakti)ʃək.t̪iːʃək.t̪iː
Passedउस्ताद (ustād)ʊs.t̪ɑːd̪ʊs.t̪ɑːd̪
Passedपंकज (paṅkaj)pəŋ.kəd͡ʒpəŋ.kəd͡ʒ
Passedमाला (mālā)mɑː.lɑːmɑː.lɑː
Passedदीवार (dīvār)d̪iː.ʋɑːɾd̪iː.ʋɑːɾ
Passedसुरुची (surucī)sʊ.ɾʊ.t͡ʃiːsʊ.ɾʊ.t͡ʃiː
Passedनिरस्त्र (nirastra)nɪ.ɾəs.t̪ɾᵊnɪ.ɾəs.t̪ɾᵊ
Passedनिर्वृत्त (nirvŕtt)nɪɾ.ʋɾɪt̪t̪nɪɾ.ʋɾɪt̪t̪
Passedमृत्युंजय (mŕtyuñjay)mɾɪt̪.jʊn.d͡ʒəjmɾɪt̪.jʊn.d͡ʒəj
Passedपितृओं (pitŕõ)pɪt̪.ɾõːpɪt̪.ɾõː
Passedगर्भ॰पात (garbh.pāt)ɡəɾbʱ.pɑːt̪ɡəɾbʱ.pɑːt̪
Passedगर्भ (garbh)ɡəɾbʱɡəɾbʱ
Passedवस्त्र (vastra)ʋəs.t̪ɾᵊʋəs.t̪ɾᵊ
Passedयक्ष्मा (yakṣmā)jək.ʂmɑːjək.ʂmɑː
Passedउत्प्रेक्षा (utprekṣā)ʊt̪.pɾeːk.ʂɑːʊt̪.pɾeːk.ʂɑː
Passedझुंझलाहट (jhuñjhlāhaṭ)d͡ʒʱʊn.d͡ʒʱlɑː.ɦəʈd͡ʒʱʊn.d͡ʒʱlɑː.ɦəʈ
Passedसंख्या (saṅkhyā)səŋ.kʰjɑːsəŋ.kʰjɑː
Passedघुँघरू (ghuṅghrū)ɡʱʊŋ.ɡʱɾuːɡʱʊŋ.ɡʱɾuː
Passedसंभ्रांत (sambhrānt)səm.bʱɾɑːnt̪səm.bʱɾɑːnt̪
Passedइन्फ़्लुएंज़ा (influenzā)ɪn.flʊ.eːn.zɑːɪn.flʊ.eːn.zɑː
Failedइंफ़्लुएंज़ा (imfluenzā)ɪn.flʊ.eːn.zɑːɪm.flʊ.eːn.zɑː
Failedहिमाचल प्रदेश (himācal pradeś)/ɦɪ.mɑː.t͡ʃəl pɾə.d̪eːʃ/ɦɪ.mɑː.t͡ʃəl pɾə.d̪eːʃ
Failedतंक़ीद (taṅqīd)[t̪ɐ̃ɴ.qiːd̪]t̪əŋ.qiːd̪anusvara before uvulars
Passedचेरापूंजी (cerāpūñjī)t͡ʃeː.ɾɑː.puːn.d͡ʒiːt͡ʃeː.ɾɑː.puːn.d͡ʒiː
Failedचेरापूंजी (cerāpūñjī)t͡ʃeː.ɾäː.pũːn.d͡ʒiːt͡ʃeː.ɾäː.pũːn.d͡ʒiːnasal allophone before postalveolar

local export = {}

local lang = require("Module:languages").getByCode("hi")
local sc = require("Module:scripts").getByCode("Deva")
local m_IPA = require("Module:IPA")
local m_a = require("Module:accent qualifier")

local m_str_utils = require("Module:string utilities")

local find = m_str_utils.find
local gcodepoint = m_str_utils.gcodepoint
local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local u = m_str_utils.char

local correspondences = {
	["ṅ"] = "ŋ", ["g"] = "ɡ", 
	["c"] = "t͡ʃ", ["j"] = "d͡ʒ", 
	["ṭ"] = "ʈ", ["ḍ"] = "ɖ", ["ṇ"] = "ɳ",
	["t"] = "t̪", ["d"] = "d̪",
	["y"] = "j", ["r"] = "ɾ", ["v"] = "ʋ",
	["ś"] = "ʃ", ["ṣ"] = "ʂ", ["ź"] = "ʒ", ["ž"] = "ʒ", ["h"] = "ɦ",
	["ṛ"] = "ɽ", ["ẓ"] = "ʒ", ["ḷ"] = "l", ["ḻ"] = "l", ["ġ"] = "ɣ", ["q"] = "q", ["x"] = "x", ["ṉ"] = "n", ["ṟ"] = "ɾ",

	["a"] = "ə", ["ā"] = "ɑː", ["i"] = "ɪ",
	["ī"] = "iː", ["o"] = "oː", ["e"] = "eː",
	["u"] = "ʊ", ["ū"] = "uː", ["ŏ"] = "ɔ", ["ĕ"] = "æ",

	["ẽ"] = "ẽː", ["ũ"] = "ʊ̃", ["õ"] = "õː", ["ã"] = "ə̃", ["ā̃"] = "ɑ̃ː",  ["ĩ"] = "ɪ̃", ["ī̃"] = "ĩː",

	["ॐ"] = "oːm", ["ḥ"] = "(ɦ)", ["'"] = "(ʔ)",
}

local perso_arabic = {
	["x"] = "kh", ["ġ"] = "g", ["q"] = "k", ["ź"] = "z", ["z"] = "j", ["f"] = "ph", ["'"] = "",
}

local urdu = {
	["ṣ"] = "ʃ", ["ṇ"] = "n",
}

local deccani = {
	["q"] = "x",
}

local lengthen = {
	["a"] = "ā", ["i"] = "ī", ["u"] = "ū",
}

local vowels = "aāiīuūoǒŏěĕʊɪɔɔ̃ɛeæãā̃ẽĩī̃õũū̃ː"
local vowel = "[aāiīuūoǒŏěĕʊɪɔɔ̃ɛeæãā̃ẽĩī̃õũū̃]ː?"
local weak_h = "([gjdḍbṛnm])h"
local aspirate = "([kctṭp])"
local syllabify_pattern = "([" .. vowels .. "]̃?)([^" .. vowels .. "%.%-]+)([" .. vowels .. "]̃?)"

local function find_consonants(text)
	local current = ""
	local cons = {}
	for cc in gcodepoint(text .. " ") do
		local ch = u(cc)
		if find(current .. ch, "^[kgṅcjñṭḍṇtdnpbmyrlvśṣshqxġzžḻṛṟfθṉḥ]$") or find(current .. ch, "^[kgcjṭḍtdpbṛ]h$") then
			current = current .. ch
		else
			table.insert(cons, current)
			current = ch
		end
	end
	return cons
end

local function syllabify(text)
	for count = 1, 2 do
		text = gsub(text, syllabify_pattern, function(a, b, c)
			b_set = find_consonants(b)
			table.insert(b_set, #b_set > 1 and 2 or 1, ".")
			return a .. table.concat(b_set) .. c
			end)
		text = gsub(text, "(" .. vowel .. ")(?=" .. vowel .. ")", "%1.")
	end
	for count = 1, 2 do
		text = gsub(text, "(" .. vowel .. ")(" .. vowel .. ")", "%1.%2")
	end
	-- syllabification corrections
	-- ([^.]) is added in front, just in case one of the (unlikely) clusters 
	-- would occur after a blank space (temporarily reformatted as '..')
	text =  gsub(text, '([^.])%.([kqgcjṭḍtdpb])(h?)([kqgcjṭḍtdpbxġfnɳmsśzź])', '%1%2%3.%4')
	text =  gsub(text, '([^.])%.([qgcjṭḍtdpb])(h?)ṣ', '%1%2%3.ṣ')
	text =  gsub(text, '([^.])%.khṣ', '%1kh.ṣ') 						-- not kṣ/क्ष 
	text =  gsub(text, '([^.])%.([xġfnɳmzźyrlv])([kqgcjṭḍtdpbxġfnɳmsśṣzźh])', '%1%2.%3')
	text =  gsub(text, '([^.])%.([sśṣ])([gjḍdbġsśṣzźh])', '%1%2.%3')
	return text	
end

local identical = "knlsfzθ"
for character in gmatch(identical, ".") do
	correspondences[character] = character
end

local function transliterate(text)
	return (lang:transliterate(text))
end

function export.link(term)
	return require("Module:links").full_link{ term = term, lang = lang, sc = sc }
end

function export.toIPA(text, style)
	text = gsub(text, '॰', '-')
	local translit = text
	if lang:findBestScript(text):isTransliterated() then
		translit = transliterate(text)
	end
	if not translit then
		error('The term "' .. text .. '" could not be transliterated.')
	end
	
	if style == "nonpersianized" then
		translit = gsub(translit, "[xġqźzf']", perso_arabic)
	end

	if style == "dakhini" then
		translit = gsub(translit, "[q]", deccani)
	end
	
	-- force final schwa for Hindi
	translit = gsub(translit, "a~$", "ə")

	if style == "desanskritanize" then
		translit = gsub(translit, "(...)ə$", "%1ɑ(ː)")
		translit = gsub(translit, "[ṣṇ]", urdu)
	end
	
	-- vowels
	translit = gsub(translit, "͠", "̃")
	translit = gsub(translit, 'a(̃?)i', 'ɛ%1ː')
	translit = gsub(translit, 'a(̃?)u', 'ɔ%1ː')
	translit = gsub(translit, "%-$", "")
	translit = gsub(translit, "^%-", "")
	translit = gsub(translit, "ŕ$", "r")
	translit = gsub(translit, "ŕ(" .. vowel .. ")", "r%1")
	translit = gsub(translit, "ŕ", "ri")
    
	translit = gsub(translit, 'jñ', 'gy')
	translit = gsub(translit, ",", "")
	translit = gsub(translit, " ", "..")
	translit = syllabify(translit)
	translit = gsub(translit, "%.ː", "ː.")
	translit = gsub(translit, "%.̃", "̃")

	translit = gsub(translit, aspirate .. "h", '%1ʰ')
	translit = gsub(translit, weak_h, '%1ʱ')
	
	local result = gsub(translit, ".", correspondences)
	
	-- remove final schwa (Pandey, 2014)
	-- actually weaken
	result = gsub(result, "(...)ə$", "%1ᵊ")
	result = gsub(result, "(...)ə ", "%1ᵊ ")
	result = gsub(result, "(...)ə%.?%-", "%1ᵊ-")
	
	-- formatting	
	result = gsub(result, "%.?%-", ".")
	result = gsub(result, "%.%.", " ")
	result = gsub(result, "ː̃", "̃ː")
	result = gsub(result, "ː%.̃", "̃ː.")
	result = gsub(result, "%.$", "")
    
    -- ñ
    result = gsub(result, "ñ", "n")

	-- i and u lengthening
	result = gsub(result, "ʊ(̃?)(ɦ?)$", "u%1ː%2")
	result = gsub(result, "ɪ(̃?)(ɦ?)$", "i%1ː%2")
	
	-- deaffricate first affricate in geminates
	result = gsub(result, "t͡ʃ(%.?)t͡ʃ", "t̪%1t͡ʃ")	
	result = gsub(result, "d͡ʒ(%.?)d͡ʒ", "d̪%1d͡ʒ")
	
	-- silent h in 'lh-', 'vh-' (Ohala 1983, p.45)
	result = gsub(result, "^([lʋ])ɦ", "%1")  
    result = gsub(result, "([ .])([lʋ])ɦ", "%1%2")
    
	result = gsub(result, "ɛː(%.?)j", function(a)
		local res = "ə̯i"
		res = res .. a .. "j"
		return res
	end)
	result = gsub(result, "ɔː(%.?)ʋ", function(a)
		local res = "ə̯u"
		res = res .. a .. "ʋ"
		return res
	end)
	
	return result
end

function export.narrow_IPA(ipa)
	-- what /ɑ/ and /ə/ really are
	ipa = gsub(ipa, 'ɑ', 'ä')
	ipa = gsub(ipa, 'ə', 'ɐ')
	-- uvular /x/, /ɣ/ ??
	-- ipa = gsub(ipa, 'x', 'χ')
	-- ipa = gsub(ipa, 'ɣ', 'ʁ')
	-- retroflex s rules
	ipa = gsub(ipa, 'ʂ(%.?)([^ʈɖ.])', 'ʃ%1%2')
	ipa = gsub(ipa, 'ʂ$', 'ʃ')
	-- nasal allophones
	ipa = gsub(ipa, 'ŋ(%.?)([qχʁ])', 'ɴ%1%2')
	ipa = gsub(ipa, 'n%.j', 'ɲ.j')
	ipa = gsub(ipa, '[nɳ](%.?)ʃ', 'ɲ%1ʃ')  -- this nasal is likely more front than before /j/, but not doing a too narrow transcription seems preferable
	ipa = gsub(ipa, 'n(%.?)([td])̪', 'n̪%1%2̪')
    ipa = gsub(ipa, 'm(%.?)f', 'ɱ%1f')  
	-- nasals induce nasalization
	ipa = gsub(ipa, '([ɐäɪiʊueɛoɔæ])(ː?)([nɳɲŋɴmɱ])', '%1̃%2%3')
	-- cc, jj
	ipa = gsub(ipa, 't̪(%.?)t͡ʃ', 't̚%1t͡ʃ')
	ipa = gsub(ipa, 'd̪(%.?)d͡ʒ', 'd̚%1d͡ʒ')
	-- syllable boundary consonants
	ipa = gsub(ipa, '([kɡ])%.([kɡ])', '%1̚.%2')
	ipa = gsub(ipa, '([ʈɖ])%.([ʈɖ])', '%1̚.%2')
	ipa = gsub(ipa, '([td]̪?)%.([tdn])', '%1̚.%2')
	ipa = gsub(ipa, '([pb])%.([pb])', '%1̚.%2')
	-- aspiration rules
	ipa = gsub(ipa, 'ɐɦ([%. ])', 'ɛɦ%1')
	ipa = gsub(ipa, 'ɐɦ$', 'ɛɦ')
	ipa = gsub(ipa, 'ɐ%.ɦɐ', 'ɛ.ɦɛ')
	ipa = gsub(ipa, 'ɐ%(ɦ%)', 'ɛ(ɦ)')
	ipa = gsub(ipa, 'ʊɦ%.', 'ɔɦ.')
	ipa = gsub(ipa, 'ʊ%.ɦɐ', 'ɔ.ɦɔ')
	ipa = gsub(ipa, 'ɐ%.ɦʊ', 'ɔ.ɦɔ')
	ipa = gsub(ipa, '([ɐäɪiʊueɛoɔæ])(̃?)(ː?)ɦ', '%1%2%3ʱ')
	-- v/w
	ipa = gsub(ipa, '([kɡŋtdɲʈɖɳnpbm]̪?%.?)ʋ', '%1w')

	-- geminate /ɾ/ is trill
	ipa = gsub(ipa, "ɾ%.ɾ", "r.r")	
	-- for onomatopeic words ending on -र्र 
	ipa = gsub(ipa, "ɾɾ", "rː")	
	-- final geminates often pronounced as singletons
	ipa = gsub(ipa, "kk", "k(ː)")
	ipa = gsub(ipa, "ɡɡ", "ɡ(ː)")
	ipa = gsub(ipa, "ʈʈ", "ʈ(ː)")
	ipa = gsub(ipa, "ɖɖ", "ɖ(ː)")
	ipa = gsub(ipa, "ɳɳ", "ɳ(ː)")
	ipa = gsub(ipa, "t̪t̪", "t̪(ː)")
	ipa = gsub(ipa, "d̪d̪", "d̪(ː)")
	ipa = gsub(ipa, "nn", "n(ː)")
	ipa = gsub(ipa, "pp", "p(ː)")
	ipa = gsub(ipa, "bb", "b(ː)")
	ipa = gsub(ipa, "mm", "m(ː)")
	ipa = gsub(ipa, "ll", "l(ː)")
	-- final cc, jj
	ipa = gsub(ipa, "t̚t͡ʃ", "(t̚)t͡ʃ")
	ipa = gsub(ipa, "d̚d͡ʒ", "(d̚)d͡ʒ")
	
	ipa = gsub(ipa, "ɪ%.j", "i.j")		
	ipa = gsub(ipa, " ", "‿")
	return ipa
end

function export.make(frame)
	local args = frame:getParent().args
	local pagetitle = mw.title.getCurrentTitle().text
	
	local p, results = {}, {}, {}
	
	if args[1] then
		for index, item in ipairs(args) do
			table.insert(p, (item ~= "") and item or nil)
		end
	else
		p = { pagetitle }
	end
	
	for _, Hindi in ipairs(p) do
		local persianized = export.toIPA(Hindi, "persianized")
		local nonpersianized = export.toIPA(Hindi, "nonpersianized")
		table.insert(results, { pron = "/" .. persianized .. "/" })
		local narrow = export.narrow_IPA(persianized)
		if narrow ~= persianized then table.insert(results, { pron = "[" .. narrow .. "]" }) end
		if persianized ~= nonpersianized then
			table.insert(results, { pron = "/" .. nonpersianized .. "/" })
			local narrow = export.narrow_IPA(nonpersianized)
			if narrow ~= nonpersianized then table.insert(results, { pron = "[" .. narrow .. "]" }) end
		end
	end
	
	return m_a.show({'Delhi'}) .. ' ' .. m_IPA.format_IPA_full(lang, results)
end

function export.make_ur(frame)
	local args = frame:getParent().args
	local pagetitle = mw.title.getCurrentTitle().text
	local lang = require("Module:languages").getByCode("ur")
	local sc = require("Module:scripts").getByCode("ur-Arab")
	
	local p, results = {}, {}, {}
	
	if args[1] then
		for index, item in ipairs(args) do
			table.insert(p, (item ~= "") and item or nil)
		end
	else
		error("No transliterations given.")
	end
	
	for _, Urdu in ipairs(p) do
		local desanskritanize = export.toIPA(Urdu, "desanskritanize")
		table.insert(results, { pron = "/" .. desanskritanize .. "/" })
	end
	
	return m_a.show({'urd'}) .. ' ' .. m_IPA.format_IPA_full(lang, results)
end

function export.make_deccani(frame)
	local args = frame:getParent().args
	local pagetitle = mw.title.getCurrentTitle().text
	local lang = require("Module:languages").getByCode("ur")
	local sc = require("Module:scripts").getByCode("ur-Arab")
	
	local p, results = {}, {}, {}
	
	if args[1] then
		for index, item in ipairs(args) do
			table.insert(p, (item ~= "") and item or nil)
		end
	else
		error("No transliterations given.")
	end
	
	for _, Urdu in ipairs(p) do
		local dakhini = export.toIPA(Urdu, "dakhini")
		table.insert(results, { pron = "/" .. dakhini .. "/" })
	end
	
	return m_a.show({'Deccani'}) .. ' ' .. m_IPA.format_IPA_full(lang, results)
end

return export