Module:User:Sameerhameedy/Sandbox

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of Sameerhameedy, for their own experimentation. Items in this module may be added and removed at Sameerhameedy's discretion; do not rely on this module's stability.


local has_diacritics_subs = {
	-- remove punctuation and tashdid
	{ "[" .. punctuation .. tashdid .. highhmz .. numbers .. fatHataan .."]", ""},
	{"[" .. consonants .. alif .. "][" .. semivowel .. "]?$", ""},
	{"[" .. consonants .. alif .. "][" .. semivowel .. "]?(" .. space_like_class .. ")", "%1"},
	{"[" .. consonants .. alif .. "][" .. semivowel .. "]?%-", "-"},
	-- these are required for arabic al- to work
	{"[" .. consonants2 .. "]" .. "([".. zer .. pesh .."])" .. alif .. laam , laam },
	{"[" .. consonants2 .. "]([".. zer .. pesh .."])%-" .. alif .. laam , laam },
	-- remove CVV pairs
	{ "[" .. consonants2 .. "]" .. jazm .. "[" .. semivowel .. "][" .. semivowel .. "]" , ""},
	{ "[" .. consonants2 .. "]" .. jazm .. "[" .. semivowel .. "]([" .. ZZP .. "])" , "%1"},
	{ "[" .. consonants2 .. alif .."][" .. semivowel .. "][" .. semivowel .. "][" .. semivowel .. jazm .. "]" , ""},
	{ "[" .. consonants2 .. alif .. "][" .. semivowel .. "][" .. semivowel .. "]([" .. ZZP .. "])" , "%1"},
	{ "[" .. consonants2 .. alif .. ZZP .. "][" .. semivowel .. "][" .. semivowel .. "]" , ""},
	-- CV
	{ alif .. "[" .. semivowel .. "]" , ""},
	{ "[" .. consonants2 .. "]" .. jazm , ""},
	{ "[" .. consonants2 .. alif .. "]" .. "([" .. ZZP .. "])" , "%1"},
	{ malif , ""}, -- counts as a CV pair
	{ jazm .. alif .. "[" .. ZZP .. "]", ""},
	{ "[" .. consonants2 .. alif .."][" .. ZZP .. semivowel .. "]", ""},
	-- consonants paired to alif
	{ "[" .. consonants2 .. "]" .. jazm .. malif, ""},
	{ "[" .. consonants2 .. "]" .. zabar .. alif, ""},
	{ zer .. ye , ""},
	{ pesh .. vao , ""},
	{ zabar .. alif , ""},
	-- remove numbers, hamzatu l-waṣl, alif madda and ZWNJ
	{ "[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
	{ "%s", ""},
	{ "%-", ""},
	{ "[" .. ZZP .. "]", ""},
	--remove non arabic characters
	{"[^" .. U(0x0600) .. "-" .. U(0x06FF) .. U(0x0750) .. "-" .. U(0x077F) ..
			 U(0x08A0) .. "-" .. U(0x08FF) .. U(0xFB50) .. "-" .. U(0xFDFF) ..
			 U(0xFE70) .. "-" .. U(0xFEFF) .. "]", ""}
}