Module:tok-hyph

From Wiktionary, the free dictionary
Jump to navigation Jump to search

-- Primary module authorship: Chernorizets (original Bulgarian syllabification code)
-- Port to Lua: Kiril Kovachev
-- Adaptation to Toki Pona: Kiril Kovachev
-- 17 April 2024.

local export = {}

local substring = mw.ustring.sub
local rsubn = mw.ustring.gsub
local rsplit = mw.text.split
local U = mw.ustring.char
local lang = require("Module:languages").getByCode("tok")
local script = require("Module:scripts").getByCode("Latn")

local hvowels_c = "[aioeu]"

local HYPH = U(0x2027)
local BREAK_MARKER = "."

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

local function char_at(str, index)
	return substring(str, index, index)
end

local function count_vowels(word)
	local _, vowel_count = mw.ustring.gsub(word, hvowels_c, "")
	return vowel_count
end

local function is_vowel(ch)
	for _, v in pairs{"a", "e", "i", "o", "u"} do
		if v == ch then
			return true
		end
	end
	return false
end

---- Main syllabification code
-- word: the word being scanned
-- left/right vowels: integers
local function find_next_syllable_onset(word, left_vowel, right_vowel)
    local n_cons = right_vowel - left_vowel - 1

    -- No consonants - syllable starts on rightVowel
    if n_cons == 0 then return right_vowel end

    -- Single consonant between two vowels - starts a syllable
    if n_cons == 1 then return left_vowel + 1 end

    -- Two ("or more") consonants between the vowels.
    -- In Toki Pona, the phonotactics only allow this if the first syllable
    -- ends in a nasal and the second begins in a consonant, so there can only
    -- ever be two consonants, and the sonority break occurs between the two
    -- consonants (so just add 2 to the left vowel).
	local sonority_break = left_vowel + 2

    return sonority_break
end

-- Returns a table of strings (list)
local function syllabify_poly(word)
    local syllables = {}

    local prev_vowel = -1
    local prev_onset = 1;
    
    for i = 1, mw.ustring.len(word) do
	    if is_vowel(mw.ustring.lower(char_at(word, i))) then
	        -- A vowel, yay!
	        local should_skip = false
	        if prev_vowel == -1 then
	            prev_vowel = i
	            should_skip = true;
	        end

	        -- This is not the first vowel we've seen. In-between
	        -- the previous vowel and this one, there is a syllable
	        -- break, and the first character after the break starts
	        -- a new syllable.
	        if not should_skip then
		        local next_onset = find_next_syllable_onset(word, prev_vowel, i)
		        table.insert(syllables, substring(word, prev_onset, next_onset - 1))
		        prev_vowel = i
		        prev_onset = next_onset
			end
	    end
    	
    end

    -- Add the last syllable
    table.insert(syllables, substring(word, prev_onset))

    return syllables
end

function export.syllabify_word(word)
    if mw.ustring.len(word) == 0 then return {} end;

    local n_vowels = count_vowels(word)
    local syllables = n_vowels <= 1 and {word} or syllabify_poly(word)

    return table.concat(syllables, HYPH)
end

function export.syllabify(term)
	local words = rsplit(term, " ")

	local out = {}
	for _, word in pairs(words) do
		table.insert(out, export.syllabify_word(word))	
	end
	return table.concat(out, " ")
end

function export.show_syllabification(frame)
	local params = {
		[1] = {},
	}
	
	local title = mw.title.getCurrentTitle()
	local args = require("Module:parameters").process(frame:getParent().args, params)
	local term = args[1] or title.nsText == "Template" and "sitelen" or title.subpageText

	local syllabification = export.syllabify(term)
	local syllables = rsplit(syllabification, HYPH)
	return require("Module:hyphenation").format_hyphenations(
		{ 
			lang = lang,
			hyphs = { { hyph = syllables } },
			sc = script,
			caption = "Hyphenation",
		}
	)
end

return export