Jump to content

မေႃႇၵျူး:lo-translit

လုၵ်ႉတီႈ ဝိၵ်ႇသျိၼ်ႇၼရီႇ မႃး

Documentation for this module may be created at မေႃႇၵျူး:lo-translit/doc

local export = {}

local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local gsub = mw.ustring.gsub
local sub = mw.ustring.sub

-- Mapping of initial consonants.
local initial_conv = {
	['ກ'] = 'k', ['ຂ'] = 'kh', ['ຄ'] = 'kh', ['ງ'] = 'ng',
	['ຈ'] = 'ch', ['ສ'] = 's', ['ຊ'] = 's', ['ຍ'] = 'ny',
	['ດ'] = 'd', ['ຕ'] = 't', ['ຖ'] = 'th', ['ທ'] = 'th', ['ນ'] = 'n',
	['ບ'] = 'b', ['ປ'] = 'p', ['ຜ'] = 'ph', ['ຝ'] = 'f', ['ພ'] = 'ph', ['ຟ'] = 'f', ['ມ'] = 'm',
	['ຢ'] = 'y', ['ຣ'] = 'r', ['ລ'] = 'l', ['ວ'] = 'w',
	['ຫ'] = 'h', ['ອ'] = 'ʼ', ['ຮ'] = 'h',

	['ຫງ'] = 'ng',
	['ຫຍ'] = 'ny',
	['ຫນ'] = 'n', ['ໜ'] = 'n',
	['ຫມ'] = 'm', ['ໝ'] = 'm',
	['ຫຣ'] = 'r',
	['ຫລ'] = 'l', ['ຫຼ'] = 'l',
	['ຫວ'] = 'w',

	['ກຣ'] = 'kr', ['ກລ'] = 'kl',
	['ຂຣ'] = 'khr', ['ຄຣ'] = 'khr', ['ຂລ'] = 'khl', ['ຄລ'] = 'khl',
	['ປຣ'] = 'pr', ['ປລ'] = 'pl',
	['ພຣ'] = 'phr', ['ຟຣ'] = 'fr', ['ພລ'] = 'phl', ['ຟລ'] = 'fl',
	['ດຣ'] = 'dr', ['ຕຣ'] = 'tr'
}

-- Mapping of glides.
local glide_conv = {
	['ຼ'] = 'r'
}

-- Mapping of vowel combinations.
local vowel_conv = {
	['ະ'] = 'a', ['ັ'] = 'a',
	['ິ'] = 'i',
	['ຶ'] = 'ư', ['ຸ'] = 'u', ['ຸຍ'] = 'ui',
	['ເະ'] = 'e', ['ເັ'] = 'e',
	['ແະ'] = 'æ', ['ແັ'] = 'æ',
	['ໂະ'] = 'o', ['ົ'] = 'o',
	['ເາະ'] = 'ǫ', ['ັອ'] = 'ǫ',
	['ເິ'] = 'œ',
	['ເັຍ'] = 'ia', ['ັຽ'] = 'ia',
	['ເຶອ'] = 'ưa',
	['ົວະ'] = 'ua', ['ັວ'] = 'ua', ['ວັ'] = 'ua',
	['ໄ'] = 'ai', ['ໃ'] = 'ai', ['ັຍ'] = 'ai',
	['ເົາ'] = 'ao',
	['ົາວ'] = 'uau',
	['ຳ'] = 'am', ['ໍາ'] = 'am',
	['ວຳ'] = 'uam',

	['າ'] = 'ā',
	['າວ'] = 'āo',
	['ີ'] = 'ī',
	['ື'] = 'ư̄',
	['ູ'] = 'ū',
	['ເ'] = 'ē',
	['ແ'] = 'ǣ',
	['ໂ'] = 'ō',
	['ໂຍ'] = 'ōi', ['ໂຽ'] = 'ōi',
	['ໍ'] = 'ǭ', ['ອ'] = 'ǭ',
	['ອຍ'] = 'ǭi', ['ອຽ'] = 'ǭi',
	['ເີ'] = 'œ̄',
	['ເີຽ'] = 'œ̄i', ['ເີຍ'] = 'œ̄i',
	['ເຍ'] = 'īa', ['ເັຽ'] = 'īa', ['ຽ'] = 'īa',
	['ເືອ'] = 'ư̄a', ['ເືອຍ'] = 'ư̄ai',
	['ົວ'] = 'ūa', ['ວ'] = 'ūa',
	['ວຍ'] = 'uāi', ['ວຽ'] = 'uāi',
	['າຍ'] = 'āi', ['າຽ'] = 'āi',
	['ວາ'] = 'uā',
	['ວາຍ'] = 'uāi', ['ວາຽ'] = 'uāi',
	['ແວ'] = 'ǣu', -- ແ_ວ can either be ǣu and uǣ with the first one being more common.
	['ີວ'] = 'īu', ['ິວ'] = 'iu',
	['ຽວ'] = 'iāu',
	['ວີວ'] = 'uīu',
}

-- Mapping of coda consonants.
local coda_conv = {
	['ກ'] = 'k', ['ຂ'] = 'k', ['ຄ'] = 'k',
	['ງ'] = 'ng',
	['ຈ'] = 't', ['ຊ'] = 't',
	['ດ'] = 't', ['ຕ'] = 't', ['ຖ'] = 't', ['ທ'] = 't',
	['ສ'] = 's',
	['ນ'] = 'n',
	['ບ'] = 'p', ['ປ'] = 'p', ['ພ'] = 'p', ['ຟ'] = 'p',
	['ມ'] = 'm',
	['ຢ'] = 'y',
	['ຣ'] = 'n', ['ລ'] = 'n',
	['ວ'] = 'w',
	[''] = '',
}

-- Special symbols.
local sp_symbols = {
	['ຯ'] = '〃', ['ໆ'] = '〃',
	['໌'] = '',
	['໐'] = '0', ['໑'] = '1', ['໒'] = '2', ['໓'] = '3', ['໔'] = '4',
	['໕'] = '5', ['໖'] = '6', ['໗'] = '7', ['໘'] = '8', ['໙'] = '9'
}

-- List of character types.
local char_type = {
	['ກ'] = 'coda', ['ຂ'] = 'coda', ['ຄ'] = 'coda', ['ງ'] = 'coda',
	['ຈ'] = 'coda', ['ຊ'] = 'coda', ['ຍ'] = 'ambig',
	['ດ'] = 'coda', ['ຕ'] = 'coda', ['ຖ'] = 'coda', ['ທ'] = 'coda', ['ນ'] = 'coda',
	['ບ'] = 'coda', ['ປ'] = 'coda', ['ຜ'] = 'cons', ['ຝ'] = 'cons', ['ພ'] = 'coda', ['ຟ'] = 'coda', ['ມ'] = 'coda',
	['ຢ'] = 'coda', ['ຣ'] = 'coda', ['ລ'] = 'coda', ['ວ'] = 'ambig',
	['ສ'] = 'coda', ['ຫ'] = 'cons', ['ອ'] = 'ambig', ['ຮ'] = 'cons',
	['ໜ'] = 'cons', ['ໝ'] = 'cons',
	['ຯ'] = 'iter_symbol',
	['ະ'] = 'vowel_let', ['ັ'] = 'suf_vowel', ['າ'] = 'vowel_let', ['ຳ'] = 'suf_vowel',
	['ິ'] = 'suf_vowel', ['ີ'] = 'suf_vowel', ['ຶ'] = 'suf_vowel', ['ື'] = 'suf_vowel',
	['ຸ'] = 'suf_vowel', ['ູ'] = 'suf_vowel', ['ົ'] = 'suf_vowel',
	['ຼ'] = 'glide',
	['ຽ'] = 'vowel_let',
	['ເ'] = 'pref_vowel', ['ແ'] = 'pref_vowel',
	['ໂ'] = 'pref_vowel', ['ໃ'] = 'pref_vowel', ['ໄ'] = 'pref_vowel',
	['ໆ'] = 'iter_symbol',
	['່'] = 'tone', ['້'] = 'tone', ['໊'] = 'tone', ['໋'] = 'tone',
	['໌'] = 'canc_symbol', ['ໍ'] = 'suf_vowel',
	['໐'] = 'number', ['໑'] = 'number', ['໒'] = 'number', ['໓'] = 'number', ['໔'] = 'number',
	['໕'] = 'number', ['໖'] = 'number', ['໗'] = 'number', ['໘'] = 'number', ['໙'] = 'number'
}

-- List of consonant classes
local cons_class = {
	['ກ'] = 'mid', ['ຂ'] = 'high', ['ຄ'] = 'low', ['ງ'] = 'low',
	['ຈ'] = 'mid', ['ສ'] = 'high', ['ຊ'] = 'low', ['ຍ'] = 'low',
	['ດ'] = 'mid', ['ຕ'] = 'mid', ['ຖ'] = 'high', ['ທ'] = 'low', ['ນ'] = 'low',
	['ບ'] = 'mid', ['ປ'] = 'mid', ['ຜ'] = 'high', ['ຝ'] = 'high', ['ພ'] = 'low', ['ຟ'] = 'low', ['ມ'] = 'low',
	['ຢ'] = 'mid', ['ຣ'] = 'low', ['ລ'] = 'low', ['ວ'] = 'low',
	['ຫ'] = 'high', ['ອ'] = 'mid', ['ຮ'] = 'low'
}

-- Reset the syllable table.
local function reset_syllable()
	return { curr = {}, initial = {}, glide = {}, vowel = {}, tone = {}, coda = {}, sp = {} }  -- current (i.e. full syllable), initial, vowel, tone, coda, sp(ecial)
end

-- Store the current syllable, then reset the syllable table.
local function store_and_reset(syllables, curr_syll)
	table.insert(syllables, {
		curr = curr_syll.curr,
		initial = curr_syll.initial,
		glide = curr_syll.glide,
		vowel = curr_syll.vowel,
		tone = curr_syll.tone,
		coda = curr_syll.coda,
		sp = curr_syll.sp
	})
	return reset_syllable()
end

-- Split the entry into individual syllables.
function export.split_syll(text, debug)
	-- Store the split syllables.
	local syllables = {}
	local debug_syllables = {}
	local curr_syll = reset_syllable()

	-- Iterate through Lao characters.
	for lao_text in gmatch(text, '[ກ-ໝ]+') do
		local c, c_types = {}, {}

		-- Classify each character in the syllable.
		for i = 1, len(lao_text) do
			c[i] = sub(lao_text, i, i)
			c_types[i] = char_type[c[i]]
		end

		-- Parse the entry by identifying each character's type.
		for i = 1, #c + 1 do
			local type_curr, type_next = c_types[i], c_types[i+1]
			local curr_vowel_full = table.concat(curr_syll.vowel)

			-- Prefix vowels are always the start of a new syllable.
			if type_curr == 'pref_vowel' or i == #c + 1 then
				if #curr_syll.curr ~= 0 then
					curr_syll = store_and_reset(syllables, curr_syll)
				end
				table.insert(curr_syll.vowel, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Glide consonants always follow the initial consonant.
			elseif type_curr == 'glide' then
				table.insert(curr_syll.glide, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Suffix vowels and vowel letters are always part of the same syllable.
			elseif type_curr == 'suf_vowel' or type_curr == 'vowel_let' then
				table.insert(curr_syll.vowel, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Same with tone marks.
			elseif type_curr == 'tone' then
				table.insert(curr_syll.tone, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Some consonants can end a syllable.
			elseif type_curr == 'coda' then
				if #curr_syll.coda == 0 and initial_conv[table.concat(curr_syll.initial)..c[i]] and (#curr_syll.vowel == 0 or char_type[curr_vowel_full] == 'pref_vowel') then
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif #curr_syll.coda == 0 and #curr_syll.initial ~= 0 and (type_next ~= 'glide' and type_next ~= 'suf_vowel' and type_next ~= 'vowel_let' and type_next ~= 'tone')
				and not (type_next == 'ambig' and match(c_types[i+2], 'co'))
				and not ((c_types[i-1] ~= 'tone' and c_types[i-1] ~= 'suf_vowel' and c[i-1] ~= 'ອ') and type_next == 'ambig' and match(c[i + 2], '[ຍາ]')) then
					table.insert(curr_syll.coda, c[i])
					table.insert(curr_syll.curr, c[i])
				else
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				end

			-- However, some consonants can only start a syllable.
			elseif type_curr == 'cons' then
				if #curr_syll.coda == 0 and initial_conv[table.concat(curr_syll.initial)..c[i]] and (#curr_syll.vowel == 0 or char_type[curr_vowel_full] == 'pref_vowel') then
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				else
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				end

			-- Ambiguous characters can both start or end a syllable.
			elseif type_curr == 'ambig' then
				if #curr_syll.curr > 0 and c[i] == 'ອ' and type_next == 'suf_vowel' then
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif #curr_syll.initial == 0 or char_type[curr_vowel_full] == 'pref_vowel' then
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif c[i] == 'ຍ' and c[i-1] == 'າ' then -- quick hack (FIXME)
					table.insert(curr_syll.vowel, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif c[i] == 'ຍ' and c[i-1] ~= 'ຫ' and #curr_vowel_full == 0 then
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif #curr_syll.initial ~= 0 and (#curr_vowel_full == 0 or vowel_conv[curr_vowel_full..c[i]] and
				(type_next ~= 'glide' and type_next ~= 'suf_vowel' and type_next ~= 'vowel_let' and type_next ~= 'tone')) then
					table.insert(curr_syll.vowel, c[i])
					table.insert(curr_syll.curr, c[i])
				else
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				end

			-- Iteration and cancel symbols should be treated as part of the same syllable.
			elseif type_curr == 'iter_symbol' or 'canc_symbol' then
				table.insert(curr_syll.curr, c[i])
				table.insert(curr_syll.sp, c[i])

			-- However, numbers should be treated in their own syllable.
			elseif type_curr == 'number' then
				if curr_syll.initial ~= 0 or curr_syll.glide ~= 0 or curr_syll.vowel ~= 0 or curr_syll.tone ~= 0 or curr_syll.coda ~= 0 then
					curr_syll = store_and_reset(syllables, curr_syll)
				end
				table.insert(curr_syll.curr, c[i])
				table.insert(curr_syll.sp, c[i])
			end
		end
	end

	-- For debug mode, return concatenated `curr` values.
	if debug then
		for _, syll in ipairs(syllables) do
			table.insert(debug_syllables, table.concat(syll.curr))
		end
		return table.concat(debug_syllables, '-')
	-- Otherwise, return full syllable information.
	else
		return syllables
	end
end

-- Generate the transliteration of a Lao entry given the split syllables.
function export.tr(text, lang, sc)
	-- Split the entry into syllables.
	local syllables = export.split_syll(text, false)

	-- Store the transliteration.
	local translit = {}

	-- Iterate through each syllable.
	for _, syllable in ipairs(syllables) do
		-- Handle various edge cases.
		if table.concat(syllable.initial) == 'ຫ' and table.concat(syllable.glide) == 'ຼ' then  -- ຫຼ
			syllable.initial = {'ຫ', 'ຼ'}
			syllable.glide = {}
		end
		if table.concat(syllable.initial) ~= '' and table.concat(syllable.vowel) == '' then  -- null vowel is pronounced like ະ given an initial consonant
			syllable.vowel = {'ະ'}
		end

		-- Handle cases where ambiguous vowels are put in the initial consonant position when it really should be a vowel.
		if #syllable.initial > 1 and syllable.initial[#syllable.initial] == 'ວ' then
			table.remove(syllable.initial)
			table.insert(syllable.vowel, 'ວ')
		end
		if #syllable.initial > 1 and syllable.initial[#syllable.initial] == 'ຍ' then
			table.remove(syllable.initial)
			table.insert(syllable.vowel, 'ຍ')
		end

		-- Handle cases where ຍ is in the vowle position but should be in initially position with ຫ.
		if #syllable.vowel > 1 and syllable.vowel[1] == 'ຍ' and syllable.initial[1] == 'ຫ' then
			table.remove(syllable.vowel, 1)
			table.insert(syllable.initial, 'ຍ')
		end

		-- Map consonants, glides, vowels and codas mapped to their transliterations.
		local initial = initial_conv[table.concat(syllable.initial)] or ''
		local glide = glide_conv[table.concat(syllable.glide)] or ''
		local vowel = vowel_conv[table.concat(syllable.vowel)] or ''
		local coda = coda_conv[table.concat(syllable.coda)] or ''
		-- Special symbols can just be added directly.
		local sp = ''
		for c in gmatch(table.concat(syllable.curr), ".") do
			sp = sp .. (sp_symbols[c] or '')
		end

		-- ແ_ວ is uǣ with certain initial consonants (ກຂຄງຈສຊຖທລອຮ) plus a coda.
		if match(table.concat(syllable.initial), '[ກຂຄງຈສຊຖທລອຮ]') and match(table.concat(syllable.vowel), 'ແວ') and coda ~= '' then
			vowel = 'uǣ'
		end
		-- _ວຍ is ūai when the initial consonant is ຫ.
		if match(table.concat(syllable.initial), 'ຫ') and match(table.concat(syllable.vowel), 'ວຍ') then
			vowel = 'ūai'
		end
		-- _ວຽ is uīa when the coda is ນ.
		if match(table.concat(syllable.coda), 'ນ') and match(table.concat(syllable.vowel), 'ວຽ') then
			vowel = 'uīa'
		end

		-- Construct the transliterated syllable string.
		local syll_string = initial .. glide .. vowel .. coda .. sp

		-- Check if '໌' is present, which indicates a cancel symbol.
		if match(table.concat(syllable.sp), '໌') then
			syll_string = gsub(syll_string, '.$', '<small><del>%0</del></small>')
		end

		-- Then check if ຯ or ໆ is present, which indicates an iteration symbol.
		if match(table.concat(syllable.sp), '[ຯໆ]') and (initial ~= '' or glide ~= '' or vowel ~= '' or coda ~= '') then
			-- Add the transliteration of the syllable to the list with another small underlined version.
			syll_string = gsub(syll_string, '〃', '')
			table.insert(translit, syll_string)
			table.insert(translit, '<small><u>' .. syll_string .. '</u></small>')
		else
			-- Add the transliteration of the syllable to the list only once.
			table.insert(translit, syll_string)
		end
	end

	-- Return the transliteration as a concatenated string.
	return table.concat(translit, ' ')
end

return export