Модуль:Ru-pron

Модуль для генерации транскрипции русских слов. Используется в шаблоне {{transcription-ru}}.
local ut = require("Module:utils")
local com = require("Module:ru-common")
local m_ru_translit = require("Module:ru-translit")
local strutils = require("Module:string utilities")

local export = {}
local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local ulen = mw.ustring.len

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

-- If enabled, compare this module with new version of module to make
-- sure all pronunciations are the same. Eventually consider removing this;
-- but useful as new code is created.
local test_new_ru_pron_module = false
-- If enabled, do new code for final -е; else, the old way
local new_final_e_code = false

local AC = u(0x0301) -- acute =  ́
local GR = u(0x0300) -- grave =  ̀
local CFLEX = u(0x0302) -- circumflex =  ̂
local DUBGR = u(0x030F) -- double grave =  ̏
local DOTABOVE = u(0x0307) -- dot above =  ̇
local DOTBELOW = u(0x0323) -- dot below =  ̣

local vow = 'aeiouyɛəäạëöü'
local ipa_vow = vow .. 'ɐɪʊɨæɵʉ'
local vowels, vowels_c = '[' .. vow .. ']', '([' .. vow .. '])'
-- No need to include DUBGR here because we rewrite it to CFLEX very early
local acc = AC .. GR .. CFLEX .. DOTABOVE .. DOTBELOW
local accents = '[' .. acc .. ']'

local perm_syl_onset = ut.list_to_set({
	'bd', 'bz', 'bl', 'br', 'vd', 'vz', 'vzv', 'vl', 'vm', 'vr', 'gl', 'gn', 'gr', 'dž', 'dn', 'dv', 'dl', 'dr', 'žd', 'zb', 'zd', 'zm', 'zn', 'zv', 'zr', 'kl', 'kr', 'kt', 'ml', 'mn', 'nr', 'pl', 'pr', 'ps', 'pt', 'pš', 'stv', 'str', 'sp', 'st', 'sk', 'skl', 'skr', 'sl', 'sf', 'sx', 'sc', 'sm', 'sn', 'sv','sʺ', 'sʹj', 'spr', 'sr', 'tk', 'tv', 'tl', 'tr', 'fk', 'fl', 'fr', 'fs', 'ft', 'fš', 'xl', 'xm', 'xr', 'šk', 'šl', 'št', 'šč',
})

-- FIXME: Consider changing ӂ internally to ʑ to match ɕ (it is used externally
-- in e.g. дроӂӂи (pronunciation spelling of дрожжи)
local translit_conv = {
	['c'] = 't͡s', ['č'] = 't͡ɕ', ['ĉ'] = 't͡ʂ',
	['g'] = 'ɡ', ['ĝ'] = 'd͡ʐ',
	['ĵ'] = 'd͡z', ['ǰ'] = 'd͡ʑ', ['ӂ'] = 'ʑ',
	['š'] = 'ʂ', ['ž'] = 'ʐ'
}

local translit_conv_j = {
	['cʲ'] = 'tʲ͡sʲ',
	['ĵʲ'] = 'dʲ͡zʲ'
}

local allophones = {
	['a'] = { 'a', 'ɐ', 'ə' },
	['e'] = { 'e', 'ɪ', 'ɪ' },
	['i'] = { 'i', 'ɪ', 'ɪ' },
	['o'] = { 'o', 'ɐ', 'ə' },
	['u'] = { 'u', 'ʊ', 'ʊ' },
	['y'] = { 'ɨ', 'ɨ', 'ɨ' },
	['ɛ'] = { 'ɛ', 'ɛ', 'ɛ' },
	['ä'] = { 'a', 'ɪ', 'ɪ' },
	['ạ'] = { 'a', 'ɐ', 'ə' },
	['ë'] = { 'e', 'ɪ', 'ɪ' },
	['ö'] = { 'ɵ', 'ɪ', 'ɪ' },
	['ü'] = { 'u', 'ʊ', 'ʊ' },
	['ə'] = { 'ə', 'ə', 'ə' },
}

local devoicing = {
	['b'] = 'p', ['d'] = 't', ['g'] = 'k',
	['z'] = 's', ['v'] = 'f',
	['ž'] = 'š', ['ɣ'] = 'x',
	['ĵ'] = 'c', ['ǰ'] = 'č', ['ĝ'] = 'ĉ',
	['ӂ'] = 'ɕ',
}

local voicing = {
	['p'] = 'b', ['t'] = 'd', ['k'] = 'g',
	['s'] = 'z', ['f'] = 'v',
	['š'] = 'ž', ['c'] = 'ĵ', ['č'] = 'ǰ', ['ĉ'] = 'ĝ',
	['x'] = 'ɣ', ['ɕ'] = 'ӂ'
}

local iotating = {
	['a'] = 'ä',
	['e'] = 'ë',
	['o'] = 'ö',
	['u'] = 'ü'
}

local retracting = {
	['e'] = 'ɛ',
	['i'] = 'y',
}

local fronting = {
	['a'] = 'æ',
	['u'] = 'ʉ',
	['ʊ'] = 'ʉ',
}

-- Prefixes that we recognize specially when they end in a geminated
-- consonant. The first element is the result after applying voicing/devoicing,
-- gemination and other changes. The second element is the original spelling,
-- so that we don't overmatch and get cases like Поттер. We check for these
-- prefixes at the beginning of words and also preceded by ne-, po- and nepo-.
local geminate_pref = {
	--'abː', --'adː',
	{'be[szšž]ː', 'be[sz]'},
	--'braomː',
	{'[vf]ː', 'v'},
	{'vo[szšž]ː', 'vo[sz]'},
	{'i[szšž]ː', 'i[sz]'},
	--'^inː',
	{'kontrː', 'kontr'},
	{'superː', 'super'},
	{'tran[szšž]ː', 'trans'},
	{'na[tdcč]ː', 'nad'},
	{'ni[szšž]ː', 'ni[sz]'},
	{'o[tdcč]ː', 'ot'}, --'^omː',
	{'o[bp]ː', 'ob'},
	{'obe[szšž]ː', 'obe[sz]'},
    {'po[tdcč]ː', 'pod'},
	{'pre[tdcč]ː', 'pred'}, --'^paszː', '^pozː',
	{'ra[szšž]ː', 'ra[sz]'},
	{'[szšž]ː', '[szšž]'}, -- ž on right for жжёт etc., ш on left for США
	{'me[žš]ː', 'mež'},
	{'če?re[szšž]ː', 'če?re[sz]'},
	-- certain double prefixes involving ra[zs]-
	{'predra[szšž]ː', 'predra[sz]'},
	{'bezra[szšž]ː', 'bezra[sz]'},
	{'nara[szšž]ː', 'nara[sz]'},
	{'vra[szšž]ː', 'vra[sz]'},
	{'dora[szšž]ː', 'dora[sz]'},
	-- '^sverxː', '^subː', '^tröxː', '^četyröxː',
}

local sztab = { s='cs', z='ĵz' }
local function ot_pod_sz(pre, sz)
	return pre .. sztab[sz]
end

local phon_respellings = {
	{'h', 'ɣ'},

	{'šč', 'ɕː'}, -- conversion of šč to geminate

	-- the following group is ordered before changes that affect ts
	-- FIXME!!! Should these also pay attention to grave accents?
	{'́tʹ?sja⁀', '́cca⁀'},
	{'([^́])tʹ?sja⁀', '%1ca⁀'},
	{'n[dt]sk', 'n(t)sk'},
	{'s[dt]sk', 'sck'},

	-- Add / before цз, чж sequences (Chinese words) and assimilate чж
	{'cz', '/cz'},
	{'čž', '/ĝž'},

	-- main changes for affricate assimilation of [dt] + sibilant, including ts;
	-- we either convert to "short" variants t͡s, d͡z, etc. or to "long" variants
	-- t͡ss, d͡zz, etc.
	-- 1. т с, д з across word boundary, also т/с, д/з with explicitly written
	--    slash, use long variants.
	{'[dt](ʹ?[ ‿⁀/]+)s', 'c%1s'},
	{'[dt](ʹ?[ ‿⁀/]+)z', 'ĵ%1z'},
	-- 2. тс, дз + vowel use long variants.
	{'[dt](ʹ?)s(j?' .. vowels .. ')', 'c%1s%2'},
	{'[dt](ʹ?)z(j?' .. vowels .. ')', 'ĵ%1z%2'},
	-- 3. тьс, дьз use long variants.
	{'[dt]ʹs', 'cʹs'},
	{'[dt]ʹz', 'ĵʹz'},
	-- 4. word-initial от[сз]-, под[сз]- use long variants because there is
	--    a morpheme boundary.
	{'(⁀o' .. accents .. '?)t([sz])', ot_pod_sz},
	{'(⁀po' .. accents .. '?)d([sz])', ot_pod_sz},
	-- 5. other тс, дз use short variants.
	{'[dt]s', 'c'},
	{'[dt]z', 'ĵ'},
	-- 6. тш, дж always use long variants (FIXME, may change)
	{'[dt](ʹ?[ %-‿⁀/]*)š', 'ĉ%1š'},
	{'[dt](ʹ?[ %-‿⁀/]*)ž', 'ĝ%1ž'},
	-- 7. soften palatalized hard hushing affricates resulting from the previous
	{'ĉʹ', 'č'},
	{'ĝʹ', 'ǰ'},

	-- changes that generate ɕː and ɕč through assimilation:
	-- зч and жч become ɕː, as does сч at the beginning of a word and in the
	-- sequence счёт when not following [цдт] (подсчёт); else сч becomes ɕč
	-- (отсчи́тываться), as щч always does (рассчитáть written ращчита́ть)
	{'[cdt]sč', 'čɕː'},
	{'ɕːč', 'ɕč'},
	{'[zž]č', 'ɕː'},
	{'[szšž]ɕː?', 'ɕː'},
	{'⁀sč', '⁀ɕː'},
	{'sčjo(' .. accents .. '?)t', 'ɕːjo%1t'},
	{'sč', 'ɕč'},

	-- misc. changes for assimilation of [dtsz] + sibilants and affricates
	{'[sz][dt]c', 'sc'},
	{'([rn])[dt]([cč])', '%1%2'},
	{'[dt]([cč])', '%1%1'},
	-- the following is ordered before the next one, which applies assimilation
	-- of [тд] to щ (including across word boundaries)
	{'n[dt]ɕ', 'nɕ'},
	-- [сз] and [сз]ь before soft affricates [щч], including across word
	-- boundaries; note that the common sequence сч has already been handled
	{'[zs]ʹ?([ ‿⁀/]*[ɕč])', 'ɕ%1'},
	-- reduction of too many ɕ's, which can happen from the previous
	{'ɕɕː', 'ɕː'},
	-- assimilation before [тдц] and [тдц]ь before щ
	{'[cdt]ʹ?([ ‿⁀/]*)ɕ', 'č%1ɕ'},
	-- assimilation of [сз] and [сз]ь before [шж]
	{'[zs]([ ‿⁀/]*)š', 'š%1š'},
	{'[zs]([ ‿⁀/]*)ž', 'ž%1ž'},
	{'[zs]ʹ([ ‿⁀/]*)š', 'ɕ%1š'},
	{'[zs]ʹ([ ‿⁀/]*)ž', 'ӂ%1ž'},

	{'sverxi', 'sverxy'},
	{'stʹd', 'zd'},
	{'tʹd', 'dd'},

	-- loss of consonants in certain clusters
	{'([ns])[dt]g', '%1g'},
	{'zdn', 'zn'},
	{'lnc', 'nc'},
	{'[sz]t([ln])', 's%1'},
	
	 -- backing of /i/ after hard consonants in close juncture
	{'([mnpbtdkgfvszxɣrlšžcĵĉĝ])⁀‿⁀i', '%1⁀‿⁀y'},
}

local cons_assim_palatal = {
	-- assimilation of tn, dn, nč, nɕ is handled specially
	compulsory = ut.list_to_set({'stʲ', 'zdʲ', 'ntʲ', 'ndʲ', 'xkʲ',
	    'csʲ', 'ĵzʲ', 'ncʲ', 'nĵʲ'}),
	optional = ut.list_to_set({'slʲ', 'zlʲ', 'snʲ', 'znʲ', 'nsʲ', 'nzʲ',
		'mpʲ', 'mbʲ', 'mfʲ', 'fmʲ'})
}

-- words which will be treated as accentless (i.e. their vowels will be
-- reduced), and which will liaise with a preceding or following word;
-- this will not happen if the words have an accent mark, cf.
-- по́ небу vs. по не́бу, etc.
local accentless = {
	-- class 'pre': particles that join with a following word
	pre = ut.list_to_set({'bez', 'bliz', 'v', 'vo', 'do',
       'iz-pod', 'iz-za', 'za', 'iz', 'izo',
       'k', 'ko', 'mež', 'na', 'nad', 'nado', 'ob', 'obo', 'ot',
       'po', 'pod', 'podo', 'pred', 'predo', 'pri', 'pro', 'pered', 'peredo',
       'čerez', 's', 'so', 'u', 'ne'}),
	-- class 'prespace': particles that join with a following word, but only
	--   if a space (not a hyphen) separates them; hyphens are used here
	--   to spell out letters, e.g. а-эн-бэ́ for АНБ (NSA = National Security
	--   Agency) or о-а-э́ for ОАЭ (UAE = United Arab Emirates)
	prespace = ut.list_to_set({'a', 'o'}),
	-- class 'post': particles that join with a preceding word
	post = ut.list_to_set({'libo', 'nibudʹ', 'by', 'b', 'že', 'ž',
       'ka', 'tka', 'li'}),
	-- class 'posthyphen': particles that join with a preceding word, but only
	--   if a hyphen (not a space) separates them
	posthyphen = ut.list_to_set({'to'}),
}

-- Pronunciation of final unstressed -е, depending on the part of speech and
--   exact ending.
--
-- Endings:
--   oe = -ое
--   ve = any other vowel plus -е (FIXME, may have to split out -ее)
--   je = -ье
--   softpaired = soft paired consonant + -е
--   hardsib = hard sibilant (ц, ш, ж) + -е
--   softsib = soft sibilant (ч, щ) + -е
--
-- Parts of speech:
--   def = default used in absence of pos
--   n/noun = neuter noun in the nominative/accusative singular (but not ending
--     in adjectival -ое or -ее; those should be considered as adjectives)
--   pre = prepositional case singular
--   dat = dative case singular (treated same as prepositional case singular)
--   voc = vocative case (currently treated as 'mid')
--   nnp = noun nominative plural in -е (гра́ждане, боя́ре, армя́не); not
--     adjectival plurals in -ие or -ые, including adjectival nouns
--     (да́нные, а́вторские)
--   inv = invariable noun or other word (currently treated as 'mid')
--   a/adj = adjective or adjectival noun (typically either neuter in -ое or
--     -ее, or plural in -ие, -ые, or -ье, or short neuter in unpaired
--     sibilant + -е)
--   c/com = comparative (typically either in -ее or sibilant + -е)
--   adv = adverb
--   p = preposition (treated same as adverb)
--   v/vb/verb = finite verbal form (usually 2nd-plural in -те); not
--     participle forms, which should be treated as adjectives
--   pro = pronoun (кое-, какие-, ваше, сколькие)
--   num = number (двое, трое, обе, четыре; currently treated as 'mid')
--   pref = prefix (treated as 'high' because integral part of word)
--   hi/high = force high values ([ɪ] or [ɨ])
--   mid = force mid values ([e] or [ɨ])
--   lo/low/schwa = force low, really schwa, values ([ə])
--
-- Possible values:
--   1. ə [ə], e [e], i [ɪ] after a vowel or soft consonant
--   2. ə [ə] or y [ɨ] after a hard sibilant
--
-- If a part of speech doesn't have an entry for a given type of ending,
--   it receives the default value. If a part of speech's entry is a string,
--   it's an alias for another way of specifying the same part of speech
--   (e.g. n=noun).
local final_e = {
	def={oe='ə', ve='e', je='e', softpaired='e', hardsib='y', softsib='e'},
	noun={oe='ə', ve='e', je='e', softpaired='e', hardsib='ə', softsib='e'},
	n='noun',
	pre={oe='e', ve='e', softpaired='e', hardsib='y', softsib='e'},
	dat='pre',
	voc='mid',
	nnp={softpaired='e'}, -- FIXME, not sure about this
	inv='mid', --FIXME, not sure about this (e.g. вице-, кофе)
	adj={oe='ə', ve='e', je='ə'}, -- FIXME: Not sure about -ее, e.g. neut adj си́нее; FIXME, not sure about short neuter adj, e.g. похо́же from похо́жий, дорогосто́яще from дорогосто́ящий, should this be treated as neuter noun?
	a='adj',
	com={ve='e', hardsib='y', softsib='e'},
	c='com',
	adv={softpaired='e', hardsib='y', softsib='e'},
	p='adv', --FIXME, not sure about prepositions
	verb={softpaired='e'},
	v='verb',
	vb='verb',
	pro={oe='i', ve='i'}, --FIXME, not sure about ваше, сколькие, какие-, кое-
	num='mid', --FIXME, not sure about обе
	pref='high',
	-- forced values
	high={oe='i', ve='i', je='i', softpaired='i', hardsib='y', softsib='i'},
	hi='high',
	mid={oe='e', ve='e', je='e', softpaired='e', hardsib='y', softsib='e'},
	low={oe='ə', ve='ə', je='ə', softpaired='ə', hardsib='ə', softsib='ə'},
	lo='low',
	schwa='low'
}

local function ine(x)
	return x ~= "" and x or nil
end

local function track(page)
	local m_debug = require("Module:debug")
	m_debug.track("ru-pron/" .. page)
	return true
end

-- For use with {{ru-IPA|phon=...}}; remove accents that we don't want
-- to appear in the phonetic respelling
function export.phon_respelling(text)
	if type(text) == 'table' then
		text = ine(text.args[1])
	end
	text = rsub(text, '[' .. CFLEX .. DUBGR .. DOTABOVE .. DOTBELOW .. ']', '')
	return text
end

function export.ipa(text, adj, gem, bracket, pos)
	local new_module_result
	-- Test code to compare existing module to new one.
	if test_new_ru_pron_module then
		local m_new_ru_pron = require("Module:User:Benwing2/ru-pron")
		new_module_result = m_new_ru_pron.ipa(text, adj, gem, bracket, pos)
	end

	if type(text) == 'table' then
		text, adj, gem, bracket, pos = (ine(text.args.phon) or ine(text.args[1])), ine(text.args.adj), ine(text.args.gem), ine(text.args.bracket), ine(text.args.pos)
		if not text then
			text = mw.title.getCurrentTitle().text
		end
	end
	gem = usub(gem or '', 1, 1)
	bracket = ine(usub(bracket or '', 1, 1))
	if bracket == 'n' then
		bracket = nil
	end

	pos = pos or "def"
	-- If a multipart part of speech, split into components, and convert
	-- each blank component to the default.
	if rfind(pos, "/") then
		pos = rsplit(pos, "/")
		for i=1,#pos do
			if pos[i] == "" then
				pos[i] = "def"
			end
		end
	end
	-- Verify that pos (or each part of multipart pos) is recognized
	for _, p in ipairs(type(pos) == "table" and pos or {pos}) do
		if not final_e[pos] then
			error("Unrecognized part of speech '" .. pos .. "': Should be n/noun/neut, a/adj, c/com, pre, dat, adv, inv, voc, v/verb, pro, hi/high, mid, lo/low/schwa or omitted")
		end
	end

	text = ulower(text)

	if gem ~= '' then
		track("gem")
		track("gem/" .. gem)
	end
	if adj then
		track("adj")
	end
	if rfind(text, "[a-zščžáéíóúýàèìòùỳ]") then
		track("latin-text")
	end
	if rfind(text, "[сз]ч") then
		track("sch")
	end
	if rfind(text, "[шж]ч") then
		track("shch")
	end
	if rfind(text, CFLEX) then
		track("cflex")
	end
	if rfind(text, DUBGR) then
		track("dubgr")
	end

	text = rsub(text, "``", DUBGR)
	text = rsub(text, "`", GR)
	text = rsub(text, "@", DOTABOVE)
	text = rsub(text, "%^", CFLEX)
	text = rsub(text, DUBGR, CFLEX)

	-- translit doesn't always convert э to ɛ (depends on whether a consonant
	-- precedes), so do it ourselves before translit
	text = rsub(text, 'э', 'ɛ')
	-- vowel + йе should have double jj, but the translit module will translit
	-- it the same as vowel + е, so do it ourselves before translit
	text = rsub(text, '([' .. com.vowel .. ']' .. com.opt_accent .. ')й([еѐ])',
		'%1йй%2')
	-- transliterate and decompose Latin vowels with accents, recomposing
	-- certain key combinations
	text = com.translit(text)

	-- handle old ě (e.g. сѣдло́), and ě̈ from сѣ̈дла
	text = rsub(text, 'ě̈', 'jo' .. AC)
	text = rsub(text, 'ě', 'e')
	-- handle sequences of accents (esp from ё with secondary/tertiary stress)
	text = rsub(text, accents .. '+(' .. accents .. ')', '%1')

	-- canonicalize multiple spaces
	text = rsub(text, '%s+', ' ')

	-- Add primary stress to single-syllable words preceded or followed by
	-- unstressed particle or preposition. Add "tertiary" stress to remaining
	-- single-syllable words that aren't a particle, preposition, prefix or
	-- suffix and don't already bear an accent (including force-reduction
	-- accents, i.e. dot-above/dot-below); "tertiary stress" means a vowel is
	-- treated as stressed for the purposes of vowel reduction but isn't
	-- marked with a primary or secondary stress marker; we repurpose a
	-- circumflex for this purpose. We need to preserve the distinction
	-- between spaces and hyphens because (1) we only recognize certain
	-- post-accentless particles following a hyphen (to distinguish e.g.
	-- 'то' from '-то'); (2) we only recognize certain pre-accentless
	-- particles preceding a space (to distinguish particles 'о' and 'а' from
	-- spelled letters о and а, which should not be reduced); and (3) we
	-- recognize hyphens for the purpose of marking unstressed prefixes and
	-- suffixes.
	local word = strutils.capturing_split(text, "([ %-]+)")
	for i = 1, #word do
		-- check for single-syllable words that need a stress; they must meet
		-- the following conditions:
		-- 1. must not be an accentless word, which is any of the following:
		--         1a. in the "pre" class, or
		if not (accentless['pre'][word[i]] or
				-- 1b. in the "prespace" class if followed by space and another word, or
				i < #word - 1 and accentless['prespace'][word[i]] and word[i+1] == " " or
				-- 1c. in the "post" class if preceded by another word, or
				i > 2 and accentless['post'][word[i]] or
				-- 1d. in the "posthyphen" class preceded by a hyphen and another word;
				i > 2 and accentless['posthyphen'][word[i]] and word[i-1] == "-") and
		-- 2. must be one syllable;
			ulen(rsub(word[i], '[^' .. vow .. ']', '')) == 1 and
		-- 3. must not have any accents (including dot-above, forcing reduction);
			not rfind(word[i], accents) and
		-- 4. must not be a prefix or suffix, identified by a preceding or trailing hyphen, i.e. one of the following:
		--         4a. utterance-initial preceded by a hyphen, or
			not (i == 3 and word[2] == "-" and word[1] == "" or
			    -- 4b. non-utterance-initial preceded by a hyphen, or
				i >= 3 and word[i-1] == " -" or
			    -- 4c. utterance-final followed by a hyphen, or
				i == #word - 2 and word[i+1] == "-" and word[i+2] == "" or
			    -- 4d. non-utterance-final followed by a hyphen;
				i <= #word - 2 and word[i+1] == "- ") then

		-- OK, we have a stressable single-syllable word; either add primary
		-- or tertiary stress:
		-- 1. add primary stress if preceded or followed by an accentless word,
			if (i > 2 and accentless['pre'][word[i-2]] or
				i > 2 and word[i-1] == " " and accentless['prespace'][word[i-2]] or
				i < #word - 1 and accentless['post'][word[i+2]] or
				i < #word - 1 and word[i+1] == "-" and accentless['posthyphen'][word[i+2]]) then
				word[i] = rsub(word[i], vowels_c, '%1' .. AC)
		-- 2. else add tertiary stress
			else
				word[i] = rsub(word[i], vowels_c, '%1' .. CFLEX)
			end
		end
	end

	-- make unaccented prepositions and particles liaise with the following or
	-- preceding word
	for i = 1, #word do
		if i < #word - 1 and (accentless['pre'][word[i]] or accentless['prespace'][word[i]] and word[i+1] == " ") then
			word[i+1] = '‿'
		elseif i > 2 and (accentless['post'][word[i]] or accentless['posthyphen'][word[i]] and word[i-1] == "-") then
			word[i-1] = '‿'
		end
	end

	-- rejoin words, convert hyphens to spaces and eliminate stray spaces
	-- resulting from this
	text = table.concat(word, "")
	text = rsub(text, '[%-%s]+', ' ')
	text = rsub(text, '^ ', '')
	text = rsub(text, ' $', '')

	-- convert commas and en/en dashes to IPA foot boundaries
	text = rsub(text, '%s*[,–—]%s*', ' | ')

	-- add a ⁀ at the beginning and end of every word and at close juncture
	-- boundaries; we will remove this later but it makes it easier to do
	-- word-beginning and word-end rsubs
	text = rsub(text, ' ', '⁀ ⁀')
	text = '⁀' .. text .. '⁀'
	text = rsub(text, '‿', '⁀‿⁀')

	-- save original word spelling before respellings, (de)voicing changes,
	-- geminate changes, etc. for implementation of geminate_pref
	local orig_word = rsplit(text, " ", true)

	-- insert or remove /j/ before [aou] so that palatal versions of these
	-- vowels are always preceded by /j/ and non-palatal versions never are
	-- (do this before the change below adding tertiary stress to final
	-- palatal о):
	-- (1) Non-palatal [ou] after always-hard шж (e.g. in брошю́ра, жю́ри)
	--     despite the spelling (FIXME, should this also affect [a]?)
	text = rsub(text, '([šž])j([ou])', '%2%3')
	-- (2) Palatal [aou] after always-soft щчӂ and voiced variant ǰ (NOTE:
	--     this happens before the change šč -> ɕː in phon_respellings)
	text = rsub(text, '([čǰӂ])([aou])', '%1j%2')
	-- (3) ьо is pronounced as ьйо, i.e. like (possibly unstressed) ьё, e.g.
	--     in Асунсьо́н
	text = rsub(text, 'ʹo', 'ʹjo')

	-- add tertiary stress to some final -о (this needs to be done before
	-- eliminating dot-above, after adding ⁀, after adding /j/ before palatal о):
	-- (1) after vowels, e.g. То́кио
	text = rsub(text, '(' .. vowels .. accents .. '?o)⁀', '%1' .. CFLEX .. '⁀')
	-- (2) when palatal, e.g. ра́нчо, га́учо, ма́чо, Ога́йо
	text = rsub(text, 'jo⁀', 'jo' .. CFLEX .. '⁀')

	-- eliminate dot-above, which has served its purpose of preventing any
	-- sort of stress
	text = rsub(text, DOTABOVE, '')
	-- eliminate dot-below
	text = rsub(text, 'ja' .. DOTBELOW, 'jạ')
	if rfind(text, DOTBELOW) then
		error("Dot-below accent can only be placed on я or palatal а")
	end

	text = adj and rsub(text, '(.[aoe]́?)go(' .. AC .. '?)⁀', '%1vo%2⁀') or text
	text = adj and rsub(text, '(.[aoe]́?)go(' .. AC .. '?)⁀', '%1vo%2sja⁀') or text

	--phonetic respellings
	for _, respell in ipairs(phon_respellings) do
		text = rsub(text, respell[1], respell[2])
	end

	--voicing, devoicing
	--NOTE: v before an obstruent assimilates in voicing and triggers voicing
	--assimilation of a preceding consonant; neither happens before a sonorant
	--1. absolutely final devoicing
	text = rsub(text, '([bdgvɣzžĝĵǰӂ])(ʹ?⁀)$', function(a, b)
		return devoicing[a] .. b end)
	--2. word-final devoicing before another word
	text = rsub(text, '([bdgvɣzžĝĵǰӂ])(ʹ?⁀ ⁀[^bdgɣzžĝĵǰӂ])', function(a, b)
		return devoicing[a] .. b end)
	--3. voicing/devoicing assimilation; repeat to handle recursive assimilation
	while true do
		local new_text = rsub(text, '([bdgvɣzžĝĵǰӂ])([ ‿⁀ʹː()/]*[ptkfxsščɕcĉ])', function(a, b)
			return devoicing[a] .. b end)
		new_text = rsub(new_text, '([ptkfxsščɕcĉ])([ ‿⁀ʹː()/]*v?[ ‿⁀ʹː()/]*[bdgɣzžĝĵǰӂ])', function(a, b)
			return voicing[a] .. b end)
		if new_text == text then
			break
		end
		text = new_text
	end

	--re-notate orthographic geminate consonants
	text = rsub(text, '([^' .. vow .. '.%-_])' .. '%1', '%1ː')
	text = rsub(text, '([^' .. vow .. '.%-_])' .. '%(%1%)', '%1(ː)')

	--rewrite iotated vowels
	text = rsub(text, '(j[%(ː%)]*)([aeou])', function(a, b)
		return a .. iotating[b] end)
	-- eliminate j after consonant and before iotated vowel (including
	-- semi-reduced ạ)
	text = rsub(text, '([^' .. vow .. acc .. 'ʹʺ‿⁀ ]/?)j([äạëöü])', '%1%2')

	--split by word and process each word
	word = rsplit(text, " ", true)

	if type(pos) == "table" and #pos ~= #word then
		error("Number of parts of speech (" .. #pos .. ") should match number of combined words (" .. #word .. ")")
	end
	for i = 1, #word do
		local pron = word[i]

		-- Check for gemination at prefix boundaries; if so, convert the
		-- regular gemination symbol ː to a special symbol ˑ that indicates
		-- we always preserve the gemination unless gem=n. We look for
		-- certain sequences at the beginning of a word, but make sure that
		-- the original spelling is appropriate as well (see comment above
		-- for geminate_pref).
		if rfind(pron, 'ː') then
			local orig_pron = orig_word[i]
			local deac = rsub(pron, accents, '')
			local orig_deac = rsub(orig_pron, accents, '')
			for _, gempref in ipairs(geminate_pref) do
				local newspell = gempref[1]
				local oldspell = gempref[2]
				-- FIXME! The rsub below will be incorrect if there is
				-- gemination in a joined preposition or particle
				if rfind(orig_deac, '⁀' .. oldspell) and rfind(deac, '⁀' .. newspell) or
					rfind(orig_deac, '⁀ne' .. oldspell) and rfind(deac, '⁀ne' .. newspell) then
					pron = rsub(pron, '(⁀[^‿⁀ː]*)ː', '%1ˑ')
				end
			end
		end

		--degemination, optional gemination
		if gem == 'y' then
			-- leave geminates alone, convert ˑ to regular gemination; ˑ is a
			-- special gemination symbol used at prefix boundaries that we
			-- remove only when gem=n, else we convert it to regular gemination
			pron = rsub(pron, 'ˑ', 'ː')
		elseif gem == 'o' then
			-- make geminates optional, except for ɕӂ, also ignore left paren
			-- in (ː) sequence
			pron = rsub(pron, '([^ɕӂ%(%)])[ːˑ]', '%1(ː)')
		elseif gem == 'n' then
			-- remove gemination, except for ɕӂ
			pron = rsub(pron, '([^ɕӂ%(%)])[ːˑ]', '%1')
		else
			-- degeminate l's
			pron = rsub(pron, '(l)ː', '%1')
			-- preserve gemination between vowels immediately after the stress,
			-- special gemination symbol ˑ also remains, ɕӂ remain geminated,
			-- žn remain geminated between vowels even not immediately after
			-- the stress, n becomes optionally geminated when after but not
			-- immediately after the stress, ssk and zsk remain geminated
			-- immediately after the stress, else degeminate; we signal that
			-- gemination should remain by converting to special symbol ˑ,
			-- then removing remaining ː not after ɕӂ and left paren; do
			-- various subs repeatedly in case of multiple geminations in a word
			-- 1. immediately after the stress
			pron = rsub_repeatedly(pron, '(' .. vowels .. accents .. '[^ɕӂ%(%)])ː(' .. vowels .. ')', '%1ˑ%2')
			-- 2. remaining geminate n after the stress between vowels
			pron = rsub_repeatedly(pron, '(' .. AC .. '.-' .. vowels .. accents .. '?n)ː(' .. vowels .. ')', '%1(ː)%2')
			-- 3. remaining ž and n between vowels
			pron = rsub_repeatedly(pron, '(' .. vowels .. accents .. '?[žn])ː(' .. vowels .. ')', '%1ˑ%2')
			-- 4. ssk (and zsk, already normalized) immediately after the stress
			pron = rsub(pron, '(' .. vowels .. accents .. '[^' .. vow .. ']*s)ː(k)', '%1ˑ%2')
			-- 5. eliminate remaining gemination
			pron = rsub(pron, '([^ɕӂ%(%)])ː', '%1')
			-- 6. convert special gemination symbol ˑ to regular gemination
			pron = rsub(pron, 'ˑ', 'ː')
		end

		-- handle soft and hard signs, assimilative palatalization
		-- 1. insert j before i when required
		pron = rsub(pron, 'ʹi', 'ʹji')
		-- 2. insert glottal stop after hard sign if required
		pron = rsub(pron, 'ʺ([aɛiouy])', 'ʔ%1')
		-- 3. assimilative palatalization of consonants when followed by
		--    front vowels or soft sign
		pron = rsub(pron, '([mnpbtdkgfvszxɣrl])([ː()]*[eiäạëöüʹ])', '%1ʲ%2')
		pron = rsub(pron, '([cĵ])([ː()]*[äạöüʹ])', '%1ʲ%2')
		-- 4. remove hard and soft signs
		pron = rsub(pron, "[ʹʺ]", "")

		-- reduction of unstressed word-final -я, -е; but special-case
		-- unstressed не, же. Final -я always becomes [ə]; final -е may
		-- become [ə], [e], [ɪ] or [ɨ] depending on the part of speech and
		-- the preceding consonants/vowels.
		pron = rsub(pron, '[äạ]⁀', 'ə⁀')
		pron = rsub(pron, '⁀nʲe⁀', '⁀nʲi⁀')
		pron = rsub(pron, '⁀že⁀', '⁀žy⁀')
		-- function to fetch the appropriate value for ending and part of
		-- speech, handling aliases and defaults and converting 'e' to 'ê'
		-- so that the unstressed [e] sound is preserved
		function fetch_e_sub(ending)
			local thispos = type(pos) == "table" and pos[i] or pos
			local chart = final_e[thispos]
			while type(chart) == "string" do -- handle aliases
				chart = final_e[chart]
			end
			assert(type(chart) == "table")
			local sub = chart[ending] or final_e['def'][ending]
			assert(sub)
			if sub == 'e' then
				-- add CFLEX to preserve the unstressed [e] sound, which
				-- will otherwise be converted to [ɪ]; NOTE: DO NOT use ê
				-- here directly because it's a single composed char, when
				-- we need the e and accent to be separate
				return 'e' .. CFLEX
			else
				return sub
			end
		end
		if new_final_e_code then
			-- handle substitutions in two parts, one for vowel+j+e sequences
			-- and the other for cons+e sequences
			pron = rsub(pron, vowels_c .. '(' .. accents .. '?j)ë⁀', function(v, ac)
				 local ty = v == 'o' and 'oe' or 've'
				 return v .. ac .. fetch_e_sub(ty)
			end)
			-- consonant may palatalized, geminated or optional-geminated
			pron = rsub(pron, '(.)(ʲ?[ː()]*)[eë]⁀', function(ch, mod)
				 local ty = ch == 'j' and 'je' or
					rfind(ch, '[cĵšžĉĝ]') and 'hardsib' or
					rfind(ch, '[čǰɕӂ]') and 'softsib' or
					'softpaired'
				 return ch ..mod .. fetch_e_sub(ty)
			end)
		else
			-- Do the old way, which mostly converts final -е to schwa, but
			-- has highly broken retraction code for vowel + [шжц] + е (but
			-- not with accent on vowel!) before it that causes final -е in
			-- this circumstance to become [ɨ], and a special hack for кое-.
			pron = rsub(pron, vowels_c .. '([cĵšžĉĝ][ː()]*)[eë]', '%1%2ɛ')
			pron = rsub(pron, '⁀ko(' .. accents .. ')jë⁀', '⁀ko%1ji⁀')
			pron = rsub(pron, '[eë]⁀', 'ə⁀')
		end

		-- retraction of е and и after цшж
		pron = rsub(pron, '([cĵšžĉĝ][ː()]*)([ei])', function(a, b)
			return a .. retracting[b] end)

		--syllabify, inserting @ at syllable boundaries
		--1. insert @ after each vowel
		pron = rsub(pron, '(' .. vowels .. accents .. '?)', '%1@')
		--2. eliminate word-final @
		pron = rsub(pron, '@+⁀$', '⁀')
		--3. move @ forward directly before any ‿⁀, as long as at least
		--   one consonant follows that; we will move it across ‿⁀ later
		pron = rsub(pron, '@([^@' .. vow .. acc .. ']*)([‿⁀]+[^‿⁀@' .. vow .. acc .. '])', '%1@%2')
		--4. in a consonant cluster, move @ forward so it's before the
		--   last consonant
		pron = rsub(pron, '@([^‿⁀@' .. vow .. acc .. ']*)([^‿⁀@' .. vow .. acc .. 'ːˑ()ʲ]ʲ?[ːˑ()]*‿?[' .. vow .. acc .. '])', '%1@%2')
		--5. move @ backward if in the middle of a "permanent onset" cluster,
		--   e.g. sk, str, that comes before a vowel, putting the @ before
		--   the permanent onset cluster
		pron = rsub(pron, '([^‿⁀@' .. vow .. acc .. ']?)([^‿⁀@' .. vow .. acc .. '])@([^‿⁀@' .. vow .. acc .. 'ːˑ()ʲ])(ʲ?[ːˑ()]*[‿⁀]*[' .. vow .. acc .. '])', function(a, b, c, d)
			if perm_syl_onset[a .. b .. c] or c == 'j' and rfind(b, '[čǰɕӂʲ]') then
				return '@' .. a .. b .. c .. d
			elseif perm_syl_onset[b .. c] then
				return a .. '@' .. b .. c .. d
			end end)
		--6. if / is present (explicit syllable boundary), remove any @
		--   (automatic boundary) and convert / to @
		if rfind(pron, '/') then
			pron = rsub(pron, '[^' .. vow .. acc .. ']+', function(x)
				if rfind(x, '/') then
					x = rsub(x, '@', '')
					x = rsub(x, '/', '@')
				end
				return x
			end)
		end
		--7. remove @ followed by a final consonant cluster
		pron = rsub(pron, '@([^‿⁀@' .. vow .. ']+⁀)$', '%1')
		--8. remove @ preceded by an initial consonant cluster (should only
		--   happen when / is inserted by user or in цз, чж sequences)
		pron = rsub(pron, '^(⁀[^‿⁀@' .. vow .. ']+)@', '%1')
		--9. make sure @ isn't directly before linking ‿⁀
		pron = rsub(pron, '@([‿⁀]+)', '%1@')

		-- handle word-initial unstressed o and a; note, vowels always
		-- followed by at least one char because of word-final ⁀
		-- do after syllabification because syllabification doesn't know
		-- about ɐ as a vowel
		pron = rsub(pron, '^⁀[ao]([^' .. acc .. '])', '⁀ɐ%1')

		--split by syllable
		local syllable = rsplit(pron, '@', true)

		--create set of 1-based syllable indexes of stressed syllables
		--(acute, grave, circumflex)
		local stress = {}
		for j = 1, #syllable do
			if rfind(syllable[j], accents) then
				stress[j] = true
			end
		end

		-- iterate syllable by syllable to handle stress marks, vowel allophony
		local syl_conv = {}
		for j = 1, #syllable do
			local syl = syllable[j]

			local alnum

			--vowel allophony
			if stress[j] then
				-- convert acute/grave/circumflex accent to appropriate
				-- IPA marker of primary/secondary/unmarked stress
				alnum = 1
				syl = rsub(syl, '(.*)́', 'ˈ%1')
				syl = rsub(syl, '(.*)̀', 'ˌ%1')
				syl = rsub(syl, CFLEX, '')
			elseif stress[j+1] then
				alnum = 2
			else
				alnum = 3
			end
			syl = rsub(syl, vowels_c, function(a)
				if a ~= '' then
					return allophones[a][alnum]
				end end)
			syl_conv[j] = syl
		end

		pron = table.concat(syl_conv, "")

		-- Optional (j) before ɪ, which is always unstressed
		pron = rsub(pron, "⁀jɪ", "⁀(j)ɪ")
		pron = rsub(pron, '([' .. ipa_vow .. '])jɪ', "%1(j)ɪ")

		--consonant assimilative palatalization of tn/dn, depending on
		--whether [rl] precedes
		pron = rsub(pron, '([rl]?)([ˈˌ]?[dt])([ˈˌ]?nʲ)', function(a, b, c)
			if a == '' then
				return a .. b .. 'ʲ' .. c
			else
				return a .. b .. '⁽ʲ⁾' .. c
			end end)

		--general consonant assimilative palatalization
		pron = rsub_repeatedly(pron, '([szntdpbmfcĵx])([ˈˌ]?)([szntdpbmfcĵlk]ʲ)', function(a, b, c)
			if cons_assim_palatal['compulsory'][a..c] then
				return a .. 'ʲ' .. b .. c
			elseif cons_assim_palatal['optional'][a..c] then
				return a .. '⁽ʲ⁾' .. b .. c
			else
				return a .. b .. c
			end end)

		-- further assimilation before alveolopalatals
		pron = rsub(pron, 'n([ˈˌ]?)([čǰɕӂ])', 'nʲ%1%2')

		-- optional palatal assimilation of вп, вб only word-initially
		pron = rsub(pron, '⁀([ˈˌ]?[fv])([ˈˌ]?[pb]ʲ)', '⁀%1⁽ʲ⁾%2')

		-- optional palatal assimilation of бв but not in обв-
		pron = rsub(pron, 'b([ˈˌ]?vʲ)', 'b⁽ʲ⁾%1')
		if rfind(word[i], '⁀o' .. accents .. '?bv') then
			-- ə in case of a word with a preceding preposition
			pron = rsub(pron, '⁀([ˈˌ]?[ɐəo][ˈˌ]?)b⁽ʲ⁾([ˈˌ]?vʲ)', '⁀%1b%2')
		end

		if rfind(word[i], 's[äạ]⁀') then
			pron = rsub(pron, 'sʲə⁀', 's⁽ʲ⁾ə⁀')
		end

		word[i] = pron
	end

	text = table.concat(word, " ")
	if bracket then
		text = '[' .. text .. ']'
	end

	-- Front a and u between soft consonants. If between a soft and
	-- optionally soft consonant (should only occur in that order, shouldn't
	-- ever have a or u preceded by optionally soft consonant),
	-- split the result into two. We only split into two even if there
	-- happen to be multiple optionally fronted a's and u's to avoid
	-- excessive numbers of possibilities (and it simplifies the code).
	-- 1. First, temporarily add soft symbol to inherently soft consonants.
	text = rsub(text, '([čǰɕӂj])', '%1ʲ')
	-- 2. Handle case of [au] between two soft consonants
	text = rsub(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?.ʲ)', function(a, b, c)
		return a .. fronting[b] .. c end)
	-- 3. Handle [au] between soft consonant and optional j, which is still fronted
	text = rsub(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?%(jʲ%))', function(a, b, c)
			return a .. fronting[b] .. c end)
	-- 4. Handle case of [au] between soft and optionally soft consonant
	if rfind(text, 'ʲ[ː()]*[auʊ][ˈˌ]?.⁽ʲ⁾') then
		local opt_hard = rsub(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?.)⁽ʲ⁾', '%1%2%3')
		local opt_soft = rsub(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?.)⁽ʲ⁾', function(a, b, c)
			return a .. fronting[b] .. c .. 'ʲ' end)
		text = opt_hard .. ', ' .. opt_soft
	end
	-- 5. Undo addition of soft symbol to inherently soft consonants.
	text = rsub(text, '([čǰɕӂj])ʲ', '%1')

	-- convert special symbols to IPA
	text = rsub(text, '[cĵ]ʲ', translit_conv_j)
	text = rsub(text, '[cčgĉĝĵǰšžɕӂ]', translit_conv)

	-- Assimilation involving hiatus of ɐ and ə
	text = rsub(text, 'ə([‿⁀]*)[ɐə]', 'ɐ%1ɐ')

	-- eliminate ⁀ symbol at word boundaries
	-- eliminate _ symbol that prevents assimilations
	text = rsub(text, '[⁀_]', '')
	
	text = rsub(text, 'ʐ(ː?)ɛ([^ˈˌ]*)([ˈˌ])', 'ʐ%1ɨ%2%3')
	text = rsub(text, 'ʂ(ː?)ɛ([^ˈˌ]*)([ˈˌ])', 'ʂ%1ɨ%2%3')
	text = rsub(text, 't͡s(ː?)ɛ([^ˈˌ]*)([ˈˌ])', 't͡s%1ɨ%2%3')
	text = rsub(text, 't͡sɛ$', 't͡sə')

	if test_new_ru_pron_module then
		if new_module_result ~= text then
			--error(text .. " || " .. new_module_result)
			track("different-pron")
		end
	end

	return text
end

return export

-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet:
Модуль:Ru-pron

Навигация

Поиск