import unicodedata
from pycantonese.jyutping.parse_jyutping import parse_jyutping
from pycantonese.util import _deprecate
ONSETS_YALE = {
"b": "b",
"d": "d",
"g": "g",
"gw": "gw",
"z": "j",
"p": "p",
"t": "t",
"k": "k",
"kw": "k",
"c": "ch",
"m": "m",
"n": "n",
"ng": "ng",
"f": "f",
"h": "h",
"s": "s",
"l": "l",
"w": "w",
"j": "y",
"": "",
}
NUCLEI_YALE = {
"aa": "aa",
"a": "a",
"i": "i",
"yu": "yu",
"u": "u",
"oe": "eu",
"e": "e",
"eo": "eu",
"o": "o",
"m": "m",
"ng": "ng",
}
CODAS_YALE = {
"p": "p",
"t": "t",
"k": "k",
"m": "m",
"n": "n",
"ng": "ng",
"i": "i",
"u": "u",
"": "",
}
[docs]def jyutping_to_yale(jp_str, as_list=True):
"""Convert Jyutping romanization into Yale romanization.
.. versionadded:: 3.0.0
This function replaces the deprecated equivalent ``jyutping2yale``.
.. versionchanged:: 3.0.0
``as_list`` has its default value switched from ``False`` to ``True``,
so that by default the function returns a list, which is in line with
the other "jyutping_to_X" functions.
Parameters
----------
jp_str : str
Jyutping romanization for one or multiple characters
as_list : bool, optional
If False (default is True), the output is a string with a single quote
``'`` to disambiguate unclear syllable boundaries (e.g., a consonant
or the low-tone marker "h" being ambiguous as an onset or as
part of the previous syllable).
Returns
-------
list[str], or str if as_list is False
Raises
------
ValueError
If the Jyutping romanization is illegal (e.g., with unrecognized
elements).
Examples
--------
>>> jyutping_to_yale("gwong2dung1waa2") # 廣東話, Cantonese
['gwóng', 'dūng', 'wá']
>>> jyutping_to_yale("gwong2dung1waa2", as_list=False)
'gwóngdūngwá'
>>>
>>> # 'heihauh' would be ambiguous between hei3hau6 and hei6au6.
>>> jyutping_to_yale("hei3hau6", as_list=False) # 氣候, climate
"hei'hauh"
"""
jp_parsed_list = parse_jyutping(jp_str)
yale_list = []
for jp_parsed in jp_parsed_list:
onset = ONSETS_YALE[jp_parsed.onset]
nucleus = NUCLEI_YALE[jp_parsed.nucleus]
coda = CODAS_YALE[jp_parsed.coda]
tone = jp_parsed.tone # still in parse_jyutping
# jyutping2yale system uses "h" to mark the three low tones
if tone in {"4", "5", "6"}:
low_tone_h = "h"
else:
low_tone_h = ""
# in jyutping2yale, long "aa" vowel with no coda is denoted by "a"
if nucleus == "aa" and coda == "":
nucleus = "a"
# when nucleus is "yu"...
# 1. disallow "yyu" (when onset is "y")
# 2. change nucleus "yu" into "u" -- this is a hack for adding tone
# diacritic, since we don't want "y" to bear the diacritic
if nucleus == "yu":
if onset == "y":
onset = ""
nucleus = "u"
# when nucleus is "ng"
# the tone diacritic has to be on "g" but not "n"
# now we pretend that the nucleus is "g", and will prepend the "n" back
# at the end
if nucleus == "ng":
nucleus = "g"
# add the jyutping2yale tone diacritic to the first nucleus letter
# parse_jyutping tone 1 --> add macron
# parse_jyutping tone 2 or 5 --> add acute
# parse_jyutping tone 4 --> add grave
# parse_jyutping tone 3 or 6 --> (no diacritic)
# If the accented letter doesn't exist in unicode, use the combining
# accent instead.
letter = nucleus[0] # nucleus 1st letter
unicode_letter_name = unicodedata.name(letter)
if tone == "1":
try:
letter_with_diacritic = unicodedata.lookup(
unicode_letter_name + " WITH MACRON"
)
except KeyError:
letter_with_diacritic = letter + "\u0304"
elif tone in {"2", "5"}:
try:
letter_with_diacritic = unicodedata.lookup(
unicode_letter_name + " WITH ACUTE"
)
except KeyError:
letter_with_diacritic = letter + "\u0301"
elif tone == "4":
try:
letter_with_diacritic = unicodedata.lookup(
unicode_letter_name + " WITH GRAVE"
)
except KeyError:
letter_with_diacritic = letter + "\u0300"
else:
# either tone 3 or tone 6
letter_with_diacritic = letter
nucleus = letter_with_diacritic + nucleus[1:]
# add back "y" if the nucleus is "yu"
# ("y" was taken away for convenience in adding tone diacritic)
if jp_parsed.nucleus == "yu":
nucleus = "y" + nucleus
# add back "n" if the nucleus is "ng"
# ('n' was taken away so that tone diacritic is on "g" but not "n")
if jp_parsed.nucleus == "ng":
nucleus = "n" + nucleus
# parse_jyutping final "eu" should be jyutping2yale "ew" (not "eu")
if coda == "u" and nucleus == "e":
coda = "w"
# save the resultant jyutping2yale
if coda in {"i", "u", "w"} and tone in {"4", "5", "6"}:
yale = onset + nucleus + coda + low_tone_h
else:
yale = onset + nucleus + low_tone_h + coda
yale_list.append(yale)
if as_list:
return yale_list
# Output yale_list as a string
# Check if there's potential ambiguity when Yale strings are concatenated
# Ambiguity case 1:
# 1st syllable coda is one of the "ambiguous_consonants"
# and 2nd syllable starts with a vowel *letter*
# Ambiguity case 2:
# 1st syllable has no coda and 2nd syllable starts with one of the
# "ambiguous_consonants"
# e.g., hei3hau6 'climate' --> heihauh
# (middle "h" for tone in 1st syllable or being onset of 2nd syllable?)
if len(yale_list) == 0:
return ""
elif len(yale_list) == 1:
return yale_list[0]
ambiguous_consonants = {"h", "p", "t", "k", "m", "n", "ng"}
vowel_letters = {
"a",
"e",
"i",
"o",
"u",
"á",
"é",
"í",
"ó",
"ú",
"à",
"è",
"ì",
"ò",
"ù",
"ā",
"ē",
"ī",
"ō",
"ū",
}
output_str = ""
for i in range(len(yale_list) - 1):
yale1 = yale_list[i]
yale2 = yale_list[i + 1]
ambiguous = False
# test case 1:
if _endswithoneof(yale1, ambiguous_consonants) and _startswithoneof(
yale2, vowel_letters
):
ambiguous = True
# test case 2:
if (
not ambiguous
and not _endswithoneof(yale1, ambiguous_consonants)
and _startswithoneof(yale2, ambiguous_consonants)
):
ambiguous = True
output_str += yale1
if ambiguous:
output_str += "'"
output_str += yale_list[-1]
return output_str
@_deprecate("jyutping2yale", "jyutping_to_yale", "3.0.0", "4.0.0")
def jyutping2yale(*args, **kwargs):
"""Same as jyutping_to_yale.
.. deprecated:: 3.0.0
"""
return jyutping_to_yale(*args, **kwargs)
def _startswithoneof(inputstr, seq):
"""
Check if *inputstr* starts with one of the items in seq. If it does, return
the item that it starts with. If it doe not, return ``None``.
:param inputstr: input string
:param seq: sequences of items to check
:return: the item the the input string starts with (``None`` if not found)
:rtype: str or None
"""
seq = set(seq)
for item in seq:
if inputstr.startswith(item):
return item
else:
return None
def _endswithoneof(inputstr, seq):
"""
Check if *inputstr* ends with one of the items in seq. If it does, return
the item that it ends with. If it doe not, return ``None``.
:param inputstr: input string
:param seq: sequences of items to check
:return: the item the the input string ends with (``None`` if not found)
:rtype: str or None
"""
seq = set(seq)
for item in seq:
if inputstr.endswith(item):
return item
else:
return None