Source code for pycantonese.jyutping.yale

import unicodedata

from pycantonese.jyutping.parse_jyutping import parse_jyutping
from pycantonese.util import _deprecate


ONSETS_YALE = {
    "b": "b",
    "d": "d",
    "g": "g",
    "gw": "gw",
    "z": "j",
    "p": "p",
    "t": "t",
    "k": "k",
    "kw": "k",
    "c": "ch",
    "m": "m",
    "n": "n",
    "ng": "ng",
    "f": "f",
    "h": "h",
    "s": "s",
    "l": "l",
    "w": "w",
    "j": "y",
    "": "",
}

NUCLEI_YALE = {
    "aa": "aa",
    "a": "a",
    "i": "i",
    "yu": "yu",
    "u": "u",
    "oe": "eu",
    "e": "e",
    "eo": "eu",
    "o": "o",
    "m": "m",
    "ng": "ng",
}

CODAS_YALE = {
    "p": "p",
    "t": "t",
    "k": "k",
    "m": "m",
    "n": "n",
    "ng": "ng",
    "i": "i",
    "u": "u",
    "": "",
}


[docs]def jyutping_to_yale(jp_str, as_list=True):
    """Convert Jyutping romanization into Yale romanization.

    .. versionadded:: 3.0.0
        This function replaces the deprecated equivalent ``jyutping2yale``.

    .. versionchanged:: 3.0.0
        ``as_list`` has its default value switched from ``False`` to ``True``,
        so that by default the function returns a list, which is in line with
        the other "jyutping_to_X" functions.

    Parameters
    ----------
    jp_str : str
        Jyutping romanization for one or multiple characters
    as_list : bool, optional
        If False (default is True), the output is a string with a single quote
        ``'`` to disambiguate unclear syllable boundaries (e.g., a consonant
        or the low-tone marker "h" being ambiguous as an onset or as
        part of the previous syllable).

    Returns
    -------
    list[str], or str if as_list is False

    Raises
    ------
    ValueError
        If the Jyutping romanization is illegal (e.g., with unrecognized
        elements).

    Examples
    --------
    >>> jyutping_to_yale("gwong2dung1waa2")  # 廣東話, Cantonese
    ['gwóng', 'dūng', 'wá']
    >>> jyutping_to_yale("gwong2dung1waa2", as_list=False)
    'gwóngdūngwá'
    >>>
    >>> # 'heihauh' would be ambiguous between hei3hau6 and hei6au6.
    >>> jyutping_to_yale("hei3hau6", as_list=False)  # 氣候, climate
    "hei'hauh"
    """
    jp_parsed_list = parse_jyutping(jp_str)
    yale_list = []

    for jp_parsed in jp_parsed_list:
        onset = ONSETS_YALE[jp_parsed.onset]
        nucleus = NUCLEI_YALE[jp_parsed.nucleus]
        coda = CODAS_YALE[jp_parsed.coda]
        tone = jp_parsed.tone  # still in parse_jyutping

        # jyutping2yale system uses "h" to mark the three low tones
        if tone in {"4", "5", "6"}:
            low_tone_h = "h"
        else:
            low_tone_h = ""

        # in jyutping2yale, long "aa" vowel with no coda is denoted by "a"
        if nucleus == "aa" and coda == "":
            nucleus = "a"

        # when nucleus is "yu"...
        # 1. disallow "yyu" (when onset is "y")
        # 2. change nucleus "yu" into "u" -- this is a hack for adding tone
        #       diacritic, since we don't want "y" to bear the diacritic
        if nucleus == "yu":
            if onset == "y":
                onset = ""
            nucleus = "u"

        # when nucleus is "ng"
        # the tone diacritic has to be on "g" but not "n"
        # now we pretend that the nucleus is "g", and will prepend the "n" back
        # at the end
        if nucleus == "ng":
            nucleus = "g"

        # add the jyutping2yale tone diacritic to the first nucleus letter
        # parse_jyutping tone 1      --> add macron
        # parse_jyutping tone 2 or 5 --> add acute
        # parse_jyutping tone 4      --> add grave
        # parse_jyutping tone 3 or 6 --> (no diacritic)
        # If the accented letter doesn't exist in unicode, use the combining
        # accent instead.

        letter = nucleus[0]  # nucleus 1st letter
        unicode_letter_name = unicodedata.name(letter)
        if tone == "1":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH MACRON"
                )
            except KeyError:
                letter_with_diacritic = letter + "\u0304"
        elif tone in {"2", "5"}:
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH ACUTE"
                )
            except KeyError:
                letter_with_diacritic = letter + "\u0301"
        elif tone == "4":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH GRAVE"
                )
            except KeyError:
                letter_with_diacritic = letter + "\u0300"
        else:
            # either tone 3 or tone 6
            letter_with_diacritic = letter
        nucleus = letter_with_diacritic + nucleus[1:]

        # add back "y" if the nucleus is "yu"
        # ("y" was taken away for convenience in adding tone diacritic)
        if jp_parsed.nucleus == "yu":
            nucleus = "y" + nucleus

        # add back "n" if the nucleus is "ng"
        # ('n' was taken away so that tone diacritic is on "g" but not "n")
        if jp_parsed.nucleus == "ng":
            nucleus = "n" + nucleus

        # parse_jyutping final "eu" should be jyutping2yale "ew" (not "eu")
        if coda == "u" and nucleus == "e":
            coda = "w"

        # save the resultant jyutping2yale
        if coda in {"i", "u", "w"} and tone in {"4", "5", "6"}:
            yale = onset + nucleus + coda + low_tone_h
        else:
            yale = onset + nucleus + low_tone_h + coda
        yale_list.append(yale)

    if as_list:
        return yale_list

    # Output yale_list as a string
    # Check if there's potential ambiguity when Yale strings are concatenated

    # Ambiguity case 1:
    #   1st syllable coda is one of the "ambiguous_consonants"
    #   and 2nd syllable starts with a vowel *letter*

    # Ambiguity case 2:
    #   1st syllable has no coda and 2nd syllable starts with one of the
    #   "ambiguous_consonants"
    #   e.g., hei3hau6 'climate' --> heihauh
    #   (middle "h" for tone in 1st syllable or being onset of 2nd syllable?)

    if len(yale_list) == 0:
        return ""
    elif len(yale_list) == 1:
        return yale_list[0]

    ambiguous_consonants = {"h", "p", "t", "k", "m", "n", "ng"}
    vowel_letters = {
        "a",
        "e",
        "i",
        "o",
        "u",
        "á",
        "é",
        "í",
        "ó",
        "ú",
        "à",
        "è",
        "ì",
        "ò",
        "ù",
        "ā",
        "ē",
        "ī",
        "ō",
        "ū",
    }

    output_str = ""

    for i in range(len(yale_list) - 1):
        yale1 = yale_list[i]
        yale2 = yale_list[i + 1]

        ambiguous = False

        # test case 1:
        if _endswithoneof(yale1, ambiguous_consonants) and _startswithoneof(
            yale2, vowel_letters
        ):
            ambiguous = True

        # test case 2:
        if (
            not ambiguous
            and not _endswithoneof(yale1, ambiguous_consonants)
            and _startswithoneof(yale2, ambiguous_consonants)
        ):
            ambiguous = True

        output_str += yale1

        if ambiguous:
            output_str += "'"

    output_str += yale_list[-1]

    return output_str


@_deprecate("jyutping2yale", "jyutping_to_yale", "3.0.0", "4.0.0")
def jyutping2yale(*args, **kwargs):
    """Same as jyutping_to_yale.

    .. deprecated:: 3.0.0
    """
    return jyutping_to_yale(*args, **kwargs)


def _startswithoneof(inputstr, seq):
    """
    Check if *inputstr* starts with one of the items in seq. If it does, return
        the item that it starts with. If it doe not, return ``None``.

    :param inputstr: input string

    :param seq: sequences of items to check

    :return: the item the the input string starts with (``None`` if not found)

    :rtype: str or None
    """
    seq = set(seq)
    for item in seq:
        if inputstr.startswith(item):
            return item
    else:
        return None


def _endswithoneof(inputstr, seq):
    """
    Check if *inputstr* ends with one of the items in seq. If it does, return
        the item that it ends with. If it doe not, return ``None``.

    :param inputstr: input string

    :param seq: sequences of items to check

    :return: the item the the input string ends with (``None`` if not found)

    :rtype: str or None
    """
    seq = set(seq)
    for item in seq:
        if inputstr.endswith(item):
            return item
    else:
        return None