Source code for pycantonese.jyutping.parse_jyutping

import dataclasses
from typing import List


ONSETS = {
    "b",
    "d",
    "g",
    "gw",
    "z",
    "p",
    "t",
    "k",
    "kw",
    "c",
    "m",
    "n",
    "ng",
    "f",
    "h",
    "s",
    "l",
    "w",
    "j",
    "",
}

NUCLEI = {"aa", "a", "i", "yu", "u", "oe", "e", "eo", "o", "m", "ng"}

CODAS = {"p", "t", "k", "m", "n", "ng", "i", "u", ""}

TONES = {"1", "2", "3", "4", "5", "6"}


[docs]@dataclasses.dataclass
class Jyutping:
    """Jyutping representation of a Chinese/Cantonese character.

    Attributes
    ----------
    onset : str
        Onset
    nucleus : str
        Nucleus
    coda : str
        Coda
    tone : str
        Tone
    """

    __slots__ = ("onset", "nucleus", "coda", "tone")
    onset: str
    nucleus: str
    coda: str
    tone: str

[docs]    def __str__(self):
        """Combine onset + nucleus + coda + tone."""
        return f"{self.onset}{self.nucleus}{self.coda}{self.tone}"

    @property
    def final(self):
        """Return the final (= nucleus + coda)."""
        return f"{self.nucleus}{self.coda}"


[docs]def parse_jyutping(jp_str) -> List[Jyutping]:
    """Parse Jyutping romanization into onset, nucleus, code, and tone.

    Parameters
    ----------
    jp_str : str
        Jyutping romanization for one or multiple characters.

    Returns
    -------
    List[Jyutping]

    Raises
    ------
    ValueError
        If the Jyutping romanization is illegal (e.g., with unrecognized
        elements).

    Examples
    --------
    >>> parse_jyutping("gwong2dung1waa2")  # 廣東話, Cantonese
    [Jyutping(onset='gw', nucleus='o', coda='ng', tone='2'),
     Jyutping(onset='d', nucleus='u', coda='ng', tone='1'),
     Jyutping(onset='w', nucleus='aa', coda='', tone='2')]
    """
    if not jp_str:
        return []

    # check jp_str as a valid argument string
    if not isinstance(jp_str, str):
        raise ValueError("argument needs to be a string -- " + repr(jp_str))
    jp_str = jp_str.lower()

    # parse jp_str as multiple jp strings
    jp_list = []
    jp_current = ""
    for c in jp_str:
        jp_current = jp_current + c
        if c.isdigit():
            jp_list.append(jp_current)
            jp_current = ""

    if not jp_str[-1].isdigit():
        # TODO: error msg should be "no invalid tone detected" or something?
        raise ValueError("tone error -- " + repr(jp_str[-1]))

    jp_parsed_list = []

    for jp in jp_list:

        if len(jp) < 2:
            raise ValueError(
                "jyutping string has fewer than " "2 characters -- " + repr(jp)
            )

        tone = jp[-1]
        cvc = jp[:-1]

        # tone
        if tone not in TONES:
            raise ValueError("tone error -- " + repr(jp))

        # coda
        if not (cvc[-1] in "ieaouptkmng"):
            raise ValueError("coda error -- " + repr(jp))

        if cvc in ["m", "n", "ng", "i", "e", "aa", "o", "u"]:
            jp_parsed_list.append(Jyutping("", cvc, "", tone))
            continue
        elif cvc[-2:] == "ng":
            coda = "ng"
            cv = cvc[:-2]
        elif (
            (cvc[-1] in "ptkmn")
            or ((cvc[-1] == "i") and (cvc[-2] in "eaou"))
            or ((cvc[-1] == "u") and (cvc[-2] in "ieao"))
        ):
            coda = cvc[-1]
            cv = cvc[:-1]
        else:
            coda = ""
            cv = cvc

        # nucleus, and then onset
        nucleus = ""

        while cv[-1] in "ieaouy":
            nucleus = cv[-1] + nucleus
            cv = cv[:-1]
            if not cv:
                break

        if not nucleus:
            raise ValueError("nucleus error -- " + repr(jp))

        onset = cv

        if onset not in ONSETS:
            raise ValueError("onset error -- " + repr(jp))

        jp_parsed_list.append(Jyutping(onset, nucleus, coda, tone))

    return jp_parsed_list


def _parse_final(final):
    """Parse a final into its nucleus and coda.

    Parameters
    ----------
    final : str

    Returns
    -------
    tuple[str]
    """
    for i in range(1, len(final) + 1):
        possible_nucleus = final[:i]
        possible_coda = final[i:]

        if (possible_nucleus in NUCLEI) and (possible_coda in CODAS):
            return possible_nucleus, possible_coda
    return None