Source code for pycantonese.corpus

import dataclasses
import functools
import os
from typing import List, Optional, Union, Tuple

from pylangacq.chat import Reader, _params_in_docstring
from pylangacq.chat import read_chat as pylangacq_read_chat
from pylangacq.objects import Gra

from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
from pycantonese.jyutping.parse_jyutping import parse_jyutping
from pycantonese.search import _perform_search
from pycantonese.util import _deprecate


_ENCODING = "utf-8"


[docs]@dataclasses.dataclass
class Token:
    """Token with attributes as parsed from a CHAT utterance.

    Attributes
    ----------
    word : str
        Word form of the token
    pos : str
        Part-of-speech tag
    jyutping : str
        Jyutping romanization
    mor : str
        Morphological information
    gloss : str
        Gloss in English
    gra : Gra
        Grammatical relation
    """

    __slots__ = ("word", "pos", "jyutping", "mor", "gloss", "gra")

    word: str
    pos: Optional[str]
    jyutping: Optional[str]
    mor: Optional[str]
    gloss: Optional[str]
    gra: Optional[Gra]

    def to_mor_tier(self) -> str:
        if self.word in _PUNCTUATION_MARKS:
            return self.word
        result = ""
        if self.pos:
            result += f"{self.pos}|"
        if self.jyutping:
            result += self.jyutping
        if self.mor:
            result += self.mor
        if self.gloss:
            result += f"={self.gloss}"
        return result

    def to_gra_tier(self) -> str:
        return f"{self.gra.dep}|{self.gra.head}|{self.gra.rel}"


[docs]class CHATReader(Reader):
    """A reader for Cantonese CHAT corpus files.

    .. note:: Some of the methods are inherited from the parent class
        :class:`~pylangacq.Reader` for language acquisition,
        which may or may not be applicable to your use case.
    """

[docs]    def ipsyn(self):
        """(Not implemented - the upstream ``ipsyn`` method works for English only.)"""
        raise NotImplementedError(
            "The upstream `ipsyn` method works for English only. "
            "There isn't yet a Cantonese version of IPSyn."
        )

    @staticmethod
    def _partition_maybe_none(x: str, sep: str) -> Tuple[str, str]:
        if x is None:
            return None, None
        if sep not in x:
            return x, None
        new1, _, new2 = x.partition(sep)
        return new1, new2

    def _preprocess_token(self, t) -> Token:
        # Examples from the CHILDES LeeWongLeung corpus, child mhz
        # e.g., mor is suk1&DIM=uncle, word is 叔叔
        # e.g., mor is ngo5-PL=I, word is 我

        jyutping_mor, gloss = self._partition_maybe_none(t.mor, "=")
        jyutping_mor, mor2 = self._partition_maybe_none(jyutping_mor, "-")
        jyutping, mor1 = self._partition_maybe_none(jyutping_mor, "&")

        mor = ""
        if mor1:
            mor += f"&{mor1}"
        if mor2:
            mor += f"-{mor2}"

        try:
            parse_jyutping(jyutping)
        except ValueError:
            jyutping = None

        return Token(t.word, t.pos, jyutping or None, mor or None, gloss or None, t.gra)

[docs]    @_params_in_docstring("participants", "exclude", "by_utterances", "by_files")
    def jyutping(
        self, participants=None, exclude=None, by_utterances=False, by_files=False
    ) -> Union[List[str], List[List[str]], List[List[List[str]]]]:
        """Return the data in Jyutping romanization.

        Parameters
        ----------

        Returns
        -------
        List[List[List[str]]] if both by_utterances and by_files are True
        List[List[str]] if by_utterances is True and by_files is False
        List[List[str]] if by_utterances is False and by_files is True
        List[str] if both by_utterances and by_files are False
        """
        tagged_sents = self.tokens(
            participants=participants,
            exclude=exclude,
            by_utterances=True,
            by_files=True,
        )
        result = [
            [
                [tagged_word.jyutping for tagged_word in tagged_sent]
                for tagged_sent in tagged_sents_for_file
            ]
            for tagged_sents_for_file in tagged_sents
        ]
        if by_files and by_utterances:
            pass
        elif by_files and not by_utterances:
            result = [self._flatten(list, f) for f in result]
        elif not by_files and by_utterances:
            result = self._flatten(list, result)
        else:
            # not by_files and not by_utterances
            result = self._flatten(list, (self._flatten(list, f) for f in result))
        return result

    def jyutping_sents(self, participants=None, exclude=None, by_files=False):
        _deprecate(
            "jyutping_sents", "jyutping with by_utterances=True", "3.2.0", "4.0.0"
        )
        return self.jyutping(
            participants=participants,
            exclude=exclude,
            by_utterances=True,
            by_files=by_files,
        )

    def jyutpings(
        self, participants=None, exclude=None, by_utterances=False, by_files=False
    ):
        _deprecate("jyutpings", "jyutping", "3.2.0", "4.0.0")
        return self.jyutping(
            participants=participants,
            exclude=exclude,
            by_utterances=by_utterances,
            by_files=by_files,
        )

    @staticmethod
    def _get_chars_from_sent(sent: List[str]) -> List[str]:
        result = []
        for word in sent:
            if word and "\u4e00" <= word[0] <= "\u9fff":
                result.extend(list(word))
            else:
                result.append(word)
        return result

[docs]    @_params_in_docstring("participants", "exclude", "by_utterances", "by_files")
    def characters(
        self, participants=None, exclude=None, by_utterances=False, by_files=False
    ) -> Union[List[str], List[List[str]], List[List[List[str]]]]:
        """Return the data in individual Chinese characters.

        Parameters
        ----------

        Returns
        -------
        List[List[List[str]]] if both by_utterances and by_files are True
        List[List[str]] if by_utterances is True and by_files is False
        List[List[str]] if by_utterances is False and by_files is True
        List[str] if both by_utterances and by_files are False
        """
        sents = self.words(
            participants=participants,
            exclude=exclude,
            by_utterances=True,
            by_files=True,
        )
        result = [
            [self._get_chars_from_sent(sent) for sent in sents_for_file]
            for sents_for_file in sents
        ]
        if by_files and by_utterances:
            pass
        elif by_files and not by_utterances:
            result = [self._flatten(list, f) for f in result]
        elif not by_files and by_utterances:
            result = self._flatten(list, result)
        else:
            # not by_files and not by_utterances
            result = self._flatten(list, (self._flatten(list, f) for f in result))
        return result

    def character_sents(self, participants=None, exclude=None, by_files=False):
        _deprecate(
            "character_sents", "characters with by_utterances=True", "3.2.0", "4.0.0"
        )
        return self.characters(
            participants=participants,
            exclude=exclude,
            by_utterances=True,
            by_files=by_files,
        )

[docs]    def search(
        self,
        *,
        onset=None,
        nucleus=None,
        coda=None,
        tone=None,
        initial=None,
        final=None,
        jyutping=None,
        character=None,
        pos=None,
        word_range=(0, 0),
        utterance_range=(0, 0),
        sent_range=(0, 0),  # Deprecated
        by_tokens=True,
        by_utterances=False,
        tagged=None,  # Deprecated
        sents=None,  # Deprecated
        participants=None,
        exclude=None,
        by_files=False,
    ):
        """Search the data for the given criteria.

        For examples, please see https://pycantonese.org/searches.html.

        Parameters
        ----------
        onset : str, optional
            Onset to search for. A regex is supported.
        nucleus : str, optional
            Nucleus to search for. A regex is supported.
        coda : str, optional
            Coda to search for. A regex is supported.
        tone : str, optional
            Tone to search for. A regex is supported.
        initial : str, optional
            Initial to search for. A regex is supported.
            An initial, a term more prevalent in traditional Chinese
            phonology, is the equivalent of an onset.
        final : str, optional
            Final to search for.
            A final, a term more prevalent in traditional Chinese
            phonology, is the equivalent of a nucleus plus a coda.
        jyutping : str, optional
            Jyutping romanization of one Cantonese character to search for.
            If the romanization contains more than one character, a ValueError
            is raised.
        character : str, optional
            One or more Cantonese characters (within a segmented word) to
            search for.
        pos : str, optional
            A part-of-speech tag to search for. A regex is supported.
        word_range : tuple[int, int], optional
            Span of words to the left and right of a matching word to include
            in the output. The default is `(0, 0)` to disable a range.
            If `sent_range` is used, `word_range` is ignored.
        utterance_range : Tuple[int, int], optional
            Span of utterances before and after an utterance containing a matching
            word to include in the output.
            If set to ``(0, 0)`` (the default), no utterance range output is generated.
            If `utterance_range` is used, `word_range` is ignored.
        sent_range : Tuple[int, int], optional
            [Deprecated; please use utterance_range instead]
        by_tokens : bool, optional
            If ``True`` (the default), words in the output are in the token form
            (i.e., with Jyutping and part-of-speech tags).
            Otherwise just words as text strings are returned.
        by_utterances : bool, optional
            If ``True`` (default is False), utterances containing matching words
            are returned. Otherwise, only matching words are returned.
        tagged : bool, optional
            [Deprecated; please use by_tokens instead]
        sents : bool, optional
            [Deprecated; please use by_utterances instead]
        participants : str or iterable[str], optional
            One or more participants to include in the search.
            If unspecified, all participants are included.
        exclude : str or iterable[str], optional
            One or more participants to exclude in the search.
            If unspecified, no participants are excluded.
        by_files : bool, optional
            If True (default: False), return data organized by the
            individual file paths.

        Returns
        -------
        list
        """
        if sent_range != (0, 0):
            _deprecate("sent_range", "utterance_range", "3.2.0", "4.0.0")
            if utterance_range != (0, 0):
                raise TypeError(
                    "Do not use both utterance_range and sent_range "
                    f"(you've passed in {utterance_range} and {sent_range}, "
                    f"respectively). "
                    f"Please use utterance_range; "
                    f"sent_range has been deprecated."
                )
            utterance_range = sent_range
        if tagged is not None:
            _deprecate("tagged", "by_tokens", "3.2.0", "4.0.0")
            by_tokens = tagged
        if sents is not None:
            _deprecate("sents", "by_utterances", "3.2.0", "4.0.0")
            by_utterances = sents

        tagged_sents = self.tokens(
            participants=participants,
            exclude=exclude,
            by_utterances=True,
            by_files=True,
        )
        result_by_files = _perform_search(
            tagged_sents,
            onset=onset,
            nucleus=nucleus,
            coda=coda,
            tone=tone,
            initial=initial,
            final=final,
            jyutping=jyutping,
            character=character,
            pos=pos,
            word_range=word_range,
            utterance_range=utterance_range,
            by_tokens=by_tokens,
            by_utterances=by_utterances,
        )

        if by_files:
            return result_by_files
        else:
            return self._flatten(list, result_by_files)


class _HKCanCorReader(CHATReader):
    """Corpus reader for HKCanCor specifically.

    We enforce uppercase for part-of-speech tags,
    because the original HKCanCor's tags have a mix of upper- and lowercase, e.g.,
    v and Vg, which makes it harder to perform a corpus search
    with a clean(er) regex.
    """

    @staticmethod
    def _preprocess_pos(pos: str) -> str:
        """Override the parent Reader class's method."""
        try:
            return pos.upper()
        except AttributeError:
            return pos


[docs]@functools.lru_cache(maxsize=1)
def hkcancor() -> CHATReader:
    """Create a corpus object for the Hong Kong Cantonese Corpus.

    Returns
    -------
    :class:`~pycantonese.CHATReader`
    """
    data_dir = os.path.join(os.path.dirname(__file__), "data", "hkcancor")
    reader = _HKCanCorReader.from_dir(data_dir)
    for f in reader._files:
        f.file_path = f.file_path.replace(data_dir, "").lstrip(os.sep)
    return reader


[docs]@_params_in_docstring("match", "exclude", "encoding", class_method=False)
def read_chat(
    path: str, match: str = None, exclude: str = None, encoding: str = _ENCODING
) -> CHATReader:
    """Read Cantonese CHAT data files.

    Parameters
    ----------
    path : str
        A path that points to one of the following:

        - ZIP file. Either a local ``.zip`` file path or a URL (one that begins with
          ``"https://"`` or ``"http://"``).
          URL example: ``"https://childes.talkbank.org/data/Biling/YipMatthews.zip"``
        - A local directory, for files under this directory recursively.
        - A single ``.cha`` CHAT file.

    Returns
    -------
    :class:`~pycantonese.CHATReader`
    """
    return pylangacq_read_chat(
        path, match=match, exclude=exclude, encoding=encoding, cls=CHATReader
    )