Source code for pycantonese.parsing

import collections
import concurrent.futures as cf
import functools
import multiprocessing as mp
import re
import uuid
from string import ascii_uppercase

from pylangacq.chat import _File, Utterance

from pycantonese.corpus import CHATReader, Token
from pycantonese.jyutping.characters import characters_to_jyutping
from pycantonese.pos_tagging.tagger import pos_tag


# Punctuation marks for utterance segmentation.
_UTTERANCE_PUNCT_MARKS = frozenset(("。", "！", "？"))
_ASCII_UPPERCASE = frozenset(ascii_uppercase)

_UNKNOWN_PARTICIPANT = "X"

_CPU_COUNT = mp.cpu_count()
_CHUNK_SIZE = 4


def _parse_text(text: str, segment_kwargs, pos_tag_kwargs):
    chars_jps = characters_to_jyutping(text, **(segment_kwargs or {}))
    segmented, jyutping = [], []
    for chars, jps in chars_jps:
        segmented.append(chars)
        jyutping.append(jps)
    tags = [pos for _, pos in pos_tag(segmented, **(pos_tag_kwargs or {}))]
    return segmented, tags, jyutping


def _get_utterance(
    unparsed_sent, segment_kwargs, pos_tag_kwargs, participant
) -> Utterance:
    if participant is not None:
        pass
    elif isinstance(unparsed_sent, str):
        participant = _UNKNOWN_PARTICIPANT
    elif isinstance(unparsed_sent, tuple):
        participant, unparsed_sent, *_ = unparsed_sent
    else:
        raise TypeError(
            "Utterance must be either a string or "
            f"a tuple of (participant, utterance): {unparsed_sent}"
        )
    participant = str(participant)

    if not unparsed_sent:
        return Utterance(
            participant=participant, tokens=[], time_marks=None, tiers={participant: ""}
        )
    words, tags, jps = _parse_text(unparsed_sent, segment_kwargs, pos_tag_kwargs)

    tokens = [
        Token(word, pos, jp, None, None, None)
        for word, pos, jp in zip(words, tags, jps)
    ]

    return Utterance(
        participant=participant,
        tokens=tokens,
        time_marks=None,
        tiers={
            # TODO or question: Convert full-width punct to CHAT-styled punct?
            participant: " ".join(words),
            "%mor": " ".join(
                word
                if pos == "PUNCT" or pos[0].upper() not in _ASCII_UPPERCASE
                else f"{pos}|{jp or ''}"
                for word, pos, jp in zip(words, tags, jps)
            ),
        },
    )


[docs]def parse_text(
    data,
    *,
    segment_kwargs=None,
    pos_tag_kwargs=None,
    participant: str = None,
    parallel: bool = True,
) -> CHATReader:
    """Parse raw Cantonese text.

    Parameters
    ----------
    data : str or Iterable[str] or Iterable[Tuple[str, str]]
        Raw Cantonese text data, in one of the following formats:

        - A single string, e.g.,
          ``"廣東話好難學？都唔係吖！"`` (which would be two utterances).
          Basic utterance segmentation
          (i.e., splitting by the end-of-line character ``\\n``
          or one of the Chinese full-width punctuation marks from {"。", "！", "？"})
          will be applied to this string, and
          each segmented utterance will be an utterance in the resulting CHAT reader.
        - An iterable of strings, e.g.,
          ``["廣東話好難學？", "都唔係吖！"]``.
          No utterance segmentation will be done. Use this
          option to pass in data that's utterance-segmented to your liking.
        - An iterable of tuples, where each tuple has two strings, one for the
          participant and the other for the utterance, e.g.,
          ``[("小芬", "你食咗飯未呀？"), ("小明", "我食咗喇。")]``.

    segment_kwargs : dict, optional
        To customize word segmentation,
        provide a dictionary here which would then be passed as keyword arguments to
        :func:`~pycantonese.segment`.
    pos_tag_kwargs : dict, optional
        To customize part-of-speech tagging,
        provide a dictionary here which would then be passed as keyword arguments to
        :func:`~pycantonese.pos_tag`.
    participant : str, optional
        If provided, this will be the participant in the output CHAT-formatted data
        (and will override all the particpants if your input to ``data`` is an iterable
        of tuples).
        If not provided, a default dummy participant ``"X"`` is used when your ``data``
        is either a single string or an iterable of strings.
    parallel : bool, optional
        If ``True`` (the default), this function attempts to parallelize parsing
        for speed-up.
        (In case the data volume is very small, the parsing is not parallelized
        even if you pass in ``True``.)
        Under certain circumstances (e.g., your application is already parallelized and
        further parallelization from within this function might be undesirable),
        you may like to consider setting this parameter to ``False``.

    Returns
    -------
    :class:`~pycantonese.CHATReader`
    """

    if isinstance(data, str):
        # Perform basic sentence segmentation.
        for punct in _UTTERANCE_PUNCT_MARKS:
            data = data.replace(punct, f"{punct}\n")
        data = data.replace("\r\n", "\n")
        data = re.sub(r"\n{2,}", "\n", data)
        data = data.strip().split("\n")

    # Internally, word segmentation is actually going to be done by
    # `characters_to_jyutping` instead of `segment`.
    # `characters_to_jyutping`'s segmenter kwarg is called `segmenter`,
    # while that of `segment` is called `cls`.
    segment_kwargs = segment_kwargs or {}
    if "cls" in segment_kwargs:
        segment_kwargs["segmenter"] = segment_kwargs["cls"]
        del segment_kwargs["cls"]

    # If there's not much data, don't bother with parallelization.
    if parallel and len(data) < (_CPU_COUNT * _CHUNK_SIZE):
        parallel = False

    if parallel:
        func = functools.partial(
            _get_utterance,
            segment_kwargs=segment_kwargs,
            pos_tag_kwargs=pos_tag_kwargs,
            participant=participant,
        )
        with cf.ProcessPoolExecutor() as executor:
            utterances = list(executor.map(func, data, chunksize=_CHUNK_SIZE))
    else:
        utterances = [
            _get_utterance(sent, segment_kwargs, pos_tag_kwargs, participant)
            for sent in data
        ]

    f = _File(
        file_path=str(uuid.uuid4()),
        header={},
        utterances=utterances,
    )
    reader = CHATReader()
    reader._files = collections.deque([f])
    return reader