Source code for pycantonese.word_segmentation

from functools import lru_cache
from typing import List

from wordseg import LongestStringMatching

from pycantonese.corpus import hkcancor
from pycantonese.data.rime_cantonese import CHARS_TO_JYUTPING, LETTERED
from pycantonese.util import _split_chars_with_alphanum


_MAX_WORD_LENGTH = 5

_ALLOWED_WORDS = None
_DISALLOWED_WORDS = None


[docs]class Segmenter(LongestStringMatching):
    """A customizable word segmentation model.

    .. versionadded:: 3.0.0
    """

[docs]    def __init__(
        self,
        *,
        max_word_length=_MAX_WORD_LENGTH,
        allow=None,
        disallow=None,
    ):
        """Initialize a Segmenter object.

        Parameters
        ----------
        max_word_length : int, optional
            Maximum word length this model allows.
        allow : iterable[str], optional
            Words to allow in word segmentation.
        disallow : iterable[str], optional
            Words to disallow in word segmentation.
        """
        super(Segmenter, self).__init__(max_word_length=max_word_length)

        # Train with HKCanCor data.
        self.fit(hkcancor().words(by_utterances=True))

        # Train with rime-cantonese data.
        self._words |= CHARS_TO_JYUTPING.keys()
        self._words |= LETTERED.keys()

        # Adjust with the allowed and disallowed words.
        self._words |= allow or set()
        self._words -= disallow or set()

        # Turn everything from strings to tuples due to alphanumeric chars.
        self._words = {_split_chars_with_alphanum(x) for x in self._words}

    def _predict_sent(self, sent_str):
        chars = _split_chars_with_alphanum(sent_str)
        segmented = super(Segmenter, self)._predict_sent(chars)
        # Turn the result back from tuples to strings.
        segmented = ["".join(x) for x in segmented]
        return segmented


@lru_cache(maxsize=1)
def _get_default_segmenter():
    return Segmenter()


[docs]def segment(unsegmented: str, cls: Segmenter = None) -> List[str]:
    """Segment the unsegmented input.

    The word segmentation model is the longest string matching approach,
    trained by (i) the HKCanCor corpus included in this library and
    (ii) the rime-cantonese data.
    The segmented sentence does not contain words longer than five
    characters.

    Parameters
    ----------
    unsegmented : str
        Unsegmented input.
    cls: Segmenter, optional
        A custom :class:`~pycantonese.word_segmentation.Segmenter` instance
        for setting the maximal
        word length (default = 5) and words to allow or disallow.
        If not provided, a default segmenter is used, with maximum word
        length = 5.

    Returns
    -------
    List[str]

    Examples
    --------
    >>> segment("廣東話容唔容易學？")  # "Is Cantonese easy to learn?"
    ['廣東話', '容', '唔', '容易', '學', '？']
    >>>
    >>> # Customizing the segmentation behavior.
    >>> from pycantonese.word_segmentation import Segmenter
    >>> segmenter = Segmenter(allow={"容唔容易"})
    >>> segment("廣東話容唔容易學？", cls=segmenter)
    ['廣東話', '容唔容易', '學', '？']
    """
    if not unsegmented:
        return []
    if cls is None:
        cls = _get_default_segmenter()
    elif type(cls) != Segmenter:
        raise TypeError(f"`segmenter` must be a Segmenter object: {cls}")
    # Strip all whitespace.
    unsegmented = "".join(unsegmented.split())
    segmented = list(cls.predict([unsegmented]))[0]
    return segmented