from functools import lru_cache
from typing import List
from wordseg import LongestStringMatching
from pycantonese.corpus import hkcancor
from pycantonese.data.rime_cantonese import CHARS_TO_JYUTPING, LETTERED
from pycantonese.util import _split_chars_with_alphanum
_MAX_WORD_LENGTH = 5
_ALLOWED_WORDS = None
_DISALLOWED_WORDS = None
[docs]class Segmenter(LongestStringMatching):
"""A customizable word segmentation model.
.. versionadded:: 3.0.0
"""
[docs] def __init__(
self,
*,
max_word_length=_MAX_WORD_LENGTH,
allow=None,
disallow=None,
):
"""Initialize a Segmenter object.
Parameters
----------
max_word_length : int, optional
Maximum word length this model allows.
allow : iterable[str], optional
Words to allow in word segmentation.
disallow : iterable[str], optional
Words to disallow in word segmentation.
"""
super(Segmenter, self).__init__(max_word_length=max_word_length)
# Train with HKCanCor data.
self.fit(hkcancor().words(by_utterances=True))
# Train with rime-cantonese data.
self._words |= CHARS_TO_JYUTPING.keys()
self._words |= LETTERED.keys()
# Adjust with the allowed and disallowed words.
self._words |= allow or set()
self._words -= disallow or set()
# Turn everything from strings to tuples due to alphanumeric chars.
self._words = {_split_chars_with_alphanum(x) for x in self._words}
def _predict_sent(self, sent_str):
chars = _split_chars_with_alphanum(sent_str)
segmented = super(Segmenter, self)._predict_sent(chars)
# Turn the result back from tuples to strings.
segmented = ["".join(x) for x in segmented]
return segmented
@lru_cache(maxsize=1)
def _get_default_segmenter():
return Segmenter()
[docs]def segment(unsegmented: str, cls: Segmenter = None) -> List[str]:
"""Segment the unsegmented input.
The word segmentation model is the longest string matching approach,
trained by (i) the HKCanCor corpus included in this library and
(ii) the rime-cantonese data.
The segmented sentence does not contain words longer than five
characters.
Parameters
----------
unsegmented : str
Unsegmented input.
cls: Segmenter, optional
A custom :class:`~pycantonese.word_segmentation.Segmenter` instance
for setting the maximal
word length (default = 5) and words to allow or disallow.
If not provided, a default segmenter is used, with maximum word
length = 5.
Returns
-------
List[str]
Examples
--------
>>> segment("廣東話容唔容易學?") # "Is Cantonese easy to learn?"
['廣東話', '容', '唔', '容易', '學', '?']
>>>
>>> # Customizing the segmentation behavior.
>>> from pycantonese.word_segmentation import Segmenter
>>> segmenter = Segmenter(allow={"容唔容易"})
>>> segment("廣東話容唔容易學?", cls=segmenter)
['廣東話', '容唔容易', '學', '?']
"""
if not unsegmented:
return []
if cls is None:
cls = _get_default_segmenter()
elif type(cls) != Segmenter:
raise TypeError(f"`segmenter` must be a Segmenter object: {cls}")
# Strip all whitespace.
unsegmented = "".join(unsegmented.split())
segmented = list(cls.predict([unsegmented]))[0]
return segmented