Source code for pycantonese.corpus

from functools import lru_cache
import os

from pylangacq.chat import Reader

from pycantonese.search import perform_search
from pycantonese.util import ENCODING, get_jyutping_from_mor, ListFromIterables


[docs]class CantoneseCHATReader(Reader): """A reader for Cantonese CHAT corpus files. .. note:: Some of the methods are inherited from the parent class ``pylangacq.chat.Reader`` for language acquisition, which may or may not be applicable to your use case. """
[docs] def __init__(self, *filenames, **kwargs): """Initialize a reader for Cantonese CHAT corpus files. Parameters ---------- *filenames : iterable of str File paths to Cantonese CHAT data files. Glob filename matching is supported. **kwargs Keyword arguments passed to CantoneseCHATReader. Currently, only the ``encoding`` kwarg is supported (default: 'utf8'). """ encoding = kwargs.get("encoding", ENCODING) super(CantoneseCHATReader, self).__init__( *filenames, encoding=encoding )
[docs] def concordance( self, search_item, participant=None, exclude=None, match_entire_word=True, lemma=False, by_files=False, ): raise NotImplementedError("coming soon!")
def _get_jyutping_sents(self, participant=None, exclude=None, sents=True): fname_to_tagged_sents = self.tagged_sents( participant=participant, exclude=exclude, by_files=True ) fn_to_jyutpings = {} jyutpings_list = [] for fn in self.filenames(): fn_to_jyutpings[fn] = [] tagged_sents = fname_to_tagged_sents[fn] for tagged_sent in tagged_sents: if sents: jyutpings_list = [] for tagged_word in tagged_sent: _, _, mor, _ = tagged_word jyutping = get_jyutping_from_mor(mor) if sents: jyutpings_list.append(jyutping) else: fn_to_jyutpings[fn].append(jyutping) if sents: fn_to_jyutpings[fn].append(jyutpings_list) return fn_to_jyutpings
[docs] def jyutpings(self, participant=None, exclude=None, by_files=False): """Return the words in Jyutping romanization. Parameters ---------- participant : str or iterable[str], optional One or more participants to include the data for. If unspecified, all participants are included. exclude : str or iterable[str], optional One or more participants to exclude the data for. If unspecified, no participants are excluded. by_files : bool, optional If True (default: False), return data organized by the individual file paths. Returns ------- list[str], or dict[str, list[str]] if by_files is True """ fn_to_jyutpings = self._get_jyutping_sents( participant=participant, exclude=exclude, sents=False ) if by_files: return fn_to_jyutpings else: return ListFromIterables( *(v for _, v in sorted(fn_to_jyutpings.items())) )
[docs] def jyutping_sents(self, participant=None, exclude=None, by_files=False): """Return the sentences in Jyutping romanization. Parameters ---------- participant : str or iterable[str], optional One or more participants to include the data for. If unspecified, all participants are included. exclude : str or iterable[str], optional One or more participants to exclude the data for. If unspecified, no participants are excluded. by_files : bool, optional If True (default: False), return data organized by the individual file paths. Returns ------- list[list[str]], or dict[str, list[list[str]]] if by_files is True """ fn_to_jyutpings = self._get_jyutping_sents( participant=participant, exclude=exclude, sents=True ) if by_files: return fn_to_jyutpings else: return ListFromIterables( *(v for _, v in sorted(fn_to_jyutpings.items())) )
def _get_character_sents(self, participant=None, exclude=None, sents=True): fname_to_tagged_sents = self.tagged_sents( participant=participant, exclude=exclude, by_files=True ) fn_to_characters = {} characters_list = [] for fn in self.filenames(): fn_to_characters[fn] = [] tagged_sents = fname_to_tagged_sents[fn] for tagged_sent in tagged_sents: if sents: characters_list = [] for tagged_word in tagged_sent: chs, _, _, _ = tagged_word if chs and "\u4e00" <= chs[0] <= "\u9fff": characters_in_word = list(chs) else: characters_in_word = [chs] if sents: characters_list.extend(characters_in_word) else: fn_to_characters[fn].extend(characters_in_word) if sents: fn_to_characters[fn].append(characters_list) return fn_to_characters
[docs] def characters(self, participant=None, exclude=None, by_files=False): """Return the data split in individual Cantonese characters. Parameters ---------- participant : str or iterable[str], optional One or more participants to include the data for. If unspecified, all participants are included. exclude : str or iterable[str], optional One or more participants to exclude the data for. If unspecified, no participants are excluded. by_files : bool, optional If True (default: False), return data organized by the individual file paths. Returns ------- list[str], or dict[str, list[str]] if by_files is True """ fn_to_characters = self._get_character_sents( participant=participant, exclude=exclude, sents=False ) if by_files: return fn_to_characters else: return ListFromIterables( *(v for _, v in sorted(fn_to_characters.items())) )
[docs] def character_sents(self, participant=None, exclude=None, by_files=False): """Return the data as sentences of individual Cantonese characters. Parameters ---------- participant : str or iterable[str], optional One or more participants to include the data for. If unspecified, all participants are included. exclude : str or iterable[str], optional One or more participants to exclude the data for. If unspecified, no participants are excluded. by_files : bool, optional If True (default: False), return data organized by the individual file paths. Returns ------- list[list[str]], or dict[str, list[list[str]]] if by_files is True """ fn_to_characters = self._get_character_sents( participant=participant, exclude=exclude, sents=True ) if by_files: return fn_to_characters else: return ListFromIterables( *(v for _, v in sorted(fn_to_characters.items())) )
[docs] def search( self, *, onset=None, nucleus=None, coda=None, tone=None, initial=None, final=None, jyutping=None, character=None, pos=None, word_range=(0, 0), sent_range=(0, 0), tagged=True, sents=False, participant=None, exclude=None, by_files=False, ): """Search the data for the given criteria. For examples, please see https://pycantonese.org/searches.html. Parameters ---------- onset : str, optional Onset to search for. A regex is supported. nucleus : str, optional Nucleus to search for. A regex is supported. coda : str, optional Coda to search for. A regex is supported. tone : str, optional Tone to search for. A regex is supported. initial : str, optional Initial to search for. A regex is supported. An initial, a term more prevalent in traditional Chinese phonology, is the equivalent of an onset. final : str, optional Final to search for. A final, a term more prevalent in traditional Chinese phonology, is the equivalent of a nucleus plus a coda. jyutping : str, optional Jyutping romanization of one Cantonese character to search for. If the romanization contains more than one character, a ValueError is raised. character : str, optional One or more Cantonese characters (within a segmented word) to search for. pos : str, optional A part-of-speech tag to search for. A regex is supported. word_range : tuple[int, int], optional Span of words to the left and right of a matching word to include in the output. The default is `(0, 0)` to disable a range. If `sent_range` is used, `word_range` is ignored. sent_range : tuple[int, int], optional Span of sentences before and after a sentence containing a matching word to include in the output. The default is `(0, 0)` to disable a range. If `sent_range` is used, `word_range` is ignored. tagged : bool, optional If True (the default), words in the output are in the tagged form. Otherwise just word token strings are returned. sents : bool, optional If True (default is False), sentences containing matching words are returned. Otherwise, only matching words are returned. participant : str or iterable[str], optional One or more participants to include in the search. If unspecified, all participants are included. exclude : str or iterable[str], optional One or more participants to exclude in the search. If unspecified, no participants are excluded. by_files : bool, optional If True (default: False), return data organized by the individual file paths. Returns ------- list """ tagged_sents = self.tagged_sents( participant=participant, exclude=exclude, by_files=True ) fn_to_results = perform_search( tagged_sents, onset=onset, nucleus=nucleus, coda=coda, tone=tone, initial=initial, final=final, jyutping=jyutping, character=character, pos=pos, word_range=word_range, sent_range=sent_range, tagged=tagged, sents=sents, ) if by_files: return fn_to_results else: return ListFromIterables( *(v for _, v in sorted(fn_to_results.items())) )
[docs]@lru_cache(maxsize=1) def hkcancor(): """Create a corpus object for the Hong Kong Cantonese Corpus. Returns ------- :class:`~pycantonese.corpus.CantoneseCHATReader` """ data_path = os.path.join( os.path.dirname(__file__), "data", "hkcancor", "*.cha" ) return CantoneseCHATReader(data_path, encoding="utf8")
[docs]def read_chat(*filenames, **kwargs): """Read Cantonese CHAT data files into a reader object. Parameters ---------- *filenames : iterable[str] File paths to Cantonese CHAT data files. Glob filename matching is supported. **kwargs Keyword arguments passed to CantoneseCHATReader. Currently, only the ``encoding`` kwarg is supported (default: 'utf8'). Returns ------- :class:`~pycantonese.corpus.CantoneseCHATReader` """ encoding = kwargs.get("encoding", ENCODING) return CantoneseCHATReader(*filenames, encoding=encoding)