Source code for bella.tokenisers

'''
Functions that tokenise text and returns the list of tokens all of which are
Strings.

Functions:

1. :py:func:`bella.tokenisers.whitespace` -- tokenises on whitespace.
2. :py:func:`bella.tokenisers.ark_twokenize` -- A Twitter tokeniser from
   `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_
3. :py:func:`bella.tokenisers.stanford` -- Stanford tokeniser from
   `CoreNLP <https://nlp.stanford.edu/software/tokenizer.html>`_
4. :py:func:`bella.tokenisers.moses` -- Tokeniser used in the
   `moses toolkit <https://github.com/moses-smt>`_
5. :py:func:`bella.tokenisers.spacy_tokeniser` -- 
   `SpaCy tokeniser <https://spacy.io/>`_
'''
from typing import List, Dict

import twokenize
import spacy
from spacy.language import Language as SpacyModelType

from bella import stanford_tools
from bella.moses_tools import MosesTokenizer


[docs]def whitespace(text: str) -> List[str]: ''' Tokenises on whitespace. :param text: A string to be tokenised. :returns: A list of tokens where each token is a String. ''' if isinstance(text, str): return text.split() raise ValueError(f'The paramter must be of type str not {type(text)}')
[docs]def ark_twokenize(text: str) -> List[str]: ''' A Twitter tokeniser from `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_ This is a wrapper of `this <https://github.com/Sentimentron/ark-twokenize-py>`_ :param text: A string to be tokenised. :returns: A list of tokens where each token is a String. ''' if isinstance(text, str): return twokenize.tokenizeRawTweetText(text) raise ValueError(f'The paramter must be of type str not {type(text)}')
[docs]def stanford(text: str) -> List[str]: ''' Stanford tokeniser from `CoreNLP <https://nlp.stanford.edu/software/tokenizer.html>`_ Requires CoreNLP server to be running. :param text: A string to be tokenised. :returns: A list of tokens where each token is a String. ''' if isinstance(text, str): return stanford_tools.tokenise(text) raise ValueError(f'The paramter must be of type str not {type(text)}')
[docs]def moses(text: str, aggressive_dash_splits: bool = False, escape: bool = True) -> List[str]: ''' Tokeniser used in the `moses toolkit <https://github.com/moses-smt>`_ :param text: A string to be tokenised. :param aggressive_dash_splits: Option to trigger dash split rules :param escape: Whether to escape characters e.g. "'s" escaped equals "&apos;s" :returns: A list of tokens where each token is a String. ''' if isinstance(text, str): moses = MosesTokenizer() return moses.tokenize(text, aggressive_dash_splits=aggressive_dash_splits, escape=escape) raise ValueError(f'The paramter must be of type str not {type(text)}')
LOADED_SPACY_MODELS: Dict[str, SpacyModelType] = {} def _get_spacy_model(language: str) -> SpacyModelType: """ To avoid laoding lots of spacy models the model specific to a language is loaded and saved within a Global dictionary. This has been mainly taken from the `AllenNLP package <https://github. com/allenai/allennlp/blob/master/allennlp/common/util.py>`_ :param language: Language of the SpaCy model to load. :returns: The relevant SpaCy model. """ if language not in LOADED_SPACY_MODELS: LOADED_SPACY_MODELS[language] = spacy.blank(language) # If requiring statsitical models in the future the code below will be # required but as we are only requring a tokeniser the above will do #from spacy.cli.download import download as spacy_download #from spacy.cli import link #from spacy.util import get_package_path #disable = ['vectors', 'textcat', 'tagger', 'parser', 'ner'] #try: # spacy_model = spacy.load(language, disable=disable) #except: # print(f"Spacy models '{language}' not found. Downloading and installing.") # spacy_download(language) # package_path = get_package_path(language) # spacy_model = spacy.load(language, disable=disable) # link(language, language, model_path=package_path) #LOADED_SPACY_MODELS[language] = spacy_model return LOADED_SPACY_MODELS[language]
[docs]def spacy_tokeniser(text: str) -> List[str]: ''' `SpaCy tokeniser <https://spacy.io/>`_ Assumes the language to be English. :param text: A string to be tokenised. :returns: A list of tokens where each token is a String. ''' if not isinstance(text, str): raise ValueError('The parameter passed has to be of type' f'String not {type(text)}') spacy_model = _get_spacy_model('en') spacy_document = spacy_model(text) tokens = [] for token in spacy_document: if not token.is_space: tokens.append(token.text) return tokens