Source code for bella.tokenisers

'''
Functions that tokenise text and returns the list of tokens all of which are
Strings.

Functions:

1. :py:func:`bella.tokenisers.whitespace` -- tokenises on whitespace.
2. :py:func:`bella.tokenisers.ark_twokenize` -- A Twitter tokeniser from
   `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_
3. :py:func:`bella.tokenisers.stanford` -- Stanford tokeniser from
   `CoreNLP <https://nlp.stanford.edu/software/tokenizer.html>`_
4. :py:func:`bella.tokenisers.moses` -- Tokeniser used in the
   `moses toolkit <https://github.com/moses-smt>`_
5. :py:func:`bella.tokenisers.spacy_tokeniser` -- 
   `SpaCy tokeniser <https://spacy.io/>`_
'''
from typing import List, Dict

import twokenize
import spacy
from spacy.language import Language as SpacyModelType

from bella import stanford_tools
from bella.moses_tools import MosesTokenizer


[docs]def whitespace(text: str) -> List[str]:
    '''
    Tokenises on whitespace.

    :param text: A string to be tokenised.
    :returns: A list of tokens where each token is a String.
    '''

    if isinstance(text, str):
        return text.split()
    raise ValueError(f'The paramter must be of type str not {type(text)}')


[docs]def ark_twokenize(text: str) -> List[str]:
    '''
    A Twitter tokeniser from
    `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_

    This is a wrapper of
    `this <https://github.com/Sentimentron/ark-twokenize-py>`_

    :param text: A string to be tokenised.
    :returns: A list of tokens where each token is a String.
    '''

    if isinstance(text, str):
        return twokenize.tokenizeRawTweetText(text)
    raise ValueError(f'The paramter must be of type str not {type(text)}')


[docs]def stanford(text: str) -> List[str]:
    '''
    Stanford tokeniser from
    `CoreNLP <https://nlp.stanford.edu/software/tokenizer.html>`_

    Requires CoreNLP server to be running.

    :param text: A string to be tokenised.
    :returns: A list of tokens where each token is a String.
    '''
    if isinstance(text, str):
        return stanford_tools.tokenise(text)
    raise ValueError(f'The paramter must be of type str not {type(text)}')


[docs]def moses(text: str, aggressive_dash_splits: bool = False, 
          escape: bool = True) -> List[str]:
    '''
    Tokeniser used in the `moses toolkit <https://github.com/moses-smt>`_

    :param text: A string to be tokenised.
    :param aggressive_dash_splits: Option to trigger dash split rules
    :param escape: Whether to escape characters e.g. "'s" escaped equals 
                   "&apos;s"
    :returns: A list of tokens where each token is a String.
    '''

    if isinstance(text, str):
        moses = MosesTokenizer()
        return moses.tokenize(text, 
                              aggressive_dash_splits=aggressive_dash_splits,
                              escape=escape)
    raise ValueError(f'The paramter must be of type str not {type(text)}')

LOADED_SPACY_MODELS: Dict[str, SpacyModelType] = {}

def _get_spacy_model(language: str) -> SpacyModelType:
    """
    To avoid laoding lots of spacy models the model specific to a language 
    is loaded and saved within a Global dictionary.
    This has been mainly taken from the `AllenNLP package <https://github.
    com/allenai/allennlp/blob/master/allennlp/common/util.py>`_
    :param language: Language of the SpaCy model to load.
    :returns: The relevant SpaCy model.
    """
    if language not in LOADED_SPACY_MODELS:
        LOADED_SPACY_MODELS[language] = spacy.blank(language)
        # If requiring statsitical models in the future the code below will be 
        # required but as we are only requring a tokeniser the above will do

        #from spacy.cli.download import download as spacy_download
        #from spacy.cli import link
        #from spacy.util import get_package_path
        #disable = ['vectors', 'textcat', 'tagger', 'parser', 'ner']
        #try:
        #    spacy_model = spacy.load(language, disable=disable)
        #except:
        #    print(f"Spacy models '{language}' not found.  Downloading and installing.")
        #    spacy_download(language)
        #    package_path = get_package_path(language)
        #    spacy_model = spacy.load(language, disable=disable)
        #    link(language, language, model_path=package_path)
        #LOADED_SPACY_MODELS[language] = spacy_model
    return LOADED_SPACY_MODELS[language]

[docs]def spacy_tokeniser(text: str) -> List[str]:
    '''
    `SpaCy tokeniser <https://spacy.io/>`_

    Assumes the language to be English.

    :param text: A string to be tokenised.
    :returns: A list of tokens where each token is a String.
    '''
    if not isinstance(text, str):
        raise ValueError('The parameter passed has to be of type' 
                         f'String not {type(text)}')
    spacy_model = _get_spacy_model('en')

    spacy_document = spacy_model(text)
    tokens = []
    for token in spacy_document:
        if not token.is_space:
            tokens.append(token.text)
    return tokens