Source code for target_extraction.tokenizers

'''
This modules contains a set of functions that return tokenization functions 
which can be defined by the following typing: Callable[[str], List[str]]. 
All of the functions take exactly no positional arguments but can take 
keyword arguments.
'''
import copy
import re
from typing import List, Callable, Optional, Tuple
from pathlib import Path
import pkgutil

import spacy
import stanfordnlp
from stanfordnlp.utils import resources
import twokenize

from target_extraction.taggers_helper import stanford_downloader
from target_extraction.data_types_util import Span


[docs]def is_character_preserving(original_text: str, text_tokens: List[str]
                            ) -> bool:
    '''
    :param original_text: Text that has been tokenized
    :param text_tokens: List of tokens after the text has been tokenized
    :returns: True if the tokenized text when all characters are joined 
                together is equal to the original text with all it's 
                characters joined together.
    '''
    text_tokens_copy = copy.deepcopy(text_tokens)
    # Required as some of the tokenization tokens contain whitespace at the 
    # end of them I think this due to Stanford method being a Neural Network
    text_tokens_copy = [token.strip(' ') for token in text_tokens_copy]
    tokens_text = ''.join(text_tokens_copy)
    original_text = ''.join(original_text.split())
    if tokens_text == original_text:
        return True
    else:
        return False

[docs]def spacy_tokenizer(lang: str = 'en') -> Callable[[str], List[str]]:
    '''
    Given optionally the language (default English) it will return the 
    Spacy rule based tokeniser for that language but the function will now 
    return a List of String rather than Spacy tokens.

    If the whitespace between two words is more than one token then the Spacy 
    tokenizer treat it as in affect a special space token, we remove these 
    special space tokens.

    :param lang: Language of the rule based Spacy tokeniser to use.
    :returns: A callable that takes a String and returns the tokens for that 
              String.
    '''
    spacy_lang_modules = pkgutil.iter_modules(spacy.lang.__path__)
    spacy_lang_codes = [lang_code for _, lang_code, _ in spacy_lang_modules 
                        if len(lang_code) == 2]
    if lang not in spacy_lang_codes:
        raise ValueError('Spacy does not support the following language '
                         f'{lang}. These languages are supported '
                         f'{spacy_lang_codes}')
    sapcy_tokenizer_func = spacy.blank(lang)
    def _spacy_token_to_text(text: str) -> Callable[[str], List[str]]:
        return [spacy_token.text for spacy_token in sapcy_tokenizer_func(text) 
                if not spacy_token.is_space]
    return _spacy_token_to_text

[docs]def whitespace() -> Callable[[str], List[str]]:
    '''
    Standard whitespace tokeniser

    :returns: A callable that takes a String and returns the tokens for that 
              String.
    '''
    return str.split

[docs]def ark_twokenize() -> Callable[[str], List[str]]:
    '''
    A Twitter tokeniser from
    `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_ and relevant 
    `paper <https://www.cs.cmu.edu/~ark/TweetNLP/gimpel+etal.acl11.pdf>`_

    :returns: A callable that takes a String and returns the tokens for 
              that String.
    '''
    return twokenize.tokenizeRawTweetText

[docs]def stanford(lang: str = 'en', treebank: Optional[str] = None, 
             download: bool = False) -> Callable[[str], List[str]]:
    '''
    Stanford neural network tokeniser that uses a BiLSTM and CNN at the 
    character and token level.

    ASSUMPTIONS: The returned callable tokeniser will assume that all text 
    that is given to it, is one sentence, as this method performs sentence 
    splitting but we assume each text is one sentence and we ignore the 
    sentence splitting.

    For Vietnamese instead of characters they used syllables.

    `Languages supported <https://stanfordnlp.github.io/stanfordnlp/installation_download.html#human-languages-supported-by-stanfordnlp>`_

    `Reference paper <https://www.aclweb.org/anthology/K18-2016>`_

    :param lang: Language of the Neural Network tokeniser
    :param treebank: The neural network model to use based on the treebank 
                     it was trained from. If not given the default treebank 
                     will be used. To see which is the default treebank 
                     and the treebanks available for each language go to this 
                     `link <https://stanfordnlp.github.io/stanfordnlp/models.html#human-languages-supported-by-stanfordnlp>`_
    :param download: If to re-download the model. 
    :returns: A callable that takes a String and returns the tokens for that 
              String.
    '''
    full_treebank_name = stanford_downloader(lang, treebank, download)
    nlp = stanfordnlp.Pipeline(lang=lang, processors='tokenize', 
                               treebank=full_treebank_name)

    def _stanford_doc_to_text(text: str) -> Callable[[str], List[str]]:
        '''
        This returns all of the words in each sentence however in the 
        documentation you do have the option to use the tokens instead but 
        the words are used for downstream application hence why the words 
        were chosen over the tokens. See here for more 
        `details <https://stanfordnlp.github.io/stanfordnlp/pipeline.html#accessing-word-information>`_
        '''
        if text.strip() == '':
            return []
        doc = nlp(text)
        sentences = doc.sentences
        tokens = []
        for sentence in sentences:
             for word in sentence.words:
                 tokens.append(word.text)
        return tokens
    return _stanford_doc_to_text


[docs]def token_index_alignment(text: str, tokens: List[str]
                          ) -> List[Span]:
    '''
    :param text: text that has been tokenized
    :param tokens: The tokens that were the output of the text and a tokenizer
                   (tokenizer has to be character preserving)
    :returns: A list of tuples where each tuple contains two ints each 
              representing the start and end index for each of the associated 
              tokens given as an argument.
    '''
    if not is_character_preserving(text, tokens):
        raise ValueError('The tokenization method used is not character'
                         f' preserving. Original text `{text}`\n'
                         f'Tokenized text `{tokens}`')
    token_index_list: List[Span] = []
    char_index = 0
    # Handle whitespace at the start of the text
    if len(text) > char_index:
        while text[char_index] == ' ':
            char_index += 1
            if len(text) <= char_index:
                break

    for token_index, token in enumerate(tokens):
        token_start = char_index
        token_end = token_start
        for token_char_index, token_char in enumerate(token):
            char = text[char_index]
            if token_char == char:
                char_index += 1
            else:
                raise ValueError('The tokenised output within the token should '
                                 f'be the same as the text. Token {token}\n'
                                 f'Text: {text}\nCharacter index {char_index}\n'
                                 f'Token index: {token_index}\nToken char '
                                 f'index {token_char_index}\nTokens {tokens}')
                
        token_end = char_index
        token_index_list.append(Span(token_start, token_end))
        # Covers the whitespaces of n length between tokens and after the text
        if len(text) > char_index:
            while re.search(r'\s', text[char_index]):
                char_index += 1
                if len(text) <= char_index:
                    break
    
    if char_index != len(text):
        raise ValueError(f'Did not get to the end of the text: {text}\n'
                         f'Character index {char_index}\n'
                         f'Token index list {token_index_list}')
    return token_index_list