'''
This modules contains a set of functions that return tokenization functions
which can be defined by the following typing: Callable[[str], List[str]].
All of the functions take exactly no positional arguments but can take
keyword arguments.
'''
import copy
import re
from typing import List, Callable, Optional, Tuple
from pathlib import Path
import pkgutil
import spacy
import stanfordnlp
from stanfordnlp.utils import resources
import twokenize
from target_extraction.taggers_helper import stanford_downloader
from target_extraction.data_types_util import Span
[docs]def is_character_preserving(original_text: str, text_tokens: List[str]
) -> bool:
'''
:param original_text: Text that has been tokenized
:param text_tokens: List of tokens after the text has been tokenized
:returns: True if the tokenized text when all characters are joined
together is equal to the original text with all it's
characters joined together.
'''
text_tokens_copy = copy.deepcopy(text_tokens)
# Required as some of the tokenization tokens contain whitespace at the
# end of them I think this due to Stanford method being a Neural Network
text_tokens_copy = [token.strip(' ') for token in text_tokens_copy]
tokens_text = ''.join(text_tokens_copy)
original_text = ''.join(original_text.split())
if tokens_text == original_text:
return True
else:
return False
[docs]def spacy_tokenizer(lang: str = 'en') -> Callable[[str], List[str]]:
'''
Given optionally the language (default English) it will return the
Spacy rule based tokeniser for that language but the function will now
return a List of String rather than Spacy tokens.
If the whitespace between two words is more than one token then the Spacy
tokenizer treat it as in affect a special space token, we remove these
special space tokens.
:param lang: Language of the rule based Spacy tokeniser to use.
:returns: A callable that takes a String and returns the tokens for that
String.
'''
spacy_lang_modules = pkgutil.iter_modules(spacy.lang.__path__)
spacy_lang_codes = [lang_code for _, lang_code, _ in spacy_lang_modules
if len(lang_code) == 2]
if lang not in spacy_lang_codes:
raise ValueError('Spacy does not support the following language '
f'{lang}. These languages are supported '
f'{spacy_lang_codes}')
sapcy_tokenizer_func = spacy.blank(lang)
def _spacy_token_to_text(text: str) -> Callable[[str], List[str]]:
return [spacy_token.text for spacy_token in sapcy_tokenizer_func(text)
if not spacy_token.is_space]
return _spacy_token_to_text
[docs]def whitespace() -> Callable[[str], List[str]]:
'''
Standard whitespace tokeniser
:returns: A callable that takes a String and returns the tokens for that
String.
'''
return str.split
[docs]def ark_twokenize() -> Callable[[str], List[str]]:
'''
A Twitter tokeniser from
`CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_ and relevant
`paper <https://www.cs.cmu.edu/~ark/TweetNLP/gimpel+etal.acl11.pdf>`_
:returns: A callable that takes a String and returns the tokens for
that String.
'''
return twokenize.tokenizeRawTweetText
[docs]def stanford(lang: str = 'en', treebank: Optional[str] = None,
download: bool = False) -> Callable[[str], List[str]]:
'''
Stanford neural network tokeniser that uses a BiLSTM and CNN at the
character and token level.
ASSUMPTIONS: The returned callable tokeniser will assume that all text
that is given to it, is one sentence, as this method performs sentence
splitting but we assume each text is one sentence and we ignore the
sentence splitting.
For Vietnamese instead of characters they used syllables.
`Languages supported <https://stanfordnlp.github.io/stanfordnlp/installation_download.html#human-languages-supported-by-stanfordnlp>`_
`Reference paper <https://www.aclweb.org/anthology/K18-2016>`_
:param lang: Language of the Neural Network tokeniser
:param treebank: The neural network model to use based on the treebank
it was trained from. If not given the default treebank
will be used. To see which is the default treebank
and the treebanks available for each language go to this
`link <https://stanfordnlp.github.io/stanfordnlp/models.html#human-languages-supported-by-stanfordnlp>`_
:param download: If to re-download the model.
:returns: A callable that takes a String and returns the tokens for that
String.
'''
full_treebank_name = stanford_downloader(lang, treebank, download)
nlp = stanfordnlp.Pipeline(lang=lang, processors='tokenize',
treebank=full_treebank_name)
def _stanford_doc_to_text(text: str) -> Callable[[str], List[str]]:
'''
This returns all of the words in each sentence however in the
documentation you do have the option to use the tokens instead but
the words are used for downstream application hence why the words
were chosen over the tokens. See here for more
`details <https://stanfordnlp.github.io/stanfordnlp/pipeline.html#accessing-word-information>`_
'''
if text.strip() == '':
return []
doc = nlp(text)
sentences = doc.sentences
tokens = []
for sentence in sentences:
for word in sentence.words:
tokens.append(word.text)
return tokens
return _stanford_doc_to_text
[docs]def token_index_alignment(text: str, tokens: List[str]
) -> List[Span]:
'''
:param text: text that has been tokenized
:param tokens: The tokens that were the output of the text and a tokenizer
(tokenizer has to be character preserving)
:returns: A list of tuples where each tuple contains two ints each
representing the start and end index for each of the associated
tokens given as an argument.
'''
if not is_character_preserving(text, tokens):
raise ValueError('The tokenization method used is not character'
f' preserving. Original text `{text}`\n'
f'Tokenized text `{tokens}`')
token_index_list: List[Span] = []
char_index = 0
# Handle whitespace at the start of the text
if len(text) > char_index:
while text[char_index] == ' ':
char_index += 1
if len(text) <= char_index:
break
for token_index, token in enumerate(tokens):
token_start = char_index
token_end = token_start
for token_char_index, token_char in enumerate(token):
char = text[char_index]
if token_char == char:
char_index += 1
else:
raise ValueError('The tokenised output within the token should '
f'be the same as the text. Token {token}\n'
f'Text: {text}\nCharacter index {char_index}\n'
f'Token index: {token_index}\nToken char '
f'index {token_char_index}\nTokens {tokens}')
token_end = char_index
token_index_list.append(Span(token_start, token_end))
# Covers the whitespaces of n length between tokens and after the text
if len(text) > char_index:
while re.search(r'\s', text[char_index]):
char_index += 1
if len(text) <= char_index:
break
if char_index != len(text):
raise ValueError(f'Did not get to the end of the text: {text}\n'
f'Character index {char_index}\n'
f'Token index list {token_index_list}')
return token_index_list