Source code for target_extraction.pos_taggers

'''
This modules contains a set of functions that return pos tagger functions 
which can be defined by the following typing: 
Callable[[str], Tuple[List[str], List[str]]]. 
All of the functions take exactly no positional arguments but can take 
keyword arguments.

All of the functions take in a String and perform tokenisation and POS tagging 
at the same time and return both as a List of Strings where the first List 
are the tokens and the second the POS tags.

Functions:

1. stanford -- Returns both UPOS and XPOS tags where UPOS is the default. 
   Stanford Neural Network POS tagger. Tagger has the option to have been 
   trained on different languages and treebanks.
2. spacy -- Returns both UPOS and XPOS tags where UPOS is the default. 
   Spacy Neural Network POS tagger. Tagger has the option to have been trained 
   on different languages.
'''
from typing import List, Callable, Optional, Tuple
from pathlib import Path

import stanfordnlp
from stanfordnlp.utils import resources

from target_extraction.taggers_helper import stanford_downloader, spacy_downloader

[docs]def spacy_tagger(fine: bool = False, spacy_model_name: str = 'en_core_web_sm'
                 ) -> Callable[[str], Tuple[List[str], List[str]]]:
    '''
    Spacy Neural Network POS tagger which returns both UPOS and XPOS tags.

    Choice of two different POS tags:
    1. UPOS - Universal POS tags, coarse grained POS tags.
    2. XPOS - Target language fine grained POS tags.

    The XPOS for English I think is Penn Treebank set.

    If the whitespace between two words is more than one token then the Spacy 
    tagger tags it as a space, however we remove these tags.

    Languages supported: 
    https://spacy.io/usage/models

    :param fine: If True then returns XPOS else returns UPOS tags.
    :param spacy_model_name: Name of the Spacy model e.g. en_core_web_sm
    :returns: A callable that takes a String and returns the tokens and  
              associated POS tags for that String.
    '''
    spacy_model = spacy_downloader(spacy_model_name, pos_tags=True, 
                                   parse=False, ner=False)
    def _spacy_tagger(text: str
                      ) -> Callable[[str],  Tuple[List[str], List[str]]]:
        if text.strip() == '':
            return [], []
        doc = spacy_model(text)
        pos_tokens = []
        tokens = []
        for token in doc:
            if token.is_space:
                continue
            if fine:
                pos_tokens.append(token.tag_)
            else:
                pos_tokens.append(token.pos_)
            tokens.append(token.text)
        return tokens, pos_tokens
    return _spacy_tagger
    

[docs]def stanford(fine: bool = False, lang: str = 'en', 
             treebank: Optional[str] = None, 
             download: bool = False
             ) -> Callable[[str], Tuple[List[str], List[str]]]:
    '''
    Stanford Neural Network (NN) tagger that uses a highway BiLSTM that has as 
    input: 1. Word2Vec and FastText embeddings, 2. Trainable Word Vector, and 
    3. Uni-Directional LSTM over character embeddings. The UPOS predicted tag 
    is used as a feature to predict the XPOS tag within the NN.

    Choice of two different POS tags:
    1. UPOS - Universal POS tags, coarse grained POS tags.
    2. XPOS - Target language fine grained POS tags.

    The XPOS for English I think is Penn Treebank set.

    ASSUMPTIONS: The returned callable pos tagger will assume that all text 
    that is given to it, is one sentence, as this method performs sentence 
    splitting but we assume each text is one sentence and we ignore the 
    sentence splitting.

    Languages supported: 
    https://stanfordnlp.github.io/stanfordnlp/installation_download.html#human-
    languages-supported-by-stanfordnlp

    Reference paper:
    https://www.aclweb.org/anthology/K18-2016

    :param fine: If True then returns XPOS else returns UPOS tags.
    :param lang: Language of the Neural Network tokeniser
    :param treebank: The neural network model to use based on the treebank 
                     it was trained from. If not given the default treebank 
                     will be used. To see which is the default treebank 
                     and the treebanks available for each language go to:
                     https://stanfordnlp.github.io/stanfordnlp/installation_
                     download.html#human-languages-supported-by-stanfordnlp
    :param download: If to re-download the model. 
    :returns: A callable that takes a String and returns the tokens and  
              associated POS tags for that String.
    '''
    full_treebank_name = stanford_downloader(lang, treebank, download)
    nlp = stanfordnlp.Pipeline(lang=lang, processors='tokenize,mwt,pos', 
                               treebank=full_treebank_name)

    def _stanford_doc_to_text(text: str
                              ) -> Callable[[str],  Tuple[List[str], List[str]]]:
        '''
        This returns all of the pos tags in each sentence however in the 
        documentation you do have the option to use the tokens instead but 
        the words are used for downstream application hence why the words 
        were chosen over the tokens. See here for more details:
        https://stanfordnlp.github.io/stanfordnlp
        /pipeline.html#accessing-word-information
        '''
        if text.strip() == '':
            return [], []
        doc = nlp(text)
        sentences = doc.sentences
        pos_tokens = []
        tokens = []
        for sentence in sentences:
            for word in sentence.words:
                if fine:
                    pos_tokens.append(word.xpos)
                else:
                    pos_tokens.append(word.upos)
                tokens.append(word.text)
        return tokens, pos_tokens
    return _stanford_doc_to_text