Source code for target_extraction.taggers_helper

'''
This module contains code that will help the following modules:

1. tokenizers
2. pos_taggers

Functions:

1. stanford_downloader - Downloads the specific Stanford NLP Neural Network 
   pipeline.
2. spacy_downloader - This in affect downloads the relevant spacy model and 
   loads the model with the relevant taggers e.g. POS, Parse and NER taggers 
   for that spacy model which is language dependent.
'''
from typing import Optional, Dict, Tuple
from pathlib import Path

import spacy
from spacy.cli.download import download as spacy_download
from spacy.language import Language as SpacyModelType
import stanfordnlp
from stanfordnlp.utils import resources

LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool], SpacyModelType] = {}

[docs]def spacy_downloader(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: ''' This is a copy of allennlp.common.util.get_spacy_model function. This in affect downloads the relevant spacy model and loads the model with the relevant taggers e.g. POS, Parse and NER taggers for that spacy model which is language dependent. Spacy can have multiple trained models per language based on size. :param spacy_model_name: Name of the Spacy model e.g. en_core_web_sm :param pos_tags: Whether or not the returned Spacy model should perform POS tagging. :param parse: Whether or not the returned Spacy model should perform Parsing. :param ner: Whether or not the returned Spacy model should perform NER. :returns: The relevant Spacy model. ''' options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: # This needs manually updating each time Spacy is updated. Supported # languages can be found here: https://spacy.io/usage/models supported_codes = ['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt', 'xx'] lang_code = spacy_model_name[:2] if lang_code not in supported_codes: raise ValueError('Spacy does not support the following language ' f'{lang_code}. These languages are supported ' f'{supported_codes}') disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: print(f"Spacy models '{spacy_model_name}' not found. " "Downloading and installing.") spacy_download(spacy_model_name) from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(spacy_model_name) link(spacy_model_name, spacy_model_name, model_path=package_path) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
[docs]def stanford_downloader(lang: str, treebank: Optional[str] = None, download: bool = False) -> str: ''' Downloads the Stanford NLP Neural Network pipelines that can be used for the following tagging tasks: 1. tokenizing 2. Multi Word Tokens (MWT) 3. POS tagging - Universal POS (UPOS) tags and depending on the language, language specific POS tags (XPOS) 4. Lemmatization 5. Dependency Parsing Each pipeline is trained per language and per treebank hence why the language and treebank is required as arguments. When the treebank is not given the default treebank is used. If download is True then it will re-download the pipeline even if it already exists, this might be useful if a new version has come avliable. Languages supported: https://stanfordnlp.github.io/stanfordnlp/installation_download.html#human- languages-supported-by-stanfordnlp Reference paper: https://www.aclweb.org/anthology/K18-2016 :param lang: Language of the Neural Network Pipeline to download. :param treebank: The neural network model to use based on the treebank it was trained from. If not given the default treebank will be used. To see which is the default treebank and the treebanks available for each language go to: https://stanfordnlp.github.io/stanfordnlp/installation_ download.html#human-languages-supported-by-stanfordnlp :param download: If to re-download the model. :returns: The treebank full name which this method has to resolve to it's full name to find the model's directory. :raises ValueError: If the treebank does not exist for the given language. Also will raise an error there is not a pipeline for the language given. ''' if lang not in resources.default_treebanks: pipeline_langs = list(resources.default_treebanks.keys()) raise ValueError(f'There is no pipeline for the language {lang}. ' 'There are pipelines for the following languages:' f' {pipeline_langs}') if treebank is None: treebank = resources.default_treebanks[lang] else: treebank = f'{lang}_{treebank}' if treebank not in resources.conll_shorthands: raise ValueError(f'The treebank {treebank} does not exist for ' f'{lang}. Here is a list of languages and ' 'treebanks that do exist:\n' f'{resources.conll_shorthands}') model_dir_name = f'{treebank}_models' model_download_dir = Path(Path.home(), 'stanfordnlp_resources', model_dir_name) if download: stanfordnlp.download(treebank, force=True) elif model_download_dir.exists(): pass else: stanfordnlp.download(treebank, force=True) return treebank