Source code for bella.stanford_tools

import json
from pathlib import Path
from typing import Tuple

from filelock import FileLock
from nltk.tree import Tree
from ruamel.yaml import YAML
from stanfordcorenlp import StanfordCoreNLP

BELLA_CONFIG_FP = Path.home().joinpath('.Bella', 'config.yaml')
BELLA_CONFIG_LOCK_FP = Path.home().joinpath('.Bella', 'config.yaml.lock')

[docs]class StanfordNlp(object): ''' Singleton Class instance ''' instance = None
[docs] @staticmethod def get_config() -> Tuple[str, int]: hostname = 'http://localhost' port = 9000 yaml = YAML() config_data = {} BELLA_CONFIG_FP.parent.mkdir(parents=True, exist_ok=True) lock = FileLock(BELLA_CONFIG_LOCK_FP) with lock.acquire(60): if BELLA_CONFIG_FP.exists(): with BELLA_CONFIG_FP.open('r') as config_file: config_data = yaml.load(config_file) if 'stanford_core_nlp' in config_data: stanford_config = config_data['stanford_core_nlp'] if 'hostname' in stanford_config: hostname = stanford_config['hostname'] if 'port' in stanford_config: port = stanford_config['port'] config_data['stanford_core_nlp'] = {} config_data['stanford_core_nlp']['hostname'] = hostname config_data['stanford_core_nlp']['port'] = port with BELLA_CONFIG_FP.open('w') as config_file: yaml.dump(config_data, config_file) return hostname, port
def __new__(cls): if StanfordNlp.instance is None: hostname, port = cls.get_config() StanfordNlp.instance = StanfordCoreNLP(hostname, port) return StanfordNlp.instance def __getattr__(self, name): return getattr(self.instance, name) def __setattr__(self, name, value): return setattr(self.instance, name, value)
[docs]def tokenise(text): stanford_nlp = StanfordNlp() output_dict = stanford_nlp.annotate(text, {'annotators': 'ssplit,tokenize', 'tokenize.language': 'English', 'outputFormat': 'json'}) output_dict = json.loads(output_dict, strict=False) tokens = [token['originalText'] for s in output_dict['sentences'] for token in s['tokens']] return tokens
[docs]def constituency_parse(text): ''' :param text: The text you want to parse :type text: String :returns: A list of parse trees where each tree is represented by \ nltk.tree.Tree. Each parse tree is associated to a sentence in the text. \ If one sentence in the text then the list will be of length 1. :rtype: list ''' if text.strip() == '': raise ValueError('There has to be some text to parse. Text given {}' .format(text)) stanford_nlp = StanfordNlp() output_dict = stanford_nlp.annotate(text, {'annotators': 'pos,parse', 'tokenize.language': 'English', 'outputFormat': 'json'}) output_dict = json.loads(output_dict) parse_trees = [Tree.fromstring(sent['parse']) for sent in output_dict['sentences']] return parse_trees
[docs]def dependency_parse(text, dep_type='basicDependencies'): ''' :param text: The text you want to parse :param dep_type: The dependency type to use either: 'basicDependencies', \ 'enhancedDependencies', or 'enhancedPlusPlusDependencies'. See for more \ details \ https://nlp.stanford.edu/~sebschu/pubs/schuster-manning-lrec2016.pdf :type text: String :type dep_type: String. Default basicDependencies ''' if text.strip() == '': raise ValueError('There has to be some text to parse. Text given {}' .format(text)) stanford_nlp = StanfordNlp() output_dict = stanford_nlp.annotate(text, {'annotators': 'pos,depparse', 'tokenize.language': 'English', 'outputFormat': 'json'}) sentences = json.loads(output_dict)['sentences'] tokens_dicts = [] dep_dicts = [] for sentence in sentences: tokens_dict = {token_data['index']: token_data for token_data in sentence['tokens']} dep_dict = {} for dep_data in sentence[dep_type]: dep_rel = dep_data['dep'] dep_token_index = dep_data['governor'] current_token_index = dep_data['dependent'] dep_dict[current_token_index] = (dep_rel, dep_token_index) tokens_dicts.append(tokens_dict) dep_dicts.append(dep_dict) return dep_dicts, tokens_dicts