Source code for bella.dependency_parsers

Contains functions that perform depdency parsing.
from collections import defaultdict
from pathlib import Path
from typing import List, Tuple

from filelock import FileLock
import networkx as nx
from ruamel.yaml import YAML
from tweebo_parser import API

from bella.dependency_tokens import DependencyToken
from bella import stanford_tools

BELLA_CONFIG_FP = Path.home().joinpath('.Bella', 'config.yaml')
BELLA_CONFIG_LOCK_FP = Path.home().joinpath('.Bella', 'config.yaml.lock')

[docs]class TweeboParser(object): ''' Singleton Class instance ''' instance = None
[docs] @staticmethod def get_config() -> Tuple[str, int]: hostname = '' port = 8000 yaml = YAML() config_data = {} BELLA_CONFIG_FP.parent.mkdir(parents=True, exist_ok=True) lock = FileLock(BELLA_CONFIG_LOCK_FP) with lock.acquire(60): if BELLA_CONFIG_FP.exists(): with'r') as config_file: config_data = yaml.load(config_file) if 'tweebo_parser' in config_data: tweebo_config = config_data['tweebo_parser'] if 'hostname' in tweebo_config: hostname = tweebo_config['hostname'] if 'port' in tweebo_config: port = tweebo_config['port'] config_data['tweebo_parser'] = {} config_data['tweebo_parser']['hostname'] = hostname config_data['tweebo_parser']['port'] = port with'w') as config_file: yaml.dump(config_data, config_file) return hostname, port
def __new__(cls): if TweeboParser.instance is None: hostname, port = cls.get_config() TweeboParser.instance = API(hostname=hostname, port=port, log_errors=True) return TweeboParser.instance def __getattr__(self, name): return getattr(self.instance, name) def __setattr__(self, name, value): return setattr(self.instance, name, value)
def _to_dependencies_tokens(token_dep_sentence: List[Tuple[str, int]] ) -> List[DependencyToken]: ''' NOTE: The DependencyToken allows easy access to all the dependency links for that token. :param token_dep_sentence: list of tuples that contain (token, linked \ index token) :type token_dep_sentence: list :returns: A list of DependencyToken instances one for each tuple/token. :rtype: list ''' def dep_search(dep_index, sentence, dep_info): ''' This is a tail recusive function that returns a dictionary denoting which index in the sentence relates to the other tokens in the sentence and at what dependency depth. :param dep_index: The index of the token whos dependecies are being \ collected :param sentence: List of tuples which contain (token, index of \ head word) :param dep_info: default dict whose keys are indexs from the sentence \ and value is a default dict whose keys are dependency depths and \ value is the sentence index related to that dependency depth. :type dep_index: int :type sentence: list :type dep_info: defaultdict :returns: Default dictionary whose keys are sentence indexs, values \ are default dictionaries whose keys are dependency depths and value \ is the associated sentence index to that depth. :rtype: defaultdict :Example: >>> sentence = [('To', -1), ('appear', 0), ('(', -2), ('EMNLP', 1), ('2014', 3)] >>> dep_info = defaultdict(lambda: dict()) >>> print(dep_search(4, sentence, dep_info)) >>> {0 : {1 : 1, 2 : 3, 3 : 4}, 1 : {1 : 3, 2 : 3}, 3 : {1 : 4}, 4 : {}} ''' head_index = sentence[dep_index][1] if head_index == -1 or head_index == -2: return dep_info prev_dep_info = dep_info[dep_index] head_deps = {dep_level + 1: deps for dep_level, deps in prev_dep_info.items()} head_deps[1] = dep_index dep_info[head_index] = head_deps return dep_search(head_index, sentence, dep_info) dep_results = defaultdict(lambda: defaultdict(set)) for index, _ in enumerate(token_dep_sentence): dep_result = dep_search(index, token_dep_sentence, defaultdict(lambda: dict())) for token_index, dependencies in dep_result.items(): for dep_level, dependent in dependencies.items(): dep_results[token_index][dep_level].add(dependent) G = nx.Graph() for index, token_related_index in enumerate(token_dep_sentence): G.add_node(index) token, related_index = token_related_index related_index = related_index if related_index not in {-2, -1}: G.add_edge(index, related_index) # Convert each of the tokens in the sentence into a dependent token # using the results from searching through the dependencies dep_tokens = [] for token_index, token_dep in enumerate(token_dep_sentence): token, _ = token_dep depth_related = dep_results[token_index] token_relations = defaultdict(list) connected_indexs = set() for _, node_relations in nx.bfs_successors(G, token_index): for node_relation in node_relations: connected_indexs.add(node_relation) if token_index not in connected_indexs: connected_indexs.add(token_index) connected_indexs = sorted(list(connected_indexs)) connected_words = [] for connected_index in connected_indexs: connected_word = token_dep_sentence[connected_index][0].strip() if connected_index == token_index: connected_words.append((connected_word, 'CURRENT')) else: connected_words.append((connected_word, 'CONNECTED')) # Get the tokens relations for depth, related_tokens_index in depth_related.items(): for related_token_index in related_tokens_index: related_token = token_dep_sentence[related_token_index][0] token_relations[depth].append(related_token) dep_tokens.append(DependencyToken(token, token_relations, connected_words)) return dep_tokens
[docs]def tweebo(texts: List[str]) -> List[DependencyToken]: ''' Given a list of Strings will tokenise, pos tag and then dependecy parse the text using `Tweebo <>`_ a Tweet specific parser. The Tweebo parser cannot handle no strings therefore a special empty string symbol is required. If one of the texts is an empty String then an empty list will be returned for that index of the returned list. :param texts: The texts that are to be parsed :type text: list :returns: A list of of a list of DependencyToken instances. A list per \ text in the texts argument. :rtype: list ''' tweebo_api = TweeboParser() texts = [text.replace('\n', ' ') for text in texts] processed_texts = tweebo_api.parse_conll(texts) dep_texts = [] for processed_text in processed_texts: token_dep_indexs = _convert_conll(processed_text) dep_texts.append(_to_dependencies_tokens(token_dep_indexs)) return dep_texts
def _convert_conll(conll_data: str) -> List[Tuple[str, int]]: token_dep_indexs = [] for line in conll_data.split('\n'): if not line: continue column_data = line.split('\t') word = column_data[1].strip() # All the word indexs start at 1 not 0. # Need to take into account the previous sentence words dep_token_index = int(column_data[6]) - 1 token_dep_indexs.append((word, dep_token_index)) return token_dep_indexs
[docs]def stanford(texts: str) -> List[DependencyToken]: dep_texts = [] for text in texts: dep_dicts, tokens_dicts = stanford_tools.dependency_parse(text) token_dep_indexs = [] prev_sent_length = 0 for sentence, _ in enumerate(tokens_dicts): tokens_dict = tokens_dicts[sentence] dep_dict = dep_dicts[sentence] for i in range(1, len(tokens_dict) + 1): word = tokens_dict[i]['word'] # All the word indexs start at 1 not 0. # Need to take into account the previous sentence words dep_word_index = (dep_dict[i][1] - 1) + prev_sent_length # If True then it is the root word if dep_word_index + 1 == prev_sent_length: dep_word_index = -1 token_dep_indexs.append((word, dep_word_index)) prev_sent_length += len(tokens_dict) dep_texts.append(_to_dependencies_tokens(token_dep_indexs)) return dep_texts