'''
Contains functions that perform depdency parsing.
'''
from collections import defaultdict
from pathlib import Path
from typing import List, Tuple
from filelock import FileLock
import networkx as nx
from ruamel.yaml import YAML
from tweebo_parser import API
from bella.dependency_tokens import DependencyToken
from bella import stanford_tools
BELLA_CONFIG_FP = Path.home().joinpath('.Bella', 'config.yaml')
BELLA_CONFIG_LOCK_FP = Path.home().joinpath('.Bella', 'config.yaml.lock')
[docs]class TweeboParser(object):
'''
Singleton Class instance
'''
instance = None
[docs] @staticmethod
def get_config() -> Tuple[str, int]:
hostname = '0.0.0.0'
port = 8000
yaml = YAML()
config_data = {}
BELLA_CONFIG_FP.parent.mkdir(parents=True, exist_ok=True)
lock = FileLock(BELLA_CONFIG_LOCK_FP)
with lock.acquire(60):
if BELLA_CONFIG_FP.exists():
with BELLA_CONFIG_FP.open('r') as config_file:
config_data = yaml.load(config_file)
if 'tweebo_parser' in config_data:
tweebo_config = config_data['tweebo_parser']
if 'hostname' in tweebo_config:
hostname = tweebo_config['hostname']
if 'port' in tweebo_config:
port = tweebo_config['port']
config_data['tweebo_parser'] = {}
config_data['tweebo_parser']['hostname'] = hostname
config_data['tweebo_parser']['port'] = port
with BELLA_CONFIG_FP.open('w') as config_file:
yaml.dump(config_data, config_file)
return hostname, port
def __new__(cls):
if TweeboParser.instance is None:
hostname, port = cls.get_config()
TweeboParser.instance = API(hostname=hostname, port=port,
log_errors=True)
return TweeboParser.instance
def __getattr__(self, name):
return getattr(self.instance, name)
def __setattr__(self, name, value):
return setattr(self.instance, name, value)
def _to_dependencies_tokens(token_dep_sentence: List[Tuple[str, int]]
) -> List[DependencyToken]:
'''
NOTE:
The DependencyToken allows easy access to all the dependency links for that
token.
:param token_dep_sentence: list of tuples that contain (token, linked \
index token)
:type token_dep_sentence: list
:returns: A list of DependencyToken instances one for each tuple/token.
:rtype: list
'''
def dep_search(dep_index, sentence, dep_info):
'''
This is a tail recusive function that returns a dictionary denoting
which index in the sentence relates to the other tokens in the sentence
and at what dependency depth.
:param dep_index: The index of the token whos dependecies are being \
collected
:param sentence: List of tuples which contain (token, index of \
head word)
:param dep_info: default dict whose keys are indexs from the sentence \
and value is a default dict whose keys are dependency depths and \
value is the sentence index related to that dependency depth.
:type dep_index: int
:type sentence: list
:type dep_info: defaultdict
:returns: Default dictionary whose keys are sentence indexs, values \
are default dictionaries whose keys are dependency depths and value \
is the associated sentence index to that depth.
:rtype: defaultdict
:Example:
>>> sentence = [('To', -1), ('appear', 0), ('(', -2),
('EMNLP', 1), ('2014', 3)]
>>> dep_info = defaultdict(lambda: dict())
>>> print(dep_search(4, sentence, dep_info))
>>> {0 : {1 : 1, 2 : 3, 3 : 4},
1 : {1 : 3, 2 : 3},
3 : {1 : 4},
4 : {}}
'''
head_index = sentence[dep_index][1]
if head_index == -1 or head_index == -2:
return dep_info
prev_dep_info = dep_info[dep_index]
head_deps = {dep_level + 1: deps
for dep_level, deps in prev_dep_info.items()}
head_deps[1] = dep_index
dep_info[head_index] = head_deps
return dep_search(head_index, sentence, dep_info)
dep_results = defaultdict(lambda: defaultdict(set))
for index, _ in enumerate(token_dep_sentence):
dep_result = dep_search(index, token_dep_sentence,
defaultdict(lambda: dict()))
for token_index, dependencies in dep_result.items():
for dep_level, dependent in dependencies.items():
dep_results[token_index][dep_level].add(dependent)
G = nx.Graph()
for index, token_related_index in enumerate(token_dep_sentence):
G.add_node(index)
token, related_index = token_related_index
related_index = related_index
if related_index not in {-2, -1}:
G.add_edge(index, related_index)
# Convert each of the tokens in the sentence into a dependent token
# using the results from searching through the dependencies
dep_tokens = []
for token_index, token_dep in enumerate(token_dep_sentence):
token, _ = token_dep
depth_related = dep_results[token_index]
token_relations = defaultdict(list)
connected_indexs = set()
for _, node_relations in nx.bfs_successors(G, token_index):
for node_relation in node_relations:
connected_indexs.add(node_relation)
if token_index not in connected_indexs:
connected_indexs.add(token_index)
connected_indexs = sorted(list(connected_indexs))
connected_words = []
for connected_index in connected_indexs:
connected_word = token_dep_sentence[connected_index][0].strip()
if connected_index == token_index:
connected_words.append((connected_word, 'CURRENT'))
else:
connected_words.append((connected_word, 'CONNECTED'))
# Get the tokens relations
for depth, related_tokens_index in depth_related.items():
for related_token_index in related_tokens_index:
related_token = token_dep_sentence[related_token_index][0]
token_relations[depth].append(related_token)
dep_tokens.append(DependencyToken(token, token_relations,
connected_words))
return dep_tokens
[docs]def tweebo(texts: List[str]) -> List[DependencyToken]:
'''
Given a list of Strings will tokenise, pos tag and then dependecy parse
the text using `Tweebo <https://github.com/ikekonglp/TweeboParser>`_
a Tweet specific parser.
The Tweebo parser cannot handle no strings therefore a special empty string
symbol is required.
If one of the texts is an empty String then an empty list will be returned
for that index of the returned list.
:param texts: The texts that are to be parsed
:type text: list
:returns: A list of of a list of DependencyToken instances. A list per \
text in the texts argument.
:rtype: list
'''
tweebo_api = TweeboParser()
texts = [text.replace('\n', ' ') for text in texts]
processed_texts = tweebo_api.parse_conll(texts)
dep_texts = []
for processed_text in processed_texts:
token_dep_indexs = _convert_conll(processed_text)
dep_texts.append(_to_dependencies_tokens(token_dep_indexs))
return dep_texts
def _convert_conll(conll_data: str) -> List[Tuple[str, int]]:
token_dep_indexs = []
for line in conll_data.split('\n'):
if not line:
continue
column_data = line.split('\t')
word = column_data[1].strip()
# All the word indexs start at 1 not 0.
# Need to take into account the previous sentence words
dep_token_index = int(column_data[6]) - 1
token_dep_indexs.append((word, dep_token_index))
return token_dep_indexs
[docs]def stanford(texts: str) -> List[DependencyToken]:
dep_texts = []
for text in texts:
dep_dicts, tokens_dicts = stanford_tools.dependency_parse(text)
token_dep_indexs = []
prev_sent_length = 0
for sentence, _ in enumerate(tokens_dicts):
tokens_dict = tokens_dicts[sentence]
dep_dict = dep_dicts[sentence]
for i in range(1, len(tokens_dict) + 1):
word = tokens_dict[i]['word']
# All the word indexs start at 1 not 0.
# Need to take into account the previous sentence words
dep_word_index = (dep_dict[i][1] - 1) + prev_sent_length
# If True then it is the root word
if dep_word_index + 1 == prev_sent_length:
dep_word_index = -1
token_dep_indexs.append((word, dep_word_index))
prev_sent_length += len(tokens_dict)
dep_texts.append(_to_dependencies_tokens(token_dep_indexs))
return dep_texts