'''
A set of functions which either produce contexts and related targets based on
syntactic parsing. Or functions that create
'''
import re
from bella.data_types import Target
from bella.dependency_parsers import stanford
[docs]def normalise_target(target, text, sorted_target_spans, renormalise=False,
parser=None):
def add_target_to_text(target):
target_added_text = text
for start, end in sorted_target_spans:
start_text = target_added_text[: start]
end_text = target_added_text[end :]
start_text += ' {} '.format(target)
target_added_text = start_text + end_text
return ' '.join(target_added_text.split())
def replace_target_in_text(target, new_target):
return text.replace(target, new_target)
def check_target_unique(target, text, num_target_spans):
escaped_target = re.escape(target)
num_target_occurences = len(re.findall(escaped_target, text))
if num_target_occurences != num_target_spans:
return False
return True
norm_target = target.strip()
norm_text = text
num_target_spans = len(sorted_target_spans)
split_target = target.split()
num_spaces_in_target = len(split_target)
# Converts words such LG Flat Screen into LG_FlatScreen This is done as the
# parser won't keep a word as a word if it is LG_Flat_Screen
if renormalise:
norm_target = ''.join(split_target)
if norm_target[0] == '#' or norm_target[0] == '@':
if '#' in norm_target[1:]:
norm_target = norm_target.replace('#', '')
norm_target = '#{}'.format(norm_target)
elif '@' in norm_target[1:]:
norm_target = norm_target.replace('@', '')
norm_target = '@{}'.format(norm_target)
if parser == stanford:
norm_target = norm_target.replace('@', '')
if norm_target[-1] == '#':
norm_target = norm_target.replace('#', '')
elif num_spaces_in_target > 2:
first_word = split_target[0]
joined_rest_words = ''.join(split_target[1:])
norm_target = '_'.join([first_word, joined_rest_words])
# Keeps the word as a whole word.
elif num_spaces_in_target == 2:
temp_split_target = []
for word_index, split_word in enumerate(split_target):
if word_index != 0:
split_word = split_word.replace('#', '')
temp_split_target.append(split_word)
norm_target = '_'.join(temp_split_target)
# Gets rid of anything this is not a word @ or #
norm_target = re.sub(r'[^\w@#]', '', norm_target)
# Put the normalised targets into the text
norm_text = add_target_to_text(norm_target)
#if parser == stanford:
# norm_target = norm_target.replace('@', '')
# Checks a word that is the normalised target word exists in the text and
# if so changes it to a word that is not by putting dollar signs around it.
if not check_target_unique(norm_target, norm_text, num_target_spans):
if parser == stanford:
norm_target = norm_target.replace('@', '')
norm_target = norm_target.replace('#', '')
elif '@' in norm_target:
norm_target = norm_target.replace('@', '')
else:
if renormalise and len(split_target) == 1:
if norm_target[0] == '#':
norm_target = '${}$'.format(norm_target[1:])
else:
norm_target = '${}$'.format(norm_target)
norm_text = add_target_to_text(norm_target)
if not check_target_unique(norm_target, norm_text, num_target_spans):
if parser == stanford:
norm_target = '<{}>'.format(norm_target)
norm_text = add_target_to_text(norm_target)
if check_target_unique(norm_target,
norm_text, num_target_spans):
return norm_target
elif '$' not in norm_target:
norm_target = '${}$'.format(norm_target)
norm_text = add_target_to_text(norm_target)
if check_target_unique(norm_target,
norm_text, num_target_spans):
return norm_target
raise Exception('Normalised word {} occurs in the text more times '\
'than it spans {}. Text {}'\
.format(norm_target, num_target_spans, norm_text))
elif parser == stanford:
if '@' in norm_target:
norm_target
return norm_target
[docs]def target_normalisation(target_dict, renormalise=False, parser=None):
'''
Given a target instance it normalises the target by removing whitespaces
between target words and inserting `_`. Then inserting the normalised word
into where the target spans appear and adding whitespace around the target
word incase other words are joined on. Returns the text with the inserted
normalised words and the normalised target word.
:param target_dict: target instance.
:type target_dict: Target
:returns: Tuple of two Strings containing. Text with normalised targets and \
the normalised target
:rtype: tuple
'''
sorted_spans = sorted(target_dict['spans'], key=lambda span: span[0],
reverse=True)
org_text = target_dict['text']
target = target_dict['target']
target = normalise_target(target, org_text, sorted_spans, renormalise,
parser=parser)
for start_index, end_index in sorted_spans:
start_text = org_text[: start_index]
end_text = org_text[end_index :]
start_text += ' {} '.format(target)
org_text = start_text + end_text
org_text = ' '.join(org_text.split())
return org_text, target
[docs]def normalise_context(target_dicts, lower, renormalise=False, parser=None):
'''
Given a list of target dicts and if the text should be lower cased returns
all of the text and targets within those target dicts as lists where the
text and targets have been normalised to ensure the targets within the
text can be identified.
:param target_dicts: list of dicts
:param lower: state if the text within the dicts should be lower cased
:type target_dicts: list
:type lower: bool
:returns: A tuple of length two which contain a list of normalised texts and \
targets.
:rtype: list
'''
# Normalise the target and text
all_text = []
all_norm_targets = []
for target_dict in target_dicts:
norm_text, norm_target = target_normalisation(target_dict, renormalise,
parser=parser)
if lower:
norm_text = norm_text.lower()
norm_target = norm_target.lower()
all_text.append(norm_text)
all_norm_targets.append(norm_target)
return all_text, all_norm_targets
[docs]def dependency_relation_context(target_dicts, parser, lower=False,
n_relations=(1, 1)):
'''
Given a list of target dicts where each target dict has a sentence that
contains one or more of the same target. Returns a list of a list of Strings
where each String is associated to a target within the target sentence.
The String is a concatenation of n_relations depth of dependency relations
where each relation is a child of the target. e.g. n_relations = (1, 1)
will return a String of the concatenation of the children of the target
within the dependency tree. n_relations = (1, 2) will return the children
of the target and the children of the children.
:param target_dicts: list of dictionaries where each dictionary is associated \
to a target sentence.
:param parser: function that performs dependency parsing
:param lower: Whether to lower case the text
:param n_relations: The depth of the dependency relation text from the target \
to return. Represented as a tuple of two ints the first defining the \
starting depth the second end depth e.g. (1, 2) will return depths one and \
two of dependency tree.
:type target_dicts: list
:type parser: String
:type lower: bool. Default False.
:type n_relations: tuple. Default (1, 1).
:returns: A list of a list of Strings where each String represents a specific \
target word within a target sentence dependency related text at n_relations \
depth.
:rtype: list
'''
# Normalise the target and text
targets = [target_dict['target'] for target_dict in target_dicts]
norm_texts, norm_targets = normalise_context(target_dicts, lower,
parser=parser)
# Get contexts
all_dependency_tokens = parser(norm_texts)
all_contexts = []
for index, dependency_tokens in enumerate(all_dependency_tokens):
for attempts in range(1, 3):
contexts = []
norm_target = norm_targets[index]
# This only happens if the first normalization does not work
if attempts == 2:
text, norm_target = normalise_context([target_dicts[index]],
lower=lower, parser=parser,
renormalise=True)
norm_target = norm_target[0]
dependency_tokens = parser(text)[0]
for dependency_token in dependency_tokens:
current_token = targets[index]
if lower:
current_token = current_token.lower()
if dependency_token.token == norm_target:
all_related_words = dependency_token.get_n_relations(n_relations)
related_text = ' '.join(all_related_words)
related_text = related_text.replace(current_token, norm_target)
contexts.append(related_text)
rel_target = target_dicts[index]
valid_num_targets = len(rel_target['spans'])
if valid_num_targets != len(contexts):
if attempts == 1:
continue
raise ValueError('The number of identified targets `{}` not equal '\
'to the number of targets in the data `{}`'\
.format(contexts, rel_target))
# Ensure the returned data type is consistent
if contexts == []:
raise ValueError('This should not happen as each data type should '\
'have a target {}'.format(rel_target))
all_contexts.append(contexts)
break
return all_contexts
[docs]def dependency_context(target_dicts, parser, lower=False):
'''
Given a list of target dicts it will normalise the target word to ensure
that it is seperated and if it is a multi word target join the target words
together to ensure when it is processed by the dependency parser it is treated
as a singular word.
Given a list of target sentences returns a list of contexts where each contexts
is associated to a target sentence and each contexts contains a target context
for each target word in the target sentence. A target context is a dict which
contains `text` and `span` keys where the values correspond to all the
dependency related words as a String and the span are the indexs to the target
word within the text.
:param target_dicts: list of dictionaries where each dictionary is associated \
to a target sentence.
:param parser: function that performs dependency parsing
:param lower: Whether to lower case the texts before processing them with the \
parser.
:type target_dicts: list
:type parser: function
:type lower: bool Default False
:returns: A list of a list of dicts where each list is contains many contexts.
:rtype: list
'''
# Normalise the target and text
targets = [target_dict['target'] for target_dict in target_dicts]
norm_texts, norm_targets = normalise_context(target_dicts, lower,
parser=parser)
# Get contexts
all_dependency_tokens = parser(norm_texts)
all_contexts = []
for index, dependency_tokens in enumerate(all_dependency_tokens):
for attempts in range(1, 3):
contexts = []
norm_target = norm_targets[index]
# This only happens if the first normalization does not work
if attempts == 2:
text, norm_target = normalise_context([target_dicts[index]],
lower=lower, parser=parser,
renormalise=True)
norm_target = norm_target[0]
dependency_tokens = parser(text)[0]
for dependency_token in dependency_tokens:
current_target = targets[index]
if lower:
current_target = current_target.lower()
if dependency_token.token == norm_target:
norm_org = (norm_target, current_target)
text_span = dependency_token\
.connected_target_span(renormalise=norm_org)
connected_text, target_span = text_span
contexts.append({'text' : connected_text,
'span' : target_span})
rel_target = target_dicts[index]
valid_num_targets = len(rel_target['spans'])
if valid_num_targets != len(contexts):
if attempts == 1:
continue
raise ValueError('The number of identified targets `{}` not equal '\
'to the number of targets in the data `{}`'\
' norm target {}'\
.format(contexts, rel_target, norm_target))
# Ensure the returned data type is consistent
if contexts == []:
raise ValueError('This should not happen as each data type should '\
'have a target {}'.format(rel_target))
all_contexts.append(contexts)
break
return all_contexts
[docs]def context(all_context_dicts, specific_context, inc_target=False):
'''
Returns a list of a list of Strings based on the location of the target word
in the text within the target dict (NOTE the target word can occur more than
once hence why a list is returned as the context is returned for each
occurence). Context can be one of the following:
1. left - left of the target occurence.
2. right - right of the target occurence.
3. target - target word/words of each target occurence.
4. full - whole text repeated for each occurence.
If the target only occur once in the text then for that text occurence the
length of the list will be one.
:param all_context_dicts: A list of dicts where each dict contains text and \
span keys.
:param specific_context: String specifying the context e.g. left.
:param inc_target: Whether to include the target word in the context text. \
(Only applies for left and right context.)
:type all_context_dicts: list
:type specific_context: String
:type inc_target: Boolean Default False
:returns: A list of of a list of context strings
:rtype: list
'''
acceptable_contexts = {'left', 'right', 'target', 'full'}
if specific_context not in acceptable_contexts:
raise ValueError('context parameter can only be one of the following {}'\
' not {}'.format(acceptable_contexts, context))
all_contexts = []
for context_dicts in all_context_dicts:
contexts = []
for context_dict in context_dicts:
text = context_dict['text']
target_span = context_dict['span']
start_char = target_span[0]
end_char = target_span[1]
if specific_context == 'left':
if inc_target:
contexts.append(text[:end_char])
else:
contexts.append(text[:start_char])
elif specific_context == 'right':
if inc_target:
contexts.append(text[start_char:])
else:
contexts.append(text[end_char:])
elif specific_context == 'target':
contexts.append(text[start_char:end_char])
elif specific_context == 'full':
contexts.append(text)
else:
raise ValueError('context parameter should only be `right` or '\
'`left` not {} there must be a logic error'\
.format(specific_context))
all_contexts.append(contexts)
return all_contexts