'''
Moudle that contains the two main data types
`target_extraction.data_types.TargetText` and
`target_extraction.data_types.TargetTextCollection` where the later is a
container for the former.
classes:
1. `target_extraction.data_types.TargetText`
2. `target_extraction.data_types.TargetTextCollection`
'''
from collections.abc import MutableMapping
from collections import OrderedDict, Counter, defaultdict, deque
import copy
import json
import itertools
import functools
from pathlib import Path
from typing import Optional, List, Tuple, Iterable, NamedTuple, Any, Callable
from typing import Union, Dict, Set
import traceback
import random
from target_extraction.tokenizers import is_character_preserving, token_index_alignment
from target_extraction.data_types_util import (Span, OverLappingTargetsError,
AnonymisedError, OverwriteError)
[docs]def check_anonymised(func):
'''
Assumes the first argument in the given function is a TargetText object
defined by self.
:raises AnonymisedError: If the TargetText object given to `func`
`anonymised` attribute is True.
'''
@functools.wraps(func)
def wrapper_func(*args, **kwargs):
target_text_object = args[0]
if target_text_object.anonymised:
anonymised_err = (f'Cannot perform this function as the Target '
f'{target_text_object} has been anonymised '
'and therefore has no `text`')
raise AnonymisedError(anonymised_err)
return func(*args, **kwargs)
return wrapper_func
[docs]class TargetText(MutableMapping):
'''
This is a data structure that inherits from MutableMapping which is
essentially a python dictionary.
The following are the default keys that are in all `TargetText`
objects, additional items can be added through __setitem__
1. text - The text associated to all of the other items
2. text_id -- The unique ID associated to this object
3. targets -- List of all target words that occur in the text. A special
placeholder of None (python None value) can exist where the
target does not exist but a related Category does this would
mean though that the related span is Span(0, 0), this type of
special placeholder is in place for the SemEval 2016 Restaurant
dataset where they link the categories to the targets but
not all categories have related targets thus None.
4. spans -- List of Span NamedTuples where each one specifies the start and
end of the respective targets within the text.
5. target_sentiments -- List sepcifying the sentiment of the respective
targets within the text.
6. categories -- List of categories that exist in the data which may or
may not link to the targets (this is dataset speicific). NOTE:
depending on the dataset and how it is parsed the category can exist
but the target does not as the category is a latent variable, in
these cases the category and category sentiments will be the same size
which would be a different size to the target and target sentiments
size. E.g. can happen where the dataset has targets and categories
but they do not map to each other in a one to one manner e.g
SemEval 2014 restuarant dataset, there are some samples that contain
categories but no targets. Another word for category can be aspect.
7. category_sentiments -- List of the sentiments associated to the
categories. If the categories and targets map to each other then
this will be empty and you will only use the target_sentiments.
Attributes:
1. anonymised -- If True then the data within the TargetText object has
no text but the rest of the metadata should exist.
Methods:
1. to_json -- Returns the object as a dictionary and then encoded using
json.dumps
2. to_conll -- Returns a CONLL formatted string where the formatt will be the
following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`. Where each
token and relevant labels are on separate new lines. The first line will
always contain the following: `# {text_id: `value`}` where the `text_id`
represents the `text_id` of this TargetText, this will allow the CONLL
string to be uniquely identified back this TargetText object.
3. from_conll -- Adds the gold labels and/or predicted sequence labels from
the CONLL formatted string.
4. tokenize -- This will add a new key `tokenized_text` to this TargetText
instance that will store the tokens of the text that is associated to
this TargetText instance.
5. pos_text -- This will add a new key `pos_tags` to this TargetText
instance. This key will store the pos tags of the text that is
associated to this Target Text instance.
6. force_targets -- Does not return anything but modifies the `spans` and
`text` values as whitespace is prefixed and suffixed the target unless
the prefix or suffix is whitespace. NOTE that this is the only method
that currently can change the `spans` and `text` key values after they
have been set.
7. sequence_labels -- Adds the `sequence_labels` key to this TargetText
instance which can be used to train a machine learning algorthim to
detect targets.
8. get_sequence_indexs -- The indexs related to the tokens, pos tags etc
for each labelled sequence span.
9. get_sequence_spans -- The span indexs from the sequence labels given
assuming that the sequence labels are in BIO format.
10. get_targets_from_sequence_labels -- Retrives the target words given the
sequence labels.
11. one_sample_per_span -- This returns a similar TargetText instance
where the new instance will only contain one target per span.
12. left_right_target_contexts -- This will return the sentence that is
left and right of the target as well as the words in the target for
each target in the sentence.
13. replace_target -- Given an index and a new target word it will replace
the target at the index with the new target word and return a new
TargetText object with everything the same apart from this new target.
14. de_anonymise -- This will set the `anonymised` attribute to False
from True and set the `text` key value to the value in the `text`
key within the `text_dict` argument.
15. in_order -- True if all the `targets` within this TargetText
are in sequential left to right order within the text.
16. re_order -- Re-Orders the TargetText object targets so that they are in
a left to right order within the text, this will then re-order all
values within this object that are in a list format into this order.
Once the TargetText has been re-ordered it will return True when
:py:meth`target_extraction.data_types.TargetText.in_order` is called.
17. add_unique_key -- Given a key e.g. `targets` it will create a new value
in the TargetText object that is a list of strings which are unique IDs
based on the `text_id` and the index the `targets` occur in e.g.
if the `targets` contain [`food`, `service`] and the `text_id` is
`12a5` then the `target_id` created will contain `[`12a5$$0`,`12a5$$1`]`
Static Functions:
1. from_json -- Returns a TargetText object given a json string. For
example the json string can be the return of TargetText.to_json.
2. targets_from_spans -- Given a sequence of spans and the associated text
it will return the targets that are within the text based on the spans
3. target_text_from_prediction -- Creates a TargetText object from data
that has come from predictions of a Target Extract tagger
'''
def _check_is_list(self, item: List[Any], item_name: str) -> None:
'''
This will check that the argument given is a List and if not will raise
a TypeError.
:param item: The argument that is going to be checked to ensure it is a
list.
:param item_name: Name of the item. This is used within the raised
error message, if an error is raised.
:raises TypeError: If any of the items are not of type List.
'''
type_err = f'{item_name} should be a list not {type(item)} {item}'
if not isinstance(item, list):
raise TypeError(type_err)
[docs] def sanitize(self) -> None:
'''
This performs a check on all of the lists that can be given at
object construction time to ensure that the following conditions are
met:
1. The target, spans and target_sentiments lists are all of the same
size if set.
2. The categories and the category_sentiments lists are all of the
same size if set.
Further more it checks the following:
1. If targets or spans are set then both have to exist.
2. If targets and spans are set that the spans text match the
associated target words e.g. if the target is `barry davies` in
`today barry davies went` then the spans should be [[6,18]]
3. If anonymised esures that the `text` key does not exist.
The 2nd check is not performed if `self.anonymised` is False.
:raises ValueError: If any of the above conditions are not True.
'''
def length_mis_match(lists_to_check: List[Any],
text_id_msg: str) -> None:
length_mismatch_msg = 'The following lists do not match '\
f'{lists_to_check}'
list_lengths = [len(_list) for _list in lists_to_check
if _list is not None]
current_list_size = -1
for list_length in list_lengths:
if current_list_size == -1:
current_list_size = list_length
else:
if current_list_size != list_length:
raise ValueError(text_id_msg + length_mismatch_msg)
targets = self._storage['targets']
target_sentiments = self._storage['target_sentiments']
spans = self._storage['spans']
categories = self._storage['categories']
category_sentiments = self._storage['category_sentiments']
text_id = self._storage['text_id']
text_id_msg = f'Text id that this error refers to {text_id}\n'
# Checking the length mismatches for the two different lists
length_mis_match([targets, target_sentiments, spans], text_id_msg)
length_mis_match([categories, category_sentiments], text_id_msg)
# Checking that if targets are set than so are spans
if targets is not None and spans is None:
spans_none_msg = f'If the targets are a list: {targets} then spans'\
f' should also be a list and not None: {spans}'
raise ValueError(text_id_msg + spans_none_msg)
# Checking that the words Spans reference in the text match the
# respective target words. Edge case is the case of None targets which
# should have a Span value of (0, 0)
if targets is not None:
for target, span in zip(targets, spans):
if target is None:
target_span_msg = 'As the target value is None the span '\
'it refers to should be of value '\
f'Span(0, 0) and not {span}'
if span != Span(0, 0):
raise ValueError(text_id_msg + target_span_msg)
else:
if span == Span(0, 0) and target != '':
target_span_msg = (f'The Span is {Span(0, 0)} and the '
f'target is {target} therefore the '
'span must be in-correct for this'
f' target {self}.')
raise ValueError(target_span_msg)
# Cannot check the text value when the data has been anonymised
if self.anonymised:
continue
text = self._storage['text']
start, end = span.start, span.end
text_target = text[start:end]
target_span_msg = 'The target the spans reference in the '\
f'text: {text_target} does not match '\
f'the target in the targets list: {target}'
if text_target != target:
raise ValueError(text_id_msg + target_span_msg)
if self.anonymised and 'text' in self._storage:
raise ValueError('The TargetText object is anonymised and therefore'
f' should not contain a `text` key. {self}')
def __init__(self, text: Union[str, None], text_id: str,
targets: Optional[List[str]] = None,
spans: Optional[List[Span]] = None,
target_sentiments: Optional[List[Union[int, str]]] = None,
categories: Optional[List[str]] = None,
category_sentiments: Optional[List[Union[int, str]]] = None,
anonymised: bool = False,
**additional_data):
'''
:param additional_data: Any other data that is to be added to the
object at construction.
'''
# Ensure that the arguments that should be lists are lists.
self._list_argument_names = ['targets', 'spans', 'target_sentiments',
'categories', 'category_sentiments']
self._list_arguments = [targets, spans, target_sentiments, categories,
category_sentiments]
names_arguments = zip(self._list_argument_names, self._list_arguments)
for argument_name, list_argument in names_arguments:
if list_argument is None:
continue
self._check_is_list(list_argument, argument_name)
# anonymised data will have no text
temp_dict = dict(text=text, text_id=text_id, targets=targets,
spans=spans, target_sentiments=target_sentiments,
categories=categories,
category_sentiments=category_sentiments)
if anonymised:
del temp_dict['text']
self._protected_keys = set(['text_id', 'targets', 'spans'])
else:
self._protected_keys = set(['text', 'text_id', 'targets', 'spans'])
self._storage = temp_dict
self._storage = {**self._storage, **additional_data}
self._anonymised = anonymised
self.sanitize()
@property
def anonymised(self) -> bool:
'''
:returns: True if the data within the TargetText has been anonymised.
Anonymised data means that there is no text associated with
the TargetText object but all of the metadata is there.
'''
return self._anonymised
@anonymised.setter
def anonymised(self, value: bool) -> None:
'''
Sets whether or not `anonymised` attribute is True or False. Either
which way when set it performs the `sanitize` check to ensure that
the attribute can be set to this value else it is reverted.
:param value: If True then the `text` key will be deleted. In all
cases the TargetText object is subjected to the
:py:meth:`sanitize` to ensure that the anonymised
process is correct.
:raises AnonymisedError: If the TargetText object cannot be set to the
`anonymised` value given. If this Error occurs
then the object will have kept the original
`anonymised` value.
'''
# If want to anonymise all the
if not self.anonymised and value:
del self._storage['text']
self._anonymised = value
try:
self.sanitize()
except:
self._anonymised = not value
sanitize_err = traceback.format_exc()
raise AnonymisedError('Cannot de-anonymise this TargetText '
f'{self} as it cannot pass the `sanitize`'
' check of which the following is the '
f'error from said check {sanitize_err}')
def __getitem__(self, key: str) -> Any:
'''
:returns: One of the values from the self._storage dictionary. e.g.
if the key is `text` it will return the string representing
the text associated to this object.
'''
return self._storage[key]
def __iter__(self) -> Iterable[str]:
'''
Returns an interator over the keys in self._storage which are the
following Strings by default additional keys can be added:
1. text
2. text_id
3. targets
4. spans
5. target_sentiments
6. categories
7. category_sentiments
:returns: The keys in self._storage
'''
return iter(self._storage)
def __len__(self) -> int:
'''
:returns: The number of items in self._storage.
'''
return len(self._storage)
def __repr__(self) -> str:
'''
:returns: String returned is what user see when the instance is
printed or printed within a interpreter.
'''
return f'TargetText({self._storage})'
def __eq__(self, other: 'TargetText') -> bool:
'''
Two TargetText instances are equal if they both have the same `text_id`
value.
:param other: Another TargetText object that is being compared to this
TargetText object.
:returns: True if they have the same `text_id` value else False.
'''
if not isinstance(other, TargetText):
return False
elif self['text_id'] != other['text_id']:
return False
return True
def __delitem__(self, key: str) -> None:
'''
Given a key that matches a key within self._storage or self.keys()
it will delete that key and value from this object.
NOTE: Currently 'text', 'text_id', 'spans', and 'targets' are keys
that cannot be deleted.
:param key: Key and its respective value to delete from this object.
'''
if key in self._protected_keys:
raise KeyError('Cannot delete a key that is protected, list of '
f' protected keys: {self._protected_keys}')
del self._storage[key]
def __setitem__(self, key: str, value: Any) -> None:
'''
Given a key and a respected value it will either change that current
keys value to the one gien here or create a new key with that value.
NOTE: Currently 'text', 'text_id', 'spans', and 'targets' are keys
that cannot be changed.
:param key: Key to be added or changed
:param value: Value associated to the given key.
'''
if key in self._protected_keys:
raise KeyError('Cannot change a key that is protected, list of '
f' protected keys: {self._protected_keys}')
# If the key value should be a list ensure that the new value is a
# list as well.
if key in self._list_argument_names:
self._check_is_list(value, key)
self._storage[key] = value
self.sanitize()
[docs] def to_json(self) -> str:
'''
Required as TargetText is not json serlizable due to the 'spans'.
:returns: The object as a dictionary and then encoded using json.dumps
'''
return json.dumps(self._storage)
[docs] @check_anonymised
def to_conll(self, gold_label_key: str,
prediction_key: Optional[str] = None) -> str:
'''
:param gold_label: A key that contains a sequence of labels e.g.
[`B`, `I`, `O`]. This can come from the return
of the :py:meth:`sequence_labels`
:param prediction_key: Key to the predicted labels of the `gold_label`.
Where the prediction key values is a list of a
list of predicted labels. Each list is therefore
a different model run hence creating the
`PREDICTION 1`, 'PREDICTION 2' etc. Thus the
values of `prediction_key` must be of shape
(number runs, number tokens)
:returns: A CONLL formatted string where the format will be the
following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`
Where each token and relevant labels are on separate new
lines. The first line will always contain the following:
`# {text_id: `value`}` where the text_id represents the
`text_id` of this TargetText, this will allow the CONLL
string to be uniquely identified back this TargetText
object.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises KeyError: If the the object has not be tokenized using
:py:meth:`tokenize`
:raises KeyError: If the `prediction_key` or `gold_label_key` do not
exist.
:raises ValueError: If the `gold_label_key` or `prediction_key` values
are not of the same length as the tokens, as the
labels will not be able to match tokens etc.
:raises ValueError: If the values in `prediction_key` are not of shape
(number runs, number tokens)
'''
keys_to_check = ['tokenized_text', gold_label_key]
if prediction_key is not None:
keys_to_check.append(prediction_key)
for key in keys_to_check:
self._key_error(key)
number_tokens = len(self['tokenized_text'])
# ensure number of labels same as number of tokens
value_err = (f'Number of tokens {number_tokens} does not match the '
f'number of labels ')
number_gold_labels = len(self[gold_label_key])
if number_tokens != number_gold_labels:
gold_err = (f'{number_gold_labels} for gold label '
f'{gold_label_key} in {self}')
gold_err = f'{value_err} {gold_err}'
raise ValueError(gold_err)
if prediction_key is not None:
for prediction_labels in self[prediction_key]:
if not isinstance(prediction_labels, list):
pred_list_shape_error = ('The predictions should be a list '
'of a list of labels of shape '
'(number runs, number tokens) not '
f'{self[prediction_key]}')
raise ValueError(pred_list_shape_error)
number_labels = len(prediction_labels)
pred_err = (f'{number_labels} for prediction '
f'label {prediction_key} in {self}')
pred_err = f'{value_err} {pred_err}'
if number_tokens != number_labels:
raise ValueError(pred_err)
# End of checks now creating the CONLL string
text_id_str = json.dumps({'text_id': self['text_id']})
gold_labels = self[gold_label_key]
conll_string = f'# {text_id_str}'
for token_index, token in enumerate(self['tokenized_text']):
gold_label = gold_labels[token_index]
token_string = f'{token} {gold_label} '
if prediction_key is not None:
for prediction_labels in self[prediction_key]:
prediction_label = prediction_labels[token_index]
token_string += f'{prediction_label} '
token_string = token_string.strip(' ')
token_string = f'\n{token_string}'
conll_string += token_string
return conll_string
[docs] @check_anonymised
def from_conll(self, conll_str: str, tokens_key: str = 'tokenized_text',
gold_label_key: Optional[str] = None,
prediction_key: Optional[str] = None) -> None:
'''
:param conll_str: CONLL formatted string formatted like so:
`TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`
:param tokens_key: Key to save the CONLL tokens too.
:param gold_label_key: Key to save the gold labels too. Either
`gold_label_key` or `prediction_key` must not be
`None` or both not `None`
:param prediction_key: Key to save the prediction labels too. The value
will be of shape (number runs, number tokens)
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises ValueError: If both `gold_label_key` and `prediction_key` are
`None`.
:raises ValueError: If the number of labels are not consistent in the
CONLL string e.g. the first token has 3 predicted
labels and the second token has 2 predicted labels.
:raises ValueError: If the text within this TargetText does not match
the tokens in the CONLL string. (CASE SENSITIVE)
'''
if prediction_key is None and gold_label_key is None:
raise ValueError('Either `prediction_key` or `gold_label_key` or '
'both need to be a String not None')
# Predicted labels is of shape (number runs, number tokens)
predicted_labels: List[List[str]] = []
gold_labels = []
tokens = []
conll_token_labels = conll_str.split('\n')
conll_string_length = 0
for index, conll_token_label in enumerate(conll_token_labels):
token_labels = conll_token_label.split(' ')
number_token_labels = len(token_labels)
# Ensure legnth of CONLL same each time
if index == 0:
conll_string_length = number_token_labels
else:
if conll_string_length != number_token_labels:
raise ValueError('Number of labels are not consistent. '
f'Index {index}. CONLL String: '
f'{conll_token_labels}\n Self {self}')
if number_token_labels < 2:
raise ValueError('CONLL String does not contain any labels '
f'{conll_token_labels}')
tokens.append(token_labels[0])
gold_labels.append(token_labels[1])
if number_token_labels < 3:
continue
predicted_values = token_labels[2:]
for pred_index, predicted_tokens in enumerate(predicted_values):
if index == 0:
predicted_labels.append([])
predicted_labels[pred_index].append(predicted_tokens)
# Ensure that the tokens match the text
text = self['text']
if not is_character_preserving(text, tokens):
raise ValueError(f'The tokens {tokens} do not match the text {text}'
f' for {self}')
self[tokens_key] = tokens
if gold_label_key is not None:
self[gold_label_key] = gold_labels
if prediction_key is not None:
self[prediction_key] = predicted_labels
def _shift_spans(self, target_span: Span, prefix: bool,
suffix: bool) -> None:
'''
This only affects the current state of the TargetText attributes.
The attributes this affects is the `spans` attribute.
NOTE: This is only used within self.force_targets method.
:param prefix: Whether it affects the prefix of the target_span
:param suffix: Whether it affects the suffix of the target_span
:param spans: The current target span indexs that are having extra
whitespace added either prefix or suffix.
'''
target_span_start = target_span.start
target_span_end = target_span.end
for span_index, other_target_span in enumerate(self['spans']):
if other_target_span == target_span:
continue
start, end = self['spans'][span_index]
if prefix:
if other_target_span.start >= target_span_start:
start += 1
if other_target_span.end >= target_span_start:
end += 1
if suffix:
if other_target_span.start >= target_span_end:
start += 1
if other_target_span.end >= target_span_end:
end += 1
self._storage['spans'][span_index] = Span(start, end)
[docs] @check_anonymised
def force_targets(self) -> None:
'''
:NOTE: As this affects the following attributes `spans`, `text`, and `targets`
it therefore has to modify these through self._storage as both of
these attributes are within self._protected_keys.
Does not return anything but modifies the `spans` and `text` values
as whitespace is prefixed and suffixed the target unless the prefix
or suffix is whitespace.
Motivation:
Ensure that the target tokens are not within another separate String
e.g. target = `priced` but the sentence is `the laptop;priced is high`
and the tokenizer is on whitespace it will not have `priced` seperated
therefore the BIO tagging is not deterministic thus force will add
whitespace around the target word e.g. `the laptop; priced`. This was
mainly added for the TargetText.sequence_tags method.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
'''
for span_index in range(len(self['spans'])):
text = self._storage['text']
last_token_index = len(text)
span = self._storage['spans'][span_index]
prefix = False
suffix = False
start, end = span
if start != 0:
if text[start - 1] != ' ':
prefix = True
if end < last_token_index:
if text[end] != ' ':
suffix = True
text_before = text[:start]
text_after = text[end:]
target = text[start:end]
if prefix and suffix:
self._storage['text'] = f'{text_before} {target} {text_after}'
self._shift_spans(span, prefix=True, suffix=True)
self._storage['spans'][span_index] = Span(start + 1, end + 1)
elif prefix:
self._storage['text'] = f'{text_before} {target}{text_after}'
self._shift_spans(span, prefix=True, suffix=False)
self._storage['spans'][span_index] = Span(start + 1, end + 1)
elif suffix:
self._storage['text'] = f'{text_before}{target} {text_after}'
self._shift_spans(span, prefix=False, suffix=True)
# Get the targets from the re-aligned spans
updated_targets = []
text = self._storage['text']
for span in self._storage['spans']:
target = text[span.start: span.end]
updated_targets.append(target)
self._storage['targets'] = updated_targets
[docs] @check_anonymised
def tokenize(self, tokenizer: Callable[[str], List[str]],
perform_type_checks: bool = False) -> None:
'''
This will add a new key `tokenized_text` to this TargetText instance
that will store the tokens of the text that is associated to this
TargetText instance.
For a set of tokenizers that are definitely comptable see
target_extraction.tokenizers module.
Ensures that the tokenization is character preserving.
:param tokenizer: The tokenizer to use tokenize the text for each
TargetText instance in the current collection
:param perform_type_checks: Whether or not to perform type checks
to ensure the tokenizer returns a List of
Strings
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises TypeError: If the tokenizer given does not return a List of
Strings.
:raises ValueError: This is raised if the TargetText instance contains
empty text.
:raises ValueError: If the tokenization is not character preserving.
'''
text = self['text']
tokenized_text = tokenizer(text)
if perform_type_checks:
if not isinstance(tokenized_text, list):
raise TypeError('The return type of the tokenizer function ',
f'{tokenizer} should be a list and not '
f'{type(tokenized_text)}')
for token in tokenized_text:
if not isinstance(token, str):
raise TypeError('The return type of the tokenizer function ',
f'{tokenizer} should be a list of Strings'
f' and not a list of {type(token)}')
if len(tokenized_text) == 0:
raise ValueError('There are no tokens for this TargetText '
f'instance {self}')
if not is_character_preserving(text, tokenized_text):
raise ValueError('The tokenization method used is not character'
f' preserving. Original text `{text}`\n'
f'Tokenized text `{tokenized_text}`')
self['tokenized_text'] = tokenized_text
[docs] @check_anonymised
def pos_text(self, tagger: Callable[[str], Tuple[List[str], List[str]]],
perform_type_checks: bool = False) -> None:
'''
This will add a new key `pos_tags` to this TargetText instance.
This key will store the pos tags of the text that is associated to
this Target Text instance. NOTE: It will also replace the current
tokens in the `tokenized_text` key with the tokens produced
from the pos tagger.
For a set of pos taggers that are definetly comptable see
target_extraction.pos_taggers module. The pos tagger will have to
produce both a list of tokens and pos tags.
:param tagger: POS tagger.
:param perform_type_checks: Whether or not to perform type checks
to ensure the POS tagger returns a
tuple containing two lists both containing
Strings.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises TypeError: If the POS tagger given does not return a Tuple
:raises TypeError: If the POS tagger given does not return a List of
Strings for both the tokens and the pos tags.
:raises TypeError: If the POS tagger tokens or pos tags are not lists
:raises ValueError: If the POS tagger return is not a tuple of length
2
:raises ValueError: This is raised if the Target Text text is empty
:raises ValueError: If the number of pos tags for this instance
does not have the same number of tokens that has
been generated by the tokenizer function.
'''
text = self['text']
tokens_pos_tags = tagger(text)
if perform_type_checks:
if not isinstance(tokens_pos_tags, tuple):
raise TypeError('The return type for the pos tagger should be'
f' a tuple not {type(tokens_pos_tags)}')
if len(tokens_pos_tags) != 2:
raise ValueError('The return of the POS tagger should be a '
f'tuple of length 2 not {len(tokens_pos_tags)}')
if not isinstance(tokens_pos_tags[0], list):
raise TypeError('The return type of the tagger function ',
f'{tagger} should be a list and not '
f'{type(tokens_pos_tags[0])} for the tokens')
if not isinstance(tokens_pos_tags[1], list):
raise TypeError('The return type of the tagger function ',
f'{tagger} should be a list and not '
f'{type(tokens_pos_tags[1])} for the POS tags')
for name, tags in [('tokens', tokens_pos_tags[0]),
('pos_tags', tokens_pos_tags[1])]:
for tag in tags:
if not isinstance(tag, str):
raise TypeError('The return type of the tagger function ',
f'{tagger} should be a list of Strings'
f' and not a list of {type(tag)} for '
f'the {name}')
tokens, pos_tags = tokens_pos_tags
num_pos_tags = len(pos_tags)
if len(pos_tags) == 0:
raise ValueError('There are no tags for this TargetText '
f'instance {self}')
num_tokens = len(tokens)
if num_tokens != num_pos_tags:
raise ValueError(f'Number of POS tags {pos_tags} should be the '
f'same as the number of tokens {tokens}')
self['pos_tags'] = pos_tags
self['tokenized_text'] = tokens
[docs] @check_anonymised
def sequence_labels(self, per_target: bool = False,
label_key: Optional[str] = None) -> None:
'''
Adds the `sequence_labels` key to this TargetText instance which can
be used to train a machine learning algorthim to detect targets. The
value associated to the `sequence_labels` key will be a list of
`B`, `I`, or `O` labels, where each label is associated to a token.
The `force_targets` method might come in useful here for training
and validation data to ensure that more of the targets are not
affected by tokenization error as only tokens that are fully within
the target span are labelled with `B` or `I` tags. Another use for the
`force_targets` is so to ensure that targets are not affected by
tokenisation and therefore can be used to state where the targets are
in the sequence for sentiment classification e.g. in the case of
getting contextualised target tokens or to create [TD-BERT
Gao et al. 2019](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8864964).
Currently the only sequence labels supported is IOB-2 labels for the
targets only. Future plans look into different sequence label order
e.g. IOB see link below for more details of the difference between the
two sequence, of which there are more sequence again.
https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
:param per_target: Whether the the value of associated to the
`sequence_labels` key should be one list for all
of the targets False. Or if True should be a list
of a labels per target where the labels will only
be associated to the represented target.
:param label_key: Optional label key. Where the key represents a list of
values that are associated with each token. These list
of values are then the class labels to attach to
each `B`, `I`, `O` tag. E.g. the label key could be
`target_sentiments` therefore creating the sequence
labelling task of target extraction and predicting
sentiment. For example if the label key is `target_sentiments`
it would make the `B`, `I`, `O` task extraction
and sentiment prediction.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises KeyError: If the current TargetText has not been tokenized. Or
if `label_key` is not None then `label_key` must be
a key in self else KeyError.
:raises ValueError: If `label_key` not None. Raises if number of labels
does not match the number of targets that the labels
should be associated too.
:raises ValueError: If two targets overlap the same token(s) e.g
`Laptop cover was great` if `Laptop` and
`Laptop cover` are two separate targets this should
raise a ValueError as a token should only be
associated to one target.
'''
text = self['text']
if 'tokenized_text' not in self:
raise KeyError(f'Expect the current TargetText {self} to have '
'been tokenized using the self.tokenize method.')
self.sanitize()
tokens = self['tokenized_text']
sequence_labels = ['O' for _ in range(len(tokens))]
if per_target:
sequence_labels = [sequence_labels]
# This is the case where there are no targets thus all sequence labels
# are `O`
if self['spans'] is None or self['targets'] is None:
self['sequence_labels'] = sequence_labels
return
if per_target:
sequence_labels = []
for _ in self['targets']:
sequence_labels.append(['O' for _ in range(len(tokens))])
# Setting up the labels that might be part of the sequence labels
target_spans: List[Span] = self['spans']
labels = None
if label_key is not None:
self._key_error(label_key)
labels = self[label_key]
number_targets = len(target_spans)
number_labels = len(labels)
if number_labels != number_targets:
raise ValueError(f'The number of labels {number_labels} does '
f'not match the number of targets {number_targets}.'
f' Labels {labels}, target spans {target_spans}.'
f' For {self}')
tokens_index = token_index_alignment(text, tokens)
for target_index, target_span in enumerate(target_spans):
target_span_range = list(range(*target_span))
same_target = False
current_sequence_labels = sequence_labels
if per_target:
current_sequence_labels = sequence_labels[target_index]
for sequence_index, token_index in enumerate(tokens_index):
token_start, token_end = token_index
token_end = token_end - 1
if (token_start in target_span_range and
token_end in target_span_range):
if current_sequence_labels[sequence_index] != 'O':
err_msg = ('Cannot have two sequence labels for one '
f'token, text {text}\ntokens {tokens}\n'
f'token indexs {tokens_index}\nTarget '
f'spans {target_spans}')
raise ValueError(err_msg)
if same_target:
current_sequence_labels[sequence_index] = 'I'
if label_key is not None:
label = labels[target_index]
current_sequence_labels[sequence_index] = f'I-{label}'
else:
current_sequence_labels[sequence_index] = 'B'
if label_key is not None:
label = labels[target_index]
current_sequence_labels[sequence_index] = f'B-{label}'
same_target = True
self['sequence_labels'] = sequence_labels
def _key_error(self, key: str) -> None:
'''
:param key: The key to check for within this TargetText instance.
:raises KeyError: If the key given does not exist within this
TargetText instance.
'''
if f'{key}' not in self:
raise KeyError(f'Requires that this TargetText contains {key} for '
f'instance {self}')
[docs] @check_anonymised
def get_sequence_indexs(self, sequence_key: str) -> List[List[int]]:
'''
The following sequence label tags are supported: IOB-2. These are the
tags that are currently generated by `sequence_labels`.
:param sequence_key: Key to sequence labels such as a BIO sequence
labels. Example key name would be `sequence_labels`
after `sequence_labels` function has been called
or more appropiately `predicted_sequence_labels`
when you have predicted sequence labels.
:returns: A list of a list of intergers where each list of integers
represent the token/pos tag/sequence label index of each
sequence label span.
:Example: These sequence labels [`O`, `B`, `I`, `O`, `B`]
would return the following integers list [[1, 2], [4]]
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises ValueError: If the sequence labels that are contained in the
sequence key value contain values other than
`B`, `I`, or `O`.
:raises ValueError: If then number of tokens in the current TargetText
object is not the same as the number of sequence
labels.
'''
# number of tokens and sequence labels, should
# all be the same, it is if the `sequence_labels` function is used
tokens = self['tokenized_text']
sequence_labels = self[sequence_key]
if len(tokens) != len(sequence_labels):
raise ValueError(f'The number of tokens in the TargetText object {self}'
f' is not the same as the number of sequence labels')
same_target = False
start_index = 0
end_index = 0
sequence_indexs: List[List[int]] = []
for label_index, sequence_label in enumerate(sequence_labels):
if sequence_label == 'B':
if same_target == True:
sequence_index = list(range(start_index, end_index))
sequence_indexs.append(sequence_index)
same_target = False
start_index = 0
end_index = 0
same_target = True
start_index = label_index
end_index = label_index + 1
elif sequence_label == 'I':
end_index = label_index + 1
elif sequence_label == 'O':
if same_target:
sequence_index = list(range(start_index, end_index))
sequence_indexs.append(sequence_index)
same_target = False
start_index = 0
end_index = 0
else:
raise ValueError('Sequence labels should be `B` `I` or `O` '
f'and not {sequence_label}. Sequence label '
f'key used {sequence_key}\nTargetText {self}')
if end_index != 0:
sequence_index = list(range(start_index, end_index))
sequence_indexs.append(sequence_index)
return sequence_indexs
[docs] @check_anonymised
def get_sequence_spans(self, sequence_key: str,
confidence: Optional[float] = None) -> List[Span]:
'''
The following sequence label tags are supported: IOB-2. These are the
tags that are currently generated by `sequence_labels`
:param sequence_key: Key to sequence labels such as a BIO sequence
labels. Example key name would be `sequence_labels`
after `sequence_labels` function has been called
or more appropiately `predicted_sequence_labels`
when you have predicted sequence labels.
:param confidence: Optional argument that will return only spans
that have been predicted with a confidence
higher than this.
:NOTE: As it is BIO labelling in the case where
all but one of the B and I's is greater than
the threshold that span would not be
returned, as one of the words in the multi
word target word is less than the threshold.
:returns: The span indexs from the sequence labels given assuming that
the sequence labels are in BIO format.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises KeyError: If no `confidence` key are found. However `confidence`
is only required if the confidence argument is set.
:raises ValueError: If the sequence labels that are contained in the
sequence key value contain values other than
`B`, `I`, or `O`.
:raises ValueError: If the confidence value is not between 0 and 1
'''
# number of tokens, sequence labels, and token text indexs should
# all be the same, it is if the `sequence_labels` function is used
if confidence is not None:
self._key_error('confidence')
if confidence > 1.0 or confidence < 0.0:
raise ValueError('Confidence value has to be bounded between '
f'1 and 0 and not {confidence}')
sequence_indexs: List[List[int]] = self.get_sequence_indexs(sequence_key)
if not sequence_indexs:
return []
tokens = self['tokenized_text']
token_text_indexs = token_index_alignment(self['text'], tokens)
sequence_spans: List[Span] = []
confidences = None
if confidence is not None:
confidences = self['confidence']
for span_sequence_index in sequence_indexs:
# Test that each sequence label was predicted with enough confidence
if confidence is not None:
next_span = False
for index in span_sequence_index:
if confidences[index] <= confidence:
next_span = True
if next_span:
continue
start_index = span_sequence_index[0]
start_span = token_text_indexs[start_index][0]
end_index = span_sequence_index[-1]
end_span = token_text_indexs[end_index][1]
sequence_spans.append(Span(start_span, end_span))
return sequence_spans
[docs] @check_anonymised
def get_targets_from_sequence_labels(self, sequence_key: str,
confidence: Optional[float] = None
) -> List[str]:
'''
This function mains use is when the sequence labels have been
predicted on a piece of text that has no gold annotations.
:param sequence_key: Key to sequence labels such as a BIO sequence
labels. Example key name would be `sequence_labels`
after `sequence_labels` function has been called
or more appropiately `predicted_sequence_labels`
when you have predicted sequence labels.
:param confidence: Optional argument that will return only target
texts that have been predicted with a confidence
higher than this.
:NOTE: As it is BIO labelling in the case where
all but one of the B and I's is greater than
the threshold that target word would not be
returned as one of the words in the multi
word target word is less than the threshold.
:returns: The target text's that the sequence labels have predcited.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:raises KeyError: If no `tokenized_text` or `confidence` key are found.
However `confidence` is only required if the
confidence argument is set.
:raises ValueError: If the confidence value is not between 0 and 1
'''
if confidence is not None:
self._key_error('confidence')
if confidence > 1.0 or confidence < 0.0:
raise ValueError('Confidence value has to be bounded between '
f'1 and 0 and not {confidence}')
self._key_error('tokenized_text')
sequence_indexs: List[List[int]] = self.get_sequence_indexs(sequence_key)
# No targets to extract
if not sequence_indexs:
return []
tokens = self['tokenized_text']
confidences = None
if confidence is not None:
confidences = self['confidence']
targets = []
for span_sequence_index in sequence_indexs:
start_index = span_sequence_index[0]
end_index = span_sequence_index[-1] + 1
target_tokens = tokens[start_index: end_index]
# Test that each token in target tokens was predicted with a
# great enough confidence
if confidence is not None:
next_span = False
for index in span_sequence_index:
if confidences[index] <= confidence:
next_span = True
if next_span:
continue
target = ' '.join(target_tokens)
targets.append(target)
return targets
[docs] @check_anonymised
def one_sample_per_span(self, remove_empty: bool = False) -> 'TargetText':
'''
This returns a similar TargetText instance where the new instance
will only contain one target per span.
This is for the cases where you can have a target e.g. `food` that has
a different related category attached to it e.g.
TargetText(text=`$8 and there is much nicer, food, all of it great and
continually refilled.`, text_id=`1`,
targets=[`food`, `food`, `food`],
categories=[`style`, `quality`, `price`],
target_sentiments=[`pos`,`pos`,`pos`],
spans=[Span(27, 31),Span(27, 31),Span(27, 31)])
As we can see the targets and the categories are linked, this is only
really the case in SemEval 2016 datasets from what I know currently.
In the example case above it will transform it to the following:
TargetText(text=`$8 and there is much nicer, food, all of it great and
continually refilled.`, text_id=`1`,
targets=[`food`],spans=[Span(27,31)])
This type of pre-processing is perfect for the Target Extraction
task.
:param remove_empty: If the TargetText instance contains any None
targets then these will be removed along with
their respective Spans.
:returns: This returns a similar TargetText instance where the new
instance will only contain one target per span.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
'''
text = self['text']
text_id = self['text_id']
targets: List[str] = []
spans: List[Span] = []
if self['spans'] is None:
return TargetText(text=text, text_id=text_id)
current_spans = self['spans']
unique_spans = set(current_spans)
spans = sorted(unique_spans, key=lambda x: x[0])
temp_spans: List[Span] = []
for span in spans:
targets_text = text[span.start: span.end]
if span.start == 0 and span.end == 0 and remove_empty:
continue
else:
temp_spans.append(span)
targets.append(targets_text)
spans = temp_spans
return TargetText(text=text, text_id=text_id,
targets=targets, spans=spans)
[docs] @check_anonymised
def left_right_target_contexts(self, incl_target: bool
) -> List[Tuple[List[str], List[str], List[str]]]:
'''
:param incl_target: Whether or not the left and right sentences should
also include the target word.
:returns: The sentence that is left and right of the target as well as
the words in the target for each target in the sentence.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
'''
left_right_target_list = []
text = self['text']
if self['spans'] is not None:
for span in self['spans']:
span: Span
span_start = span.start
span_end = span.end
if incl_target:
left_context = text[:span_end]
right_context = text[span_start:]
else:
left_context = text[:span_start]
right_context = text[span_end:]
target_context = text[span_start:span_end]
contexts = (left_context, right_context, target_context)
left_right_target_list.append(contexts)
return left_right_target_list
[docs] @check_anonymised
def replace_target(self, target_index: int, replacement_target_word: str
) -> 'TargetText':
'''
:params target_index: The target index of the target word to replace
:param replacement_target_word: The target word to replace the target
word at the given index
:returns: Given the target index and replacement target word it will
replace the target at the index with the new target word and
return a new TargetText object with everything the same apart
from this new target.
:raises ValueError: If the target_index is less than 0 or an index
number that does not exist.
:raises OverLappingTargetsError: If the target to replace is contained
within another target e.g.
`what a great day` if this has two
targets `great` and `great day` then
it will raise this error if you
replace either word as each is
within the other.
:raises AnonymisedError: If the object has been anonymised then this
method cannot be used.
:Example: Given the following TargetText Object
'''
self_dict = copy.deepcopy(dict(self))
number_targets = len(self_dict['targets'])
if target_index < 0 or target_index >= number_targets:
raise ValueError('Not a valid target_index number. Number of targets'
f'in the current object {number_targets}')
# Change the target word
targets = self_dict['targets']
target_to_be_replaced = targets[target_index]
targets[target_index] = replacement_target_word
# Change the target spans
spans = self_dict['spans']
span_to_change = spans[target_index]
spans_to_change: List[int] = []
for span_index, span in enumerate(spans):
if span_index == target_index:
continue
span: Span
# Check that there are no overlapping targets
raise_in_target_error = False
if span.start >= span_to_change.start and span.start < span_to_change.end:
raise_in_target_error = True
elif span.end > span_to_change.start and span.end <= span_to_change.end:
raise_in_target_error = True
if raise_in_target_error:
raise OverLappingTargetsError('There are targets that share '
f'the same context {self}')
if span.start >= span_to_change.end:
spans_to_change.append(span_index)
difference_in_length = len(replacement_target_word) - len(target_to_be_replaced)
# Change all of the spans
for span_index in spans_to_change:
span = spans[span_index]
new_start = span.start + difference_in_length
new_end = span.end + difference_in_length
spans[span_index] = Span(new_start, new_end)
# Change the target that is being replaced span by only the end
new_end = span_to_change.end + difference_in_length
spans[target_index] = Span(span_to_change.start, new_end)
# Change the text
text = self_dict['text']
span_to_change_start = span_to_change.start
span_to_change_end = span_to_change.end
start_text = text[:span_to_change_start]
end_text = text[span_to_change_end:]
text = f'{start_text}{replacement_target_word}{end_text}'
self_dict['targets'] = targets
self_dict['spans'] = spans
self_dict['text'] = text
return TargetText(**self_dict)
[docs] def de_anonymise(self, text_dict: Dict[str, str]) -> None:
'''
This will set the `anonymised` attribute to False from True and
set the `text` key value to the value in the `text` key within the
`text_dict` argument.
:param text_dict: A dictionary that contain the following two keys:
1. `text` and 2. `text_id` where the `text_id` has
to match the current TargetText object `text_id` and
the `text` value will become the new value in the
`text` key for this TargetText object.
:raises ValueError: If the TargetText object `text_id` does not match
the `text_id` within `text_dict` argument.
:raises AnonymisedError: If the `text` given does not pass the
:py:meth:`sanitize` test.
'''
current_text_id = self['text_id']
other_text_id = text_dict['text_id']
if current_text_id != other_text_id:
raise ValueError(f"The current `text_id` {current_text_id} "
"does not match that of the argument's `text_id`"
f" {other_text_id}. For TargetText {self}")
text = text_dict['text']
self._storage['text'] = text
try:
self.anonymised = False
except AnonymisedError:
del self._storage['text']
sanitize_err = traceback.format_exc()
raise AnonymisedError('Cannot de-anonymise this TargetText '
f'{self} as it cannot pass the `sanitize`'
' check of which the following is the '
f'error from said check {sanitize_err}')
[docs] def in_order(self) -> bool:
'''
:returns: True if all the `targets` within this TargetText
are in sequential left to right order within the text.
'''
spans = self['spans']
ordered_spans = sorted(spans)
if ordered_spans != spans:
return False
return True
[docs] def re_order(self, keys_not_to_order: Optional[List[str]] = None) -> None:
'''
Re-orders the TargetText object so that the targets are in a left to
right order within the text, this will then re-order all values within
this object that are in a list format into this order. Once the
TargetText has been re-ordered it will return True when
:py:meth`target_extraction.data_types.TargetText.in_order` is called.
:param keys_not_to_order: Any key values not to re-order using this
function e.g. `pos_tags`, `tokenized_text`,
etc
:raises AssertionError: If running :py:meth`target_extraction.data_types.TargetText.in_order`
after being re-ordered does not return True.
'''
def sorting_by_index(index_order: List[int],
value_to_sort: List[Any]) -> List[Any]:
sorted_value = []
for index in index_order:
sorted_value.append(value_to_sort[index])
return sorted_value
if keys_not_to_order is None:
keys_not_to_order = []
spans: List[Span] = self['spans']
index_order = sorted(range(len(spans)), key=lambda k: spans[k],
reverse=False)
new_key_values = {}
for key, value in self._storage.items():
try:
if isinstance(value, list) and key not in keys_not_to_order:
# Edge case where the list can be just an empty list
if not value:
continue
# Need to check if the first instance of the value is a
# list and if so then that needs to be sorted and not the
# outer list
sorted_value = []
if isinstance(value[0], list):
for inner_value in value:
sorted_inner_value = sorting_by_index(index_order,
inner_value)
sorted_value.append(sorted_inner_value)
else:
sorted_value = sorting_by_index(index_order, value)
assert sorted_value
new_key_values[key] = sorted_value
except:
real_err = traceback.format_exc()
err_msg = (f'The following error {real_err} has occured on the '
f'following key {key} and value {value} for this '
f'TargetText {self}')
raise Exception(err_msg)
# Covers the rollback problem
for key, value in new_key_values.items():
self._storage[key] = value
self.sanitize()
assert self.in_order(), print(f'After re-ordering the object is '
f'still not in order:{self}')
[docs] def add_unique_key(self, id_key: str, id_key_name: str,
id_delimiter: str = '::') -> None:
'''
:param id_key: The name of the key within this TargetText that requires
unique ids that will be stored in `id_key_name`.
:param id_key_name: The name of the key to associate to these new
unique ids.
:param id_delimiter: The delimiter to seperate the `text_id` and the
index of the `id_key` that is being represented
by this unique id.
:raises KeyError: If the `id_key_name` already exists within the
TargetText.
:raises TypeError: If the value of `id_key` is not of type List.
:Example: self.add_unique_key(`targets`, `targets_id`) where
`targets`=[`food`, `service`] and `text_id`=`12a5` the
following key will be added to self `targets_id` with the
following value = `[`12a5::0`, `12a5::1`]`
'''
self._key_error(id_key)
text_id = self['text_id']
if id_key_name in self:
raise KeyError(f'The new id_key_name {id_key_name} '
f'already exists within {self}')
if not isinstance(self[id_key], list):
raise TypeError(f'The value of `id_key` {self[id_key]} in {self} '
f'has to be of type List and not {type(self[id_key])}')
new_ids = []
for index in range(len(self[id_key])):
new_ids.append(f'{text_id}{id_delimiter}{index}')
self[id_key_name] = new_ids
[docs] @staticmethod
def from_json(json_text: str, anonymised: bool = False) -> 'TargetText':
'''
This is required as the 'spans' are Span objects which are not json
serlizable and are required for TargetText therefore this handles
that special case.
This function is also required as we have had to avoid using the
__set__ function and add objects via the _storage dictionary
underneath so that we could add values to this object that are not
within the constructor like `tokenized_text`. To ensure that it is
compatable with the TargetText concept we call `TargetText.sanitize`
method at the end.
:param json_text: JSON representation of TargetText
(can be from TargetText.to_json)
:param anonymised: Whether or not the TargetText object being loaded
is an anonymised version.
:returns: A TargetText object
:raises KeyError: If within the JSON representation there is no
`text_id` key. Or if anonymised is False raises a
KeyError if there is no `text` key in the JSON
representation.
'''
json_target_text = json.loads(json_text)
text = None
if not 'text_id' in json_target_text:
raise KeyError('The JSON text given does not contain a '
f'`text_id` field: {json_target_text}')
if not anonymised:
if not 'text' in json_target_text:
raise KeyError('The JSON text given does not contain a `text`'
f'field: {json_target_text}')
text = json_target_text['text']
target_text = TargetText(text=text, anonymised=anonymised,
text_id=json_target_text['text_id'])
for key, value in json_target_text.items():
if key == 'text' or key == 'text_id':
continue
if key == 'spans':
if value == None:
target_text._storage[key] = None
else:
all_spans = []
for span in value:
all_spans.append(Span(*span))
target_text._storage[key] = all_spans
else:
target_text._storage[key] = value
target_text.sanitize()
return target_text
[docs] @staticmethod
def targets_from_spans(text:str, spans: List[Span]) -> List[str]:
'''
:param text: The text that the spans are associated too.
:param spans: A list of Span values that represent the character index
of the target words to be returned.
:returns: The target words that are associated to the spans and text
given.
'''
targets = []
if not spans:
return targets
for span in spans:
target = text[span.start: span.end]
targets.append(target)
return targets
[docs] @staticmethod
def target_text_from_prediction(text: str, text_id: str,
sequence_labels: List[str],
tokenized_text: List[str],
confidence: Optional[float] = None,
confidences: Optional[List[float]] = None,
**additional_data) -> 'TargetText':
'''
Creates a TargetText object from data that has come from predictions
of a Target Extract tagger e.g. the dictionaries that are returned
from :meth:`target_extraction.allen.allennlp_model.predict_sequences`
:param text: Text to give to the TargetText object
:param text_id: Text ID to give to the TargetText object
:param sequence_labels: The predicted sequence labels
:param tokenized_text: The tokens that were used to produce the
predicted sequence labels (should be returned
by the Target Extract tagger predictor).
:param confidence: The level of confidence from the tagger that is
required for a target to be a target e.g. 0.9
:param confidences: The list of confidence values produced
by the Target Extract tagger predictor to be used
with the confidence argument. The list of confidence
values should be the same size as the sequence labels
list and tokenized text.
:param additional_data: Any other keyword arguments to provide to the
TargetText object
:returns: A TargetText object with spans and targets values
:raises ValueError: If sequence labels, tokenized text and confidecnes
are not of the same length
:raises ValueError: If the following keys are in the additional data;
1. confidence, 2. text, 3. text_id, 4. tokenized_text
5. sequence_labels, 6. targets, 7. spans. As these
keys will be populated by within the TargetText
object automatically.
'''
if len(sequence_labels) != len(tokenized_text):
raise ValueError('Sequence labels and tokenized texts are not of '
f'the same length:\nSequence labels {sequence_labels}'
f'\nTokenized text: {tokenized_text}')
if confidence is not None and len(sequence_labels) != len(confidences):
raise ValueError('Sequence labels and confidences are not of '
f'the same length:\nSequence labels {sequence_labels}'
f'\nconfidences: {confidences}')
not_allowed_additional_keys = {'confidence', 'text', 'text_id',
'tokenized_text', 'sequence_labels',
'targets', 'spans'}
for key in additional_data:
if key in not_allowed_additional_keys:
raise ValueError("The following keys are not allowd in the "
f"additional data:\n{not_allowed_additional_keys}")
temp_target_text = TargetText(text_id=text_id, text=text,
tokenized_text=tokenized_text,
sequence_labels=sequence_labels,
confidence=confidences)
target_spans = temp_target_text.get_sequence_spans('sequence_labels',
confidence=confidence)
targets = TargetText.targets_from_spans(text, target_spans)
return TargetText(text_id=text_id, text=text, confidence=confidences,
tokenized_text=tokenized_text, targets=targets,
spans=target_spans, sequence_labels=sequence_labels,
**additional_data)
[docs]class TargetTextCollection(MutableMapping):
'''
This is a data structure that inherits from MutableMapping which is
essentially a python dictionary, however the underlying storage is a
OrderedDict therefore if you iterate over it, the iteration will always be
in the same order.
This structure only contains TargetText instances.
Attributes:
1. name -- Name associated to the TargetTextCollection.
2. metadata -- Any metadata to associate to the object e.g. domain of the
dataset, all metadata is stored in a dictionary. By default the
metadata will always have the `name` attribute within
the metadata under the key `name`. If `anonymised` is also True then
this will also be in the metadata under the key `anonymised`
3. anonymised -- If True then the data within the TargetText objects have
no text but the rest of the metadata should exist.
Methods:
1. to_json -- Writes each TargetText instances as a dictionary using it's
own to_json function on a new line within the returned String. The
returned String is not json comptable but if split by new line it is and
is also comptable with the from_json method of TargetText.
2. to_conll -- A CONLL formatted string where the format will be the
following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2` Where each
token and relevant labels are on separate new lines. The first line
will always contain the following: `# {text_id: `value`}` where the
text_id represents the `text_id` of this TargetText, this will allow
the CONLL string to be uniquely identified back this TargetText
object. Also each TargetText CONLL string will be seperated
by a new line.
3. to_conll_file -- Saves the TargetTextCollection to CONLL format. Useful
for Sequence Labelling tasks.
4. load_conll -- Loads the CONLL information into the collection.
5. add -- Wrapper around __setitem__. Given as an argument a TargetText
instance it will be added to the collection.
6. to_json_file -- Saves the current TargetTextCollection to a json file
which won't be strictly json but each line in the file will be and each
line in the file can be loaded in from String via TargetText.from_json.
Also the file can be reloaded into a TargetTextCollection using
TargetTextCollection.load_json.
7. tokenize -- This applies the TargetText.tokenize method across all
of the TargetText instances within the collection.
8. pos_text -- This applies the TargetText.pos_text method across all of
the TargetText instances within the collection.
9. sequence_labels -- This applies the TargetText.sequence_labels
method across all of the TargetText instances within the collection.
10. force_targets -- This applies the TargetText.force_targets method
across all of the TargetText instances within the collection.
11. exact_match_score -- Recall, Precision, and F1 score in a Tuple.
All of these measures are based on exact span matching rather than the
matching of the sequence label tags, this is due to the annotation spans
not always matching tokenization therefore this removes the tokenization
error that can come from the sequence label measures.
12. samples_with_targets -- Returns all of the samples that have target
spans as a TargetTextCollection.
13. target_count -- A dictionary of target text as key and values as the
number of times the target text occurs in this TargetTextCollection
14. one_sample_per_span -- This applies the TargetText.one_sample_per_span
method across all of the TargetText instances within the collection to
create a new collection with those new TargetText instances within it.
15. number_targets -- Returns the total number of targets.
16. number_categories -- Returns the total number of categories.
17. category_count -- Returns a dictionary of categories as keys and
values as the number of times the category occurs.
18. target_sentiments -- A dictionary where the keys are target texts and
the values are a List of sentiment values that have been associated to
that target.
19. dict_iter -- Returns an interator of all of the TargetText objects
within the collection as dictionaries.
20. unique_distinct_sentiments -- A set of the distinct sentiments within
the collection. The length of the set represents the number of distinct
sentiments within the collection.
21. de_anonymise -- This will set the `anonymised` attribute to False
from True and set the `text` key value to the value in the `text`
key within the `text_dict` argument for each of the TargetTexts in
the collection. If any Error is raised this collection will revert back
fully to being anonymised.
22. sanitize -- This applies the TargetText.sanitize function to all of
the TargetText instances within this collection, affectively ensures
that all of the instances follow the specified rules that TargetText
instances should follow.
23. in_order -- This returns True if all TargetText objects within the
collection contains a list of targets that are in order of appearance
within the text from left to right e.g. if the only TargetText in the
collection contains two targets where the first target in the `targets`
list is the first (left most) target in the text then this method would
return True.
24. re_order -- This will apply :py:meth:`target_extraction.data_types.TargetText.re_order`
to each TargetText within the collection.
25. add_unique_key -- Applies the following
:py:meth:`target_extraction.data_types.TargetText.add_unique_key`
to each TargetText within this collection
26. key_difference -- Given this collection and another it will return all
of the keys that the other collection contains which this does not.
27. combine_data_on_id -- Given this collection and another it will add all
of the data from the other collection into this collection based on the
unique key given.
28. one_sentiment_text -- Adds the `text_sentiment_key` to each TargetText
within the collection where the value will represent the sentiment value
for the text based on the `sentiment_key` values and `average_sentiment`
determining how to handle multiple sentiments. This will allow text level
classifiers to be trained on target/aspect/category data.
Static Functions:
1. from_json -- Returns a TargetTextCollection object given the json like
String from to_json. For example the json string can be the return of
TargetTextCollection.to_json.
2. load_json -- Returns a TargetTextCollection based on each new line in
the given json file.
3. combine -- Returns a TargetTextCollection that is the combination of all
of those given.
4. same_data -- Given a List of TargetTextCollections it will return a list
of tuples specifying the overlap between the collections based on the
samples `text_id` and `text` key values. If it returns an empty list
then there are no overlap between the collections. This is useful to
find duplicates beyond the `text_id` as it checks the `text` value as well.
'''
def __init__(self, target_texts: Optional[List['TargetText']] = None,
name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
anonymised: bool = False) -> None:
'''
:param target_texts: A list of TargetText instances to add to the
collection.
:param name: Name to call the collection, this is added to the metadata
automatically and overrides the name key value in the
metadata if exists.
:param metadata: Any data that you would like to associate to this
TargetTextCollection.
:param anonymised: Wether or not the TargetText objects should be loaded
in and anonymised, as well as stating whether or not
the whole collection should be anonymised when
loading in new TargetText objects.
'''
self._storage = OrderedDict()
self._anonymised = anonymised
if target_texts is not None:
for target_text in target_texts:
target_text.sanitize()
self.add(target_text)
self.metadata = None
if metadata is not None:
self.metadata = metadata
if anonymised:
self.metadata = {} if metadata is None else metadata
self.metadata['anonymised'] = anonymised
if name is not None:
self.name = name
self.metadata = {} if metadata is None else metadata
self.metadata['name'] = name
else:
self.name = ''
@property
def name(self) -> str:
'''
:returns: The name attribute.
'''
return self._name
@name.setter
def name(self, name_string: str) -> None:
'''
Sets the value of the name attribute, and also updates the `name` key
value in the `metadata` attribute.
:param name_string: New name to give to the name attribute.
'''
self._name = name_string
self.metadata = {} if self.metadata is None else self.metadata
self.metadata['name'] = self._name
@property
def anonymised(self) -> bool:
'''
:returns: True if the data within the TargetTextCollection has been
anonymised. Anonymised data means that there is no text
associated with any of the TargetText objects within the
collection, but all of the metadata is there.
'''
return self._anonymised
@anonymised.setter
def anonymised(self, value: bool) -> None:
'''
Sets whether or not `anonymised` attribute is True or False. This in
effect performs the
:py:meth:`target_extraction.data_types.TargetText.anonymised`
on each TargetText object within the collection if True. When you
want to set this to False you need to perform
:py:meth:`target_extraction.data_types.TargetTextCollection.de_anonymise`.
:param value: True for anonymised, else False. If True this will
enforce that all the TargetText objects do not have a
`text` key/value and the attribute `anonymised` is True.
:raises AnonymisedError: If the TargetText object within the collection
cannot be set to the
`anonymised` value given. If this Error occurs
then the object will have kept the original
`anonymised` value.
'''
for target_text in self.values():
target_text.anonymised = value
self.metadata = {} if self.metadata is None else self.metadata
self.metadata['anonymised'] = value
self._anonymised = value
[docs] def add(self, value: 'TargetText') -> None:
'''
Wrapper around set item. Instead of having to add the value the
usual way of finding the instances 'text_id' and setting this containers
key to this value, it does this for you.
e.g. performs self[value['text_id']] = value
:param value: The TargetText instance to store in the collection. Will
anonymise the TargetText object if the collection's
anonymised attribute is True.
'''
value.anonymised = self.anonymised
self[value['text_id']] = value
[docs] def to_json(self) -> str:
'''
Required as TargetTextCollection is not json serlizable due to the
'spans' in the TargetText instances.
:returns: The object as a list of dictionaries where each the TargetText
instances are dictionaries. It will also JSON serialize any
meta data as well.
'''
json_text = ''
for index, target_text_instance in enumerate(self.values()):
if index != 0:
json_text += '\n'
target_text_instance: TargetText
json_text += target_text_instance.to_json()
if self.metadata is not None:
if json_text != '':
json_text += '\n'
json_text += json.dumps({'metadata': self.metadata})
return json_text
[docs] def to_conll(self, gold_label_key: str,
prediction_key: Optional[str] = None) -> str:
'''
This in affect performs the `to_conll` function for each TargetText
within the collection and seperates each on the CONLL strings with a
new line.
:param gold_label: A key that contains a sequence of labels e.g.
[`B`, `I`, `O`]. This can come from the return
of the :py:meth:`sequence_labels`
:param prediction_key: Key to the predicted labels of the `gold_label`.
Where the prediction key values is a list of a
list of predicted labels. Each list is therefore
a different model run hence creating the
`PREDICTION 1`, 'PREDICTION 2' etc. Thus the
values of `prediction_key` must be of shape
(number runs, number tokens)
:returns: A CONLL formatted string where the format will be the
following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`
Where each token and relevant labels are on separate new
lines. The first line will always contain the following:
`# {text_id: `value`}` where the text_id represents the
`text_id` of this TargetText, this will allow the CONLL
string to be uniquely identified back this TargetText
object. Also each TargetText CONLL string will be seperated
by a new line.
'''
conll_string = ''
for target_text in self.values():
target_conll = target_text.to_conll(gold_label_key=gold_label_key,
prediction_key=prediction_key)
conll_string += f'{target_conll}\n\n'
return conll_string
[docs] def to_conll_file(self, conll_fp: Path, gold_label_key: str,
prediction_key: Optional[str] = None) -> None:
'''
Writes the ouput of `to_conll` to the `conll_fp` file.
:param conll_fp: Write the CONLL string to this file path.
:param gold_label: A key that contains a sequence of labels e.g.
[`B`, `I`, `O`]. This can come from the return
of the :py:meth:`sequence_labels`
:param prediction_key: Key to the predicted labels of the `gold_label`.
Where the prediction key values is a list of a
list of predicted labels. Each list is therefore
a different model run hence creating the
`PREDICTION 1`, 'PREDICTION 2' etc. Thus the
values of `prediction_key` must be of shape
(number runs, number tokens)
'''
conll_string = self.to_conll(gold_label_key=gold_label_key,
prediction_key=prediction_key)
with conll_fp.open('w+') as conll_file:
conll_file.write(conll_string)
[docs] def load_conll(self, conll_fp: Path, tokens_key: str = 'tokenized_text',
gold_label_key: Optional[str] = None,
prediction_key: Optional[str] = None) -> None:
'''
This takes the `conll_fp` and loads the CONLL data into the relevant
TargetText samples in this collection using the TargetText `from_conll`
function. The matching of TargetText with CONLL data is through the CONLL
string containing `# {text_id: _id}` for each CONLL sentence/text.
:param tokens_key: Key to save the CONLL tokens too, for the TargetText.
:param gold_label_key: Key to save the gold labels too. Either
`gold_label_key` or `prediction_key` must not be
`None` or both not `None`, for the TargetText.
:param prediction_key: Key to save the prediction labels too. The value
will be of shape (number runs, number tokens),
for the TargetText.
'''
def _line_divider(line: str) -> bool:
return line.strip() == ''
with conll_fp.open('r') as conll_file:
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(conll_file, _line_divider):
if is_divider:
continue
lines = list(lines)
_id_line = lines[0]
_id_line = _id_line.lstrip('#').strip()
text_id = json.loads(_id_line)['text_id']
# Find relevant TargetText
lines = [line.strip() for line in lines if line.strip()!='']
conll_line = '\n'.join(lines[1:])
self[text_id].from_conll(conll_line, tokens_key=tokens_key,
gold_label_key=gold_label_key,
prediction_key=prediction_key)
@staticmethod
def _get_metadata(json_iterable: Iterable[str]) -> Tuple[Union[Dict[str, Any], None],
Union[str, None], bool]:
'''
:param json_iterable: An interable that generates a JSON string, of
which the last string contains the metadata if
it exists.
:returns: The metadata for the collection being loaded, as a Tuple of
length 3 where the 3 items are: 1. The metadata,
2. The name of the collection, and 3. Whether it has been
anonymised. The first 2 by default are None and the 3 is
False by default.
'''
metadata = None
name = None
anonymised = False
for line in deque(json_iterable, 1):
if line.strip():
json_line = json.loads(line)
if 'metadata' in json_line:
metadata = json_line['metadata']
if 'name' in metadata:
name = metadata['name']
if 'anonymised' in metadata:
anonymised = metadata['anonymised']
return metadata, name, anonymised
[docs] @staticmethod
def from_json(json_text: str, **target_text_collection_kwargs
) -> 'TargetTextCollection':
'''
Required as the json text is expected to be the return from the
self.to_json method. This string is not passable by a standard json
decoder.
:param json_text: This is expected to be a dictionary like object for
each new line in this text
:param target_text_collection_kwargs: Key word arguments to give to
the TargetTextCollection
constructor.
:returns: A TargetTextCollection based on each new line in the given
text to be passable by TargetText.from_json method.
:raises AnonymisedError: If the `TargetText` object that it is loading
is anonymised but the `target_text_collection_kwargs`
argument contains `anonymised` False, as
you cannot de-anonymised without performing
the
:py:meth:`target_extraction.data_types.TargetTextCollection.de_anonymised`.
'''
if json_text.strip() == '':
return TargetTextCollection(**target_text_collection_kwargs)
target_text_instances = []
metadata, name, anonymised = TargetTextCollection._get_metadata(json_text.split('\n'))
for line in json_text.split('\n'):
json_line = json.loads(line)
if not 'metadata' in json_line:
target_text_instance = TargetText.from_json(line, anonymised=anonymised)
target_text_instances.append(target_text_instance)
# Key word arguments over riding meta data
if 'name' in target_text_collection_kwargs:
name = target_text_collection_kwargs['name']
if 'metadata' in target_text_collection_kwargs:
metadata = target_text_collection_kwargs['metadata']
if 'anonymised' in target_text_collection_kwargs:
anonymised = target_text_collection_kwargs['anonymised']
return TargetTextCollection(target_text_instances, name=name,
metadata=metadata, anonymised=anonymised)
[docs] @staticmethod
def load_json(json_fp: Path, **target_text_collection_kwargs
) -> 'TargetTextCollection':
'''
Allows loading a dataset from json. Where the json file is expected to
be output from TargetTextCollection.to_json_file as the file will be
a json String on each line generated from TargetText.to_json. This
will also load any meta data that was stored within the TargetTextCollection.
:param json_fp: File that contains json strings generated from
TargetTextCollection.to_json_file
:param target_text_collection_kwargs: Key word arguments to give to
the TargetTextCollection
constructor. If there was
any meta data stored within the
loaded json then these key word
arguments would over ride the
meta data stored.
:returns: A TargetTextCollection based on each new line in the given
json file, and the optional meta data on the last line.
'''
target_text_instances = []
with json_fp.open('r') as json_file:
metadata, name, anonymised = TargetTextCollection._get_metadata(json_file)
with json_fp.open('r') as json_file:
for line in json_file:
if line.strip():
json_line = json.loads(line)
if 'metadata' not in json_line:
target_text_instance = TargetText.from_json(line, anonymised)
target_text_instances.append(target_text_instance)
# Key word arguments over riding meta data
if 'name' in target_text_collection_kwargs:
name = target_text_collection_kwargs['name']
if 'metadata' in target_text_collection_kwargs:
metadata = target_text_collection_kwargs['metadata']
if 'anonymised' in target_text_collection_kwargs:
anonymised = target_text_collection_kwargs['anonymised']
return TargetTextCollection(target_text_instances, name=name,
metadata=metadata, anonymised=anonymised)
[docs] def to_json_file(self, json_fp: Path,
include_metadata: bool = False) -> None:
'''
Saves the current TargetTextCollection to a json file which won't be
strictly json but each line in the file will be and each line in the
file can be loaded in from String via TargetText.from_json. Also the
file can be reloaded into a TargetTextCollection using
TargetTextCollection.load_json.
:param json_fp: File path to the json file to save the current data to.
:param include_metadata: Whether or not to include the metadata when
writing to file.
'''
with json_fp.open('w+') as json_file:
for index, target_text_instance in enumerate(self.values()):
target_text_instance: TargetText
target_text_string = target_text_instance.to_json()
if index != 0:
target_text_string = f'\n{target_text_string}'
json_file.write(target_text_string)
if self.metadata is not None and include_metadata:
metadata_to_write = {'metadata': self.metadata}
json_file.write(f'\n{json.dumps(metadata_to_write)}')
[docs] def tokenize(self, tokenizer: Callable[[str], List[str]]) -> None:
'''
This applies the TargetText.tokenize method across all of
the TargetText instances within the collection.
For a set of tokenizers that are definetly comptable see
target_extraction.tokenizers module.
Ensures that the tokenization is character preserving.
:param tokenizer: The tokenizer to use tokenize the text for each
TargetText instance in the current collection
:raises TypeError: If the tokenizer given does not return a List of
Strings.
:raises ValueError: This is raised if any of the TargetText instances
in the collection contain an empty string.
:raises ValueError: If the tokenization is not character preserving.
'''
for index, target_text_instance in enumerate(self.values()):
if index == 0:
target_text_instance.tokenize(tokenizer, True)
else:
target_text_instance.tokenize(tokenizer, False)
[docs] def pos_text(self, tagger: Callable[[str], List[str]]) -> None:
'''
This applies the TargetText.pos_text method across all of
the TargetText instances within the collection.
For a set of pos taggers that are definetly comptable see
target_extraction.pos_taggers module.
:param tagger: POS tagger.
:raises TypeError: If the POS tagger given does not return a List of
Strings.
:raises ValueError: This is raised if any of the TargetText instances
in the collection contain an empty string.
:raises ValueError: If the Target Text instance has not been tokenized.
:raises ValueError: If the number of pos tags for a Target Text instance
does not have the same number of tokens that has
been generated by the tokenizer function.
'''
for index, target_text_instance in enumerate(self.values()):
if index == 0:
target_text_instance.pos_text(tagger, True)
else:
target_text_instance.pos_text(tagger, False)
[docs] def force_targets(self) -> None:
'''
This applies the TargetText.force_targets method across all of the
TargetText instances within the collection.
'''
for target_text_instance in self.values():
target_text_instance.force_targets()
[docs] def sequence_labels(self, return_errors: bool = False,
**target_sequence_label_kwargs
) -> List['TargetText']:
'''
This applies the TargetText.sequence_labels method across all of
the TargetText instances within the collection.
:param return_errors: Returns TargetText objects that have caused
the ValueError to be raised.
:param target_sequence_label_kwargs: Any Keyword arguments to give to
the TargetText `sequence_labels`
function.
:returns: A list of TargetText objects that have caused the ValueError
to be raised if `return_errors` is True else an empty list
will be returned.
:raises KeyError: If the current TargetText has not been tokenized.
:raises ValueError: If two targets overlap the same token(s) e.g
`Laptop cover was great` if `Laptop` and
`Laptop cover` are two seperate targets this should
riase a ValueError as a token should only be
associated to one target.
'''
errored_targets = []
for target_text_instance in self.values():
if return_errors:
try:
target_text_instance.sequence_labels(**target_sequence_label_kwargs)
except ValueError:
errored_targets.append(target_text_instance)
else:
target_text_instance.sequence_labels(**target_sequence_label_kwargs)
return errored_targets
[docs] def exact_match_score(self,
predicted_sequence_key: str = 'predicted_sequence_labels'
) -> Tuple[float, float, float,
Dict[str, List[Tuple[str, Span]]]]:
'''
Just for clarification we use the sequence label tags to find the
predicted spans. However even if you have a perfect sequence label
score does not mean you will have a perfect extact span score
as the tokenizer used for the sequence labelling might not align
perfectly with the annotated spans.
The False Positive mistakes, False Negative mistakes, and correct
True Positive Dictionary keys are those names with the values neing a
List of Tuples where the Tuple is made up of the TargetText instance ID
and the Span that was incorrect (FP) or not tagged (FN) or correct (TP).
Example of this is as follows:
{`FP`: [('1', Span(0, 4))], 'FN': [], 'TP': []}
:param predicted_sequence_key: Key of the predicted sequence labels
within this TargetText instance.
:returns: Recall, Precision, and F1 score, False Positive mistakes,
False Negative mistakes, and correct True Positives in a
Dict. All of these measures are based on exact span matching
rather than the matching of the sequence label tags,
this is due to the annotation spans not always matching
tokenization therefore this removes the tokenization
error that can come from the sequence label measures.
:raises KeyError: If there are no predicted sequence label key
within this TargetText.
:raises ValueError: If the predicted or true spans contain multiple
spans that have the same span e.g.
[Span(4, 15), Span(4, 15)]
'''
# tp = True Positive count
tp = 0.0
num_pred_true = 0.0
num_actually_true = 0.0
fp_mistakes: List[Tuple[str, Span]] = []
fn_mistakes: List[Tuple[str, Span]] = []
correct_tp: List[Tuple[str, Span]] = []
for target_text_index, target_text_instance in enumerate(self.values()):
if target_text_index == 0:
keys_to_check = ['spans',
f'{predicted_sequence_key}']
for key in keys_to_check:
target_text_instance._key_error(key)
predicted_spans = target_text_instance.get_sequence_spans(predicted_sequence_key)
# Add to the number of predicted true and actually true
predicted_spans: List[Span]
num_pred_true += len(predicted_spans)
true_spans: List[Span] = target_text_instance['spans']
if true_spans is None:
true_spans = []
num_actually_true += len(true_spans)
# This should be impossible to get to
if len(predicted_spans) != len(set(predicted_spans)):
raise ValueError(f'Predicted spans {predicted_spans} contain'
f' multiple of the same predicted span. '
f'TargetText: {target_text_instance}')
# This is possible
if len(true_spans) != len(set(true_spans)):
raise ValueError(f'True spans {true_spans} contain'
f' multiple of the same true span. '
f'TargetText: {target_text_instance}')
text_id = target_text_instance['text_id']
true_spans = set(true_spans)
for predicted_span in predicted_spans:
if predicted_span in true_spans:
tp += 1
correct_tp.append((text_id, predicted_span))
else:
fp_mistakes.append((text_id, predicted_span))
for true_span in true_spans:
if true_span not in predicted_spans:
fn_mistakes.append((text_id, true_span))
error_analysis_dict = {'FP': fp_mistakes, 'FN': fn_mistakes,
'TP': correct_tp}
if tp == 0.0:
return 0.0, 0.0, 0.0, error_analysis_dict
recall = tp / num_actually_true
precision = tp / num_pred_true
f1 = (2 * precision * recall) / (precision + recall)
return recall, precision, f1, error_analysis_dict
[docs] def samples_with_targets(self) -> 'TargetTextCollection':
'''
:returns: All of the samples that have targets as a
TargetTextCollection for this TargetTextCollection.
:raises KeyError: If either `spans` or `targets` does not exist in
one or more of the TargetText instances within this
collection. These key's are protected keys thus they
should always exist but this is just a warning if
you have got around the protected keys.
'''
sub_collection = TargetTextCollection()
sub_collection.anonymised = self.anonymised
for target_text in self.values():
if target_text['spans'] and target_text['targets']:
sub_collection.add(target_text)
return sub_collection
[docs] def target_count(self, lower: bool = False,
target_key: str = 'targets') -> Dict[str, int]:
'''
:Note: The target can not exist e.g. be a `None` target as the target
can be combined with the category like in the SemEval 2016
Restaurant dataset. In these case we do not include these
in the target_count.
:param lower: Whether or not to lower the target text.
:param target_key: The key in each TargetText sample that contains the
list of target words.
:returns: A dictionary of target text as key and values as the number
of times the target text occurs in this TargetTextCollection
'''
target_count: Dict[str, int] = Counter()
for target_dict in self.values():
if target_dict[target_key]:
for target in target_dict[target_key]:
if target is None:
continue
if lower:
target = target.lower()
target_count.update([target])
return dict(target_count)
[docs] def target_sentiments(self, lower: bool = False,
unique_sentiment: bool = False
) -> Dict[str, Union[List[str], Set[str]]]:
'''
:Note: The target can not exist e.g. be a `None` target as the target
can be combined with the category like in the SemEval 2016
Restaurant dataset. In these case we do not include these
in the target_count.
:param lower: Whether or not to lower the target text.
:param unique_sentiment: Whether or not the return is a dictionary
whose values are a List of Strings or if
True a Set of Strings.
:returns: A dictionary where the keys are target texts and the values
are a List of sentiment values that have been associated to
that target. The sentiment value can occur more than once
indicating the number of times that target has been associated
with that sentiment unless unique_sentiment is True then
instead of a List of sentiment values a Set is used instead.
:Explanation: If the target `camera` has occured with the sentiment
`positive` twice and `negative` once then it will return
{`camera`: [`positive`, `positive`, `negative`]}. However
if `unique_sentiment` is True then it will return:
{`camera`: {`positive`, `negative`}}.
'''
target_sentiment_values: Dict[str, List[str]] = defaultdict(list)
if unique_sentiment:
target_sentiment_values: Dict[str, Set[str]] = defaultdict(set)
for target_dict in self.values():
if target_dict['targets'] and target_dict['target_sentiments']:
for target, sentiment in zip(target_dict['targets'],
target_dict['target_sentiments']):
if target is None:
continue
if lower:
target = target.lower()
if unique_sentiment:
target_sentiment_values[target].add(sentiment)
else:
target_sentiment_values[target].append(sentiment)
return dict(target_sentiment_values)
[docs] def number_targets(self, incl_none_targets: bool = False) -> int:
'''
:param incl_none_targets: Whether to include targets that are `None`
and are therefore associated to the categories
in the count.
:returns: The total number of targets in the collection.
'''
target_count = 0
for target_dict in self.values():
if target_dict['targets']:
for target in target_dict['targets']:
if not incl_none_targets and target is None:
continue
target_count += 1
return target_count
[docs] def number_categories(self) -> int:
'''
:returns: The total number of categories in the collection
:raises ValueError: If one of the category values in the list is of
value None
'''
return sum(self.category_count().values())
[docs] def category_count(self) -> Dict[str, int]:
'''
:returns: A dictionary of categories as keys and values as the number
of times the category occurs in this TargetTextCollection
:raises ValueError: If any category has the value of None.
'''
categories_count = Counter()
for target_dict in self.values():
if target_dict['categories']:
for category in target_dict['categories']:
if category is None:
raise ValueError('One of the category value is None, '
f'within {target_dict}')
categories_count.update([category])
return dict(categories_count)
[docs] def one_sample_per_span(self, remove_empty: bool = False
) -> 'TargetTextCollection':
'''
This applies the TargetText.one_sample_per_span method across all of the
TargetText instances within the collection to create a new collection
with those new TargetText instances within it.
:param remove_empty: If the TargetText instance contains any None
targets then these will be removed along with
their respective Spans.
:returns: A new TargetTextCollection that has samples that come
from this collection but has had the
TargetText.one_sample_per_span method applied to it.
'''
new_collection = TargetTextCollection()
for target_text in self.values():
new_collection.add(target_text.one_sample_per_span(remove_empty=remove_empty))
return new_collection
[docs] def dict_iterator(self) -> Iterable[Dict[str, Any]]:
'''
:returns: An interator of all of the TargetText objects
within the collection as dictionaries.
'''
for target_text in self.values():
target_text: TargetText
yield dict(target_text)
[docs] def unique_distinct_sentiments(self,
sentiment_key: str = 'target_sentiments'
) -> Set[int]:
'''
:param sentiment_key: The key that represents the sentiment value
for each TargetText object
:returns: A set of the distinct sentiments within the collection.
The length of the set represents the number of distinct
sentiments within the collection.
:raises TypeError: If the value in the sentiment_key is not of type list
'''
unique_ds = set()
for target_object in self.values():
sentiment_value = target_object[sentiment_key]
if not isinstance(sentiment_value, list):
raise TypeError(f'The sentiment key {sentiment_key} contains a'
f' value that is not of type List: '
f'{sentiment_value}. TargetText object: '
f'{target_object}')
unique_ds.add(len(set(sentiment_value)))
# Need to remove 0's which come about because an empty list is of
# length 0
if 0 in unique_ds:
unique_ds.remove(0)
return unique_ds
[docs] def de_anonymise(self, text_dicts: Iterable[Dict[str, str]]) -> None:
'''
This will set the `anonymised` attribute to False
from True and set the `text` key value to the value in the `text`
key within the `text_dict` argument for each of the TargetTexts in
the collection. If any Error is raised this collection will revert back
fully to being anonymised.
:param text_dicts: An iterable of dictionaries that contain the following
two keys: 1. `text` and 2. `text_id` where
the `text_id` has to be a key within the current
collection. The `text` associated to that id will
become that TargetText object's text value.
:raises ValueError: If the length of the `text_dicts` does not match
that of the collection.
:raises KeyError: If any of the `text_id`s in the `text_dicts` do not
match those within this collection.
'''
try:
self_len = len(self)
text_dict_len = {}
for text_dict in text_dicts:
text_dict_id = text_dict['text_id']
text_dict_len[text_dict_id] = 1
if text_dict_id not in self:
raise KeyError(f"The key {text_dict_id} from `text_dicts`"
f" is not in this collection.")
self[text_dict_id].de_anonymise(text_dict)
text_dict_len = len(text_dict_len)
if self_len != text_dict_len:
raise ValueError(f'The length of collection {self_len} is not '
'equal to the length of the `text_dicts` '
f'{text_dict_len}.')
except Exception as e:
# Cleans up after the exception as we have to preserve the case
# that it is still anonymised
for target_text in self.values():
if not target_text.anonymised:
target_text.anonymised = True
raise e
self.anonymised = False
[docs] def sanitize(self) -> None:
'''
This applies the TargetText.sanitize function to all of
the TargetText instances within this collection, affectively ensures
that all of the instances follow the specified rules that TargetText
instances should follow.
'''
for target_text in self.values():
target_text.sanitize()
[docs] def in_order(self) -> bool:
'''
This returns True if all TargetText objects within the
collection contains a list of targets that are in order of appearance
within the text from left to right e.g. if the only TargetText in the
collection contains two targets where the first target in the `targets`
list is the first (left most) target in the text then this method would
return True.
:returns: True if all the `targets` within all the TargetText objects
in this collection are in sequential left to right order
within the text.
'''
for target_text in self.values():
if not target_text.in_order():
return False
return True
[docs] def re_order(self, keys_not_to_order: Optional[List[str]] = None) -> None:
'''
This will apply :py:meth:`target_extraction.data_types.TargetText.re_order`
to each TargetText within the collection.
:param keys_not_to_order: Any keys within the TargetTexts that do not
need re-ordering
'''
# This takes into account the rollback problem where an error occurs
# halfway through performing the function and half the collection has
# been re-ordered where as the other half has not. This will bring it
# back into a stable state.
self_copy = copy.deepcopy(self._storage)
try:
for target_text in self.values():
target_text.re_order(keys_not_to_order)
except Exception as e:
self._storage = self_copy
raise e
[docs] def add_unique_key(self, id_key: str, id_key_name: str,
id_delimiter: str = '::') -> None:
'''
Applies the following
:py:meth:`target_extraction.data_types.TargetText.add_unique_key`
to each TargetText within this collection
:param id_key: The name of the key within this TargetText that requires
unique ids that will be stored in `id_key_name`.
:param id_key_name: The name of the key to associate to these new
unique ids.
:param id_delimiter: The delimiter to seperate the `text_id` and the
index of the `id_key` that is being represented
by this unique id.
'''
for value in self.values():
value.add_unique_key(id_key, id_key_name, id_delimiter=id_delimiter)
[docs] def key_difference(self, other_collection: 'TargetTextCollection'
) -> List[str]:
'''
:param other_collection: The collection that is being compared to this.
:returns: A list of keys that represent all of the keys that are in the
other (compared) collection and not in this collection.
'''
this_keys = {key for value in self.values() for key in value.keys()}
other_keys = {key for value in other_collection.values() for key in value.keys()}
return list(other_keys.difference(this_keys))
[docs] def combine_data_on_id(self, other_collection: 'TargetTextCollection',
id_key: str, data_keys: List[str],
raise_on_overwrite: bool = True,
check_same_ids: bool = True) -> None:
'''
:param other_collection: The collection that contains the data
that is to be copied to this collection.
:param id_key: The key that indicates in each TargetText within
this and the `other_collection` how the values are
to be copied from the `other_collection` to this
collection.
:param data_keys: The keys of the values in each TargetText within the
`other_collection` that is be copied to the relevant
TargetTexts within this collection. It assumes that if
any of key/values are a list of lists that the inner
lists relate to the targets and the outer list is
not related to the targets.
:param raise_on_overwrite: If True will raise the
:py:class:`target_extraction.data_types_util.OverwriteError`
if any of the `data_keys` exist in any
of the TargetTexts within this collection.
:param check_same_ids: If True will ensure that this collection and the
other collection are of same length and check
if each have the same unique ids
:raises AssertionError: If the number of IDs from the `id_key` does not
match the number of data to be added to a data key
:raises ValueError: If `check_same_ids` is True and the two collections
are either of not the same length or have
different unique ids according to `id_key` within
the TargetText objects.
:raises OverwriteError: If `raise_on_overwrite` is True and the any of
the `data_keys` exist in any of the TargetTexts
within this collection.
'''
def sort_data_by_key(key: str, self_target_text: TargetText,
other_target_text: TargetText,
data_to_sort: List[Any]) -> List[Any]:
'''
:param key: A key that appear in both `self_target_text` and
`other_target_text`, where the key for both represents
values that appear in both and are unique.
:param self_target_text: A TargetText object where the values in
`key` will determine the sorting performed
to `data_to_sort`.
:param other_target_text: The TargetText that represents the `data_to_sort`
and is in this TargetText's sort order
based on values in `key`
:param data_to_sort: Data that has come from `other_target_text` that
is to be sorted based on `key` values from
`self_target_text`
:returns: The `data_to_sort` ordered by the values in `self_target_text`
key `key`
:raises AssertionError: If the number of IDs from the `key` does not
match the number of data_to_sort
'''
self_data_values = []
num_ids = len(other_target_text[key])
num_data = len(data_to_sort)
assert_err = (f'The ID key {key} contains {num_ids}, however the '
'number of values/data to be added from the other '
f'TargetText is {num_data} which is {data_to_sort} '
f'OtherTargetText {other_target_text}\n'
f'SelfTargetText {self_target_text}')
assert num_ids == num_data, assert_err
for self_id_value in self_target_text[key]:
index_other_id_value = other_target_text[key].index(self_id_value)
self_data_values.append(data_to_sort[index_other_id_value])
return self_data_values
if check_same_ids:
len_self = len(self)
len_other = len(other_collection)
if len_self != len_other:
raise ValueError('The two collections are not the same length. '
f'This length {len_self} other {len_other}')
self_ids = {_id for value in self.values() for _id in value[id_key]}
other_ids = {_id for value in other_collection.values()
for _id in value[id_key]}
self_differences = self_ids.difference(other_ids)
other_differences = other_ids.difference(self_ids)
all_differences = self_differences.union(other_differences)
if len(all_differences):
raise ValueError(f'The two collections do not contain the same'
f' ids. The difference between this and the '
f'other are the following ids {self_differences}'
f'\nThe difference between the other and this '
f'is the following {other_differences}')
# If an error occurs would be good to have a roll back poilcy that
# will return this collection back to it's original self
self_copy = copy.deepcopy(self._storage)
try:
for text_id, self_target_text in self.items():
other_target_text = other_collection[text_id]
# Cannot assume that the unique ids will be in the same order.
for data_key in data_keys:
if data_key in self_target_text and raise_on_overwrite:
raise OverwriteError(f'The following data key {data_key}'
' exists in the following TargetText'
f' {self_target_text} within this collection. '
'The other TargetText that contains '
'this data key to copy the data from '
f'is {other_target_text}')
self_data_values = []
other_data_values = other_target_text[data_key]
# If the other_data_values is a list of a list, need to
# take into account the sorting of the targets should only
# be applied to the inner list.
is_inner_list = False
if isinstance(other_data_values, list):
if other_data_values:
if isinstance(other_data_values[0], list):
is_inner_list = True
if is_inner_list:
for other_inner_list_data in other_data_values:
self_inner_list_data = sort_data_by_key(id_key, self_target_text,
other_target_text,
other_inner_list_data)
self_data_values.append(self_inner_list_data)
else:
self_data_values = sort_data_by_key(id_key, self_target_text,
other_target_text,
other_data_values)
self_target_text[data_key] = self_data_values
except Exception as e:
self._storage = self_copy
raise e
[docs] def one_sentiment_text(self, sentiment_key: str,
average_sentiment: bool = False,
text_sentiment_key: str = 'text_sentiment'
) -> None:
'''
Adds the `text_sentiment_key` to each TargetText within the collection
where the value will represent the sentiment value for the text based
on the `sentiment_key` values and `average_sentiment` determining how
to handle multiple sentiments. This will allow text level classifiers
to be trained on target/aspect/category data.
:param sentiment_key: The key in the TargetTexts that represent the
sentiment for the TargetTexts sentence.
:param average_sentiment: If False it will only add the `text_sentiment_key`
to TargetTexts that have one unique sentiment in the
`sentiment_key` e.g. can have more than one sentiment
value in the `sentiment_key` but each one of
those values has to be the same value. If True
it will choose the
most frequent sentiment , ties are decided
by random choice. If the there are no
values in `sentiment_key` then
`text_sentiment_key` will not be added to
the TargetText.
:param text_sentiment_key: The key to add the text level sentiment value
to.
'''
for target_text in self.values():
target_text: TargetText
target_text._key_error(sentiment_key)
sentiments = target_text[sentiment_key]
if average_sentiment:
if len(sentiments) == 1:
target_text[text_sentiment_key] = sentiments[0]
elif len(sentiments) == 0:
continue
else:
sentiment_counts = Counter(sentiments)
sorted_counts = sorted(sentiment_counts.items(),
key=lambda x: x[1], reverse=True)
highest_count = sorted_counts[0][1]
highest_sentiment_values = []
for sentiment_value, count in sorted_counts:
if count == highest_count:
highest_sentiment_values.append(sentiment_value)
assert highest_sentiment_values
random_sentiment_value = random.choice(highest_sentiment_values)
target_text[text_sentiment_key] = random_sentiment_value
else:
if len(sentiments) == 1:
target_text[text_sentiment_key] = sentiments[0]
elif len(sentiments) > 1 and len(set(sentiments)) == 1:
target_text[text_sentiment_key] = sentiments[0]
[docs] @staticmethod
def combine(*collections) -> 'TargetTextCollection':
'''
:param collections: An iterator containing one or more
TargetTextCollections
:returns: A TargetTextCollection that is the combination of all of
those given.
:NOTE: If any of the collections are anonymised then the returned
collection will also be anonymised, even if only one of the
collections has been anonymised.
'''
target_objects: 'TargetText' = []
is_anonymised = False
for collection in collections:
if collection.anonymised:
is_anonymised = True
for target in collection.values():
target_objects.append(target)
return TargetTextCollection(target_objects, anonymised=is_anonymised)
[docs] @staticmethod
def same_data(collections: List['TargetTextCollection']
) -> List[Tuple[List[Tuple['TargetText', 'TargetText']], Tuple[str, str]]]:
'''
:param collections: A list of TargetTextCollections to test if there are
any duplicates based on `text_id` and `text` key
values.
:returns: If the list is empty then there are no duplicates. Else a list of
tuples containing 1. A list of tuples of duplicate TargetText instances
2. A tuple of collection names that the TargetText have come stating
the names of the collections that have the duplicates.
'''
all_matches = []
for collection_index, collection in enumerate(collections):
for other_collection in collections[collection_index + 1:]:
same_targets = []
for target_text in collection.values():
for other_target_text in other_collection.values():
if target_text['text_id'] == other_target_text['text_id']:
same_targets.append((target_text, other_target_text))
elif target_text['text'] == other_target_text['text']:
same_targets.append((target_text, other_target_text))
if same_targets:
all_matches.append((same_targets, (collection.name, other_collection.name)))
return all_matches
def __setitem__(self, key: str, value: 'TargetText') -> None:
'''
Will add the TargetText instance to the collection where the key
should be the same as the TargetText instance 'text_id'.
:param key: Key to be added or changed
:param value: TargetText instance associated to this key. Where the
key should be the same value as the TargetText instance
'text_id' value. Furthermore if the TargetTextCollection's
`anonymised` attribute is True then the TargetText object
being added will also be anonymised.
'''
if not isinstance(value, TargetText):
raise TypeError('The value should be of type TargetText and not '
f'{type(value)}')
text_id = value['text_id']
if text_id != key:
raise ValueError(f'The value `text_id`: {text_id} should be the '
f'same value as the key: {key}')
# We copy it to stop any mutable objects from changing outside of the
# collection
value_copy = copy.deepcopy(value)
value_copy.anonymised = self.anonymised
self._storage[key] = value_copy
def __delitem__(self, key: str) -> None:
'''
Given a key that matches a key within self._storage or self.keys()
it will delete that key and value from this object.
:param key: Key and its respective value to delete from this object.
'''
del self._storage[key]
def __eq__(self, other: 'TargetTextCollection') -> bool:
'''
Two TargetTextCollection instances are equal if they both have
the same TargetText instances within it.
:param other: Another TargetTextCollection object that is being
compared to this TargetTextCollection object.
:returns: True if they have the same TargetText instances within it.
'''
if not isinstance(other, TargetTextCollection):
return False
if len(self) != len(other):
return False
for key in self.keys():
if key not in other:
return False
return True
def __repr__(self) -> str:
'''
:returns: String returned is what user see when the instance is
printed or printed within a interpreter.
'''
rep_text = 'TargetTextCollection('
for key, value in self.items():
rep_text += f'key: {key}, value: {value}'
break
if len(self) > 1:
rep_text += '...)'
else:
rep_text += ')'
return rep_text
def __len__(self) -> int:
'''
:returns: The number of TargetText instances in the collection.
'''
return len(self._storage)
def __iter__(self) -> Iterable[str]:
'''
Returns as interator over the TargetText instances 'text_id''s that
are stored in this collection. This is an ordered iterator as the
underlying dictionary used to store the TargetText instances is an
OrderedDict in self._storage.
:returns: TargetText instances 'text_id''s that are stored in this
collection
'''
return iter(self._storage)
def __getitem__(self, key: str) -> 'TargetText':
'''
:returns: A TargetText instance that is stored within this collection.
'''
return self._storage[key]