import logging
import json
from typing import Dict, Any, Optional, List, Union, NamedTuple
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
from allennlp.data.fields import TextField, ListField, MetadataField, Field
from allennlp.data.fields import SequenceLabelField, ArrayField
from overrides import overrides
import numpy as np
from target_extraction.data_types import TargetText
from target_extraction.data_types_util import Span
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
[docs]@DatasetReader.register("target_sentiment")
class TargetSentimentDatasetReader(DatasetReader):
'''
Dataset reader designed to read a list of JSON like objects of the
following type:
{`text`: `This Camera lens is great`,
`targets`: [`Camera`],
`target_sentiments`: [`positive`]}
or
{`text`: `This Camera lens is great`,
`categories`: [`CAMERA`],
`category_sentiments`: [`positive`]}
or
{`text`: `This Camera lens is great`,
`targets`: [`Camera`]
`categories`: [`CAMERA`],
`target_sentiments`: [`positive`]}
or
{`text`: `This Camera lens is great`,
`targets`: [`Camera`],
`target_sentiments`: [`positive`],
`spans`: [[5,11]]}
This type of JSON can be created from exporting a
`target_extraction.data_types.TargetTextCollection` using the
`to_json_file` method.
The difference between the three objects depends on the objective of the
model being trained:
1. Version is for a purely Target based sentiment classifier.
2. Version is for a purely Aspect or latent based sentiment classifier.
3. Version is if you want to make use of the relationship between the
Target and Aspect in the sentiment classifier.
4. If the Target based sentiment classifier requires the knowledge of
where the target is.
:param lazy: Whether or not instances can be read lazily.
:param token_indexers: We use this to define the input representation
for the text. See
:class:`allennlp.data.token_indexers.TokenIndexer`.
:param tokenizer: Tokenizer to use to split the sentence text as well as
the text of the target.
:param left_right_contexts: If True it will return within the
instance for `text_to_instance` the
sentence context left and right of the target.
:param reverse_right_context: If True this will reverse the text that is
in the right context. NOTE left_right_context
has to be True.
:param incl_target: If left_right_context is True and this also
the left and right contexts will include the target
word(s) as well.
:param use_categories: Whether or not to return the categories in the
instances even if they do occur in the dataset.
This is a temporary solution to the following
`issue <https://github.com/apmoore1/target-extraction/issues/5>`_.
The number of categories does not have to match the
number of targets, just there has to be at least one
category per sentence.
:param target_sequences: Whether or not to generate `target_sequences`
which are a sequence of masks per target for all
target texts. This will allow the model to know
which tokens in the context relate to the target.
Example of this is shown below (for this to work
does require the `span` of each target)
:param position_embeddings: Whether or not to create distance values
that can be converted to embeddings similar
to the `position_weights` but instead of the
model later on using them as weights it uses
the distances to learn position embeddings.
(for this to work does require the `span` of
each target). `A Position-aware Bidirectional
Attention Network for Aspect-level Sentiment
Analysis <https://www.aclweb.org/anthology/C18-1066.pdf>`_
:param position_weights: In the instances there will be an extra key
`position_weights` which will be an array of
integers representing the linear distance between
each token and it's target e.g. If the text
contains two targets where each token is represented
by a number and the 1's target tokens =
[[0,0,0,1], [1,1,0,0]] then the `position_weights`
will be [[4,3,2,1], [1,1,2,3]]. (for this to work
does require the `span` of each target). An example
of position weighting is in section 3.3 of
`Modeling Sentiment Dependencies with Graph
Convolutional Networks for Aspect-level Sentiment
Classification <https://arxiv.org/pdf/1906.04501.pdf>`_
:param max_position_distance: The maximum position distance given to a token
from the target e.g. [0,0,0,0,0,1,0,0] if the
each value represents a token and 1's represent
target tokens then the distance array would be
[6,5,4,3,2,1,2,3] if the `max_position_distance`
is 5 then the distance array will be
[5,5,4,3,2,1,2,3]. (for this to work either
`position_embeddings` has to be True or
`position_weights`)
:raises ValueError: If the `left_right_contexts` is not True while either the
`incl_targets` or `reverse_right_context` arguments are
True.
:raises ValueError: If the `left_right_contexts` and `target_sequences` are
True at the same time.
:raises ValueError: If the `max_position_distance` when set is less than 2.
:raises ValueError: If `max_position_distance` is set but neither
`position_embeddings` nor `position_weights` are
`True`.
:Example of target_sequences: {`text`: `This Camera lens is great but the
screen is rubbish`,
`targets`: [`Camera`, `screen`],
`target_sentiments`: [`positive`, `negative`],
`target_sequences`: [[0,1,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,1,0,0]],
`spans`: [[5,11], [34:40]]}
'''
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None,
tokenizer: Tokenizer = None,
left_right_contexts: bool = False,
reverse_right_context: bool = False,
incl_target: bool = False,
use_categories: bool = False,
target_sequences: bool = False,
position_embeddings: bool = False,
position_weights: bool = False,
max_position_distance: Optional[int] = None,
**kwargs) -> None:
super().__init__(**kwargs)
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or \
{"tokens": SingleIdTokenIndexer()}
if incl_target and not left_right_contexts:
raise ValueError('If `incl_target` is True then `left_right_contexts`'
' argument also has to be True')
if reverse_right_context and not left_right_contexts:
raise ValueError('If `reverse_right_context` is True then '
'`left_right_contexts` argument also has to be True')
self._incl_target = incl_target
self._reverse_right_context = reverse_right_context
self._left_right_contexts = left_right_contexts
self._use_categories = use_categories
self._target_sequences = target_sequences
if self._left_right_contexts and self._target_sequences:
raise ValueError('Cannot have both `left_right_contexts` and '
'`target_sequences` True at the same time either'
' one or the other or None.')
if (not position_embeddings and not position_weights and
max_position_distance is not None):
raise ValueError('`max_position_distance` contains a value '
f'{max_position_distance} When neither `position'
'_embeddings` nor `position_weights` are True')
self._position_embeddings = position_embeddings
if position_embeddings:
# position_tokens in the namespace forces it to have a different
# vocab in the self.vocab
self._position_indexers = {"position_tokens":
SingleIdTokenIndexer(namespace="position_tokens")}
self._position_weights = position_weights
self._max_position_distance = max_position_distance
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, 'r') as te_file:
logger.info("Reading Target Sentiment instances from jsonl "
"dataset at: %s", file_path)
for line in te_file:
example = json.loads(line)
example_instance: Dict[str, Any] = {}
example_instance["text"] = example["text"]
if 'target_sentiments' in example and 'targets' in example:
example_instance['targets'] = example['targets']
example_instance['target_sentiments'] = example['target_sentiments']
if 'categories' in example:
example_instance['categories'] = example['categories']
if 'category_sentiments' in example:
example_instance['category_sentiments'] = example['category_sentiments']
if 'spans' in example:
example_instance['spans'] = example['spans']
yield self.text_to_instance(**example_instance)
def _add_context_field(self, sentence_contexts: List[str]) -> ListField:
context_fields = []
for context in sentence_contexts:
tokens = self._tokenizer.tokenize(context)
context_field = TextField(tokens, self._token_indexers)
context_fields.append(context_field)
return ListField(context_fields)
@staticmethod
def _target_indicators_to_distances(target_indicators: List[List[int]],
max_distance: Optional[int] = None,
as_string: bool = False
) -> List[List[Union[int,str]]]:
'''
:param target_indicators: For a text the outer list represents the number
of targets in the sentence and the inner list
are 0's representing no target tokens and 1's
representing targets for one potential multi
word target in that text. e.g. [[0,0,1,1,0], [1,0,0,0,0]]
this would mean the text has two targets where
the first is a multi word target and the second
is a single word target.
:param max_distance: The maximum distance that can be given.
:param as_string: Whether the integers should become string value. Required
if you want to use these as position embeddings.
:returns: A list of a list where the outer list represents the number of
targets in the text and the inner represents the distance the
tokens are to those targets e.g. using the example in
`target_indicators` the return would be [[3,2,1,1,2], [1,2,3,4,5]]
'''
if max_distance is not None:
if max_distance < 2:
distance_error = ('Max distance has to be greater than 1. '
f'Currently max distance is {max_distance}')
raise ValueError(distance_error)
target_indicator_distances: List[List[int]] = []
for target_indicator_list in target_indicators:
target_indicator_distance: List[int] = []
first_one = target_indicator_list.index(1)
# tokens up to the target
if first_one == 0:
pass
else:
for distance in reversed(range(first_one)):
distance = distance + 2
if max_distance is not None:
if distance > max_distance:
distance = max_distance
target_indicator_distance.append(distance)
# https://stackoverflow.com/questions/522372/finding-first-and-last-index-of-some-value-in-a-list-in-python
last_one = len(target_indicator_list) - 1 - target_indicator_list[::-1].index(1)
length_of_target = (last_one - first_one) + 1
# tokens in the target
for _ in range(length_of_target):
target_indicator_distance.append(1)
# tokens after the target
number_tokens_left = (len(target_indicator_list) - last_one) - 1
for distance in range(number_tokens_left):
distance = distance + 2
if max_distance is not None:
if distance > max_distance:
distance = max_distance
target_indicator_distance.append(distance)
assert len(target_indicator_list) == len(target_indicator_distance)
# to string
if as_string:
target_indicator_distance = [str(distance) for distance in target_indicator_distance]
target_indicator_distances.append(target_indicator_distance)
return target_indicator_distances
[docs] def text_to_instance(self, text: str,
targets: Optional[List[str]] = None,
target_sentiments: Optional[List[Union[str, int]]] = None,
spans: Optional[List[List[int]]] = None,
categories: Optional[List[str]] = None,
category_sentiments: Optional[List[Union[str, int]]] = None,
**kwargs) -> Instance:
'''
The original text, text tokens as well as the targets and target
tokens are stored in the MetadataField.
:NOTE: At least targets and/or categories must be present.
:NOTE: That the left and right contexts returned in the instance are
a List of a List of tokens. A list for each Target.
:param text: The text that contains the target(s) and/or categories.
:param targets: The targets that are within the text
:param target_sentiments: The sentiment of the targets. To be used if
training the classifier
:param spans: The spans that represent the character offsets for each
of the targets given in the targets list.
:param categories: The categories that are within the text
:param category_sentiments: The sentiment of the categories
:returns: An Instance object with all of the above encoded for a
PyTorch model.
:raises ValueError: If either targets and categories are both None
:raises ValueError: If `self._target_sequences` is True and the passed
`spans` argument is None.
:raises ValueError: If `self._left_right_contexts` is True and the
passed `spans` argument is None.
'''
if targets is None and categories is None:
raise ValueError('Either targets or categories must be given if you '
'want to be predict the sentiment of a target '
'or a category')
instance_fields: Dict[str, Field] = {}
# Metadata field
metadata_dict = {}
if targets is not None:
# need to change this so that it takes into account the case where
# the positions are True but not the target sequences.
if self._target_sequences or self._position_embeddings or self._position_weights:
if spans is None:
raise ValueError('To create target sequences requires `spans`')
spans = [Span(span[0], span[1]) for span in spans]
target_text_object = TargetText(text=text, spans=spans,
targets=targets, text_id='anything')
target_text_object.force_targets()
text = target_text_object['text']
allen_tokens = self._tokenizer.tokenize(text)
tokens = [x.text for x in allen_tokens]
target_text_object['tokenized_text'] = tokens
target_text_object.sequence_labels(per_target=True)
target_sequences = target_text_object['sequence_labels']
# Need to add the target sequences to the instances
in_label = {'B', 'I'}
number_targets = len(targets)
all_target_tokens: List[List[Token]] = [[] for _ in range(number_targets)]
target_sequence_fields = []
target_indicators: List[List[int]] = []
for target_index in range(number_targets):
one_values = []
target_ones = [0] * len(allen_tokens)
for token_index, token in enumerate(allen_tokens):
target_sequence_value = target_sequences[target_index][token_index]
in_target = 1 if target_sequence_value in in_label else 0
if in_target:
all_target_tokens[target_index].append(allen_tokens[token_index])
one_value_list = [0] * len(allen_tokens)
one_value_list[token_index] = 1
one_values.append(one_value_list)
target_ones[token_index] = 1
one_values = np.array(one_values)
target_sequence_fields.append(ArrayField(one_values, dtype=np.int32))
target_indicators.append(target_ones)
if self._position_embeddings:
target_distances = self._target_indicators_to_distances(target_indicators,
max_distance=self._max_position_distance,
as_string=True)
target_text_distances = []
for target_distance in target_distances:
token_distances = [Token(distance) for distance in target_distance]
token_distances = TextField(token_distances, self._position_indexers)
target_text_distances.append(token_distances)
instance_fields['position_embeddings'] = ListField(target_text_distances)
if self._position_weights:
target_distances = self._target_indicators_to_distances(target_indicators,
max_distance=self._max_position_distance,
as_string=False)
target_distances = np.array(target_distances)
instance_fields['position_weights'] = ArrayField(target_distances,
dtype=np.int32)
if self._target_sequences:
instance_fields['target_sequences'] = ListField(target_sequence_fields)
instance_fields['tokens'] = TextField(allen_tokens, self._token_indexers)
metadata_dict['text words'] = tokens
metadata_dict['text'] = text
# update target variable as the targets could have changed due
# to the force_targets function
targets = target_text_object['targets']
else:
all_target_tokens = [self._tokenizer.tokenize(target)
for target in targets]
target_fields = [TextField(target_tokens, self._token_indexers)
for target_tokens in all_target_tokens]
target_fields = ListField(target_fields)
instance_fields['targets'] = target_fields
# Add the targets and the tokenised targets to the metadata
metadata_dict['targets'] = [target for target in targets]
metadata_dict['target words'] = [[x.text for x in target_tokens]
for target_tokens in all_target_tokens]
# Target sentiment if it exists
if target_sentiments is not None:
target_sentiments_field = SequenceLabelField(target_sentiments,
target_fields,
label_namespace='target-sentiment-labels')
instance_fields['target_sentiments'] = target_sentiments_field
if categories is not None and self._use_categories:
category_fields = TextField([Token(category) for category in categories],
self._token_indexers)
instance_fields['categories'] = category_fields
# Category sentiment if it exists
if category_sentiments is not None:
category_sentiments_field = SequenceLabelField(category_sentiments,
category_fields,
label_namespace='category-sentiment-labels')
instance_fields['category_sentiments'] = category_sentiments_field
# Add the categories to the metadata
metadata_dict['categories'] = [category for category in categories]
if 'tokens' not in instance_fields:
tokens = self._tokenizer.tokenize(text)
instance_fields['tokens'] = TextField(tokens, self._token_indexers)
metadata_dict['text'] = text
metadata_dict['text words'] = [x.text for x in tokens]
# If required processes the left and right contexts
left_contexts = None
right_contexts = None
if self._left_right_contexts:
if spans is None:
raise ValueError('To create left, right, target contexts requires'
' the `spans` of the targets which is None')
spans = [Span(span[0], span[1]) for span in spans]
target_text_object = TargetText(text=text, spans=spans,
targets=targets, text_id='anything')
# left, right, and target contexts for each target in the
# the text
left_right_targets = target_text_object.left_right_target_contexts(incl_target=self._incl_target)
left_contexts: List[str] = []
right_contexts: List[str] = []
for left_right_target in left_right_targets:
left, right, _ = left_right_target
left_contexts.append(left)
if self._reverse_right_context:
right_tokens = self._tokenizer.tokenize(right)
reversed_right_tokens = []
for token in reversed(right_tokens):
reversed_right_tokens.append(token.text)
right = ' '.join(reversed_right_tokens)
right_contexts.append(right)
if left_contexts is not None:
left_field = self._add_context_field(left_contexts)
instance_fields["left_contexts"] = left_field
if right_contexts is not None:
right_field = self._add_context_field(right_contexts)
instance_fields["right_contexts"] = right_field
instance_fields["metadata"] = MetadataField(metadata_dict)
return Instance(instance_fields)