Source code for target_extraction.allen.dataset_readers.target_sentiment

import logging
import json
from typing import Dict, Any, Optional, List, Union, NamedTuple

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
from allennlp.data.fields import TextField, ListField, MetadataField, Field
from allennlp.data.fields import SequenceLabelField, ArrayField
from overrides import overrides
import numpy as np

from target_extraction.data_types import TargetText
from target_extraction.data_types_util import Span

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

[docs]@DatasetReader.register("target_sentiment") class TargetSentimentDatasetReader(DatasetReader): ''' Dataset reader designed to read a list of JSON like objects of the following type: {`text`: `This Camera lens is great`, `targets`: [`Camera`], `target_sentiments`: [`positive`]} or {`text`: `This Camera lens is great`, `categories`: [`CAMERA`], `category_sentiments`: [`positive`]} or {`text`: `This Camera lens is great`, `targets`: [`Camera`] `categories`: [`CAMERA`], `target_sentiments`: [`positive`]} or {`text`: `This Camera lens is great`, `targets`: [`Camera`], `target_sentiments`: [`positive`], `spans`: [[5,11]]} This type of JSON can be created from exporting a `target_extraction.data_types.TargetTextCollection` using the `to_json_file` method. The difference between the three objects depends on the objective of the model being trained: 1. Version is for a purely Target based sentiment classifier. 2. Version is for a purely Aspect or latent based sentiment classifier. 3. Version is if you want to make use of the relationship between the Target and Aspect in the sentiment classifier. 4. If the Target based sentiment classifier requires the knowledge of where the target is. :param lazy: Whether or not instances can be read lazily. :param token_indexers: We use this to define the input representation for the text. See :class:`allennlp.data.token_indexers.TokenIndexer`. :param tokenizer: Tokenizer to use to split the sentence text as well as the text of the target. :param left_right_contexts: If True it will return within the instance for `text_to_instance` the sentence context left and right of the target. :param reverse_right_context: If True this will reverse the text that is in the right context. NOTE left_right_context has to be True. :param incl_target: If left_right_context is True and this also the left and right contexts will include the target word(s) as well. :param use_categories: Whether or not to return the categories in the instances even if they do occur in the dataset. This is a temporary solution to the following `issue <https://github.com/apmoore1/target-extraction/issues/5>`_. The number of categories does not have to match the number of targets, just there has to be at least one category per sentence. :param target_sequences: Whether or not to generate `target_sequences` which are a sequence of masks per target for all target texts. This will allow the model to know which tokens in the context relate to the target. Example of this is shown below (for this to work does require the `span` of each target) :param position_embeddings: Whether or not to create distance values that can be converted to embeddings similar to the `position_weights` but instead of the model later on using them as weights it uses the distances to learn position embeddings. (for this to work does require the `span` of each target). `A Position-aware Bidirectional Attention Network for Aspect-level Sentiment Analysis <https://www.aclweb.org/anthology/C18-1066.pdf>`_ :param position_weights: In the instances there will be an extra key `position_weights` which will be an array of integers representing the linear distance between each token and it's target e.g. If the text contains two targets where each token is represented by a number and the 1's target tokens = [[0,0,0,1], [1,1,0,0]] then the `position_weights` will be [[4,3,2,1], [1,1,2,3]]. (for this to work does require the `span` of each target). An example of position weighting is in section 3.3 of `Modeling Sentiment Dependencies with Graph Convolutional Networks for Aspect-level Sentiment Classification <https://arxiv.org/pdf/1906.04501.pdf>`_ :param max_position_distance: The maximum position distance given to a token from the target e.g. [0,0,0,0,0,1,0,0] if the each value represents a token and 1's represent target tokens then the distance array would be [6,5,4,3,2,1,2,3] if the `max_position_distance` is 5 then the distance array will be [5,5,4,3,2,1,2,3]. (for this to work either `position_embeddings` has to be True or `position_weights`) :raises ValueError: If the `left_right_contexts` is not True while either the `incl_targets` or `reverse_right_context` arguments are True. :raises ValueError: If the `left_right_contexts` and `target_sequences` are True at the same time. :raises ValueError: If the `max_position_distance` when set is less than 2. :raises ValueError: If `max_position_distance` is set but neither `position_embeddings` nor `position_weights` are `True`. :Example of target_sequences: {`text`: `This Camera lens is great but the screen is rubbish`, `targets`: [`Camera`, `screen`], `target_sentiments`: [`positive`, `negative`], `target_sequences`: [[0,1,0,0,0,0,0,0,0,0], [0,0,0,0,0,0,0,1,0,0]], `spans`: [[5,11], [34:40]]} ''' def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, left_right_contexts: bool = False, reverse_right_context: bool = False, incl_target: bool = False, use_categories: bool = False, target_sequences: bool = False, position_embeddings: bool = False, position_weights: bool = False, max_position_distance: Optional[int] = None, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or \ {"tokens": SingleIdTokenIndexer()} if incl_target and not left_right_contexts: raise ValueError('If `incl_target` is True then `left_right_contexts`' ' argument also has to be True') if reverse_right_context and not left_right_contexts: raise ValueError('If `reverse_right_context` is True then ' '`left_right_contexts` argument also has to be True') self._incl_target = incl_target self._reverse_right_context = reverse_right_context self._left_right_contexts = left_right_contexts self._use_categories = use_categories self._target_sequences = target_sequences if self._left_right_contexts and self._target_sequences: raise ValueError('Cannot have both `left_right_contexts` and ' '`target_sequences` True at the same time either' ' one or the other or None.') if (not position_embeddings and not position_weights and max_position_distance is not None): raise ValueError('`max_position_distance` contains a value ' f'{max_position_distance} When neither `position' '_embeddings` nor `position_weights` are True') self._position_embeddings = position_embeddings if position_embeddings: # position_tokens in the namespace forces it to have a different # vocab in the self.vocab self._position_indexers = {"position_tokens": SingleIdTokenIndexer(namespace="position_tokens")} self._position_weights = position_weights self._max_position_distance = max_position_distance @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as te_file: logger.info("Reading Target Sentiment instances from jsonl " "dataset at: %s", file_path) for line in te_file: example = json.loads(line) example_instance: Dict[str, Any] = {} example_instance["text"] = example["text"] if 'target_sentiments' in example and 'targets' in example: example_instance['targets'] = example['targets'] example_instance['target_sentiments'] = example['target_sentiments'] if 'categories' in example: example_instance['categories'] = example['categories'] if 'category_sentiments' in example: example_instance['category_sentiments'] = example['category_sentiments'] if 'spans' in example: example_instance['spans'] = example['spans'] yield self.text_to_instance(**example_instance) def _add_context_field(self, sentence_contexts: List[str]) -> ListField: context_fields = [] for context in sentence_contexts: tokens = self._tokenizer.tokenize(context) context_field = TextField(tokens, self._token_indexers) context_fields.append(context_field) return ListField(context_fields) @staticmethod def _target_indicators_to_distances(target_indicators: List[List[int]], max_distance: Optional[int] = None, as_string: bool = False ) -> List[List[Union[int,str]]]: ''' :param target_indicators: For a text the outer list represents the number of targets in the sentence and the inner list are 0's representing no target tokens and 1's representing targets for one potential multi word target in that text. e.g. [[0,0,1,1,0], [1,0,0,0,0]] this would mean the text has two targets where the first is a multi word target and the second is a single word target. :param max_distance: The maximum distance that can be given. :param as_string: Whether the integers should become string value. Required if you want to use these as position embeddings. :returns: A list of a list where the outer list represents the number of targets in the text and the inner represents the distance the tokens are to those targets e.g. using the example in `target_indicators` the return would be [[3,2,1,1,2], [1,2,3,4,5]] ''' if max_distance is not None: if max_distance < 2: distance_error = ('Max distance has to be greater than 1. ' f'Currently max distance is {max_distance}') raise ValueError(distance_error) target_indicator_distances: List[List[int]] = [] for target_indicator_list in target_indicators: target_indicator_distance: List[int] = [] first_one = target_indicator_list.index(1) # tokens up to the target if first_one == 0: pass else: for distance in reversed(range(first_one)): distance = distance + 2 if max_distance is not None: if distance > max_distance: distance = max_distance target_indicator_distance.append(distance) # https://stackoverflow.com/questions/522372/finding-first-and-last-index-of-some-value-in-a-list-in-python last_one = len(target_indicator_list) - 1 - target_indicator_list[::-1].index(1) length_of_target = (last_one - first_one) + 1 # tokens in the target for _ in range(length_of_target): target_indicator_distance.append(1) # tokens after the target number_tokens_left = (len(target_indicator_list) - last_one) - 1 for distance in range(number_tokens_left): distance = distance + 2 if max_distance is not None: if distance > max_distance: distance = max_distance target_indicator_distance.append(distance) assert len(target_indicator_list) == len(target_indicator_distance) # to string if as_string: target_indicator_distance = [str(distance) for distance in target_indicator_distance] target_indicator_distances.append(target_indicator_distance) return target_indicator_distances
[docs] def text_to_instance(self, text: str, targets: Optional[List[str]] = None, target_sentiments: Optional[List[Union[str, int]]] = None, spans: Optional[List[List[int]]] = None, categories: Optional[List[str]] = None, category_sentiments: Optional[List[Union[str, int]]] = None, **kwargs) -> Instance: ''' The original text, text tokens as well as the targets and target tokens are stored in the MetadataField. :NOTE: At least targets and/or categories must be present. :NOTE: That the left and right contexts returned in the instance are a List of a List of tokens. A list for each Target. :param text: The text that contains the target(s) and/or categories. :param targets: The targets that are within the text :param target_sentiments: The sentiment of the targets. To be used if training the classifier :param spans: The spans that represent the character offsets for each of the targets given in the targets list. :param categories: The categories that are within the text :param category_sentiments: The sentiment of the categories :returns: An Instance object with all of the above encoded for a PyTorch model. :raises ValueError: If either targets and categories are both None :raises ValueError: If `self._target_sequences` is True and the passed `spans` argument is None. :raises ValueError: If `self._left_right_contexts` is True and the passed `spans` argument is None. ''' if targets is None and categories is None: raise ValueError('Either targets or categories must be given if you ' 'want to be predict the sentiment of a target ' 'or a category') instance_fields: Dict[str, Field] = {} # Metadata field metadata_dict = {} if targets is not None: # need to change this so that it takes into account the case where # the positions are True but not the target sequences. if self._target_sequences or self._position_embeddings or self._position_weights: if spans is None: raise ValueError('To create target sequences requires `spans`') spans = [Span(span[0], span[1]) for span in spans] target_text_object = TargetText(text=text, spans=spans, targets=targets, text_id='anything') target_text_object.force_targets() text = target_text_object['text'] allen_tokens = self._tokenizer.tokenize(text) tokens = [x.text for x in allen_tokens] target_text_object['tokenized_text'] = tokens target_text_object.sequence_labels(per_target=True) target_sequences = target_text_object['sequence_labels'] # Need to add the target sequences to the instances in_label = {'B', 'I'} number_targets = len(targets) all_target_tokens: List[List[Token]] = [[] for _ in range(number_targets)] target_sequence_fields = [] target_indicators: List[List[int]] = [] for target_index in range(number_targets): one_values = [] target_ones = [0] * len(allen_tokens) for token_index, token in enumerate(allen_tokens): target_sequence_value = target_sequences[target_index][token_index] in_target = 1 if target_sequence_value in in_label else 0 if in_target: all_target_tokens[target_index].append(allen_tokens[token_index]) one_value_list = [0] * len(allen_tokens) one_value_list[token_index] = 1 one_values.append(one_value_list) target_ones[token_index] = 1 one_values = np.array(one_values) target_sequence_fields.append(ArrayField(one_values, dtype=np.int32)) target_indicators.append(target_ones) if self._position_embeddings: target_distances = self._target_indicators_to_distances(target_indicators, max_distance=self._max_position_distance, as_string=True) target_text_distances = [] for target_distance in target_distances: token_distances = [Token(distance) for distance in target_distance] token_distances = TextField(token_distances, self._position_indexers) target_text_distances.append(token_distances) instance_fields['position_embeddings'] = ListField(target_text_distances) if self._position_weights: target_distances = self._target_indicators_to_distances(target_indicators, max_distance=self._max_position_distance, as_string=False) target_distances = np.array(target_distances) instance_fields['position_weights'] = ArrayField(target_distances, dtype=np.int32) if self._target_sequences: instance_fields['target_sequences'] = ListField(target_sequence_fields) instance_fields['tokens'] = TextField(allen_tokens, self._token_indexers) metadata_dict['text words'] = tokens metadata_dict['text'] = text # update target variable as the targets could have changed due # to the force_targets function targets = target_text_object['targets'] else: all_target_tokens = [self._tokenizer.tokenize(target) for target in targets] target_fields = [TextField(target_tokens, self._token_indexers) for target_tokens in all_target_tokens] target_fields = ListField(target_fields) instance_fields['targets'] = target_fields # Add the targets and the tokenised targets to the metadata metadata_dict['targets'] = [target for target in targets] metadata_dict['target words'] = [[x.text for x in target_tokens] for target_tokens in all_target_tokens] # Target sentiment if it exists if target_sentiments is not None: target_sentiments_field = SequenceLabelField(target_sentiments, target_fields, label_namespace='target-sentiment-labels') instance_fields['target_sentiments'] = target_sentiments_field if categories is not None and self._use_categories: category_fields = TextField([Token(category) for category in categories], self._token_indexers) instance_fields['categories'] = category_fields # Category sentiment if it exists if category_sentiments is not None: category_sentiments_field = SequenceLabelField(category_sentiments, category_fields, label_namespace='category-sentiment-labels') instance_fields['category_sentiments'] = category_sentiments_field # Add the categories to the metadata metadata_dict['categories'] = [category for category in categories] if 'tokens' not in instance_fields: tokens = self._tokenizer.tokenize(text) instance_fields['tokens'] = TextField(tokens, self._token_indexers) metadata_dict['text'] = text metadata_dict['text words'] = [x.text for x in tokens] # If required processes the left and right contexts left_contexts = None right_contexts = None if self._left_right_contexts: if spans is None: raise ValueError('To create left, right, target contexts requires' ' the `spans` of the targets which is None') spans = [Span(span[0], span[1]) for span in spans] target_text_object = TargetText(text=text, spans=spans, targets=targets, text_id='anything') # left, right, and target contexts for each target in the # the text left_right_targets = target_text_object.left_right_target_contexts(incl_target=self._incl_target) left_contexts: List[str] = [] right_contexts: List[str] = [] for left_right_target in left_right_targets: left, right, _ = left_right_target left_contexts.append(left) if self._reverse_right_context: right_tokens = self._tokenizer.tokenize(right) reversed_right_tokens = [] for token in reversed(right_tokens): reversed_right_tokens.append(token.text) right = ' '.join(reversed_right_tokens) right_contexts.append(right) if left_contexts is not None: left_field = self._add_context_field(left_contexts) instance_fields["left_contexts"] = left_field if right_contexts is not None: right_field = self._add_context_field(right_contexts) instance_fields["right_contexts"] = right_field instance_fields["metadata"] = MetadataField(metadata_dict) return Instance(instance_fields)