Source code for target_extraction.allen.dataset_readers.text_sentiment

import logging
import json
from typing import Dict, Union

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Tokenizer
from allennlp.data.dataset_readers import TextClassificationJsonReader
from allennlp.data.instance import Instance
from overrides import overrides

logger = logging.getLogger(__name__)

[docs]@DatasetReader.register("text_sentiment") class TextSentimentReader(TextClassificationJsonReader): """ Subclasses :py:class:`allennlp.data.dataset_readers.TextClassificationJsonReader` of which the only differences is the `label_name` construction parameter which is explained in the Parameters section below. Reads tokens and their labels from a labeled text classification dataset. Expects a "text" field and a "label" field in JSON format. The output of ``read`` is a list of ``Instance`` s with the fields: tokens: ``TextField`` and label: ``LabelField`` Parameters ---------- token_indexers : ``Dict[str, TokenIndexer]``, optional optional (default=``{"tokens": SingleIdTokenIndexer()}``) We use this to define the input representation for the text. See :class:`TokenIndexer`. tokenizer : ``Tokenizer``, optional (default = ``{"tokens": SpacyTokenizer()}``) Tokenizer to use to split the input text into words or other kinds of tokens. segment_sentences: ``bool``, optional (default = ``False``) If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like the Hierarchical Attention Network (https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf). max_sequence_length: ``int``, optional (default = ``None``) If specified, will truncate tokens to specified maximum length. skip_label_indexing: ``bool``, optional (default = ``False``) Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0. lazy : ``bool``, optional, (default = ``False``) Whether or not instances can be read lazily. label_name: ``str``, optional, (default = ``label``) The name of the label field in the JSON objects that are read. """ def __init__(self, label_name: str = 'label', **kwargs) -> None: super().__init__(**kwargs) self._label_field = label_name @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in data_file.readlines(): if not line: continue items = json.loads(line) text = items["text"] label = items.get(self._label_field, None) if label is not None: if self._skip_label_indexing: try: label = int(label) except ValueError: raise ValueError( "Labels must be integers if skip_label_indexing is True." ) else: label = str(label) instance = self.text_to_instance(text=text, label=label) if instance is not None: yield instance
[docs] @overrides def text_to_instance(self, text: str, label: Union[str, int] = None, **kwargs) -> Instance: return super().text_to_instance(text=text, label=label)