Source code for target_extraction.allen.dataset_readers.target_extraction
import logging
import json
from typing import Dict, Any, Optional, List
from allennlp.common.file_utils import cached_path
from allennlp.common.checks import ConfigurationError
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token
from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
from overrides import overrides
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
[docs]@DatasetReader.register("target_extraction")
class TargetExtractionDatasetReader(DatasetReader):
'''
Dataset reader designed to read a list of JSON like objects of the
following type:
{`tokenized_text`: [`This`, `Camera`, `lens`, `is`, `great`],
`text`: `This Camera lens is great`,
`tags`: [`O`, `B`, `I`, `O`, `O`],
`pos_tags`: [`DET`, `NOUN`, `NOUN`, `AUX`, `ADJ`]}
Where the `pos_tags` are optional. This type of JSON can be created
from exporting a `target_extraction.data_types.TargetTextCollection`
using the `to_json_file` method.
If the `pos_tags` are given, they can be used as either features or
for joint learning.
The only sequence labels that we currently support is BIO or also known as
IOB-2.
:params pos_tags: Whether or not to extract POS tags if avaliable.
:returns: A ``Dataset`` of ``Instances`` for Target Extraction.
'''
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None,
pos_tags: bool = False, **kwargs) -> None:
super().__init__(**kwargs)
self._token_indexers = token_indexers or \
{"tokens": SingleIdTokenIndexer()}
self._pos_tags = pos_tags
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, 'r') as te_file:
logger.info("Reading Target Extraction instances from jsonl "
"dataset at: %s", file_path)
for line in te_file:
example = json.loads(line)
example_instance: Dict[str, Any] = {}
sequence_labels = example["sequence_labels"]
tokens_ = example["tokenized_text"]
# TextField requires ``Token`` objects
tokens = [Token(token) for token in tokens_]
if self._pos_tags:
if 'pos_tags' not in example:
pos_err = (f"The POS tags are within the data: {example}"
"\nPlease add them in manually or automatically"
" to this dataset if you wish to use them.")
raise ConfigurationError(pos_err)
example_instance['pos_tags'] = example['pos_tags']
example_instance['tags'] = sequence_labels
example_instance['tokens'] = tokens
example_instance['text'] = example['text']
yield self.text_to_instance(**example_instance)
[docs] def text_to_instance(self, tokens: List[Token],
text: str,
tags: Optional[List[str]] = None,
pos_tags: Optional[List[str]] = None) -> Instance:
'''
The tokens are expected to be pre-tokenised.
The original token text and the text itself is stored in a
MetadataField
:param tokens: Tokenised text that either has target extraction labels
or is to be tagged.
:param text: The text that the tokenised text has come from.
:param tags: The target extraction BIO labels.
:param pos_tags: POS tags to be used either as features or for joint
learning.
:returns: An Instance object with all of the above enocded for a
PyTorch model.
'''
sequence = TextField(tokens, self._token_indexers)
instance_fields: Dict[str, Field] = {'tokens': sequence}
# Metadata field
metadata_dict = {"words": [x.text for x in tokens]}
metadata_dict['text'] = text
instance_fields["metadata"] = MetadataField(metadata_dict)
if tags is not None:
instance_fields['tags'] = SequenceLabelField(tags, sequence, "labels")
if pos_tags is not None:
instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
return Instance(instance_fields)