Source code for target_extraction.allen.dataset_readers.target_conll

import logging
from typing import Dict, Optional, List, Iterable
import re
import itertools

from allennlp.common.file_utils import cached_path
from allennlp.common.checks import ConfigurationError
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.dataset_utils import to_bioul
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token
from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
from overrides import overrides

logger = logging.getLogger(__name__)

def _is_divider(line: str) -> bool:
    if line.strip() == '':
        return True
    elif re.search('^#', line):
        return True
    else:
        return False

[docs]@DatasetReader.register("target_conll") class TargetConllDatasetReader(DatasetReader): ''' Dataset reader designed to read a CONLL formatted file that is produced from `target_extraction.data_types.TargetTextCollection.to_conll`. The CONLL file should have the following structure: `TOKEN#GOLD LABEL` Where each text is sperated by a blank new line and that each text has an associated `# {text_id: 'value'}` line at the start of the text. An example of the file is below: ` # {"text_id": "0"} The O laptop B-0 case I-0 was O great O and O cover O was O rubbish O # {"text_id": "2"} The O laptop B-0 case I-0 was O great O and O cover B-1 was O rubbish O ` Parameters ---------- token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) We use this to define the input representation for the text. See :class:`TokenIndexer`. coding_scheme: ``str``, optional (default=``BIO``) Specifies the coding scheme for. Valid options are ``BIO`` and ``BIOUL``. The ``BIO`` default maintains the original BIO scheme in the data. In the BIO scheme, B is a token starting a span, I is a token continuing a span, and O is a token outside of a span. label_namespace: ``str``, optional (default=``labels``) Specifies the namespace for the sequence labels. ''' def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, coding_scheme: str = "BIO", label_namespace: str = "labels", **kwargs) -> None: super().__init__(**kwargs) self._token_indexers = token_indexers or \ {"tokens": SingleIdTokenIndexer()} self.label_namespace = label_namespace if coding_scheme not in ("BIO", "BIOUL"): raise ConfigurationError(f"unknown coding_scheme: {coding_scheme}") self.coding_scheme = coding_scheme self._original_coding_scheme = "BIO" @overrides def _read(self, file_path: str) -> Iterable[Instance]: file_path = cached_path(file_path) with open(file_path, 'r') as conll_file: logger.info("Reading Target CONLL instances from CONLL " "dataset at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(conll_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if is_divider: continue fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] tokens_ = fields[0] tags = fields[1] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, tags)
[docs] def text_to_instance(self, tokens: List[Token], tags: Optional[List[str]] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} # Metadata field metadata_dict = {"words": [x.text for x in tokens]} instance_fields["metadata"] = MetadataField(metadata_dict) if tags is not None: if self.coding_scheme == "BIOUL": tags = to_bioul(tag_sequence=tags, encoding=self._original_coding_scheme) instance_fields['tags'] = SequenceLabelField(tags, sequence, self.label_namespace) return Instance(instance_fields)