Source code for target_extraction.allen.dataset_readers.target_conll

import logging
from typing import Dict, Optional, List, Iterable
import re
import itertools

from allennlp.common.file_utils import cached_path
from allennlp.common.checks import ConfigurationError
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.dataset_utils import to_bioul
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token
from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
from overrides import overrides

logger = logging.getLogger(__name__)

def _is_divider(line: str) -> bool:
    if line.strip() == '':
        return True
    elif re.search('^#', line):
        return True
    else:
        return False

[docs]@DatasetReader.register("target_conll")
class TargetConllDatasetReader(DatasetReader):
    '''
    Dataset reader designed to read a CONLL formatted file that is produced 
    from `target_extraction.data_types.TargetTextCollection.to_conll`. The 
    CONLL file should have the following structure:

    `TOKEN#GOLD LABEL`

    Where each text is sperated by a blank new line and that each text has an 
    associated `# {text_id: 'value'}` line at the start of the text. An example
    of the file is below:
    `
    # {"text_id": "0"}
    The O
    laptop B-0
    case I-0
    was O
    great O
    and O
    cover O
    was O
    rubbish O

    # {"text_id": "2"}
    The O
    laptop B-0
    case I-0
    was O
    great O
    and O
    cover B-1
    was O
    rubbish O
    `

    Parameters
    ----------
    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
    coding_scheme: ``str``, optional (default=``BIO``)
        Specifies the coding scheme for.
        Valid options are ``BIO`` and ``BIOUL``.  The ``BIO`` default maintains
        the original BIO scheme in the data.
        In the BIO scheme, B is a token starting a span, I is a token continuing 
        a span, and
        O is a token outside of a span.
    label_namespace: ``str``, optional (default=``labels``)
        Specifies the namespace for the sequence labels.
    '''
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None,
                 coding_scheme: str = "BIO",
                 label_namespace: str = "labels", **kwargs) -> None:
        super().__init__(**kwargs)
        self._token_indexers = token_indexers or \
                               {"tokens": SingleIdTokenIndexer()}
        self.label_namespace = label_namespace

        if coding_scheme not in ("BIO", "BIOUL"):
            raise ConfigurationError(f"unknown coding_scheme: {coding_scheme}")
        self.coding_scheme = coding_scheme
        self._original_coding_scheme = "BIO"

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)

        with open(file_path, 'r') as conll_file:
            logger.info("Reading Target CONLL instances from CONLL "
                        "dataset at: %s", file_path)
            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(conll_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if is_divider:
                    continue
                fields = [line.strip().split() for line in lines]
                # unzipping trick returns tuples, but our Fields need lists
                fields = [list(field) for field in zip(*fields)]
                tokens_ = fields[0]
                tags = fields[1]

                # TextField requires ``Token`` objects
                tokens = [Token(token) for token in tokens_]

                yield self.text_to_instance(tokens, tags)
    
[docs]    def text_to_instance(self, tokens: List[Token],
                         tags: Optional[List[str]] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer 
        in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        # Metadata field
        metadata_dict = {"words": [x.text for x in tokens]}
        instance_fields["metadata"] = MetadataField(metadata_dict)

        if tags is not None:
            if self.coding_scheme == "BIOUL":
                tags = to_bioul(tag_sequence=tags, 
                                encoding=self._original_coding_scheme)
            instance_fields['tags'] = SequenceLabelField(tags, sequence, 
                                                         self.label_namespace)

        return Instance(instance_fields)