Source code for target_extraction.allen.allennlp_model

import collections
from typing import Optional, List, Any, Iterable, Dict, Union, Tuple
import tempfile
from pathlib import Path
import random

from allennlp.common.params import Params
from allennlp.commands.train import train_model_from_file
from allennlp.data.dataset_readers import DatasetReader
from allennlp.models import Model
from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor

import target_extraction
from target_extraction.data_types import TargetTextCollection

[docs]class AllenNLPModel():
    '''
    This is a wrapper for the AllenNLP dataset readers, models, and predictors 
    so that the input to functions can be 
    :class:`target_extraction.data_types.TargetTextCollection` objects
    and the return a metric or metrics as well as predicitons within the 
    :class:`target_extraction.data_types.TargetTextCollection` objects. This 
    is instead of running everything through multiple bash files calling 
    ``allennlp train`` etc.
    '''

    def __init__(self, name: str, model_param_fp: Path, predictor_name: str, 
                 save_dir: Optional[Path] = None) -> None:
        '''
        :param name: Name of the model e.g. ELMO-Target-Extraction
        :param model_params_fp: File path to the model parameters that will 
                                define the AllenNLP model and how to train it.
        :param predictor_name: Name of the predictor to be used with the 
                               AllenNLP model e.g. for a target_tagger model 
                               the predictor should prbably be `target-tagger`
        :param save_dir: Directory to save the model to. This has to be set
                         up front as the fit function saves the model each 
                         epoch.
        '''

        self.name = name
        self.model = None
        self.save_dir = save_dir
        self._param_fp = model_param_fp.resolve()
        self._predictor_name = predictor_name

[docs]    def fit(self, train_data: TargetTextCollection, 
            val_data: TargetTextCollection,
            test_data: Optional[TargetTextCollection] = None) -> None:
        '''
        Given the training, validation, and optionally the test data it will 
        train the model that is defined in the model params file provided as 
        argument to the constructor of the class. Once trained the model can 
        be accessed through the `model` attribute.

        NOTE: If the test data is given the model only uses it to fit to the 
        vocabularly that is within the test data, the model NEVER trains on 
        the test data.
        
        :param train_data: Training data.
        :param val_data: Validation data.
        :param test_data: Optional, test data.
        '''

        model_params = self._preprocess_and_load_param_file(self._param_fp)
        # Ensures that a different random seed is used each time
        self._set_random_seeds(model_params)
        with tempfile.TemporaryDirectory() as temp_dir:
            train_fp = Path(temp_dir, 'train_data.json')
            val_fp = Path(temp_dir, 'val_data.json')

            # Write the training and validation data to json Optionally test as 
            # well
            train_data.to_json_file(train_fp)
            val_data.to_json_file(val_fp)
            if test_data:
                test_fp = Path(temp_dir, 'test_data.json')
                test_data.to_json_file(test_fp)
                self._add_dataset_paths(model_params, train_fp, val_fp, test_fp)
                model_params["evaluate_on_test"] = True
            else:
                self._add_dataset_paths(model_params, train_fp, val_fp)

            save_dir = self.save_dir
            if save_dir is None:
                save_dir = Path(temp_dir, 'temp_save_dir')
            
            temp_param_fp = Path(temp_dir, 'temp_param_file.json')
            model_params.to_file(temp_param_fp.resolve())
            trained_model = train_model_from_file(temp_param_fp, save_dir)
            self.model = trained_model

    def _predict_iter(self, data: Union[Iterable[Dict[str, Any]], 
                                        List[Dict[str, Any]]],
                      batch_size: Optional[int] = None,
                      yield_original_target: bool = False
                      ) -> Iterable[Union[Dict[str, Any], 
                                          Tuple[Dict[str, Any], Dict[str, Any]]
                                         ]]:
        '''
        Iterates over the predictions and yields one prediction at a time.
        This is a useful wrapper as it performs the data pre-processing and 
        assertion checks.

        The predictions are predicted in batchs so that the model does not 
        load in lots of data at once and thus have memory issues.

        :param data: Iterable or list of dictionaries that the predictor can 
                     take as input e.g. `target-tagger` predictor expects at 
                     most a `text` key and value.
        :param batch_size: Specify the batch size to predict on. If left None 
                           defaults to 64 unless it is specified in the 
                           `model_param_fp` within the constructor then 
                           the batch size from the param file is used. 
        :param yield_original_target: If True it will then yield the 
                                      dictionary that has been predicted on.
        :yields: A dictionary containing all the values the model outputs e.g.
                 For the `target_tagger` model it would return `logits`, 
                 `class_probabilities`, `mask`, `tags`, `words`, and `text`.
                 If `yield_original_target` is True it will then yield a Tuple 
                 of 2 dictionaries the first being what has already been stated 
                 and the second being the dictionary that is being predicted on.
        :raises AssertionError: If the `model` attribute is None. This can be 
                                overcome by either fitting or loading a model.
        :raises TypeError: If the data given is not of Type List or Iterable.
        '''
        no_model_error = 'There is no model to make predictions, either fit '\
                         'or load a model to resolve this.'
        assert self.model, no_model_error
        self.model.eval()

        all_model_params = Params.from_file(self._param_fp)

        reader_params = all_model_params.get("dataset_reader")
        dataset_reader = DatasetReader.from_params(reader_params)
        predictor = Predictor.by_name(self._predictor_name)(self.model, dataset_reader)

        # Argument batch size first then model param file and then default 64
        if batch_size is None:
            if 'iterator' in all_model_params:
                iter_params = all_model_params.get("iterator")
                if 'batch_size' in iter_params:
                    batch_size = iter_params['batch_size']
            batch_size = batch_size or 64
        
        # Data has to be an iterator
        if isinstance(data, list) or isinstance(data, collections.Iterable):
            data = iter(data)
        else:
            raise TypeError(f'Data given has to be of type {collections.Iterable}'
                            f' and not {type(data)}')
        data_exists = True
        while data_exists:
            data_batch = []
            for _ in range(batch_size):
                try:
                    data_batch.append(next(data))
                except StopIteration:
                    data_exists = False
            if data_batch:
                predictions = predictor.predict_batch_json(data_batch)
                for prediction_index, prediction in enumerate(predictions):
                    if yield_original_target:
                        yield (prediction, data_batch[prediction_index])
                    else:
                        yield prediction

[docs]    def predict_into_collection(self, collection: TargetTextCollection,
                                key_mapping: Dict[str, str],
                                batch_size: Optional[int] = None,
                                append_if_exists: bool = True
                                ) -> TargetTextCollection:
        '''
        :param collection: The TargetTextCollection that is to be predicted on 
                           and to be the store of the predicted data.
        :param key_mapping: Dictionary mapping the prediction keys that contain 
                            the prediction values to the keys that will store 
                            those prediction values within the collection that 
                            has been predicted on.
        :param batch_size: Specify the batch size to predict on. If left None 
                           defaults to 64 unless it is specified in the 
                           `model_param_fp` within the constructor then 
                           the batch size from the param file is used.
        :param append_if_exists: If False and a TargetText within the collection 
                                 already has a prediction within the given key 
                                 based on the `key_mapping` then KeyError is 
                                 raised. 
        :returns: The collection that was predict on with the new predictions 
                  within the collection stored in keys that are the values of 
                  the `key_mapping` argument. Note that all predictions are 
                  sotred within Lists within their respective keys in the 
                  collection.
        :raises KeyError: If the keys from `key_mapping` is not within the 
                          prediction dictionary.
        :raises KeyError: If `append_if_exists` is False and the a TargetText 
                          within the collection already has a prediction within
                          the given key based on the `key_mapping` then this 
                          is raised. 
        '''
        for prediction, original_target in self._predict_iter(collection.dict_iterator(), 
                                                              batch_size=batch_size, 
                                                              yield_original_target=True):
            text_id = original_target['text_id']
            # This happens first as we want an error to be raised before any
            # data is added to the TargetTextCollection.
            for prediction_key, collection_key in key_mapping.items():
                if prediction_key not in prediction:
                    raise KeyError(f'The key {prediction_key} from `key_mapping`'
                                   f' {key_mapping} is not within the prediction'
                                   f' {prediction} for the follwoing TargeText'
                                   f' {original_target}')
            
            for prediction_key, collection_key in key_mapping.items():
                if collection_key not in collection[text_id]:
                    collection[text_id][collection_key] = []
                elif not append_if_exists:
                    raise KeyError(f'The key {collection_key} from `key_mapping`'
                                   f' {key_mapping} already exists within the'
                                   f' follwoing TargeText {original_target}')
                collection[text_id][collection_key].append(prediction[prediction_key])
        return collection


[docs]    def predict_sequences(self, data: Union[Iterable[Dict[str, Any]], 
                                            List[Dict[str, Any]]],
                          batch_size: Optional[int] = None
                          ) -> Iterable[Dict[str, Any]]:
        '''
        Given the data it will predict the sequence labels and return the 
        confidence socres in those labels as well as the words and text the 
        prediction was predicting on.

        :param data: Iterable or list of dictionaries that contains at least 
                     `text` key and value and if you do not want the 
                     predictor to do the tokenization then provide `tokens` 
                     as well. Some model may also expect `pos_tags` which the 
                     predictor will provide if the `text` key is only provided.
        :param batch_size: Specify the batch size to predict on. If left None 
                           defaults to 64 unless it is specified in the 
                           `model_param_fp` within the constructor then 
                           the batch size from the param file is used. 
        :yields: A dictionary containing all the following keys and values:
                 1. `sequence_labels`: A list of predicted sequence labels. 
                    This will be a List of Strings.
                 2. `confidence`: The confidence the model had in predicting 
                    each sequence label, this comes from the softmax score.
                    This will be a List of floats.
                 3. `tokens`: The tokens that the confidence and sequence labels 
                    are associated to
                 4. `text`: The text that the tokens/words relate to. 
        '''
        self.model: Model
        label_to_index = self.model.vocab.get_token_to_index_vocabulary('labels')
        for prediction in self._predict_iter(data, batch_size):
            output_dict = {}
            # Length of the text
            sequence_length = sum(prediction['mask'])
            
            # Sequence labels
            sequence_labels = prediction['tags'][:sequence_length]
            output_dict['sequence_labels'] = sequence_labels
            
            # Confidence scores
            # First get the index of predicted lables
            confidence_indexs = [label_to_index[label] for label in sequence_labels]
            confidence_scores = prediction['class_probabilities'][:sequence_length]
            label_confidence_scores = [] 
            for scores, index in zip(confidence_scores, confidence_indexs):
                label_confidence_scores.append(scores[index])
            output_dict['confidence'] = label_confidence_scores
            output_dict['tokens'] = prediction['words']
            output_dict['text'] = prediction['text'] 

            yield output_dict

[docs]    def load(self, cuda_device: int = -1) -> Model:
        '''
        Loads the model. This does not require you to train the model if the 
        `save_dir` attribute is pointing to a folder containing a trained model.
        This is just a wrapper around the `load_archive` function.

        :param cuda_device: Whether the loaded model should be loaded on to the 
                            CPU (-1) or the GPU (0). Default CPU.
        :returns: The model that was saved at `self.save_dir` 
        :raises AssertionError: If the `save_dir` argument is None
        :raises FileNotFoundError: If the save directory does not exist.
        '''

        save_dir_err = 'Save directory was not set in the constructor of the class'
        assert self.save_dir, save_dir_err
        if self.save_dir.exists():
            archive = load_archive(self.save_dir / "model.tar.gz", 
                                   cuda_device=cuda_device)
            self.model = archive.model
            return self.model
        raise FileNotFoundError('There is nothing at the save dir:\n'
                                f'{self.save_dir.resolve()}')

    @staticmethod
    def _preprocess_and_load_param_file(model_param_fp: Path) -> Params:
        '''
        Given a model parameter file it will load it as a Params object and 
        remove all data fields for the Param object so that these keys can be 
        added with different values associated to them.
        fields (keys) that are removed:
        1. train_data_path
        2. validation_data_path
        3. test_data_path
        4. evaluate_on_test
        :param model_param_fp: File path to the model parameters that will 
                               define the AllenNLP model and how to train it.
        :returns: The model parameter file as a Params object with the data 
                  fields removed if they exisited.
        '''

        model_param_fp = str(model_param_fp)
        fields_to_remove = ['train_data_path', 'validation_data_path', 
                            'test_data_path', 'evaluate_on_test']
        model_params = Params.from_file(model_param_fp)
        for field in fields_to_remove:
            if field in model_params:
                model_params.pop(field)
        return model_params

    @staticmethod
    def _add_dataset_paths(model_params: Params, train_fp: Path, val_fp: Path, 
                           test_fp: Optional[Path] = None) -> None:
        '''
        Give model parameters it will add the given train, validation and 
        optional test dataset paths to the model parameters.
        Does not return anything as the model parameters object is mutable
        :param model_params: model parameters to add the dataset paths to
        :param train_fp: Path to the training dataset
        :param val_fp: Path to the validation dataset
        :param test_fp: Optional path to the test dataset
        '''

        model_params['train_data_path'] = str(train_fp.resolve())
        model_params['validation_data_path'] = str(val_fp.resolve())
        if test_fp:
            model_params['test_data_path'] = str(test_fp.resolve())

    @staticmethod
    def _set_random_seeds(model_params: Params) -> None:
        '''
        This ensures to some extent that the experiments are NOT reproducible 
        so that we can take into account the random seed problem.
        Returns nothing as the model_params will be modified as they are a 
        mutable object.
        :param model_params: The parameters of the model
        '''

        seed, numpy_seed, torch_seed = [random.randint(1,99999) 
                                        for i in range(3)]
        model_params["random_seed"] = seed
        model_params["numpy_seed"] = numpy_seed
        model_params["pytorch_seed"] = torch_seed     

    def __repr__(self) -> str:
        '''
        :returns: the name of the model e.g. TDLSTM or IAN
        '''
        return self.name