Source code for target_extraction.data_types

'''
Moudle that contains the two main data types 
`target_extraction.data_types.TargetText` and 
`target_extraction.data_types.TargetTextCollection` where the later is a
container for the former.

classes:

1. `target_extraction.data_types.TargetText`
2. `target_extraction.data_types.TargetTextCollection`
'''
from collections.abc import MutableMapping
from collections import OrderedDict, Counter, defaultdict, deque
import copy
import json
import itertools
import functools
from pathlib import Path
from typing import Optional, List, Tuple, Iterable, NamedTuple, Any, Callable
from typing import Union, Dict, Set
import traceback
import random

from target_extraction.tokenizers import is_character_preserving, token_index_alignment
from target_extraction.data_types_util import (Span, OverLappingTargetsError,
                                               AnonymisedError, OverwriteError)

[docs]def check_anonymised(func):
    '''
    Assumes the first argument in the given function is a TargetText object 
    defined by self.

    :raises AnonymisedError: If the TargetText object given to `func` 
                             `anonymised` attribute is True.
    '''
    @functools.wraps(func)
    def wrapper_func(*args, **kwargs):
        target_text_object = args[0]
        if target_text_object.anonymised:
            anonymised_err = (f'Cannot perform this function as the Target '
                              f'{target_text_object} has been anonymised '
                              'and therefore has no `text`')
            raise AnonymisedError(anonymised_err)
        return func(*args, **kwargs)
    return wrapper_func


[docs]class TargetText(MutableMapping):
    '''
    This is a data structure that inherits from MutableMapping which is 
    essentially a python dictionary.

    The following are the default keys that are in all `TargetText` 
    objects, additional items can be added through __setitem__

    1. text - The text associated to all of the other items
    2. text_id -- The unique ID associated to this object 
    3. targets -- List of all target words that occur in the text. A special 
                  placeholder of None (python None value) can exist where the 
                  target does not exist but a related Category does this would 
                  mean though that the related span is Span(0, 0), this type of 
                  special placeholder is in place for the SemEval 2016 Restaurant 
                  dataset where they link the categories to the targets but 
                  not all categories have related targets thus None.
    4. spans -- List of Span NamedTuples where each one specifies the start and 
       end of the respective targets within the text.
    5. target_sentiments -- List sepcifying the sentiment of the respective 
       targets within the text.
    6. categories -- List of categories that exist in the data which may or 
       may not link to the targets (this is dataset speicific). NOTE: 
       depending on the dataset and how it is parsed the category can exist 
       but the target does not as the category is a latent variable, in 
       these cases the category and category sentiments will be the same size 
       which would be a different size to the target and target sentiments 
       size. E.g. can happen where the dataset has targets and categories 
       but they do not map to each other in a one to one manner e.g 
       SemEval 2014 restuarant dataset, there are some samples that contain 
       categories but no targets. Another word for category can be aspect.
    7. category_sentiments -- List of the sentiments associated to the 
       categories. If the categories and targets map to each other then 
       this will be empty and you will only use the target_sentiments.

    Attributes:

    1. anonymised -- If True then the data within the TargetText object has 
       no text but the rest of the metadata should exist.

    Methods:

    1. to_json -- Returns the object as a dictionary and then encoded using 
       json.dumps
    2. to_conll -- Returns a CONLL formatted string where the formatt will be the 
       following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`. Where each 
       token and relevant labels are on separate new lines. The first line will 
       always contain the following: `# {text_id: `value`}` where the `text_id` 
       represents the `text_id` of this TargetText, this will allow the CONLL
        string to be uniquely identified back this TargetText object.
    3. from_conll -- Adds the gold labels and/or predicted sequence labels from 
       the CONLL formatted string.
    4. tokenize -- This will add a new key `tokenized_text` to this TargetText 
       instance that will store the tokens of the text that is associated to 
       this TargetText instance.
    5. pos_text -- This will add a new key `pos_tags` to this TargetText 
       instance. This key will store the pos tags of the text that is 
       associated to this Target Text instance.
    6. force_targets -- Does not return anything but modifies the `spans` and 
       `text` values as whitespace is prefixed and suffixed the target unless 
       the prefix or suffix is whitespace. NOTE that this is the only method 
       that currently can change the `spans` and `text` key values after they 
       have been set.
    7. sequence_labels -- Adds the `sequence_labels` key to this TargetText 
       instance which can be used to train a machine learning algorthim to 
       detect targets.
    8. get_sequence_indexs -- The indexs related to the tokens, pos tags etc 
       for each labelled sequence span.
    9. get_sequence_spans -- The span indexs from the sequence labels given 
       assuming that the sequence labels are in BIO format.
    10. get_targets_from_sequence_labels -- Retrives the target words given the 
        sequence labels.
    11. one_sample_per_span -- This returns a similar TargetText instance 
        where the new instance will only contain one target per span.
    12. left_right_target_contexts -- This will return the sentence that is 
        left and right of the target as well as the words in the target for 
        each target in the sentence.
    13. replace_target -- Given an index and a new target word it will replace 
        the target at the index with the new target word and return a new 
        TargetText object with everything the same apart from this new target.
    14. de_anonymise -- This will set the `anonymised` attribute to False 
        from True and set the `text` key value to the value in the `text` 
        key within the `text_dict` argument. 
    15. in_order -- True if all the `targets` within this TargetText 
        are in sequential left to right order within the text.
    16. re_order -- Re-Orders the TargetText object targets so that they are in 
        a left to right order within the text, this will then re-order all 
        values within this object that are in a list format into this order. 
        Once the TargetText has been re-ordered it will return True when 
        :py:meth`target_extraction.data_types.TargetText.in_order` is called.
    17. add_unique_key -- Given a key e.g. `targets` it will create a new value 
        in the TargetText object that is a list of strings which are unique IDs
        based on the `text_id` and the index the `targets` occur in e.g. 
        if the `targets` contain [`food`, `service`] and the `text_id` is 
        `12a5` then the `target_id` created will contain `[`12a5$$0`,`12a5$$1`]`  
    
    Static Functions:

    1. from_json -- Returns a TargetText object given a json string. For 
       example the json string can be the return of TargetText.to_json.
    2. targets_from_spans -- Given a sequence of spans and the associated text 
       it will return the targets that are within the text based on the spans
    3. target_text_from_prediction -- Creates a TargetText object from data 
       that has come from predictions of a Target Extract tagger
    '''

    def _check_is_list(self, item: List[Any], item_name: str) -> None:
        '''
        This will check that the argument given is a List and if not will raise 
        a TypeError.

        :param item: The argument that is going to be checked to ensure it is a
                     list.
        :param item_name: Name of the item. This is used within the raised 
                          error message, if an error is raised.
        :raises TypeError: If any of the items are not of type List.
        '''
        type_err = f'{item_name} should be a list not {type(item)} {item}'
        if not isinstance(item, list):
            raise TypeError(type_err)

[docs]    def sanitize(self) -> None:
        '''
        This performs a check on all of the lists that can be given at 
        object construction time to ensure that the following conditions are 
        met:
        
        1. The target, spans and target_sentiments lists are all of the same 
           size if set.
        2. The categories and the category_sentiments lists are all of the 
           same size if set. 

        Further more it checks the following:

        1. If targets or spans are set then both have to exist.
        2. If targets and spans are set that the spans text match the 
           associated target words e.g. if the target is `barry davies` in 
           `today barry davies went` then the spans should be [[6,18]]
        3. If anonymised esures that the `text` key does not exist.

        The 2nd check is not performed if `self.anonymised` is False.
    
        :raises ValueError: If any of the above conditions are not True.
        '''

        def length_mis_match(lists_to_check: List[Any], 
                             text_id_msg: str) -> None:
            length_mismatch_msg = 'The following lists do not match '\
                                  f'{lists_to_check}'
            list_lengths = [len(_list) for _list in lists_to_check 
                            if _list is not None]
            current_list_size = -1
            for list_length in list_lengths:
                if current_list_size == -1:
                    current_list_size = list_length
                else:
                    if current_list_size != list_length:
                        raise ValueError(text_id_msg + length_mismatch_msg)

        targets = self._storage['targets']
        target_sentiments = self._storage['target_sentiments']
        spans = self._storage['spans']
        categories = self._storage['categories']
        category_sentiments = self._storage['category_sentiments']

        text_id = self._storage['text_id']
        text_id_msg = f'Text id that this error refers to {text_id}\n'

        # Checking the length mismatches for the two different lists
        length_mis_match([targets, target_sentiments, spans], text_id_msg)
        length_mis_match([categories, category_sentiments], text_id_msg)
        

        # Checking that if targets are set than so are spans
        if targets is not None and spans is None:
            spans_none_msg = f'If the targets are a list: {targets} then spans'\
                             f' should also be a list and not None: {spans}'
            raise ValueError(text_id_msg + spans_none_msg)
        # Checking that the words Spans reference in the text match the 
        # respective target words. Edge case is the case of None targets which 
        # should have a Span value of (0, 0)
        if targets is not None:
            for target, span in zip(targets, spans):
                if target is None:
                    target_span_msg = 'As the target value is None the span '\
                                      'it refers to should be of value '\
                                      f'Span(0, 0) and not {span}'
                    if span != Span(0, 0):
                        raise ValueError(text_id_msg + target_span_msg)
                else:
                    if span == Span(0, 0) and target != '':
                        target_span_msg = (f'The Span is {Span(0, 0)} and the '
                                           f'target is {target} therefore the '
                                           'span must be in-correct for this'
                                           f' target {self}.')
                        raise ValueError(target_span_msg)
                    # Cannot check the text value when the data has been anonymised
                    if self.anonymised:
                        continue
                    text = self._storage['text'] 
                    start, end = span.start, span.end
                    text_target = text[start:end]
                    target_span_msg = 'The target the spans reference in the '\
                                      f'text: {text_target} does not match '\
                                      f'the target in the targets list: {target}'
                    if text_target != target:
                        raise ValueError(text_id_msg + target_span_msg)
        if self.anonymised and 'text' in self._storage:
            raise ValueError('The TargetText object is anonymised and therefore'
                             f' should not contain a `text` key. {self}')

    def __init__(self, text: Union[str, None], text_id: str,
                 targets: Optional[List[str]] = None, 
                 spans: Optional[List[Span]] = None, 
                 target_sentiments: Optional[List[Union[int, str]]] = None, 
                 categories: Optional[List[str]] = None,
                 category_sentiments: Optional[List[Union[int, str]]] = None,
                 anonymised: bool = False,
                 **additional_data):
        '''
        :param additional_data: Any other data that is to be added to the 
                                object at construction.
        '''
        # Ensure that the arguments that should be lists are lists.
        self._list_argument_names = ['targets', 'spans', 'target_sentiments', 
                                     'categories', 'category_sentiments']
        self._list_arguments = [targets, spans, target_sentiments, categories,
                                category_sentiments]
        names_arguments = zip(self._list_argument_names, self._list_arguments)
        for argument_name, list_argument in names_arguments:
            if list_argument is None:
                continue
            self._check_is_list(list_argument, argument_name)
        # anonymised data will have no text
        temp_dict = dict(text=text, text_id=text_id, targets=targets,
                         spans=spans, target_sentiments=target_sentiments, 
                         categories=categories, 
                         category_sentiments=category_sentiments)
        if anonymised:
            del temp_dict['text']
            self._protected_keys = set(['text_id', 'targets', 'spans'])
        else:
            self._protected_keys = set(['text', 'text_id', 'targets', 'spans'])
        self._storage = temp_dict
        self._storage = {**self._storage, **additional_data}
        self._anonymised = anonymised
        self.sanitize()

    @property
    def anonymised(self) -> bool:
        '''
        :returns: True if the data within the TargetText has been anonymised.
                  Anonymised data means that there is no text associated with
                  the TargetText object but all of the metadata is there.
        '''
        return self._anonymised

    @anonymised.setter
    def anonymised(self, value: bool) -> None:
        '''
        Sets whether or not `anonymised` attribute is True or False. Either 
        which way when set it performs the `sanitize` check to ensure that 
        the attribute can be set to this value else it is reverted.

        :param value: If True then the `text` key will be deleted. In all 
                      cases the TargetText object is subjected to the 
                      :py:meth:`sanitize` to ensure that the anonymised 
                      process is correct.
        :raises AnonymisedError: If the TargetText object cannot be set to the 
                                 `anonymised` value given. If this Error occurs 
                                 then the object will have kept the original 
                                 `anonymised` value.
        '''
        # If want to anonymise all the 
        if not self.anonymised and value:
            del self._storage['text']
        
        self._anonymised = value
        try:
            self.sanitize()
        except:
            self._anonymised = not value
            sanitize_err = traceback.format_exc()
            raise AnonymisedError('Cannot de-anonymise this TargetText '
                                    f'{self} as it cannot pass the `sanitize`'
                                    ' check of which the following is the '
                                    f'error from said check {sanitize_err}')

    def __getitem__(self, key: str) -> Any:
        '''
        :returns: One of the values from the self._storage dictionary. e.g. 
                  if the key is `text` it will return the string representing 
                  the text associated to this object.
        '''
        return self._storage[key]

    def __iter__(self) -> Iterable[str]:
        '''
        Returns an interator over the keys in self._storage which are the 
        following Strings by default additional keys can be added:

        1. text
        2. text_id
        3. targets
        4. spans
        5. target_sentiments
        6. categories
        7. category_sentiments

        :returns: The keys in self._storage
        '''
        return iter(self._storage)

    def __len__(self) -> int:
        '''
        :returns: The number of items in self._storage.
        '''
        return len(self._storage)
    
    def __repr__(self) -> str:
        '''
        :returns: String returned is what user see when the instance is 
                  printed or printed within a interpreter.
        '''
        return f'TargetText({self._storage})'

    def __eq__(self, other: 'TargetText') -> bool:
        '''
        Two TargetText instances are equal if they both have the same `text_id`
        value.

        :param other: Another TargetText object that is being compared to this 
                      TargetText object.
        :returns: True if they have the same `text_id` value else False.
        '''

        if not isinstance(other, TargetText):
            return False
        elif self['text_id'] != other['text_id']:
            return False
        return True

    def __delitem__(self, key: str) -> None:
        '''
        Given a key that matches a key within self._storage or self.keys() 
        it will delete that key and value from this object.

        NOTE: Currently  'text', 'text_id', 'spans', and 'targets' are keys 
        that cannot be deleted.

        :param key: Key and its respective value to delete from this object.
        '''
        if key in self._protected_keys:
            raise KeyError('Cannot delete a key that is protected, list of '
                           f' protected keys: {self._protected_keys}')
        del self._storage[key]

    def __setitem__(self, key: str, value: Any) -> None:
        '''
        Given a key and a respected value it will either change that current 
        keys value to the one gien here or create a new key with that value.

        NOTE: Currently  'text', 'text_id', 'spans', and 'targets' are keys 
        that cannot be changed.

        :param key: Key to be added or changed
        :param value: Value associated to the given key.
        '''
        if key in self._protected_keys:
            raise KeyError('Cannot change a key that is protected, list of '
                           f' protected keys: {self._protected_keys}')
        # If the key value should be a list ensure that the new value is a 
        # list as well.
        if key in self._list_argument_names:
            self._check_is_list(value, key)
        self._storage[key] = value
        self.sanitize()

[docs]    def to_json(self) -> str:
        '''
        Required as TargetText is not json serlizable due to the 'spans'.

        :returns: The object as a dictionary and then encoded using json.dumps
        '''
        return json.dumps(self._storage)

[docs]    @check_anonymised
    def to_conll(self, gold_label_key: str, 
                 prediction_key: Optional[str] = None) -> str:
        '''
        :param gold_label: A key that contains a sequence of labels e.g. 
                           [`B`, `I`, `O`]. This can come from the return 
                           of the :py:meth:`sequence_labels`
        :param prediction_key: Key to the predicted labels of the `gold_label`. 
                               Where the prediction key values is a list of a 
                               list of predicted labels. Each list is therefore 
                               a different model run hence creating the 
                               `PREDICTION 1`, 'PREDICTION 2' etc. Thus the 
                               values of `prediction_key` must be of shape 
                               (number runs, number tokens)
        :returns: A CONLL formatted string where the format will be the 
                  following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`
                  Where each token and relevant labels are on separate new 
                  lines. The first line will always contain the following:
                  `# {text_id: `value`}` where the text_id represents the 
                  `text_id` of this TargetText, this will allow the CONLL
                  string to be uniquely identified back this TargetText 
                  object.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises KeyError: If the the object has not be tokenized using 
                          :py:meth:`tokenize`
        :raises KeyError: If the `prediction_key` or `gold_label_key` do not 
                          exist.
        :raises ValueError: If the `gold_label_key` or `prediction_key` values 
                            are not of the same length as the tokens, as the 
                            labels will not be able to match tokens etc.
        :raises ValueError: If the values in `prediction_key` are not of shape 
                            (number runs, number tokens)
        '''
        keys_to_check = ['tokenized_text', gold_label_key]
        if prediction_key is not None:
            keys_to_check.append(prediction_key)
        for key in keys_to_check:
            self._key_error(key)
        number_tokens = len(self['tokenized_text'])
        # ensure number of labels same as number of tokens
        value_err = (f'Number of tokens {number_tokens} does not match the '
                     f'number of labels ')
        number_gold_labels = len(self[gold_label_key])
        if number_tokens != number_gold_labels:
            gold_err = (f'{number_gold_labels} for gold label '
                        f'{gold_label_key} in {self}')
            gold_err = f'{value_err} {gold_err}'
            raise ValueError(gold_err)
        if prediction_key is not None:
            for prediction_labels in self[prediction_key]:
                if not isinstance(prediction_labels, list):
                    pred_list_shape_error = ('The predictions should be a list '
                                             'of a list of labels of shape '
                                             '(number runs, number tokens) not '
                                             f'{self[prediction_key]}')
                    raise ValueError(pred_list_shape_error)
                number_labels = len(prediction_labels)
                pred_err = (f'{number_labels} for prediction '
                            f'label {prediction_key} in {self}')
                pred_err = f'{value_err} {pred_err}'
                if number_tokens != number_labels:
                    raise ValueError(pred_err)
        # End of checks now creating the CONLL string
        text_id_str = json.dumps({'text_id': self['text_id']})
        gold_labels = self[gold_label_key]
        conll_string = f'# {text_id_str}'
        for token_index, token in enumerate(self['tokenized_text']):
            gold_label = gold_labels[token_index]
            token_string = f'{token} {gold_label} '
            if prediction_key is not None:
                for prediction_labels in self[prediction_key]:
                    prediction_label = prediction_labels[token_index]
                    token_string += f'{prediction_label} '                    
            token_string = token_string.strip(' ')
            token_string = f'\n{token_string}'
            conll_string += token_string
        return conll_string

[docs]    @check_anonymised
    def from_conll(self, conll_str: str, tokens_key: str = 'tokenized_text', 
                   gold_label_key: Optional[str] = None, 
                   prediction_key: Optional[str] = None) -> None:
        '''
        :param conll_str: CONLL formatted string formatted like so: 
                          `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`
        :param tokens_key: Key to save the CONLL tokens too.
        :param gold_label_key: Key to save the gold labels too. Either 
                               `gold_label_key` or `prediction_key` must not be 
                               `None` or both not `None`
        :param prediction_key: Key to save the prediction labels too. The value 
                               will be of shape (number runs, number tokens)
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises ValueError: If both `gold_label_key` and `prediction_key` are 
                            `None`.
        :raises ValueError: If the number of labels are not consistent in the 
                            CONLL string e.g. the first token has 3 predicted 
                            labels and the second token has 2 predicted labels.
        :raises ValueError: If the text within this TargetText does not match 
                            the tokens in the CONLL string. (CASE SENSITIVE)
        '''
        if prediction_key is None and gold_label_key is None:
            raise ValueError('Either `prediction_key` or `gold_label_key` or '
                             'both need to be a String not None')
        # Predicted labels is of shape (number runs, number tokens)
        predicted_labels: List[List[str]] = []
        gold_labels = []
        tokens = []
        conll_token_labels = conll_str.split('\n')
        
        conll_string_length = 0
        for index, conll_token_label in enumerate(conll_token_labels):
            token_labels = conll_token_label.split(' ')
            number_token_labels = len(token_labels)
            # Ensure legnth of CONLL same each time
            if index == 0:
                conll_string_length = number_token_labels
            else:
                if conll_string_length != number_token_labels:
                    raise ValueError('Number of labels are not consistent. '
                                     f'Index {index}. CONLL String: '
                                     f'{conll_token_labels}\n Self {self}')
            if number_token_labels < 2:
                raise ValueError('CONLL String does not contain any labels '
                                 f'{conll_token_labels}')
            
            tokens.append(token_labels[0])
            gold_labels.append(token_labels[1])
            if number_token_labels < 3:
                continue
            predicted_values = token_labels[2:]
            for pred_index, predicted_tokens in enumerate(predicted_values):
                if index == 0:
                    predicted_labels.append([])
                predicted_labels[pred_index].append(predicted_tokens)
        # Ensure that the tokens match the text
        text = self['text']
        if not is_character_preserving(text, tokens):
            raise ValueError(f'The tokens {tokens} do not match the text {text}'
                             f' for {self}')
        self[tokens_key] = tokens
        if gold_label_key is not None:
            self[gold_label_key] = gold_labels
        if prediction_key is not None:
            self[prediction_key] = predicted_labels
        
    def _shift_spans(self, target_span: Span, prefix: bool, 
                     suffix: bool) -> None:
        '''
        This only affects the current state of the TargetText attributes. 
        The attributes this affects is the `spans` attribute.

        NOTE: This is only used within self.force_targets method.

        :param prefix: Whether it affects the prefix of the target_span
        :param suffix: Whether it affects the suffix of the target_span
        :param spans: The current target span indexs that are having extra 
                        whitespace added either prefix or suffix.
        '''
        target_span_start = target_span.start
        target_span_end = target_span.end
        for span_index, other_target_span in enumerate(self['spans']):
            if other_target_span == target_span:
                continue
            start, end = self['spans'][span_index]
            if prefix:
                if other_target_span.start >= target_span_start:
                    start += 1
                if other_target_span.end >= target_span_start:
                    end += 1
            if suffix:
                if other_target_span.start >= target_span_end:
                    start += 1
                if other_target_span.end >= target_span_end:
                    end += 1
            self._storage['spans'][span_index] = Span(start, end)

[docs]    @check_anonymised
    def force_targets(self) -> None:
        '''
        :NOTE: As this affects the following attributes `spans`, `text`, and `targets` 
               it therefore has to modify these through self._storage as both of  
               these attributes are within self._protected_keys.

        Does not return anything but modifies the `spans` and `text` values 
        as whitespace is prefixed and suffixed the target unless the prefix 
        or suffix is whitespace.

        Motivation:
        Ensure that the target tokens are not within another separate String 
        e.g. target = `priced` but the sentence is `the laptop;priced is high` 
        and the tokenizer is on whitespace it will not have `priced` seperated 
        therefore the BIO tagging is not deterministic thus force will add 
        whitespace around the target word e.g. `the laptop; priced`. This was 
        mainly added for the TargetText.sequence_tags method.

        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        '''
        for span_index in range(len(self['spans'])):
            text = self._storage['text']
            last_token_index = len(text)

            span = self._storage['spans'][span_index]
            prefix = False
            suffix = False

            start, end = span
            if start != 0:
                if text[start - 1] != ' ':
                    prefix = True
            if end < last_token_index:
                if text[end] != ' ':
                    suffix = True

            text_before = text[:start]
            text_after = text[end:]
            target = text[start:end]
            if prefix and suffix:
                self._storage['text'] = f'{text_before} {target} {text_after}'
                self._shift_spans(span, prefix=True, suffix=True)
                self._storage['spans'][span_index] = Span(start + 1, end + 1)
            elif prefix:
                self._storage['text'] = f'{text_before} {target}{text_after}'
                self._shift_spans(span, prefix=True, suffix=False)
                self._storage['spans'][span_index] = Span(start + 1, end + 1)
            elif suffix:
                self._storage['text'] = f'{text_before}{target} {text_after}'
                self._shift_spans(span, prefix=False, suffix=True)
        # Get the targets from the re-aligned spans
        updated_targets = []
        text = self._storage['text']
        for span in self._storage['spans']:
            target = text[span.start: span.end]
            updated_targets.append(target)
        self._storage['targets'] = updated_targets

[docs]    @check_anonymised
    def tokenize(self, tokenizer: Callable[[str], List[str]],
                 perform_type_checks: bool = False) -> None:
        '''
        This will add a new key `tokenized_text` to this TargetText instance
        that will store the tokens of the text that is associated to this 
        TargetText instance.

        For a set of tokenizers that are definitely comptable see 
        target_extraction.tokenizers module.

        Ensures that the tokenization is character preserving.

        :param tokenizer: The tokenizer to use tokenize the text for each 
                          TargetText instance in the current collection
        :param perform_type_checks: Whether or not to perform type checks 
                                    to ensure the tokenizer returns a List of 
                                    Strings
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises TypeError: If the tokenizer given does not return a List of 
                           Strings.
        :raises ValueError: This is raised if the TargetText instance contains
                            empty text.
        :raises ValueError: If the tokenization is not character preserving.
        '''
        text = self['text']
        tokenized_text = tokenizer(text)
        if perform_type_checks:
            if not isinstance(tokenized_text, list):
                raise TypeError('The return type of the tokenizer function ',
                                f'{tokenizer} should be a list and not '
                                f'{type(tokenized_text)}')
            for token in tokenized_text:
                if not isinstance(token, str):
                    raise TypeError('The return type of the tokenizer function ',
                                    f'{tokenizer} should be a list of Strings'
                                    f' and not a list of {type(token)}')

        if len(tokenized_text) == 0:
            raise ValueError('There are no tokens for this TargetText '
                             f'instance {self}')
        if not is_character_preserving(text, tokenized_text):
            raise ValueError('The tokenization method used is not character'
                             f' preserving. Original text `{text}`\n'
                             f'Tokenized text `{tokenized_text}`')
        self['tokenized_text'] = tokenized_text

[docs]    @check_anonymised
    def pos_text(self, tagger: Callable[[str], Tuple[List[str], List[str]]], 
                 perform_type_checks: bool = False) -> None:
        '''
        This will add a new key `pos_tags` to this TargetText instance.
        This key will store the pos tags of the text that is associated to 
        this Target Text instance. NOTE: It will also replace the current 
        tokens in the `tokenized_text` key with the tokens produced 
        from the pos tagger.

        For a set of pos taggers that are definetly comptable see 
        target_extraction.pos_taggers module. The pos tagger will have to 
        produce both a list of tokens and pos tags.

        :param tagger: POS tagger.
        :param perform_type_checks: Whether or not to perform type checks 
                                    to ensure the POS tagger returns a 
                                    tuple containing two lists both containing 
                                    Strings.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises TypeError: If the POS tagger given does not return a Tuple
        :raises TypeError: If the POS tagger given does not return a List of 
                           Strings for both the tokens and the pos tags.
        :raises TypeError: If the POS tagger tokens or pos tags are not lists
        :raises ValueError: If the POS tagger return is not a tuple of length 
                            2
        :raises ValueError: This is raised if the Target Text text is empty
        :raises ValueError: If the number of pos tags for this instance
                            does not have the same number of tokens that has 
                            been generated by the tokenizer function.
        '''
        text = self['text']
        tokens_pos_tags = tagger(text)

        if perform_type_checks:
            if not isinstance(tokens_pos_tags, tuple):
                raise TypeError('The return type for the pos tagger should be'
                                f' a tuple not {type(tokens_pos_tags)}')
            if len(tokens_pos_tags) != 2:
                raise ValueError('The return of the POS tagger should be a '
                                 f'tuple of length 2 not {len(tokens_pos_tags)}')
            if not isinstance(tokens_pos_tags[0], list):
                raise TypeError('The return type of the tagger function ',
                                f'{tagger} should be a list and not '
                                f'{type(tokens_pos_tags[0])} for the tokens')
            if not isinstance(tokens_pos_tags[1], list):
                raise TypeError('The return type of the tagger function ',
                                f'{tagger} should be a list and not '
                                f'{type(tokens_pos_tags[1])} for the POS tags')
            for name, tags in [('tokens', tokens_pos_tags[0]),
                               ('pos_tags', tokens_pos_tags[1])]:
                for tag in tags:
                    if not isinstance(tag, str):
                        raise TypeError('The return type of the tagger function ',
                                        f'{tagger} should be a list of Strings'
                                        f' and not a list of {type(tag)} for '
                                        f'the {name}')
        tokens, pos_tags = tokens_pos_tags
        num_pos_tags = len(pos_tags)
        if len(pos_tags) == 0:
            raise ValueError('There are no tags for this TargetText '
                             f'instance {self}')
        num_tokens = len(tokens)
        if num_tokens != num_pos_tags:
            raise ValueError(f'Number of POS tags {pos_tags} should be the '
                             f'same as the number of tokens {tokens}')

        self['pos_tags'] = pos_tags
        self['tokenized_text'] = tokens

[docs]    @check_anonymised
    def sequence_labels(self, per_target: bool = False, 
                        label_key: Optional[str] = None) -> None:
        '''
        Adds the `sequence_labels` key to this TargetText instance which can 
        be used to train a machine learning algorthim to detect targets. The 
        value associated to the `sequence_labels` key will be a list of 
        `B`, `I`, or `O` labels, where each label is associated to a token.

        The `force_targets` method might come in useful here for training 
        and validation data to ensure that more of the targets are not 
        affected by tokenization error as only tokens that are fully within 
        the target span are labelled with `B` or `I` tags. Another use for the 
        `force_targets` is so to ensure that targets are not affected by 
        tokenisation and therefore can be used to state where the targets are 
        in the sequence for sentiment classification e.g. in the case of 
        getting contextualised target tokens or to create [TD-BERT 
        Gao et al. 2019](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8864964).

        Currently the only sequence labels supported is IOB-2 labels for the 
        targets only. Future plans look into different sequence label order
        e.g. IOB see link below for more details of the difference between the 
        two sequence, of which there are more sequence again.
        https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)

        :param per_target: Whether the the value of associated to the 
                           `sequence_labels` key should be one list for all 
                           of the targets False. Or if True should be a list 
                           of a labels per target where the labels will only 
                           be associated to the represented target.
        :param label_key: Optional label key. Where the key represents a list of 
                          values that are associated with each token. These list 
                          of values are then the class labels to attach to 
                          each `B`, `I`, `O` tag. E.g. the label key could be 
                          `target_sentiments` therefore creating the sequence 
                          labelling task of target extraction and predicting 
                          sentiment. For example if the label key is `target_sentiments`
                          it would make the `B`, `I`, `O` task extraction 
                          and sentiment prediction.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises KeyError: If the current TargetText has not been tokenized. Or 
                          if `label_key` is not None then `label_key` must be 
                          a key in self else KeyError.
        :raises ValueError: If `label_key` not None. Raises if number of labels 
                            does not match the number of targets that the labels 
                            should be associated too.
        :raises ValueError: If two targets overlap the same token(s) e.g 
                            `Laptop cover was great` if `Laptop` and 
                            `Laptop cover` are two separate targets this should 
                            raise a ValueError as a token should only be 
                            associated to one target.
        '''
        text = self['text']
        if 'tokenized_text' not in self:
            raise KeyError(f'Expect the current TargetText {self} to have '
                           'been tokenized using the self.tokenize method.')
        self.sanitize()
        tokens = self['tokenized_text']
        sequence_labels = ['O' for _ in range(len(tokens))]

        if per_target:
            sequence_labels = [sequence_labels]
        # This is the case where there are no targets thus all sequence labels 
        # are `O`
        if self['spans'] is None or self['targets'] is None:
            self['sequence_labels'] = sequence_labels
            return

        if per_target:
            sequence_labels = []
            for _ in self['targets']:
                sequence_labels.append(['O' for _ in range(len(tokens))])

        # Setting up the labels that might be part of the sequence labels
        target_spans: List[Span] = self['spans']
        labels = None
        if label_key is not None:
            self._key_error(label_key)
            labels = self[label_key]
            number_targets = len(target_spans)
            number_labels = len(labels)
            if number_labels != number_targets:
                raise ValueError(f'The number of labels {number_labels} does '
                                 f'not match the number of targets {number_targets}.'
                                 f' Labels {labels}, target spans {target_spans}.'
                                 f' For {self}')
        
        tokens_index = token_index_alignment(text, tokens)

        for target_index, target_span in enumerate(target_spans):
            target_span_range = list(range(*target_span))
            same_target = False
            current_sequence_labels = sequence_labels
            if per_target:
                current_sequence_labels = sequence_labels[target_index]
            for sequence_index, token_index in enumerate(tokens_index):
                token_start, token_end = token_index
                token_end = token_end - 1
                if (token_start in target_span_range and
                    token_end in target_span_range):
                    if current_sequence_labels[sequence_index] != 'O':
                        err_msg = ('Cannot have two sequence labels for one '
                                    f'token, text {text}\ntokens {tokens}\n'
                                    f'token indexs {tokens_index}\nTarget '
                                    f'spans {target_spans}')
                        raise ValueError(err_msg)
                    if same_target:
                        current_sequence_labels[sequence_index] = 'I'
                        if label_key is not None:
                            label = labels[target_index]
                            current_sequence_labels[sequence_index] = f'I-{label}'
                    else:
                        current_sequence_labels[sequence_index] = 'B'
                        if label_key is not None:
                            label = labels[target_index]
                            current_sequence_labels[sequence_index] = f'B-{label}'
                    same_target = True
        self['sequence_labels'] = sequence_labels

    def _key_error(self, key: str) -> None:
        '''
        :param key: The key to check for within this TargetText instance.
        :raises KeyError: If the key given does not exist within this 
                          TargetText instance.
        '''
        if f'{key}' not in self:
            raise KeyError(f'Requires that this TargetText contains {key} for '
                           f'instance {self}')

[docs]    @check_anonymised
    def get_sequence_indexs(self, sequence_key: str) -> List[List[int]]:
        '''
        The following sequence label tags are supported: IOB-2. These are the 
        tags that are currently generated by `sequence_labels`. 

        :param sequence_key: Key to sequence labels such as a BIO sequence 
                             labels. Example key name would be `sequence_labels`
                             after `sequence_labels` function has been called 
                             or more appropiately `predicted_sequence_labels` 
                             when you have predicted sequence labels.
        :returns: A list of a list of intergers where each list of integers 
                  represent the token/pos tag/sequence label index of each 
                  sequence label span.
                  :Example: These sequence labels [`O`, `B`, `I`, `O`, `B`] 
                            would return the following integers list [[1, 2], [4]]
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises ValueError: If the sequence labels that are contained in the 
                            sequence key value contain values other than 
                            `B`, `I`, or `O`.
        :raises ValueError: If then number of tokens in the current TargetText 
                            object is not the same as the number of sequence 
                            labels.
        '''
        # number of tokens and sequence labels, should 
        # all be the same, it is if the `sequence_labels` function is used
        tokens = self['tokenized_text']
        sequence_labels = self[sequence_key]
        if len(tokens) != len(sequence_labels):
            raise ValueError(f'The number of tokens in the TargetText object {self}'
                             f' is not the same as the number of sequence labels')

        same_target = False
        start_index = 0
        end_index = 0
        sequence_indexs: List[List[int]] = []
        for label_index, sequence_label in enumerate(sequence_labels):
            if sequence_label == 'B':
                if same_target == True:
                    sequence_index = list(range(start_index, end_index))
                    sequence_indexs.append(sequence_index)
                    same_target = False
                    start_index = 0
                    end_index = 0

                same_target = True
                start_index = label_index
                end_index = label_index + 1
            elif sequence_label == 'I':
                end_index = label_index + 1
            elif sequence_label == 'O':
                if same_target:
                    sequence_index = list(range(start_index, end_index))
                    sequence_indexs.append(sequence_index)
                    same_target = False
                    start_index = 0
                    end_index = 0
            else:
                raise ValueError('Sequence labels should be `B` `I` or `O` '
                                 f'and not {sequence_label}. Sequence label '
                                 f'key used {sequence_key}\nTargetText {self}')
        if end_index != 0:
            sequence_index = list(range(start_index, end_index))
            sequence_indexs.append(sequence_index)
        return sequence_indexs

[docs]    @check_anonymised
    def get_sequence_spans(self, sequence_key: str,
                           confidence: Optional[float] = None) -> List[Span]:
        '''
        The following sequence label tags are supported: IOB-2. These are the 
        tags that are currently generated by `sequence_labels`

        :param sequence_key: Key to sequence labels such as a BIO sequence 
                             labels. Example key name would be `sequence_labels`
                             after `sequence_labels` function has been called 
                             or more appropiately `predicted_sequence_labels` 
                             when you have predicted sequence labels.
        :param confidence: Optional argument that will return only spans 
                           that have been predicted with a confidence 
                           higher than this. 
                           :NOTE: As it is BIO labelling in the case where 
                                  all but one of the B and I's is greater than 
                                  the threshold that span would not be 
                                  returned, as one of the words in the multi 
                                  word target word is less than the threshold.
        :returns: The span indexs from the sequence labels given assuming that 
                  the sequence labels are in BIO format.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises KeyError: If no `confidence` key are found. However `confidence` 
                          is only required if the confidence argument is set.
        :raises ValueError: If the sequence labels that are contained in the 
                            sequence key value contain values other than 
                            `B`, `I`, or `O`.
        :raises ValueError: If the confidence value is not between 0 and 1
        '''
        # number of tokens, sequence labels, and token text indexs should 
        # all be the same, it is if the `sequence_labels` function is used
        if confidence is not None:
            self._key_error('confidence')
            if confidence > 1.0 or confidence < 0.0:
                raise ValueError('Confidence value has to be bounded between '
                                 f'1 and 0 and not {confidence}')

        sequence_indexs: List[List[int]] = self.get_sequence_indexs(sequence_key)
        if not sequence_indexs:
            return []
        tokens = self['tokenized_text']
        token_text_indexs = token_index_alignment(self['text'], tokens)
        sequence_spans: List[Span] = []

        confidences = None
        if confidence is not None:
            confidences = self['confidence']
        for span_sequence_index in sequence_indexs:
            # Test that each sequence label was predicted with enough confidence 
            if confidence is not None:
                next_span = False
                for index in span_sequence_index:
                    if confidences[index] <= confidence:
                        next_span = True
                if next_span:
                    continue
            start_index = span_sequence_index[0]
            start_span = token_text_indexs[start_index][0]
            
            end_index = span_sequence_index[-1]
            end_span = token_text_indexs[end_index][1]
            sequence_spans.append(Span(start_span, end_span))
        return sequence_spans

[docs]    @check_anonymised
    def get_targets_from_sequence_labels(self, sequence_key: str, 
                                         confidence: Optional[float] = None
                                         ) -> List[str]:
        '''
        This function mains use is when the sequence labels have been 
        predicted on a piece of text that has no gold annotations.

        :param sequence_key: Key to sequence labels such as a BIO sequence 
                             labels. Example key name would be `sequence_labels`
                             after `sequence_labels` function has been called 
                             or more appropiately `predicted_sequence_labels` 
                             when you have predicted sequence labels.
        :param confidence: Optional argument that will return only target 
                           texts that have been predicted with a confidence 
                           higher than this. 
                           :NOTE: As it is BIO labelling in the case where 
                                  all but one of the B and I's is greater than 
                                  the threshold that target word would not be 
                                  returned as one of the words in the multi 
                                  word target word is less than the threshold.
        :returns: The target text's that the sequence labels have predcited.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :raises KeyError: If no `tokenized_text` or `confidence` key are found.
                          However `confidence` is only required if the 
                          confidence argument is set.
        :raises ValueError: If the confidence value is not between 0 and 1
        '''
        if confidence is not None:
            self._key_error('confidence')
            if confidence > 1.0 or confidence < 0.0:
                raise ValueError('Confidence value has to be bounded between '
                                 f'1 and 0 and not {confidence}')
        self._key_error('tokenized_text')
        sequence_indexs: List[List[int]] = self.get_sequence_indexs(sequence_key)
        # No targets to extract
        if not sequence_indexs:
            return []
        tokens = self['tokenized_text']
        confidences = None
        if confidence is not None:
            confidences = self['confidence']
        targets = []
        for span_sequence_index in sequence_indexs:
            start_index = span_sequence_index[0]
            end_index = span_sequence_index[-1] + 1
            target_tokens = tokens[start_index: end_index]
            # Test that each token in target tokens was predicted with a 
            # great enough confidence
            if confidence is not None:
                next_span = False
                for index in span_sequence_index:
                    if confidences[index] <= confidence:
                        next_span = True
                if next_span:
                    continue
            target = ' '.join(target_tokens)
            targets.append(target)
        return targets

[docs]    @check_anonymised
    def one_sample_per_span(self, remove_empty: bool = False) -> 'TargetText':
        '''
        This returns a similar TargetText instance where the new instance 
        will only contain one target per span. 
        
        This is for the cases where you can have a target e.g. `food` that has 
        a different related category attached to it e.g.
        TargetText(text=`$8 and there is much nicer, food, all of it great and 
                  continually refilled.`, text_id=`1`, 
                  targets=[`food`, `food`, `food`], 
                  categories=[`style`, `quality`, `price`], 
                  target_sentiments=[`pos`,`pos`,`pos`], 
                  spans=[Span(27, 31),Span(27, 31),Span(27, 31)])
        As we can see the targets and the categories are linked, this is only 
        really the case in SemEval 2016 datasets from what I know currently. 
        In the example case above it will transform it to the following:
        TargetText(text=`$8 and there is much nicer, food, all of it great and 
                   continually refilled.`, text_id=`1`, 
                   targets=[`food`],spans=[Span(27,31)])
        This type of pre-processing is perfect for the Target Extraction 
        task.

        :param remove_empty: If the TargetText instance contains any None 
                             targets then these will be removed along with 
                             their respective Spans.
        :returns: This returns a similar TargetText instance where the new 
                  instance will only contain one target per span.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        '''
        text = self['text']
        text_id = self['text_id']
        targets: List[str] = []
        spans: List[Span] = []

        if self['spans'] is None:
            return TargetText(text=text, text_id=text_id)

        current_spans = self['spans']
        unique_spans = set(current_spans)
        spans = sorted(unique_spans, key=lambda x: x[0])
        temp_spans: List[Span] = []
        for span in spans:
            targets_text = text[span.start: span.end]
            if span.start == 0 and span.end == 0 and remove_empty:
                continue
            else:
                temp_spans.append(span)
                targets.append(targets_text)
        spans = temp_spans
        return TargetText(text=text, text_id=text_id, 
                          targets=targets, spans=spans)

[docs]    @check_anonymised
    def left_right_target_contexts(self, incl_target: bool
                                   ) -> List[Tuple[List[str], List[str], List[str]]]:
        '''
        :param incl_target: Whether or not the left and right sentences should 
                            also include the target word.
        :returns: The sentence that is left and right of the target as well as 
                  the words in the target for each target in the sentence.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        '''
        left_right_target_list = []
        text = self['text']
        if self['spans'] is not None:
            for span in self['spans']:
                span: Span
                span_start = span.start
                span_end = span.end
                if incl_target:
                    left_context = text[:span_end]
                    right_context = text[span_start:]
                else:
                    left_context = text[:span_start]
                    right_context = text[span_end:]
                target_context = text[span_start:span_end]
                contexts = (left_context, right_context, target_context)
                left_right_target_list.append(contexts)
        return left_right_target_list  

[docs]    @check_anonymised
    def replace_target(self, target_index: int, replacement_target_word: str
                       ) -> 'TargetText':
        '''
        :params target_index: The target index of the target word to replace
        :param replacement_target_word: The target word to replace the target 
                                        word at the given index
        :returns: Given the target index and replacement target word it will 
                  replace the target at the index with the new target word and 
                  return a new TargetText object with everything the same apart 
                  from this new target.
        :raises ValueError: If the target_index is less than 0 or an index 
                            number that does not exist.
        :raises OverLappingTargetsError: If the target to replace is contained 
                                         within another target e.g. 
                                         `what a great day` if this has two 
                                         targets `great` and `great day` then 
                                         it will raise this error if you 
                                         replace either word as each is 
                                         within the other.
        :raises AnonymisedError: If the object has been anonymised then this 
                                 method cannot be used.
        :Example: Given the following TargetText Object 
        '''
        self_dict = copy.deepcopy(dict(self))

        number_targets = len(self_dict['targets'])
        if target_index < 0 or target_index >= number_targets:
            raise ValueError('Not a valid target_index number. Number of targets'
                             f'in the current object {number_targets}')
        # Change the target word
        targets = self_dict['targets']
        target_to_be_replaced = targets[target_index]
        targets[target_index] = replacement_target_word

        # Change the target spans
        spans = self_dict['spans']
        span_to_change = spans[target_index]
        spans_to_change: List[int] = []
        for span_index, span in enumerate(spans):
            if span_index == target_index:
                continue
            span: Span
            
            # Check that there are no overlapping targets
            raise_in_target_error = False
            if span.start >= span_to_change.start and span.start < span_to_change.end:
                raise_in_target_error = True
            elif span.end > span_to_change.start and span.end <= span_to_change.end:
                raise_in_target_error = True
            if raise_in_target_error:
                raise OverLappingTargetsError('There are targets that share '
                                              f'the same context {self}')
            
            if span.start >= span_to_change.end:
                spans_to_change.append(span_index)
        
        difference_in_length = len(replacement_target_word) - len(target_to_be_replaced) 
        # Change all of the spans
        for span_index in spans_to_change:
            span = spans[span_index]
            new_start = span.start + difference_in_length
            new_end = span.end + difference_in_length
            spans[span_index] = Span(new_start, new_end)
        # Change the target that is being replaced span by only the end
        new_end = span_to_change.end + difference_in_length
        spans[target_index] = Span(span_to_change.start, new_end)
        # Change the text
        text = self_dict['text']
        span_to_change_start = span_to_change.start
        span_to_change_end = span_to_change.end
        start_text = text[:span_to_change_start]
        end_text = text[span_to_change_end:]
        text = f'{start_text}{replacement_target_word}{end_text}'
        
        self_dict['targets'] = targets
        self_dict['spans'] = spans
        self_dict['text'] = text
        return TargetText(**self_dict)

[docs]    def de_anonymise(self, text_dict: Dict[str, str]) -> None:
        '''
        This will set the `anonymised` attribute to False from True and 
        set the `text` key value to the value in the `text` key within the 
        `text_dict` argument.

        :param text_dict: A dictionary that contain the following two keys: 
                          1. `text` and 2. `text_id` where the `text_id` has 
                          to match the current TargetText object `text_id` and 
                          the `text` value will become the new value in the 
                          `text` key for this TargetText object.
        :raises ValueError: If the TargetText object `text_id` does not match 
                            the `text_id` within `text_dict` argument.
        :raises AnonymisedError: If the `text` given does not pass the 
                                 :py:meth:`sanitize` test.
        '''
        current_text_id = self['text_id']
        other_text_id = text_dict['text_id']
        if current_text_id != other_text_id:
            raise ValueError(f"The current `text_id` {current_text_id} "
                             "does not match that of the argument's `text_id`"
                             f" {other_text_id}. For TargetText {self}")
        text = text_dict['text']
        self._storage['text'] = text
        try:
            self.anonymised = False
        except AnonymisedError:
            del self._storage['text']
            sanitize_err = traceback.format_exc()
            raise AnonymisedError('Cannot de-anonymise this TargetText '
                                  f'{self} as it cannot pass the `sanitize`'
                                  ' check of which the following is the '
                                  f'error from said check {sanitize_err}')

[docs]    def in_order(self) -> bool:
        '''
        :returns: True if all the `targets` within this TargetText 
                  are in sequential left to right order within the text.
        '''
        spans = self['spans']
        ordered_spans = sorted(spans)
        if ordered_spans != spans:
            return False
        return True

[docs]    def re_order(self, keys_not_to_order: Optional[List[str]] = None) -> None:
        '''
        Re-orders the TargetText object so that the targets are in a left to 
        right order within the text, this will then re-order all values within 
        this object that are in a list format into this order. Once the 
        TargetText has been re-ordered it will return True when 
        :py:meth`target_extraction.data_types.TargetText.in_order` is called.

        :param keys_not_to_order: Any key values not to re-order using this 
                                  function e.g. `pos_tags`, `tokenized_text`, 
                                  etc
        :raises AssertionError: If running :py:meth`target_extraction.data_types.TargetText.in_order`
                                after being re-ordered does not return True.
        '''
        def sorting_by_index(index_order: List[int], 
                             value_to_sort: List[Any]) -> List[Any]:
            sorted_value = []
            for index in index_order:
                sorted_value.append(value_to_sort[index])
            return sorted_value

        if keys_not_to_order is None:
            keys_not_to_order = []

        spans: List[Span] = self['spans']
        index_order = sorted(range(len(spans)), key=lambda k: spans[k], 
                             reverse=False)
        new_key_values = {}
        for key, value in self._storage.items():
            try:
                if isinstance(value, list) and key not in keys_not_to_order:
                    # Edge case where the list can be just an empty list
                    if not value:
                        continue
                    # Need to check if the first instance of the value is a 
                    # list and if so then that needs to be sorted and not the 
                    # outer list
                    sorted_value = []
                    if isinstance(value[0], list):
                        for inner_value in value:
                            sorted_inner_value = sorting_by_index(index_order, 
                                                                  inner_value)
                            sorted_value.append(sorted_inner_value)
                    else:
                        sorted_value = sorting_by_index(index_order, value)
                    assert sorted_value
                    new_key_values[key] = sorted_value
            except:
                real_err = traceback.format_exc()
                err_msg = (f'The following error {real_err} has occured on the '
                           f'following key {key} and value {value} for this '
                           f'TargetText {self}')
                raise Exception(err_msg)
        # Covers the rollback problem
        for key, value in new_key_values.items():
            self._storage[key] = value
        self.sanitize()
        assert self.in_order(), print(f'After re-ordering the object is '
                                      f'still not in order:{self}')
    
[docs]    def add_unique_key(self, id_key: str, id_key_name: str, 
                       id_delimiter: str = '::') -> None:
        '''
        :param id_key: The name of the key within this TargetText that requires 
                       unique ids that will be stored in `id_key_name`.
        :param id_key_name: The name of the key to associate to these new 
                            unique ids.
        :param id_delimiter: The delimiter to seperate the `text_id` and the 
                             index of the `id_key` that is being represented 
                             by this unique id.
        :raises KeyError: If the `id_key_name` already exists within the 
                          TargetText.
        :raises TypeError: If the value of `id_key` is not of type List.
        :Example: self.add_unique_key(`targets`, `targets_id`) where 
                  `targets`=[`food`, `service`] and `text_id`=`12a5` the 
                  following key will be added to self `targets_id` with the 
                  following value = `[`12a5::0`, `12a5::1`]`
        '''
        self._key_error(id_key)
        text_id = self['text_id']
        if id_key_name in self:
            raise KeyError(f'The new id_key_name {id_key_name} '
                           f'already exists within {self}')
        if not isinstance(self[id_key], list):
            raise TypeError(f'The value of `id_key` {self[id_key]} in {self} '
                            f'has to be of type List and not {type(self[id_key])}')
        new_ids = []
        for index in range(len(self[id_key])):
            new_ids.append(f'{text_id}{id_delimiter}{index}')
        self[id_key_name] = new_ids

[docs]    @staticmethod
    def from_json(json_text: str, anonymised: bool = False) -> 'TargetText':
        '''
        This is required as the 'spans' are Span objects which are not json 
        serlizable and are required for TargetText therefore this handles 
        that special case.

        This function is also required as we have had to avoid using the 
        __set__ function and add objects via the _storage dictionary 
        underneath so that we could add values to this object that are not 
        within the constructor like `tokenized_text`. To ensure that it is 
        compatable with the TargetText concept we call `TargetText.sanitize`
        method at the end.

        :param json_text: JSON representation of TargetText 
                          (can be from TargetText.to_json)
        :param anonymised: Whether or not the TargetText object being loaded 
                           is an anonymised version.
        :returns: A TargetText object
        :raises KeyError: If within the JSON representation there is no 
                          `text_id` key. Or if anonymised is False raises a
                          KeyError if there is no `text` key in the JSON 
                          representation.
        '''
        json_target_text = json.loads(json_text)
        text = None
        if not 'text_id' in json_target_text:
            raise KeyError('The JSON text given does not contain a '
                           f'`text_id` field: {json_target_text}')
        if not anonymised:
            if not 'text' in json_target_text:
                raise KeyError('The JSON text given does not contain a `text`'
                               f'field: {json_target_text}')
            text = json_target_text['text']
    
        target_text = TargetText(text=text, anonymised=anonymised,
                                 text_id=json_target_text['text_id'])
        for key, value in json_target_text.items():
            if key == 'text' or key == 'text_id':
                continue
            if key == 'spans':
                if value == None:
                    target_text._storage[key] = None
                else:
                    all_spans = []
                    for span in value:
                        all_spans.append(Span(*span))
                    target_text._storage[key] = all_spans
            else:
                target_text._storage[key] = value
        target_text.sanitize()
        return target_text

[docs]    @staticmethod
    def targets_from_spans(text:str, spans: List[Span]) -> List[str]:
        '''
        :param text: The text that the spans are associated too.
        :param spans: A list of Span values that represent the character index 
                      of the target words to be returned.
        :returns: The target words that are associated to the spans and text 
                  given.
        '''
        targets = []
        if not spans:
            return targets
        for span in spans:
            target = text[span.start: span.end]
            targets.append(target)
        return targets

[docs]    @staticmethod
    def target_text_from_prediction(text: str, text_id: str, 
                                    sequence_labels: List[str], 
                                    tokenized_text: List[str],
                                    confidence: Optional[float] = None,
                                    confidences: Optional[List[float]] = None,
                                    **additional_data) -> 'TargetText':
        '''
        Creates a TargetText object from data that has come from predictions
        of a Target Extract tagger e.g. the dictionaries that are returned 
        from :meth:`target_extraction.allen.allennlp_model.predict_sequences`

        :param text: Text to give to the TargetText object
        :param text_id: Text ID to give to the TargetText object
        :param sequence_labels: The predicted sequence labels
        :param tokenized_text: The tokens that were used to produce the 
                               predicted sequence labels (should be returned 
                               by the Target Extract tagger predictor).
        :param confidence: The level of confidence from the tagger that is 
                           required for a target to be a target e.g. 0.9
        :param confidences: The list of confidence values produced 
                            by the Target Extract tagger predictor to be used 
                            with the confidence argument. The list of confidence 
                            values should be the same size as the sequence labels 
                            list and tokenized text.
        :param additional_data: Any other keyword arguments to provide to the 
                                TargetText object
        :returns: A TargetText object with spans and targets values
        :raises ValueError: If sequence labels, tokenized text and confidecnes 
                            are not of the same length
        :raises ValueError: If the following keys are in the additional data;
                            1. confidence, 2. text, 3. text_id, 4. tokenized_text
                            5. sequence_labels, 6. targets, 7. spans. As these 
                            keys will be populated by within the TargetText 
                            object automatically.
        '''
        if len(sequence_labels) != len(tokenized_text):
            raise ValueError('Sequence labels and tokenized texts are not of '
                             f'the same length:\nSequence labels {sequence_labels}'
                             f'\nTokenized text: {tokenized_text}')
        if confidence is not None and len(sequence_labels) != len(confidences):
            raise ValueError('Sequence labels and confidences are not of '
                             f'the same length:\nSequence labels {sequence_labels}'
                             f'\nconfidences: {confidences}')
        not_allowed_additional_keys = {'confidence', 'text', 'text_id', 
                                       'tokenized_text', 'sequence_labels',
                                       'targets', 'spans'}
        for key in additional_data:
            if key in not_allowed_additional_keys:
                raise ValueError("The following keys are not allowd in the "
                                 f"additional data:\n{not_allowed_additional_keys}")
        temp_target_text = TargetText(text_id=text_id, text=text, 
                                      tokenized_text=tokenized_text,
                                      sequence_labels=sequence_labels,
                                      confidence=confidences)
        target_spans = temp_target_text.get_sequence_spans('sequence_labels',
                                                           confidence=confidence)
        targets = TargetText.targets_from_spans(text, target_spans)
        return TargetText(text_id=text_id, text=text, confidence=confidences,
                          tokenized_text=tokenized_text, targets=targets,
                          spans=target_spans, sequence_labels=sequence_labels,
                          **additional_data)



[docs]class TargetTextCollection(MutableMapping):
    '''
    This is a data structure that inherits from MutableMapping which is 
    essentially a python dictionary, however the underlying storage is a 
    OrderedDict therefore if you iterate over it, the iteration will always be 
    in the same order.

    This structure only contains TargetText instances.

    Attributes:
    
    1. name -- Name associated to the TargetTextCollection.
    2. metadata -- Any metadata to associate to the object e.g. domain of the 
       dataset, all metadata is stored in a dictionary. By default the 
       metadata will always have the `name` attribute within 
       the metadata under the key `name`. If `anonymised` is also True then 
       this will also be in the metadata under the key `anonymised`
    3. anonymised -- If True then the data within the TargetText objects have 
       no text but the rest of the metadata should exist.

    Methods:
    
    1. to_json -- Writes each TargetText instances as a dictionary using it's 
       own to_json function on a new line within the returned String. The 
       returned String is not json comptable but if split by new line it is and 
       is also comptable with the from_json method of TargetText.
    2. to_conll -- A CONLL formatted string where the format will be the 
       following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2` Where each 
       token and relevant labels are on separate new lines. The first line 
       will always contain the following: `# {text_id: `value`}` where the 
       text_id represents the `text_id` of this TargetText, this will allow 
       the CONLL string to be uniquely identified back this TargetText 
       object. Also each TargetText CONLL string will be seperated 
       by a new line.
    3. to_conll_file -- Saves the TargetTextCollection to CONLL format. Useful
       for Sequence Labelling tasks.
    4. load_conll -- Loads the CONLL information into the collection.
    5. add -- Wrapper around __setitem__. Given as an argument a TargetText 
       instance it will be added to the collection.
    6. to_json_file -- Saves the current TargetTextCollection to a json file 
       which won't be strictly json but each line in the file will be and each 
       line in the file can be loaded in from String via TargetText.from_json. 
       Also the file can be reloaded into a TargetTextCollection using 
       TargetTextCollection.load_json.
    7. tokenize -- This applies the TargetText.tokenize method across all 
       of the TargetText instances within the collection.
    8. pos_text -- This applies the TargetText.pos_text method across all of 
        the TargetText instances within the collection.
    9. sequence_labels -- This applies the TargetText.sequence_labels 
       method across all of the TargetText instances within the collection.
    10. force_targets -- This applies the TargetText.force_targets method 
        across all of the TargetText instances within the collection.
    11. exact_match_score -- Recall, Precision, and F1 score in a Tuple. 
        All of these measures are based on exact span matching rather than the 
        matching of the sequence label tags, this is due to the annotation spans 
        not always matching tokenization therefore this removes the tokenization 
        error that can come from the sequence label measures.
    12. samples_with_targets -- Returns all of the samples that have target 
        spans as a TargetTextCollection. 
    13. target_count -- A dictionary of target text as key and values as the  
        number of times the target text occurs in this TargetTextCollection
    14. one_sample_per_span -- This applies the TargetText.one_sample_per_span 
        method across all of the TargetText instances within the collection to 
        create a new collection with those new TargetText instances within it.
    15. number_targets -- Returns the total number of targets.
    16. number_categories -- Returns the total number of categories.
    17. category_count -- Returns a dictionary of categories as keys and 
        values as the number of times the category occurs.
    18. target_sentiments -- A dictionary where the keys are target texts and 
        the values are a List of sentiment values that have been associated to 
        that target.
    19. dict_iter -- Returns an interator of all of the TargetText objects 
        within the collection as dictionaries.
    20. unique_distinct_sentiments -- A set of the distinct sentiments within 
        the collection. The length of the set represents the number of distinct 
        sentiments within the collection.
    21. de_anonymise -- This will set the `anonymised` attribute to False 
        from True and set the `text` key value to the value in the `text` 
        key within the `text_dict` argument for each of the TargetTexts in 
        the collection. If any Error is raised this collection will revert back
        fully to being anonymised.
    22. sanitize -- This applies the TargetText.sanitize function to all of 
        the TargetText instances within this collection, affectively ensures 
        that all of the instances follow the specified rules that TargetText 
        instances should follow.
    23. in_order -- This returns True if all TargetText objects within the 
        collection contains a list of targets that are in order of appearance 
        within the text from left to right e.g. if the only TargetText in the 
        collection contains two targets where the first target in the `targets`
        list is the first (left most) target in the text then this method would 
        return True.
    24. re_order -- This will apply :py:meth:`target_extraction.data_types.TargetText.re_order`
        to each TargetText within the collection.
    25. add_unique_key -- Applies the following 
        :py:meth:`target_extraction.data_types.TargetText.add_unique_key` 
        to each TargetText within this collection
    26. key_difference -- Given this collection and another it will return all
        of the keys that the other collection contains which this does not.
    27. combine_data_on_id -- Given this collection and another it will add all
        of the data from the other collection into this collection based on the 
        unique key given. 
    28. one_sentiment_text -- Adds the `text_sentiment_key` to each TargetText 
        within the collection where the value will represent the sentiment value 
        for the text based on the `sentiment_key` values and `average_sentiment` 
        determining how to handle multiple sentiments. This will allow text level 
        classifiers to be trained on target/aspect/category data.

    Static Functions:

    1. from_json -- Returns a TargetTextCollection object given the json like 
       String from to_json. For example the json string can be the return of 
       TargetTextCollection.to_json.
    2. load_json -- Returns a TargetTextCollection based on each new line in 
       the given json file.
    3. combine -- Returns a TargetTextCollection that is the combination of all 
       of those given.
    4. same_data -- Given a List of TargetTextCollections it will return a list 
       of tuples specifying the overlap between the collections based on the 
       samples `text_id` and `text` key values. If it returns an empty list 
       then there are no overlap between the collections. This is useful to 
       find duplicates beyond the `text_id` as it checks the `text` value as well.
    '''
    def __init__(self, target_texts: Optional[List['TargetText']] = None,
                 name: Optional[str] = None, 
                 metadata: Optional[Dict[str, Any]] = None,
                 anonymised: bool = False) -> None:
        '''
        :param target_texts: A list of TargetText instances to add to the 
                             collection.
        :param name: Name to call the collection, this is added to the metadata 
                     automatically and overrides the name key value in the 
                     metadata if exists.
        :param metadata: Any data that you would like to associate to this 
                         TargetTextCollection.
        :param anonymised: Wether or not the TargetText objects should be loaded 
                           in and anonymised, as well as stating whether or not 
                           the whole collection should be anonymised when 
                           loading in new TargetText objects.
        '''
        self._storage = OrderedDict()

        self._anonymised = anonymised
        if target_texts is not None:
            for target_text in target_texts:
                target_text.sanitize()
                self.add(target_text)

        self.metadata = None
        if metadata is not None:
            self.metadata = metadata
        
        if anonymised:
            self.metadata = {} if metadata is None else metadata
            self.metadata['anonymised'] = anonymised

        if name is not None:
            self.name = name
            self.metadata = {} if metadata is None else metadata
            self.metadata['name'] = name
        else:
            self.name = ''

    @property
    def name(self) -> str:
        '''
        :returns: The name attribute.
        '''

        return self._name

    @name.setter
    def name(self, name_string: str) -> None:
        '''
        Sets the value of the name attribute, and also updates the `name` key 
        value in the `metadata` attribute.

        :param name_string: New name to give to the name attribute.
        '''
        self._name = name_string
        self.metadata = {} if self.metadata is None else self.metadata
        self.metadata['name'] = self._name

    @property
    def anonymised(self) -> bool:
        '''
        :returns: True if the data within the TargetTextCollection has been 
                  anonymised. Anonymised data means that there is no text 
                  associated with any of the TargetText objects within the 
                  collection, but all of the metadata is there.
        '''
        return self._anonymised

    @anonymised.setter
    def anonymised(self, value: bool) -> None:
        '''
        Sets whether or not `anonymised` attribute is True or False. This in 
        effect performs the 
        :py:meth:`target_extraction.data_types.TargetText.anonymised`
        on each TargetText object within the collection if True. When you 
        want to set this to False you need to perform 
        :py:meth:`target_extraction.data_types.TargetTextCollection.de_anonymise`.

        :param value: True for anonymised, else False. If True this will 
                      enforce that all the TargetText objects do not have a
                      `text` key/value and the attribute `anonymised` is True.
        :raises AnonymisedError: If the TargetText object within the collection 
                                 cannot be set to the 
                                 `anonymised` value given. If this Error occurs 
                                 then the object will have kept the original 
                                 `anonymised` value.
        '''
        for target_text in self.values():
            target_text.anonymised = value
        self.metadata = {} if self.metadata is None else self.metadata
        self.metadata['anonymised'] = value
        self._anonymised = value 

[docs]    def add(self, value: 'TargetText') -> None:
        '''
        Wrapper around set item. Instead of having to add the value the 
        usual way of finding the instances 'text_id' and setting this containers
        key to this value, it does this for you.

        e.g. performs self[value['text_id']] = value

        :param value: The TargetText instance to store in the collection. Will 
                      anonymise the TargetText object if the collection's 
                      anonymised attribute is True.
        '''
        value.anonymised = self.anonymised
        self[value['text_id']] = value

[docs]    def to_json(self) -> str:
        '''
        Required as TargetTextCollection is not json serlizable due to the 
        'spans' in the TargetText instances.

        :returns: The object as a list of dictionaries where each the TargetText
                  instances are dictionaries. It will also JSON serialize any 
                  meta data as well.
        '''
        json_text = ''
        for index, target_text_instance in enumerate(self.values()):
            if index != 0:
                json_text += '\n'
            target_text_instance: TargetText
            json_text += target_text_instance.to_json()
        if self.metadata is not None:
            if json_text != '':
                json_text += '\n'
            json_text += json.dumps({'metadata': self.metadata})
        return json_text

[docs]    def to_conll(self, gold_label_key: str, 
                 prediction_key: Optional[str] = None) -> str:
        '''
        This in affect performs the `to_conll` function for each TargetText 
        within the collection and seperates each on the CONLL strings with a 
        new line.

        :param gold_label: A key that contains a sequence of labels e.g. 
                           [`B`, `I`, `O`]. This can come from the return 
                           of the :py:meth:`sequence_labels`
        :param prediction_key: Key to the predicted labels of the `gold_label`. 
                               Where the prediction key values is a list of a 
                               list of predicted labels. Each list is therefore 
                               a different model run hence creating the 
                               `PREDICTION 1`, 'PREDICTION 2' etc. Thus the 
                               values of `prediction_key` must be of shape 
                               (number runs, number tokens)
        :returns: A CONLL formatted string where the format will be the 
                  following: `TOKEN#GOLD LABEL#PREDICTION 1# PREDICTION 2`
                  Where each token and relevant labels are on separate new 
                  lines. The first line will always contain the following:
                  `# {text_id: `value`}` where the text_id represents the 
                  `text_id` of this TargetText, this will allow the CONLL
                  string to be uniquely identified back this TargetText 
                  object. Also each TargetText CONLL string will be seperated 
                  by a new line.
        '''
        conll_string = ''
        for target_text in self.values():
            target_conll = target_text.to_conll(gold_label_key=gold_label_key, 
                                                prediction_key=prediction_key)
            conll_string += f'{target_conll}\n\n'
        return conll_string

[docs]    def to_conll_file(self, conll_fp: Path, gold_label_key: str, 
                      prediction_key: Optional[str] = None) -> None:
        '''
        Writes the ouput of `to_conll` to the `conll_fp` file.

        :param conll_fp: Write the CONLL string to this file path.
        :param gold_label: A key that contains a sequence of labels e.g. 
                           [`B`, `I`, `O`]. This can come from the return 
                           of the :py:meth:`sequence_labels`
        :param prediction_key: Key to the predicted labels of the `gold_label`. 
                               Where the prediction key values is a list of a 
                               list of predicted labels. Each list is therefore 
                               a different model run hence creating the 
                               `PREDICTION 1`, 'PREDICTION 2' etc. Thus the 
                               values of `prediction_key` must be of shape 
                               (number runs, number tokens)
        '''
        conll_string = self.to_conll(gold_label_key=gold_label_key, 
                                     prediction_key=prediction_key)
        with conll_fp.open('w+') as conll_file:
            conll_file.write(conll_string)

[docs]    def load_conll(self, conll_fp: Path, tokens_key: str = 'tokenized_text', 
                   gold_label_key: Optional[str] = None, 
                   prediction_key: Optional[str] = None) -> None:
        '''
        This takes the `conll_fp` and loads the CONLL data into the relevant 
        TargetText samples in this collection using the TargetText `from_conll`
        function. The matching of TargetText with CONLL data is through the CONLL
        string containing `# {text_id: _id}` for each CONLL sentence/text.

        :param tokens_key: Key to save the CONLL tokens too, for the TargetText.
        :param gold_label_key: Key to save the gold labels too. Either 
                               `gold_label_key` or `prediction_key` must not be 
                               `None` or both not `None`, for the TargetText.
        :param prediction_key: Key to save the prediction labels too. The value 
                               will be of shape (number runs, number tokens), 
                               for the TargetText.
        '''
        def _line_divider(line: str) -> bool:
            return line.strip() == ''
        
        with conll_fp.open('r') as conll_file:

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(conll_file, _line_divider):
                if is_divider:
                    continue
                lines = list(lines)
                _id_line = lines[0]
                _id_line = _id_line.lstrip('#').strip()
                text_id = json.loads(_id_line)['text_id']
                # Find relevant TargetText
                lines = [line.strip() for line in lines if line.strip()!='']
                conll_line = '\n'.join(lines[1:])
                self[text_id].from_conll(conll_line, tokens_key=tokens_key, 
                                         gold_label_key=gold_label_key, 
                                         prediction_key=prediction_key)

    @staticmethod
    def _get_metadata(json_iterable: Iterable[str]) -> Tuple[Union[Dict[str, Any], None],
                                                             Union[str, None], bool]:
        '''
        :param json_iterable: An interable that generates a JSON string, of 
                              which the last string contains the metadata if 
                              it exists. 
        :returns: The metadata for the collection being loaded, as a Tuple of 
                  length 3 where the 3 items are: 1. The metadata, 
                  2. The name of the collection, and 3. Whether it has been 
                  anonymised. The first 2 by default are None and the 3 is 
                  False by default. 
        '''
        metadata = None
        name = None
        anonymised = False
        for line in deque(json_iterable, 1):
            if line.strip():
                json_line = json.loads(line)
                if 'metadata' in json_line:
                    metadata = json_line['metadata']
                    if 'name' in metadata:
                        name = metadata['name']
                    if 'anonymised' in metadata:
                        anonymised = metadata['anonymised']  
        return metadata, name, anonymised

[docs]    @staticmethod
    def from_json(json_text: str, **target_text_collection_kwargs
                  ) -> 'TargetTextCollection':
        '''
        Required as the json text is expected to be the return from the 
        self.to_json method. This string is not passable by a standard json 
        decoder.

        :param json_text: This is expected to be a dictionary like object for 
                          each new line in this text
        :param target_text_collection_kwargs: Key word arguments to give to 
                                              the TargetTextCollection 
                                              constructor.
        :returns: A TargetTextCollection based on each new line in the given 
                  text to be passable by TargetText.from_json method.
        :raises AnonymisedError: If the `TargetText` object that it is loading 
                                 is anonymised but the `target_text_collection_kwargs`
                                 argument contains `anonymised` False, as 
                                 you cannot de-anonymised without performing 
                                 the 
                                 :py:meth:`target_extraction.data_types.TargetTextCollection.de_anonymised`.
        '''
        
       
        if json_text.strip() == '':
            return TargetTextCollection(**target_text_collection_kwargs)

        target_text_instances = []
        metadata, name, anonymised = TargetTextCollection._get_metadata(json_text.split('\n'))
        for line in json_text.split('\n'):
            json_line = json.loads(line)
            if not 'metadata' in json_line:
                target_text_instance = TargetText.from_json(line, anonymised=anonymised)
                target_text_instances.append(target_text_instance)
        # Key word arguments over riding meta data
        if 'name' in target_text_collection_kwargs:
            name = target_text_collection_kwargs['name']
        if 'metadata' in target_text_collection_kwargs:
            metadata = target_text_collection_kwargs['metadata']
        if 'anonymised' in target_text_collection_kwargs:
            anonymised = target_text_collection_kwargs['anonymised']
        return TargetTextCollection(target_text_instances, name=name, 
                                    metadata=metadata, anonymised=anonymised)

[docs]    @staticmethod
    def load_json(json_fp: Path, **target_text_collection_kwargs
                  ) -> 'TargetTextCollection':
        '''
        Allows loading a dataset from json. Where the json file is expected to 
        be output from TargetTextCollection.to_json_file as the file will be 
        a json String on each line generated from TargetText.to_json. This 
        will also load any meta data that was stored within the TargetTextCollection.

        :param json_fp: File that contains json strings generated from 
                        TargetTextCollection.to_json_file
        :param target_text_collection_kwargs: Key word arguments to give to 
                                              the TargetTextCollection 
                                              constructor. If there was
                                              any meta data stored within the 
                                              loaded json then these key word 
                                              arguments would over ride the 
                                              meta data stored.
        :returns: A TargetTextCollection based on each new line in the given 
                  json file, and the optional meta data on the last line.
        '''
        target_text_instances = []
        with json_fp.open('r') as json_file:
            metadata, name, anonymised = TargetTextCollection._get_metadata(json_file)

        with json_fp.open('r') as json_file:
            for line in json_file:
                if line.strip():
                    json_line = json.loads(line)
                    if 'metadata' not in json_line:
                        target_text_instance = TargetText.from_json(line, anonymised)
                        target_text_instances.append(target_text_instance)
                        
        # Key word arguments over riding meta data
        if 'name' in target_text_collection_kwargs:
            name = target_text_collection_kwargs['name']
        if 'metadata' in target_text_collection_kwargs:
            metadata = target_text_collection_kwargs['metadata']
        if 'anonymised' in target_text_collection_kwargs:
            anonymised = target_text_collection_kwargs['anonymised']
        return TargetTextCollection(target_text_instances, name=name, 
                                    metadata=metadata, anonymised=anonymised)

[docs]    def to_json_file(self, json_fp: Path, 
                     include_metadata: bool = False) -> None:
        '''
        Saves the current TargetTextCollection to a json file which won't be 
        strictly json but each line in the file will be and each line in the 
        file can be loaded in from String via TargetText.from_json. Also the 
        file can be reloaded into a TargetTextCollection using 
        TargetTextCollection.load_json.

        :param json_fp: File path to the json file to save the current data to.
        :param include_metadata: Whether or not to include the metadata when 
                                 writing to file.
        '''
        with json_fp.open('w+') as json_file:
            for index, target_text_instance in enumerate(self.values()):
                target_text_instance: TargetText
                target_text_string = target_text_instance.to_json()
                if index != 0:
                    target_text_string = f'\n{target_text_string}'
                json_file.write(target_text_string)
            if self.metadata is not None and include_metadata:
                metadata_to_write = {'metadata': self.metadata}
                json_file.write(f'\n{json.dumps(metadata_to_write)}')

[docs]    def tokenize(self, tokenizer: Callable[[str], List[str]]) -> None:
        '''
        This applies the TargetText.tokenize method across all of 
        the TargetText instances within the collection.

        For a set of tokenizers that are definetly comptable see 
        target_extraction.tokenizers module.

        Ensures that the tokenization is character preserving.

        :param tokenizer: The tokenizer to use tokenize the text for each 
                          TargetText instance in the current collection
        :raises TypeError: If the tokenizer given does not return a List of 
                           Strings.
        :raises ValueError: This is raised if any of the TargetText instances 
                            in the collection contain an empty string.
        :raises ValueError: If the tokenization is not character preserving.
        '''

        for index, target_text_instance in enumerate(self.values()):
            if index == 0:
                target_text_instance.tokenize(tokenizer, True)
            else:
                target_text_instance.tokenize(tokenizer, False)
    
[docs]    def pos_text(self, tagger: Callable[[str], List[str]]) -> None:
        '''
        This applies the TargetText.pos_text method across all of 
        the TargetText instances within the collection.

        For a set of pos taggers that are definetly comptable see 
        target_extraction.pos_taggers module.

        :param tagger: POS tagger.
        :raises TypeError: If the POS tagger given does not return a List of 
                           Strings.
        :raises ValueError: This is raised if any of the TargetText instances 
                            in the collection contain an empty string.
        :raises ValueError: If the Target Text instance has not been tokenized.
        :raises ValueError: If the number of pos tags for a Target Text instance
                            does not have the same number of tokens that has 
                            been generated by the tokenizer function.
        '''

        for index, target_text_instance in enumerate(self.values()):
            if index == 0:
                target_text_instance.pos_text(tagger, True)
            else:
                target_text_instance.pos_text(tagger, False)

[docs]    def force_targets(self) -> None:
        '''
        This applies the TargetText.force_targets method across all of the 
        TargetText instances within the collection.
        '''
        for target_text_instance in self.values():
            target_text_instance.force_targets()

[docs]    def sequence_labels(self, return_errors: bool = False, 
                        **target_sequence_label_kwargs
                        ) -> List['TargetText']:
        '''
        This applies the TargetText.sequence_labels method across all of 
        the TargetText instances within the collection.

        :param return_errors: Returns TargetText objects that have caused 
                              the ValueError to be raised.
        :param target_sequence_label_kwargs: Any Keyword arguments to give to 
                                             the TargetText `sequence_labels`
                                             function.
        :returns: A list of TargetText objects that have caused the ValueError 
                  to be raised if `return_errors` is True else an empty list 
                  will be returned. 
        :raises KeyError: If the current TargetText has not been tokenized.
        :raises ValueError: If two targets overlap the same token(s) e.g 
                            `Laptop cover was great` if `Laptop` and 
                            `Laptop cover` are two seperate targets this should 
                            riase a ValueError as a token should only be 
                            associated to one target.
        '''

        errored_targets = []
        for target_text_instance in self.values():
            if return_errors:
                try:
                    target_text_instance.sequence_labels(**target_sequence_label_kwargs)
                except ValueError:
                    errored_targets.append(target_text_instance)
            else:
                target_text_instance.sequence_labels(**target_sequence_label_kwargs)
        return errored_targets


[docs]    def exact_match_score(self, 
                          predicted_sequence_key: str = 'predicted_sequence_labels'
                          ) -> Tuple[float, float, float, 
                                     Dict[str, List[Tuple[str, Span]]]]:
        '''        
        Just for clarification we use the sequence label tags to find the 
        predicted spans. However even if you have a perfect sequence label 
        score does not mean you will have a perfect extact span score 
        as the tokenizer used for the sequence labelling might not align 
        perfectly with the annotated spans.

        The False Positive mistakes, False Negative mistakes, and correct
        True Positive Dictionary keys are those names with the values neing a 
        List of Tuples where the Tuple is made up of the TargetText instance ID 
        and the Span that was incorrect (FP) or not tagged (FN) or correct (TP).
        Example of this is as follows:
        {`FP`: [('1', Span(0, 4))], 'FN': [], 'TP': []}

        :param predicted_sequence_key: Key of the predicted sequence labels 
                                       within this TargetText instance.
        :returns: Recall, Precision, and F1 score, False Positive mistakes, 
                  False Negative mistakes, and correct True Positives in a 
                  Dict. All of these measures are based on exact span matching 
                  rather than the matching of the sequence label tags, 
                  this is due to the annotation spans not always matching 
                  tokenization therefore this removes the tokenization 
                  error that can come from the sequence label measures.
        :raises KeyError: If there are no predicted sequence label key 
                          within this TargetText.
        :raises ValueError: If the predicted or true spans contain multiple 
                            spans that have the same span e.g. 
                            [Span(4, 15), Span(4, 15)]
        '''
        # tp = True Positive count
        tp = 0.0
        num_pred_true = 0.0
        num_actually_true = 0.0
        fp_mistakes: List[Tuple[str, Span]] = []
        fn_mistakes: List[Tuple[str, Span]] = []
        correct_tp: List[Tuple[str, Span]] = []

        for target_text_index, target_text_instance in enumerate(self.values()):
            if target_text_index == 0:
                keys_to_check = ['spans', 
                                f'{predicted_sequence_key}']
                for key in keys_to_check:
                    target_text_instance._key_error(key)
            predicted_spans = target_text_instance.get_sequence_spans(predicted_sequence_key)
            # Add to the number of predicted true and actually true
            predicted_spans: List[Span]
            num_pred_true += len(predicted_spans)

            true_spans: List[Span] = target_text_instance['spans']
            if true_spans is None:
                true_spans = []
            num_actually_true += len(true_spans)
            
            # This should be impossible to get to
            if len(predicted_spans) != len(set(predicted_spans)):
                raise ValueError(f'Predicted spans {predicted_spans} contain'
                                 f' multiple of the same predicted span. '
                                 f'TargetText: {target_text_instance}')
            # This is possible
            if len(true_spans) != len(set(true_spans)):
                raise ValueError(f'True spans {true_spans} contain'
                                 f' multiple of the same true span. '
                                 f'TargetText: {target_text_instance}')
            
            text_id = target_text_instance['text_id']
            true_spans = set(true_spans)
            for predicted_span in predicted_spans:
                if predicted_span in true_spans:
                    tp += 1
                    correct_tp.append((text_id, predicted_span))
                else:
                    fp_mistakes.append((text_id, predicted_span))
            for true_span in true_spans:
                if true_span not in predicted_spans:
                    fn_mistakes.append((text_id, true_span))
        
        error_analysis_dict = {'FP': fp_mistakes, 'FN': fn_mistakes, 
                               'TP': correct_tp}
        if tp == 0.0:
            return 0.0, 0.0, 0.0, error_analysis_dict
        recall = tp / num_actually_true
        precision = tp / num_pred_true
        f1 = (2 * precision * recall) / (precision + recall)
        return recall, precision, f1, error_analysis_dict 

[docs]    def samples_with_targets(self) -> 'TargetTextCollection':
        '''
        :returns: All of the samples that have targets as a 
                  TargetTextCollection for this TargetTextCollection.
        :raises KeyError: If either `spans` or `targets` does not exist in 
                          one or more of the TargetText instances within this 
                          collection. These key's are protected keys thus they
                          should always exist but this is just a warning if 
                          you have got around the protected keys.
        '''
        sub_collection = TargetTextCollection()
        sub_collection.anonymised = self.anonymised
        for target_text in self.values():
            if target_text['spans'] and target_text['targets']:
                sub_collection.add(target_text)
        return sub_collection

[docs]    def target_count(self, lower: bool = False, 
                     target_key: str = 'targets') -> Dict[str, int]:
        '''
        :Note: The target can not exist e.g. be a `None` target as the target 
               can be combined with the category like in the SemEval 2016 
               Restaurant dataset. In these case we do not include these 
               in the target_count.
        :param lower: Whether or not to lower the target text.
        :param target_key: The key in each TargetText sample that contains the 
                           list of target words.
        :returns: A dictionary of target text as key and values as the number 
                  of times the target text occurs in this TargetTextCollection
        '''
        target_count: Dict[str, int] = Counter()
        for target_dict in self.values():
            if target_dict[target_key]:
                for target in target_dict[target_key]:
                    if target is None:
                        continue
                    if lower:
                        target = target.lower()
                    target_count.update([target])
        return dict(target_count)

[docs]    def target_sentiments(self, lower: bool = False, 
                          unique_sentiment: bool = False
                          ) -> Dict[str, Union[List[str], Set[str]]]:
        '''
        :Note: The target can not exist e.g. be a `None` target as the target 
               can be combined with the category like in the SemEval 2016 
               Restaurant dataset. In these case we do not include these 
               in the target_count.
        :param lower: Whether or not to lower the target text.
        :param unique_sentiment: Whether or not the return is a dictionary  
                                 whose values are a List of Strings or if 
                                 True a Set of Strings.
        :returns: A dictionary where the keys are target texts and the values 
                  are a List of sentiment values that have been associated to 
                  that target. The sentiment value can occur more than once 
                  indicating the number of times that target has been associated 
                  with that sentiment unless unique_sentiment is True then 
                  instead of a List of sentiment values a Set is used instead.
        :Explanation: If the target `camera` has occured with the sentiment 
                      `positive` twice and `negative` once then it will return 
                      {`camera`: [`positive`, `positive`, `negative`]}. However
                      if `unique_sentiment` is True then it will return:
                      {`camera`: {`positive`, `negative`}}.

        '''
        target_sentiment_values: Dict[str, List[str]] = defaultdict(list)
        if unique_sentiment:
            target_sentiment_values: Dict[str, Set[str]] = defaultdict(set)
        for target_dict in self.values():
            if target_dict['targets'] and target_dict['target_sentiments']:
                for target, sentiment in zip(target_dict['targets'], 
                                             target_dict['target_sentiments']):
                    if target is None:
                        continue
                    if lower:
                        target = target.lower()
                    if unique_sentiment:
                        target_sentiment_values[target].add(sentiment)
                    else:
                        target_sentiment_values[target].append(sentiment)
        return dict(target_sentiment_values)

[docs]    def number_targets(self, incl_none_targets: bool = False) -> int:
        '''
        :param incl_none_targets: Whether to include targets that are `None`
                                  and are therefore associated to the categories 
                                  in the count.
        :returns: The total number of targets in the collection. 
        '''
        target_count = 0
        for target_dict in self.values():
            if target_dict['targets']:
                for target in target_dict['targets']:
                    if not incl_none_targets and target is None:
                        continue
                    target_count += 1
        return target_count

[docs]    def number_categories(self) -> int:
        '''
        :returns: The total number of categories in the collection
        :raises ValueError: If one of the category values in the list is of 
                            value None
        '''
        return sum(self.category_count().values())

[docs]    def category_count(self) -> Dict[str, int]:
        '''
        :returns: A dictionary of categories as keys and values as the number 
                  of times the category occurs in this TargetTextCollection
        :raises ValueError: If any category has the value of None.
        '''
        categories_count = Counter()
        for target_dict in self.values():
            if target_dict['categories']:
                for category in target_dict['categories']:
                    if category is None:
                        raise ValueError('One of the category value is None, '
                                         f'within {target_dict}')
                    categories_count.update([category])
        return dict(categories_count)

[docs]    def one_sample_per_span(self, remove_empty: bool = False
                            ) -> 'TargetTextCollection':
        '''
        This applies the TargetText.one_sample_per_span method across all of the 
        TargetText instances within the collection to create a new collection 
        with those new TargetText instances within it.
        
        :param remove_empty: If the TargetText instance contains any None 
                             targets then these will be removed along with 
                             their respective Spans.
        :returns: A new TargetTextCollection that has samples that come 
                  from this collection but has had the 
                  TargetText.one_sample_per_span method applied to it.
        '''
        
        new_collection = TargetTextCollection()
        for target_text in self.values():
            new_collection.add(target_text.one_sample_per_span(remove_empty=remove_empty))
        return new_collection

[docs]    def dict_iterator(self) -> Iterable[Dict[str, Any]]:
        '''
        :returns: An interator of all of the TargetText objects 
                  within the collection as dictionaries.
        '''
        for target_text in self.values():
            target_text: TargetText
            yield dict(target_text)
    
[docs]    def unique_distinct_sentiments(self, 
                                   sentiment_key: str = 'target_sentiments'
                                   ) -> Set[int]:
        '''
        :param sentiment_key: The key that represents the sentiment value 
                              for each TargetText object 
        :returns: A set of the distinct sentiments within the collection. 
                  The length of the set represents the number of distinct 
                  sentiments within the collection.
        :raises TypeError: If the value in the sentiment_key is not of type list
        '''
        unique_ds = set()
        for target_object in self.values():
            sentiment_value = target_object[sentiment_key]
            if not isinstance(sentiment_value, list):
                raise TypeError(f'The sentiment key {sentiment_key} contains a'
                                f' value that is not of type List: '
                                f'{sentiment_value}. TargetText object: '
                                f'{target_object}')
            unique_ds.add(len(set(sentiment_value)))
        # Need to remove 0's which come about because an empty list is of 
        # length 0
        if 0 in unique_ds:
            unique_ds.remove(0)
        return unique_ds

[docs]    def de_anonymise(self, text_dicts: Iterable[Dict[str, str]]) -> None:
        '''
        This will set the `anonymised` attribute to False 
        from True and set the `text` key value to the value in the `text` 
        key within the `text_dict` argument for each of the TargetTexts in 
        the collection. If any Error is raised this collection will revert back
        fully to being anonymised.

        :param text_dicts: An iterable of dictionaries that contain the following 
                           two keys: 1. `text` and 2. `text_id` where 
                           the `text_id` has to be a key within the current 
                           collection. The `text` associated to that id will 
                           become that TargetText object's text value.
        :raises ValueError: If the length of the `text_dicts` does not match 
                            that of the collection.
        :raises KeyError: If any of the `text_id`s in the `text_dicts` do not 
                          match those within this collection.
        '''
        try:
            self_len = len(self)
            text_dict_len = {}
            for text_dict in text_dicts:
                text_dict_id = text_dict['text_id']
                text_dict_len[text_dict_id] = 1
                if text_dict_id not in self:
                    raise KeyError(f"The key {text_dict_id} from `text_dicts`"
                                   f" is not in this collection.")
                self[text_dict_id].de_anonymise(text_dict)
            text_dict_len = len(text_dict_len)
            if self_len != text_dict_len:
                raise ValueError(f'The length of collection {self_len} is not '
                                 'equal to the length of the `text_dicts` '
                                 f'{text_dict_len}.')
        except Exception as e:
            # Cleans up after the exception as we have to preserve the case 
            # that it is still anonymised
            for target_text in self.values():
                if not target_text.anonymised:
                    target_text.anonymised = True
            raise e
        self.anonymised = False

[docs]    def sanitize(self) -> None:
        '''
        This applies the TargetText.sanitize function to all of 
        the TargetText instances within this collection, affectively ensures 
        that all of the instances follow the specified rules that TargetText 
        instances should follow.
        '''

        for target_text in self.values():
            target_text.sanitize()

[docs]    def in_order(self) -> bool:
        '''
        This returns True if all TargetText objects within the 
        collection contains a list of targets that are in order of appearance 
        within the text from left to right e.g. if the only TargetText in the 
        collection contains two targets where the first target in the `targets`
        list is the first (left most) target in the text then this method would 
        return True.

        :returns: True if all the `targets` within all the TargetText objects 
                  in this collection are in sequential left to right order 
                  within the text.
        '''
        for target_text in self.values():
            if not target_text.in_order():
                return False
        return True
    
[docs]    def re_order(self, keys_not_to_order: Optional[List[str]] = None) -> None:
        '''
        This will apply :py:meth:`target_extraction.data_types.TargetText.re_order`
        to each TargetText within the collection.

        :param keys_not_to_order: Any keys within the TargetTexts that do not 
                                  need re-ordering
        '''
        # This takes into account the rollback problem where an error occurs 
        # halfway through performing the function and half the collection has 
        # been re-ordered where as the other half has not. This will bring it 
        # back into a stable state.
        self_copy = copy.deepcopy(self._storage)
        try:
            for target_text in self.values():
                target_text.re_order(keys_not_to_order)
        except Exception as e:
            self._storage = self_copy
            raise e

[docs]    def add_unique_key(self, id_key: str, id_key_name: str, 
                       id_delimiter: str = '::') -> None:
        '''
        Applies the following 
        :py:meth:`target_extraction.data_types.TargetText.add_unique_key` 
        to each TargetText within this collection
        
        :param id_key: The name of the key within this TargetText that requires 
                       unique ids that will be stored in `id_key_name`.
        :param id_key_name: The name of the key to associate to these new 
                            unique ids.
        :param id_delimiter: The delimiter to seperate the `text_id` and the 
                             index of the `id_key` that is being represented 
                             by this unique id.
        '''
        for value in self.values():
            value.add_unique_key(id_key, id_key_name, id_delimiter=id_delimiter)

[docs]    def key_difference(self, other_collection: 'TargetTextCollection'
                       ) -> List[str]:
        '''
        :param other_collection: The collection that is being compared to this.
        :returns: A list of keys that represent all of the keys that are in the 
                  other (compared) collection and not in this collection.
        '''
        this_keys = {key for value in self.values() for key in value.keys()}
        other_keys = {key for value in other_collection.values() for key in value.keys()}
        return list(other_keys.difference(this_keys))

[docs]    def combine_data_on_id(self, other_collection: 'TargetTextCollection', 
                           id_key: str, data_keys: List[str], 
                           raise_on_overwrite: bool = True,
                           check_same_ids: bool = True) -> None:
        '''
        :param other_collection: The collection that contains the data 
                                 that is to be copied to this collection.
        :param id_key: The key that indicates in each TargetText within 
                       this and the `other_collection` how the values are 
                       to be copied from the `other_collection` to this 
                       collection.
        :param data_keys: The keys of the values in each TargetText within the 
                          `other_collection` that is be copied to the relevant 
                          TargetTexts within this collection. It assumes that if
                          any of key/values are a list of lists that the inner 
                          lists relate to the targets and the outer list is 
                          not related to the targets. 
        :param raise_on_overwrite: If True will raise the 
                                   :py:class:`target_extraction.data_types_util.OverwriteError` 
                                   if any of the `data_keys` exist in any 
                                   of the TargetTexts within this collection.
        :param check_same_ids: If True will ensure that this collection and the 
                               other collection are of same length and check 
                               if each have the same unique ids
        :raises AssertionError: If the number of IDs from the `id_key` does not 
                                match the number of data to be added to a data key
        :raises ValueError: If `check_same_ids` is True and the two collections 
                            are either of not the same length or have  
                            different unique ids according to `id_key` within 
                            the TargetText objects.
        :raises OverwriteError: If `raise_on_overwrite` is True and the any of 
                                the `data_keys` exist in any of the TargetTexts
                                within this collection.
        '''
        def sort_data_by_key(key: str, self_target_text: TargetText, 
                             other_target_text: TargetText, 
                             data_to_sort: List[Any]) -> List[Any]:
            '''
            :param key: A key that appear in both `self_target_text` and 
                        `other_target_text`, where the key for both represents 
                        values that appear in both and are unique.
            :param self_target_text: A TargetText object where the values in 
                                     `key` will determine the sorting performed
                                     to `data_to_sort`. 
            :param other_target_text: The TargetText that represents the `data_to_sort`
                                      and is in this TargetText's sort order 
                                      based on values in `key`
            :param data_to_sort: Data that has come from `other_target_text` that 
                                 is to be sorted based on `key` values from 
                                 `self_target_text`
            :returns: The `data_to_sort` ordered by the values in `self_target_text`
                      key `key`
            :raises AssertionError: If the number of IDs from the `key` does not 
                                    match the number of data_to_sort
            '''
            self_data_values = []
            num_ids = len(other_target_text[key])
            num_data = len(data_to_sort)
            assert_err = (f'The ID key {key} contains {num_ids}, however the '
                          'number of values/data to be added from the other '
                          f'TargetText is {num_data} which is {data_to_sort} '
                          f'OtherTargetText {other_target_text}\n'
                          f'SelfTargetText {self_target_text}')
            assert num_ids == num_data, assert_err

            for self_id_value in self_target_text[key]:
                index_other_id_value = other_target_text[key].index(self_id_value)
                self_data_values.append(data_to_sort[index_other_id_value])
            
            return self_data_values

        if check_same_ids:
            len_self = len(self)
            len_other = len(other_collection)
            if len_self != len_other:
                raise ValueError('The two collections are not the same length. '
                                 f'This length {len_self} other {len_other}')
            self_ids = {_id for value in self.values() for _id in value[id_key]}
            other_ids = {_id for value in other_collection.values() 
                             for _id in value[id_key]}
            self_differences = self_ids.difference(other_ids)
            other_differences = other_ids.difference(self_ids)
            all_differences = self_differences.union(other_differences)
            if len(all_differences):
                raise ValueError(f'The two collections do not contain the same'
                                 f' ids. The difference between this and the '
                                 f'other are the following ids {self_differences}'
                                 f'\nThe difference between the other and this '
                                 f'is the following {other_differences}')
        # If an error occurs would be good to have a roll back poilcy that 
        # will return this collection back to it's original self
        self_copy = copy.deepcopy(self._storage)
        try:
            for text_id, self_target_text in self.items():
                other_target_text = other_collection[text_id]
                # Cannot assume that the unique ids will be in the same order.
                for data_key in data_keys:
                    if data_key in self_target_text and raise_on_overwrite:
                        raise OverwriteError(f'The following data key {data_key}'
                                             ' exists in the following TargetText'
                                             f' {self_target_text} within this collection. '
                                             'The other TargetText that contains '
                                             'this data key to copy the data from '
                                             f'is {other_target_text}')
                    self_data_values = []
                    other_data_values = other_target_text[data_key]
                    # If the other_data_values is a list of a list, need to 
                    # take into account the sorting of the targets should only 
                    # be applied to the inner list.
                    is_inner_list = False
                    if isinstance(other_data_values, list):
                        if other_data_values:
                            if isinstance(other_data_values[0], list):
                                is_inner_list = True
                    if is_inner_list:
                        for other_inner_list_data in other_data_values:
                            self_inner_list_data = sort_data_by_key(id_key, self_target_text, 
                                                                    other_target_text, 
                                                                    other_inner_list_data)
                            self_data_values.append(self_inner_list_data)
                    else:
                        self_data_values = sort_data_by_key(id_key, self_target_text, 
                                                            other_target_text, 
                                                            other_data_values)
                    self_target_text[data_key] = self_data_values
        except Exception as e:
            self._storage = self_copy
            raise e

[docs]    def one_sentiment_text(self, sentiment_key: str,
                           average_sentiment: bool = False, 
                           text_sentiment_key: str = 'text_sentiment'
                           ) -> None:
        '''
        Adds the `text_sentiment_key` to each TargetText within the collection 
        where the value will represent the sentiment value for the text based 
        on the `sentiment_key` values and `average_sentiment` determining how 
        to handle multiple sentiments. This will allow text level classifiers 
        to be trained on target/aspect/category data.

        :param sentiment_key: The key in the TargetTexts that represent the 
                              sentiment for the TargetTexts sentence. 
        :param average_sentiment: If False it will only add the `text_sentiment_key` 
                                  to TargetTexts that have one unique sentiment in the 
                                  `sentiment_key` e.g. can have more than one sentiment
                                  value in the `sentiment_key` but each one of 
                                  those values has to be the same value. If True 
                                  it will choose the 
                                  most frequent sentiment , ties are decided 
                                  by random choice. If the there are no 
                                  values in `sentiment_key` then 
                                  `text_sentiment_key` will not be added to 
                                  the TargetText.
        :param text_sentiment_key: The key to add the text level sentiment value 
                                   to.
        '''
        for target_text in self.values():
            target_text: TargetText
            target_text._key_error(sentiment_key)

            sentiments = target_text[sentiment_key]
            if average_sentiment:
                if len(sentiments) == 1:
                    target_text[text_sentiment_key] = sentiments[0]
                elif len(sentiments) == 0:
                    continue
                else:
                    sentiment_counts = Counter(sentiments)
                    sorted_counts = sorted(sentiment_counts.items(), 
                                           key=lambda x: x[1], reverse=True)
                    highest_count = sorted_counts[0][1]
                    highest_sentiment_values = []
                    for sentiment_value, count in sorted_counts:
                        if count == highest_count:
                            highest_sentiment_values.append(sentiment_value)
                    assert highest_sentiment_values
                    random_sentiment_value = random.choice(highest_sentiment_values)
                    target_text[text_sentiment_key] = random_sentiment_value
            else:
                if len(sentiments) == 1:
                    target_text[text_sentiment_key] = sentiments[0]
                elif len(sentiments) > 1 and len(set(sentiments)) == 1:
                    target_text[text_sentiment_key] = sentiments[0]

[docs]    @staticmethod
    def combine(*collections) -> 'TargetTextCollection':
        '''
        :param collections: An iterator containing one or more 
                            TargetTextCollections
        :returns: A TargetTextCollection that is the combination of all of 
                  those given.

        :NOTE: If any of the collections are anonymised then the returned 
               collection will also be anonymised, even if only one of the 
               collections has been anonymised.
        '''
        target_objects: 'TargetText' = []
        is_anonymised = False
        for collection in collections:
            if collection.anonymised:
                is_anonymised = True
            for target in collection.values():
                target_objects.append(target)
        return TargetTextCollection(target_objects, anonymised=is_anonymised)

[docs]    @staticmethod
    def same_data(collections: List['TargetTextCollection']
                  ) -> List[Tuple[List[Tuple['TargetText', 'TargetText']], Tuple[str, str]]]:
        '''
        :param collections: A list of TargetTextCollections to test if there are 
                            any duplicates based on `text_id` and `text` key 
                            values.
        :returns: If the list is empty then there are no duplicates. Else a list of
                  tuples containing 1. A list of tuples of duplicate TargetText instances 
                  2. A tuple of collection names that the TargetText have come stating 
                  the names of the collections that have the duplicates.
        '''
        all_matches = []
        for collection_index, collection in enumerate(collections):
            
            for other_collection in collections[collection_index + 1:]:
                same_targets = []
                for target_text in collection.values():
                    for other_target_text in other_collection.values():
                        if target_text['text_id'] == other_target_text['text_id']:
                            same_targets.append((target_text, other_target_text))
                        elif target_text['text'] == other_target_text['text']:
                            same_targets.append((target_text, other_target_text))
                if same_targets:
                    all_matches.append((same_targets, (collection.name, other_collection.name)))
        return all_matches


    def __setitem__(self, key: str, value: 'TargetText') -> None:
        '''
        Will add the TargetText instance to the collection where the key 
        should be the same as the TargetText instance 'text_id'.

        :param key: Key to be added or changed
        :param value: TargetText instance associated to this key. Where the 
                      key should be the same value as the TargetText instance 
                      'text_id' value. Furthermore if the TargetTextCollection's
                      `anonymised` attribute is True then the TargetText object 
                      being added will also be anonymised.
        '''
        if not isinstance(value, TargetText):
            raise TypeError('The value should be of type TargetText and not '
                            f'{type(value)}')
        text_id = value['text_id']
        if text_id != key:
            raise ValueError(f'The value `text_id`: {text_id} should be the '
                             f'same value as the key: {key}')
        # We copy it to stop any mutable objects from changing outside of the 
        # collection
        value_copy = copy.deepcopy(value)
        value_copy.anonymised = self.anonymised
        self._storage[key] = value_copy

    def __delitem__(self, key: str) -> None:
        '''
        Given a key that matches a key within self._storage or self.keys() 
        it will delete that key and value from this object.

        :param key: Key and its respective value to delete from this object.
        '''
        del self._storage[key]

    def __eq__(self, other: 'TargetTextCollection') -> bool:
        '''
        Two TargetTextCollection instances are equal if they both have 
        the same TargetText instances within it.

        :param other: Another TargetTextCollection object that is being  
                      compared to this TargetTextCollection object.
        :returns: True if they have the same TargetText instances within it.
        '''

        if not isinstance(other, TargetTextCollection):
            return False

        if len(self) != len(other):
            return False

        for key in self.keys():
            if key not in other:
                return False
        return True

    def __repr__(self) -> str:
        '''
        :returns: String returned is what user see when the instance is 
                  printed or printed within a interpreter.
        '''
        rep_text = 'TargetTextCollection('
        for key, value in self.items():
            rep_text += f'key: {key}, value: {value}'
            break
        if len(self) > 1:
            rep_text += '...)'
        else:
            rep_text += ')'
        return rep_text

    def __len__(self) -> int:
        '''
        :returns: The number of TargetText instances in the collection.
        '''
        return len(self._storage)

    def __iter__(self) -> Iterable[str]:
        '''
        Returns as interator over the TargetText instances 'text_id''s that 
        are stored in this collection. This is an ordered iterator as the 
        underlying dictionary used to store the TargetText instances is an 
        OrderedDict in self._storage.

        :returns: TargetText instances 'text_id''s that are stored in this 
                  collection
        '''
        return iter(self._storage)

    def __getitem__(self, key: str) -> 'TargetText':
        '''
        :returns: A TargetText instance that is stored within this collection.
        '''
        return self._storage[key]