Source code for bella.data_types

'''
Module that contains the various data types:

1. Target -- Mutable data store for a single Target value i.e. one training 
   example.
2. TargetCollection -- Mutable data store for Target data types.  i.e. A 
   data store that contains multiple Target instances.
'''

from collections.abc import MutableMapping
from collections import OrderedDict, defaultdict
import copy
import json
from pathlib import Path
import random as rand
from typing import List, Callable, Union, Dict, Tuple, Any, Optional, Set
import warnings

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import seaborn as sns

from bella.tokenisers import whitespace
from bella.stanford_tools import constituency_parse

BELLA_DATASET_DIR: Path = Path.home().joinpath('.Bella', 'Datasets')

[docs]class Target(MutableMapping): ''' Mutable data store for a single Target value. This should be used as the value for Target information where it contains all data required to be classified as Target data for Target based sentiment classification. Overrides collections.abs.MutableMapping abstract class. Reference on how I created the class: http://www.kr41.net/2016/03-23-dont_inherit_python_builtin_dict_type.html https://docs.python.org/3/library/collections.abc.html Functions changed compared to normal: 1. __delitem__ -- Will only delete the `target_id` key. 2. __eq__ -- Two Targets are the same if they either have the same `id` or \ they have the same values to the minimum keys \ ['spans', 'text', 'target', 'sentiment'] 3. __setitem__ -- Only allows you to add/modify the `predicted` key which \ represents the predicted sentiment for the Target instance. '''
[docs] def __init__(self, spans, target_id, target, text, sentiment, predicted=None, sentence_id=None, category=None, augmented=None, transfer_data=None, original_target_id=None, original_sentence_id=None, epoch_number = set([-1])): ''' :param target: Target that the sentiment is about. e.g. Iphone :param sentiment: sentiment of the target. :param text: The text context that the target and sentiment is within. :param target_id: Unique ID. Has to be Unique within the TargetCollection if it is put into a TargetCollection. :param spans: List of tuples where each tuple is of length 2 where they contain the exclusive range of an instance of Target word in the Text context. The reason it is a list if because the Target word can be mentioned more than once e.g. `The Iphone was great but the iphone is small`. The first Int in the tuple has to be less than the second Int. :param predicted: If given adds the predicted sentiment value. :param sentence_id: Unique ID of the sentence that the target is within. More than one target can have the same sentence. :param category: In some datasets there is category information where the target is assigned a category. This comes from the SemEval 2015 restaurant dataset. :param augmented: Whether or not the data comes from an augmented dataset or has been produced from an augmented function. :param transfer_data: Whether or not the data comes from a transfer dataset. :param original_target_id: If the augmented field is True then the original_target_id has to exist, as this will inform what the original target was used to create this augmented target data. :param original_sentence_id: If the data has been augmented in such a way that the sentence text has been changed then this will store the sentence ID that relates to the original text. :param epoch_number: The epochs which this target should be sampled from. This is only applicable when using this with a custom sampler, it allows you to add when you may want this target to be sampled. It is a Set of integers as it will allow you to sample the target more than once. :type target: String :type sentiment: String or Int (Based on annotation schema) :type text: String :type target_id: String :type spans: list :type predicted: Same type as sentiment. Default None (Optional) :type sentence_id: String. Default None (Optional) :type category: String. Default None (Optional) :type augmented: bool. Default None (Optional) :type transfer_data: bool. Default None (Optional) :type original_target_id: String :type original_sentence_id: String :type epoch_number: Set of Integers :returns: Nothing. Constructor. :rtype: None ''' if not isinstance(target_id, str): raise TypeError('The target ID has to be of type String and not {}'\ .format(type(target_id))) if not isinstance(target, str): raise TypeError('The target has to be of type String and not {}'\ .format(type(target))) if not isinstance(text, str): raise TypeError('The text has to be of type String and not {}'\ .format(type(text))) if not isinstance(sentiment, (str, int)): raise TypeError('The sentiment has to be of type String or Int and '\ 'not {}'.format(type(sentiment))) if not isinstance(spans, list): raise TypeError('The spans has to be of type list and not {}'\ .format(type(spans))) else: if len(spans) < 1: raise TypeError('spans has to contain at least one tuple not '\ 'None') else: for span in spans: if not isinstance(span, tuple): raise TypeError('Spans has to be a list of tuples not {}'\ .format(type(span))) if len(span) != 2: raise ValueError('Spans must contain tuples of length'\ ' 2 not {}'.format(spans)) if not isinstance(span[0], int) or \ not isinstance(span[1], int): raise TypeError('spans must be made of tuple containing '\ 'two Integers not {}'.format(span)) if span[1] <= span[0]: raise ValueError('The first integer in a span must be '\ 'less than the second integer {}'\ .format(span)) temp_dict = dict(spans=spans, target_id=target_id, target=target, text=text, sentiment=sentiment, epoch_number=epoch_number) if sentence_id is not None: if not isinstance(sentence_id, str): raise TypeError('`sentence_id` has to be a String and not {}'\ .format(type(sentence_id))) temp_dict['sentence_id'] = sentence_id if category is not None: temp_dict['category'] = category if augmented is not None: temp_dict['augmented'] = augmented if original_target_id is None: aug_data_error = ('Cannot create a Target that is augmented ' 'without having the original target_id of the' ' target data that was augmented to create ' 'this target data') raise ValueError(aug_data_error) if original_target_id is not None: temp_dict['original_target_id'] = original_target_id if original_sentence_id is not None: if augmented != True: raise ValueError('Cannot add original sentence id to a Target' ' that has not been augmented.') temp_dict['original_sentence_id'] = original_sentence_id if transfer_data is not None: temp_dict['transfer_data'] = transfer_data self._storage = temp_dict if predicted is not None: self['predicted'] = predicted
def __getitem__(self, key): return self._storage[key] def __iter__(self): return iter(self._storage) def __len__(self): return len(self._storage) def __delitem__(self, key): ''' To ensure that the Target class maintains the minimum Keys and Values to allow an instance to be used in Target based machine learning. The key and associated values that can be deleted are limited to: 1. target_id :param key: The key and associated value to delete from the store. :returns: Updates the data store by removing key and value. :rtype: None ''' accepted_keys = set(['target_id']) if key not in accepted_keys: raise KeyError('The only keys that can be deleted are the '\ 'following: {} the key you wish to delete {}'\ .format(accepted_keys, key)) del self._storage[key] def __setitem__(self, key, value): ''' :param key: key (Only store values for `predicted` key) :param value: Predicted sentiment value which has to be the same data \ type as the `sentiment` value. :type key: String (`predicted` is the only key accepted at the moment) :type value: Int or String. :returns: Nothing. Adds the predicted sentiment of the Target. :rtype: None. ''' if key != 'predicted': raise KeyError('The Only key that can be changed is the `predicted`'\ ' key not {}'.format(key)) #raise_type = False #sent_value = self._storage['sentiment'] #if isinstance(sent_value, int): # if not isinstance(value, (int, np.int32, np.int64)): # raise_type = True #elif not isinstance(value, type(sent_value)): # raise_type = True #if raise_type: # raise TypeError('Value to be stored for the `predicted` sentiment '\ # 'has to be the same data type as the sentiment '\ # 'value {} and not {}.'\ # .format(sent_value, type(value))) self._storage[key] = value def __repr__(self): ''' :returns: String returned is what user see when the instance is \ printed or printed within a interpreter. :rtype: String ''' return 'Target({})'.format(self._storage) def __eq__(self, other): ''' Two Target instances are equal if they are both Target instances and one of the following conditions 1. They have the same target_id (This is preferred) 2. The minimum keys that all targets have to have \ ['spans', 'text', 'target', 'sentiment'] are all equal. :param other: The target instance that is being compare to the current \ target instance. :type other: Target :returns: True if they are equal else False. :rtype: bool ''' if not isinstance(other, Target): return False if 'target_id' in self and 'target_id' in other: if self['target_id'] != other['target_id']: return False else: minimum_keys = ['spans', 'text', 'target', 'sentiment'] for key in minimum_keys: if not self[key] == other[key]: return False return True def __array__(self): ''' Function for converting it to a numpy array ''' return np.asarray(dict(self))
[docs]class TargetCollection(MutableMapping): ''' Mutable data store for Target data types. i.e. A data store that contains multiple Target instances. This collection ensures that there are no two Target instances stored that have the same ID this is because the storage of the collection is an OrderedDict. Overrides collections.abs.MutableMapping abstract class. Functions: 1. add -- Given a Target instance with an `id` key adds it to the data store. 2. data -- Returns all of the Target instances stored as a list of Target instances. 3. stored_sentiments -- Returns a set of unique sentiments stored. 4. sentiment_data -- Returns the list of all sentiment values stored in the Target instances stored. 5. add_pred_sentiment -- Adds a list of predicted sentiment values to the Target instances stored. 6. confusion_matrix -- Returns the confusion matrix between the True and predicted sentiment values. 7. subset_by_sentiment -- Creates a new TargetCollection based on the number of unique sentiments in a sentence. 8. to_json_file -- Returns a Path to a json file that has stored the data as a sample json encoded per line. If the split argument is set will return two Paths the first being the training file and the second the test. 9. categories_targets -- Returns two dictionaries. The first is a category to target list and the second is a target to category list. Their is the option to have a coarse grained cateogires option. 10. target_set -- Returns a set of all the targets within this dataset. 11. subset_by_ids -- Returns a TargetCollection which is a subset of the current one. Where the samples included in the subset are those with a `target_id` that is within the ids List. 12. dataset_metric_scores -- Given a metric like accuracy returns all of the metric scores for the dataset. This assumes that the dataset has the `predicted` sentiment slot assigned. 13. subset_by_targets -- Will subset the data so that only data points that contain the given targets are within the TargetCollection. Static functions: 1. target_targets -- Returns a dictionary of targets as keys and values are a list of realted targets. 2. split_dataset -- Returns the given dataset into two, the train and test based on the test_split fraction given. 3. load_from_json -- Returns a TargetCollection given that the json file has a valid Target dictionary on each line of the json file. The json file can be created from `to_json_file` method. Attributes: 1. grouped_sentences -- A dictionary of sentence_id as keys and a list of target instances that have the same sentence_id as values. 2. grouped_sentiments -- A dictionary where the keys are all possible sentiment values, the value is a list of Target(s) that only have that associated sentiment value. 3. grouped_distinct_sentiments -- A dictionary where the keys are the number of distinct sentiments that the targets have in the associated value. A target has two distinct sentiments if the associated sentence has contains targets that take on one of two sentiment where at least one target has a different sentiment to the rest. The targets in the values of this dictionary are stored as a list of Target '''
[docs] def __init__(self, list_of_target=None, name: Optional[str]=None): ''' :param list_of_target: An interator of Target instances e.g. a List of Target instances. :type list_of_target: Iterable. Default None (Optional) :returns: Nothing. Constructor. :rtype: None ''' self._storage = OrderedDict() if list_of_target is not None: if not hasattr(list_of_target, '__iter__'): raise TypeError('The list_of_target argument has to be iterable') for target in list_of_target: self.add(target) self.name = name if name is not None else 'TargetCollection' self._grouped_sentences = None self._grouped_sentiments = None self._grouped_distinct_sentiments = None
def __getitem__(self, key): return self._storage[key] def __setitem__(self, key, value): ''' If key already exists will raise KeyError. :param key: key that stores the index to the value :param value: value to store at the keys location :type key: hashable object :type value: Target :returns: Nothing. Adds data to the collection :rtype: None. ''' # Required to make sure the grouped_sentences, grouped_sentiments, # and grouped_distinct_sentiments get recomputed instead of cached. self._data_has_changed = True self._data_has_changed_gs = True self._data_has_changed_gds = True if not isinstance(value, Target): raise TypeError('All values in this store have to be of type '\ 'Target not {}'.format(type(value))) if key in self._storage: raise KeyError('This key: `{}` already exists with value `{}` '\ 'value that for the same key is `{}`'\ .format(key, self._storage[key], value)) temp_value = copy.deepcopy(value) # As the id will be saved as the key no longer needed in the target # instance (value). However if the key does not match the `target_id` # raise KeyError if 'target_id' in value: if value['target_id'] != key: raise KeyError('Cannot add this to the data store as the key {}'\ ' is not the same as the `target_id` in the Target'\ ' instance value {}'.format(key, value)) del temp_value['target_id'] self._storage[key] = temp_value def __delitem__(self, key): del self._storage[key] def __iter__(self): return iter(self._storage) def __len__(self): return len(self._storage)
[docs] def add(self, value): ''' Adds the Target instance to the data store without having to extract out the target_id of the target if using __setitem__ :Example: >>> target = Target([(10, 16)], '1', 'Iphone', 'text with Iphone', 0) >>> target_col = TargetCollection() # Add method is simpler to use than __setitem__ >>> target_col.add(target) # Example of the __setitem__ method >>> target_col[target['target_id']] = target :param value: Target instance with a `target_id` key :type value: Target :returns: Nothing. Adds the target instance to the data store. :rtype: None ''' if not isinstance(value, Target): raise TypeError('All values in this store have to be of type '\ 'Target not {}'.format(type(value))) if 'target_id' not in value: raise ValueError('The Target instance given {} does not have a '\ 'target_id'.format(value)) self[value['target_id']] = value
[docs] def data(self): ''' :returns: a list of all the Target instances stored. :rtype: list ''' _data = [] for _id, target_data in self.items(): data_dict = {**target_data} data_dict['target_id'] = _id _data.append(Target(**data_dict)) return _data
[docs] def data_dict(self): ''' :returns: Same as the data function but returns dicts instead of \ Targets :rtype: list ''' _data = [] for _id, target_data in self.items(): data_dict = {**target_data} data_dict['target_id'] = _id _data.append(data_dict) return _data
[docs] def stored_sentiments(self) -> Set[Any]: ''' :returns: A set of all unique sentiment values of the target instances \ in the data store. :rtype: set ''' unique_sentiments = set() for target_data in self.values(): unique_sentiments.add(target_data['sentiment']) return unique_sentiments
[docs] def sentiment_data(self, mapper=None, sentiment_field='sentiment'): ''' :param mapper: A dictionary that maps the keys to the values where the \ keys are the current unique sentiment values of the target instances \ stored :param sentiment_field: Determines if it should return True sentiment \ of the Targets `sentiment` or to return the predicted value `predicted` :type mapper: dict :type sentiment_field: String. Default `sentiment` (True values) :returns: a list of the sentiment value for each Target instance stored. :rtype: list :Example of using the mapper: >>> target_col = TargetCollection([Target([(10, 16)], '1', 'Iphone', 'text with Iphone', 'pos')]) >>> target_col.add(Target([(10, 15)], '2', 'Pixel', 'text with Pixel', 'neg')) # Get the unique sentiment values for each target instance >>> map_keys = target_col.stored_sentiments() >>> print(map_keys) >>> ['pos', 'neg'] >>> mapper = {'pos' : 1, 'neg' : -1} >>> print(target_col.sentiment_data(mapper=mapper)) >>> 1, -1 ''' allowed_fields = set(['sentiment', 'predicted']) if sentiment_field not in allowed_fields: raise ValueError('The `sentiment_field` has to be one of the '\ 'following values {} and not {}'\ .format(allowed_fields, sentiment_field)) if mapper is not None: if not isinstance(mapper, dict): raise TypeError('The mapper has to be of type dict and not {}'\ .format(type(mapper))) allowed_keys = self.stored_sentiments() if len(mapper) != len(allowed_keys): raise ValueError('The mapper has to contain a mapping for each '\ 'unique sentiment value {} and not a subset '\ 'given {}'.format(allowed_keys, mapper.keys())) for map_key in mapper: if map_key not in allowed_keys: raise ValueError('The mappings are not correct. The map '\ 'key {} does not exist in the unique '\ 'sentiment values in the store {}'\ .format(map_key, allowed_keys)) return [mapper[target_data[sentiment_field]]\ for target_data in self.values()] return [target_data[sentiment_field] for target_data in self.values()]
[docs] def add_id_pred(self, id_pred): count = 0 for targ_id in self: if targ_id in id_pred: self[targ_id]['predicted'] = id_pred[targ_id] count += 1 if count != len(self): raise ValueError('We have only added {} predictions to {} targets'\ .format(count, len(self)))
[docs] def add_pred_sentiment(self, sent_preds: Union[List[Any], np.ndarray], mapper: Optional[Dict[Any, Any]] = None) -> None: ''' :param sent_preds: A list of predicted sentiments for all Target instances stored or a numpy array where columns are number of different predicted runs and the rows represent the associated Target instance. :param mapper: A dictionary mapping the predicted sentiment to alternative values e.g. Integer values to String values. :type sent_preds: list or numpy array :type mapper: dict :returns: Nothing. Adds the predicted sentiments to the Target instances stored. ''' if len(sent_preds) != len(self): raise ValueError('The length of the predicted sentiments {} is not '\ 'equal to the number Target instances stored {}'\ .format(len(sent_preds), len(self))) for index, target in enumerate(self.data()): predicted_sent = sent_preds[index] if mapper is not None: predicted_sent = mapper[predicted_sent] target_id = target['target_id'] self._storage[target_id]['predicted'] = predicted_sent
[docs] def confusion_matrix(self, plot=False, norm=False): ''' :param plot: To return a heatmap of the confusion matrix. :param norm: Normalise the values in the confusion matrix :type plot: bool. Default False :type norm: bool. Default False :returns: A tuple of length two. 1. the confusion matrix \ 2. The plot of the confusion matrix if plot is True else \ None. :rtype: tuple ''' sentiment_values = sorted(self.stored_sentiments()) true_values = self.sentiment_data() pred_values = self.sentiment_data(sentiment_field='predicted') conf_matrix = metrics.confusion_matrix(true_values, pred_values, labels=sentiment_values) if norm: conf_matrix = conf_matrix / conf_matrix.sum() conf_matrix = pd.DataFrame(conf_matrix, columns=sentiment_values, index=sentiment_values) ax = None if plot: if norm: ax = sns.heatmap(conf_matrix, annot=True, fmt='.2f') else: ax = sns.heatmap(conf_matrix, annot=True, fmt='d') return conf_matrix, ax
[docs] def subset_by_sentiment(self, num_unique_sentiments): ''' Creates a subset based on the number of unique sentiment values per sentence. E.g. if num_unique_sentiments = 2 then it will return all the Target instances where each target intance has at least two target instances per sentence and those targets can have only one of two sentiment values. This can be used to test how well a method can extract exact sentiment information for the associated target. NOTE: Requires that all Target instances stored contain a sentence_id. :param num_unique_sentiments: Integer specifying the number of unique \ sentiments in the target instances per sentence. :type num_unique_sentiments: int :returns: A subset based on the number of unique sentiments per sentence. :rtype: TargetCollection ''' all_relevent_targets = [] for targets in self.grouped_sentences.values(): target_col = TargetCollection(targets) if len(target_col.stored_sentiments()) == num_unique_sentiments: all_relevent_targets.extend(targets) return TargetCollection(all_relevent_targets)
[docs] def subset_by_targets(self, target_set: Set[str]): ''' Will subset the data so that only data points that contain the given targets are within the TargetCollection. :param target_set: A list of targets to be used to sub-sample the data collection from. :returns: The current target collection with data points that only have targets that are from the target set. :rtype: TargetCollection ''' target_data = [] for data in self.data_dict(): if data['target'].lower() in target_set: target_data.append(Target(**data)) return TargetCollection(target_data)
[docs] def subset_by_sentence_length(self, length_condition): all_relevent_targets = [] for target in self.data(): target_text = target['text'] if length_condition(target_text): all_relevent_targets.append(target) return TargetCollection(all_relevent_targets)
[docs] @staticmethod def load_from_json(json_path: Path) -> 'TargetCollection': ''' Returns a TargetCollection given that the json file has a valid Target dictionary on each line of the json file. The json file can be created from `to_json_file` method. :param json_path: Path to the json file that has a valid Target that can be loaded as a json sequence on each new line. :returns: A TargetCollection of all targets that we stored in the json file. ''' target_list = [] with json_path.open('r') as json_file: for line in json_file: target = json.loads(line) target['spans'] = [tuple(span) for span in target['spans']] if 'epoch_number' in target: target['epoch_number'] = set(target['epoch_number']) target = Target(**target) target_list.append(target) return TargetCollection(target_list)
[docs] @staticmethod def split_dataset(data: 'TargetCollection', test_split: float, random: bool = False ) -> Tuple['TargetCollection', 'TargetCollection']: ''' It will split the data into training and test splits, where the amount of data in the test is defined by the test_split fraction (amount of training data will be 1 - test_split). :param data: data to split :param test_split: The amount of data to give to the test split. :param random: If the splitting should be random rather than the last test split of the data being test and the first (1 - test split) being the training data. :returns: The training data and test data as a tuple. ''' target_data = data.data_dict() data_size = len(target_data) test_data_size = int(data_size * test_split) test_data = [] train_data = [] if random: start_index = rand.randint(0, data_size) end_index = start_index + test_data_size if end_index > data_size: index_diff = end_index - data_size start_index -= index_diff end_index = data_size test_data = target_data[start_index: end_index] train_data = target_data[end_index:] train_data.extend(target_data[: start_index]) else: test_data = target_data[: test_data_size] train_data = target_data[test_data_size:] train_data = [Target(**t_data) for t_data in train_data] test_data = [Target(**t_data) for t_data in test_data] return (TargetCollection(train_data), TargetCollection(test_data))
[docs] def to_json_file(self, dataset_name: Union[str, List[str]], split: Optional[float] = None, cache: bool = True, group_by_sentence: bool = False, random: bool = False ) -> Union[Path, List[Path]]: ''' Returns a Path to a json file that has stored the data as a sample json encoded per line. If the split argument is set will return two Paths the first being the training file and the second the test. The Path does not need to be specified as it saves it to the `~/.Bella/datasets/.` directory within your user space under the dataset_name. To split the bella.data_types.TargetCollection.split_dataset method is used. :param dataset_name: Name to associate to the dataset e.g. `SemEval 2014 rest train`. If split is not None then use a List of Strings e.g. [`SemEval 2014 rest train`, `SemEval 2014 rest dev`] :param split: Whether or not to split the dataset into train, test split. If not use None else specify the fraction of the data to use for the test split. :param cache: If the data is already saved use the Cache. Default is to use the cached data. :param group_by_sentence: Whether the data should be grouped by sentence this will then produce json lines that can contain more than one target in a sentence. :param random: Whether the splitting of the training and test should be random or not. ''' def create_json_file(fp: Path, data: List[Dict[str, Any]]) -> None: ''' Given the a list of dictionaries that represent the Target data converts these samples into json encoded samples which are saved on each line within the file at the given file path(fp) :param fp: File path that will store the json samples one per line :param data: List of dictionaries that represent the Target data. :return: Nothing that data will be saved to the file. ''' with fp.open('w+') as json_file: if group_by_sentence: data = TargetCollection([Target(**d) for d in data]) data = data.grouped_sentences for index, target_datas in enumerate(data.values()): text = target_datas[0]['text'] sentiments = [] targets = [] spans = [] for target_data in target_datas: sentiments.append(target_data['sentiment']) spans.append(target_data['spans']) targets.append(target_data['target']) sentence_data = {'text': text, 'sentiments': sentiments, 'targets': targets, 'spans': spans} json_encoded_data = json.dumps(sentence_data) if index != 0: json_encoded_data = f'\n{json_encoded_data}' json_file.write(json_encoded_data) else: for index, target_data in enumerate(data): if 'epoch_number' in target_data: target_data['epoch_number'] = list(target_data['epoch_number']) json_encoded_data = json.dumps(target_data) if index != 0: json_encoded_data = f'\n{json_encoded_data}' json_file.write(json_encoded_data) # If splitting the data there has to be two dataset names else one name if split is None: assert isinstance(dataset_name, str) elif isinstance(split, float): assert isinstance(dataset_name, list) assert len(dataset_name) == 2 BELLA_DATASET_DIR.mkdir(parents=True, exist_ok=True) dataset_names = dataset_name if not isinstance(dataset_name, list): dataset_names = [dataset_name] all_paths_exist = True dataset_paths = [] for name in dataset_names: dataset_path = BELLA_DATASET_DIR.joinpath(name) if not dataset_path.exists(): all_paths_exist = False dataset_paths.append(dataset_path) # Caching if cache and all_paths_exist: print(f'Using cache for the follwoing datasets: {dataset_names}') if split is None: return dataset_paths[0] return dataset_paths if split is None: create_json_file(dataset_paths[0], self.data_dict()) return dataset_paths[0] # Splitting train, test = self.split_dataset(self, test_split=split, random=random) create_json_file(dataset_paths[0], train.data_dict()) create_json_file(dataset_paths[1], test.data_dict()) return dataset_paths
[docs] def categories_targets(self, filter: int = 2, coarse: bool = False ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]: ''' Returns two dictionaries. The first is a category to target list and the second is a target to category list. Their is the option to have a coarse grained cateogires option. ''' if 'category' not in self.data()[0]: raise ValueError('The current TargetCollection must contain ' 'targets that have a category key') temp_category_targets = defaultdict(set) for data in self.data(): category = data['category'] if coarse: category = category.split('#')[0] target = data['target'] temp_category_targets[category].add(target) category_targets = {} target_categories = defaultdict(list) categories_filtered = [] for category, targets in temp_category_targets.items(): if len(targets) < filter: categories_filtered.append(category) continue category_targets[category] = list(targets) for target in targets: target_categories[target].append(category) # Which categories are filtered print(f'Filtered {len(categories_filtered)} categories which are:') for category in categories_filtered: print(category) return category_targets, dict(target_categories)
[docs] @staticmethod def target_targets(target_set: set, category_targets: Dict[str, List[str]] ) -> Dict[str, List[str]]: ''' Returns a dictionary of targets as keys and values are a list of realted targets. The related targets have all come from category_targets dictionary. Where a target is related if it is in the same category. If it is in multiple categories then all targets from each of those categories are related. :param target_set: A list of targets that are candiate keys in the returned dictionary. :param category_targets: A dictionary of latent categories as keys and related targets as values. :returns: Dictionary of targets and their related targets as values where the related targets have come from the category_targets dictionary. ''' target_rel_targets = {} for target in target_set: rel_targets = [] for _, cat_targets in category_targets.items(): if target in cat_targets or target.lower() in cat_targets: rel_targets.extend(cat_targets) else: lower_cat_targets = [cat_target.lower() for cat_target in cat_targets] if target.lower() in lower_cat_targets: rel_targets.extend(cat_targets) if rel_targets: rel_targets = list(set(rel_targets)) to_remove = set() for rel_target in rel_targets: temp_rel_target = rel_target.lower() if target.lower() == temp_rel_target: to_remove.add(rel_target) for target_to_remove in to_remove: rel_targets.remove(target_to_remove) if not to_remove: raise ValueError('The target that maps to the list of ' 'related target should be in the related ' 'targets before being filtered out') target_rel_targets[target] = rel_targets return target_rel_targets
[docs] def target_set(self, lower: bool = False) -> set: ''' Returns a set of all the targets within this dataset. :param lower: Whether to return the targets lower cased. :returns: A set of all the targets within this dataset. ''' targets = set() for data in self.data(): target = data['target'] if lower: target = target.lower() targets.add(target) return targets
[docs] def subset_by_ids(self, ids: List[str]) -> 'TargetCollection': ''' Returns a TargetCollection which is a subset of the current one. Where the samples included in the subset are those with a `target_id` that is within the ids List. :param ids: A list of `target_id`s that are to be included in the subset :returns: A subset of the current TargetCollection where the subset only includes samples that have a `target_id` in the ids given. ''' subset_data = [] for data_id in ids: rel_data = dict(self[data_id].items()) rel_data['target_id'] = data_id subset_data.append(Target(**rel_data)) return TargetCollection(subset_data)
[docs] def dataset_metric_scores(self, metric: Callable[[np.ndarray, np.ndarray], float], **metric_kwargs) -> np.ndarray: ''' Given a metric like accuracy returns all of the metric scores for the dataset. This assumes that the dataset has the `predicted` sentiment slot assigned. :param metric: Metric function e.g. f1_score :param metric_kwargs: Keyword arguments to provide to the metric function e.g. `average` = `macro` :returns: An array of metric results. One for each column in the predicted sentiment array. ''' true_labels = self.sentiment_data() pred_matrix = self.sentiment_data(sentiment_field='predicted') score_vector = np.apply_along_axis(metric, 0, pred_matrix, true_labels, **metric_kwargs) return score_vector
# Not tested
[docs] def targets_per_sentence(self): ''' :returns: Dictionary of number of targets as keys and values the number \ of sentences that have that many targets per sentence. :rtype: dict :Example: If we have 5 sentences that contain 1 target each and 4 sentences that contain 3 targets each then it will return a dict like: {1 : 5, 3 : 4} ''' targets_sentence = {} for targets in self.grouped_sentences.values(): num_targets = len(targets) targets_sentence[num_targets] = targets_sentence.get(num_targets, 0) + 1 return targets_sentence
# Not tested
[docs] def avg_targets_per_sentence(self): return len(self) / self.number_sentences()
# Not tested
[docs] def number_sentences(self): return len(self.grouped_sentences)
# Not tested
[docs] def number_unique_targets(self): target_count = {} for target_instance in self.values(): target = target_instance['target'] target_count[target] = target_count.get(target, 0) + 1 return len(target_count)
[docs] def no_targets_sentiment(self): sentiment_targets = {} for target in self.values(): sentiment = target['sentiment'] sentiment_targets[sentiment] = sentiment_targets.get(sentiment, 0) + 1 return sentiment_targets
[docs] def ratio_targets_sentiment(self): no_sentiment_target = self.no_targets_sentiment() total_targets = sum(no_sentiment_target.values()) ratio_sentiment_targets = {} for sentiment, no_targets in no_sentiment_target.items(): ratio_sentiment_targets[sentiment] = round(no_targets / total_targets, 2) return ratio_sentiment_targets
@property def grouped_sentences(self) -> Dict[str, List['Target']]: ''' A dictionary of sentence_id as keys and a list of target instances that have the same sentence_id as values. It stores a cache of this result and the cache will expire once the data has changed within itself and then this value will have to be recomputed. :returns: A dictionary of sentence_id as keys and a list of target instances that have the same sentence_id as values. ''' # If the data has changed re-compute or if the data has never been # compute, compute else return the cached results. if self._data_has_changed or self._grouped_sentences is None: self._data_has_changed = False sentence_targets = defaultdict(list) for target in self.data(): if 'sentence_id' not in target: raise ValueError(f'A Target id instance {target} does not ' 'have a sentence_id which is required.') sentence_id = target['sentence_id'] sentence_targets[sentence_id].append(target) self._grouped_sentences = sentence_targets return self._grouped_sentences @property def grouped_sentiments(self) -> Dict[Any, List['Target']]: ''' A dictionary where the keys are all possible sentiment values, the value is a list of Target(s) that only have that associated sentiment value. It stores a cache of this result and the cache will expire once the data has changed within itself and then this value will have to be recomputed. :returns: A dictionary of sentiment values as keys and a list of target instances that have the same sentiment value. ''' # If the data has changed re-compute or if the data has never been # compute, compute else return the cached results. if self._data_has_changed_gs or self._grouped_sentiments is None: self._data_has_changed_gs = False sentiment_targets = defaultdict(list) for target in self.data(): sentiment_targets[target['sentiment']].append(target) self._grouped_sentiments = sentiment_targets return self._grouped_sentiments @property def grouped_distinct_sentiments(self) -> Dict[Any, List['Target']]: ''' A dictionary where the keys are the number of distinct sentiments that the targets have in the associated value. A target has two distinct sentiments if the associated sentence has contains targets that take on one of two sentiment where at least one target has a different sentiment to the rest. The targets in the values of this dictionary are stored as a list of Target It stores a cache of this result and the cache will expire once the data has changed within itself and then this value will have to be recomputed. :returns: A dictionary of distinct sentiments per sentence as keys and a list of target instances that have the same distinct sentiments per sentence ''' # If the data has changed re-compute or if the data has never been # compute, compute else return the cached results. if self._data_has_changed_gds or self._grouped_distinct_sentiments is None: self._data_has_changed_gds = False distinct_sentiment_targets = defaultdict(list) for targets in self.grouped_sentences.values(): target_col = TargetCollection(targets) num_unique_sentiments = len(target_col.stored_sentiments()) distinct_sentiment_targets[num_unique_sentiments].extend(targets) self._grouped_distinct_sentiments = distinct_sentiment_targets return self._grouped_distinct_sentiments
[docs] def group_by_sentence(self): ''' This is now deprecated, please use grouped_sentences property. :returns: A dictionary of sentence_id as keys and a list of target \ instances that have the same sentence_id as values. :rtype: defaultdict (default is list) ''' dep_warning = ('This is now deprecated, please use grouped_sentences ' 'property.') warnings.warn(dep_warning, DeprecationWarning) sentence_targets = defaultdict(list) for target in self.data(): if 'sentence_id' not in target: raise ValueError('A Target id instance {} does not have '\ 'a sentence_id which is required.'\ .format(target)) sentence_id = target['sentence_id'] sentence_targets[sentence_id].append(target) return sentence_targets
[docs] def avg_constituency_depth(self): avg_depths = [] for data in self.values(): sentence_trees = constituency_parse(data['text']) tree_depths = [tree.height() - 1 for tree in sentence_trees] avg_depth = sum(tree_depths) / len(sentence_trees) avg_depths.append(avg_depth) return sum(avg_depths) / len(avg_depths)
[docs] def avg_sentence_length_per_target(self, tokeniser=whitespace): all_sentence_lengths = [] for data in self.values(): all_sentence_lengths.append(len(tokeniser(data['text']))) return sum(all_sentence_lengths) / len(all_sentence_lengths)
[docs] def word_list(self, tokeniser: Callable[[str], List[str]], min_df: int = 0, lower: bool = True) -> List[str]: ''' :param tokeniser: Tokeniser function to tokenise the text of each target/sample :param min_df: Optional. The minimum percentage of documents a token must occur in. :param lower: Optional. Whether to lower the text. :return: A word list of all tokens that occur in this data collection given min_df. ''' token_df = defaultdict(lambda: 0) num_df = 0 for target in self.values(): num_df += 1 tokens = tokeniser(target['text']) for token in tokens: if lower: token = token.lower() token_df[token] += 1 min_df_value = int((num_df / 100) * min_df) word_list = [token for token, df in token_df.items() if df > min_df_value] return word_list
[docs] @staticmethod def combine_collections(*args): all_targets = [] for collections in args: all_targets.extend(collections.data()) return TargetCollection(all_targets)
def __eq__(self, other): if len(self) != len(other): return False for key in self: if key not in other: return False return True def __repr__(self): ''' :returns: String returned is what user see when the instance is \ printed or printed within a interpreter. :rtype: String ''' target_strings = '' self_len = len(self) if self_len > 2: for index, target in enumerate(self.data()): if index == 0: target_strings += '{} ... '.format(target) if index == self_len - 1: target_strings += '{}'.format(target) else: for target in self.data(): target_strings += '{}, '.format(target) if target_strings != '': target_strings = target_strings.rstrip(', ') return 'TargetCollection({})'.format(target_strings)