Source code for bella.error_analysis

'''
Functions that given a TargetCollection and any number of kwargs will 
return a list of `target_id`s from the given TargetCollection.

These functions are used to subset TargetCollection so that we can get 
subset metrics e.g. Accuracy score for samples within the TargetCollection 
that have only one sentiment within the sentence that sample came from
'''
from collections import defaultdict
from typing import Dict, Any, Set, List

from bella.data_types import TargetCollection, Target

[docs]def targets_to_samples(dataset: TargetCollection, targets: Set[str], lower: bool = True) -> List[Target]: ''' Given a dataset and a set of target words, it will return a subset of the dataset where all samples in the subset have target words that are in the targets set. :param dataset: TargetCollection containing samples :param targets: A set of target words used to subset the dataset :param lower: Whether to lower case the target words. If this is True it is up to you to ensure all the words in the `targets` set have been lower cased. :returns: A subset of the dataset where all targets in the subset are within the `targets` set. ''' samples = [] for data in dataset.data(): target = data['target'] if lower: target = target.lower() if target in targets: samples.append(data) return samples
[docs]def target_sentiments(dataset: TargetCollection, lower: bool = True) -> Dict[str, Set[Any]]: ''' Given a dataset will return a dictionary of targets and the sentiment that has been associated to those targets. E.g. within the dataset that `target` `camera` may have only been seen with a positive and a negative label but not neutral therefore in the returned dictionary it would be {`camera`: [`positive`, `negative`]} :param dataset: TargetCollection containing samples :param lower: Whether to lower case the target words. :returns: A dictionary where the keys are target words and the values are the sentiment values that have been associated to those targets. ''' targets_sentiments = defaultdict(set) for data in dataset.data(): target = data['target'] if lower: target = target.lower() targets_sentiments[target].add(data['sentiment']) return targets_sentiments
[docs]def same_one_sentiment(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that have the same one sentiment label associated to them in the train and test sets. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) same_one_sentiments = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): train_sentiments = train_target_sentiments[target] test_sentiments = test_target_sentiments[target] if (len(train_sentiments) == 1 and len(test_sentiments) == 1): if train_sentiments == test_sentiments: same_one_sentiments.add(target) same_one_samples = targets_to_samples(test_dataset, same_one_sentiments, lower) same_one_ids = [sample['target_id'] for sample in same_one_samples] return same_one_ids
[docs]def same_multi_sentiment(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that have occured more than once in the train and test sets with the same sentiment labels. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) same_multi_sentiments = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): train_sentiments = train_target_sentiments[target] test_sentiments = test_target_sentiments[target] if (len(train_sentiments) > 1 and len(test_sentiments) > 1): if train_sentiments == test_sentiments: same_multi_sentiments.add(target) same_multi_samples = targets_to_samples(test_dataset, same_multi_sentiments, lower) same_multi_ids = [sample['target_id'] for sample in same_multi_samples] return same_multi_ids
[docs]def similar_sentiment(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that have occured more than once in the train or test sets with at least some overlap between the test sentiment and train but not identical. E.g. the target `camera` could occur with `positive` and `negative` sentiment in the test set and only `negative` in the train set. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) similar_sentiments = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): train_sentiments = train_target_sentiments[target] test_sentiments = test_target_sentiments[target] if (len(train_sentiments) > 1 or len(test_sentiments) > 1): if train_sentiments == test_sentiments: continue if test_sentiments.intersection(train_sentiments): similar_sentiments.add(target) similar_samples = targets_to_samples(test_dataset, similar_sentiments, lower) similar_ids = [sample['target_id'] for sample in similar_samples] return similar_ids
[docs]def different_sentiment(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that have different sentiment labels with no overlap in the test compared to the train set. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) different_sentiments = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): train_sentiments = train_target_sentiments[target] test_sentiments = test_target_sentiments[target] if not test_sentiments.intersection(train_sentiments): different_sentiments.add(target) different_samples = targets_to_samples(test_dataset, different_sentiments, lower) different_ids = [sample['target_id'] for sample in different_samples] return different_ids
[docs]def unknown_targets(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that did not exist in the training data. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) unknowns = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): continue else: unknowns.add(target) unknown_samples = targets_to_samples(test_dataset, unknowns, lower) unknown_ids = [sample['target_id'] for sample in unknown_samples] return unknown_ids