Source code for bella.error_analysis

'''
Functions that given a TargetCollection and any number of kwargs will 
return a list of `target_id`s from the given TargetCollection.

These functions are used to subset TargetCollection so that we can get 
subset metrics e.g. Accuracy score for samples within the TargetCollection 
that have only one sentiment within the sentence that sample came from
'''
from collections import defaultdict
from typing import Dict, Any, Set, List

from bella.data_types import TargetCollection, Target

[docs]def targets_to_samples(dataset: TargetCollection, targets: Set[str], 
                       lower: bool = True) -> List[Target]:
    '''
    Given a dataset and a set of target words, it will return a subset of the 
    dataset where all samples in the subset have target words that are in the 
    targets set.

    :param dataset: TargetCollection containing samples
    :param targets: A set of target words used to subset the dataset
    :param lower: Whether to lower case the target words. If this is True 
                  it is up to you to ensure all the words in the `targets` set 
                  have been lower cased.
    :returns: A subset of the dataset where all targets in the subset are 
              within the `targets` set.
    '''
    samples = []
    for data in dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if target in targets:
            samples.append(data)
    return samples

[docs]def target_sentiments(dataset: TargetCollection, 
                      lower: bool = True) -> Dict[str, Set[Any]]:
    '''
    Given a dataset will return a dictionary of targets and the sentiment 
    that has been associated to those targets.

    E.g. within the dataset that `target` `camera` may have only been seen 
    with a positive and a negative label but not neutral therefore in the 
    returned dictionary it would be {`camera`: [`positive`, `negative`]}

    :param dataset: TargetCollection containing samples
    :param lower: Whether to lower case the target words.
    :returns: A dictionary where the keys are target words and the values 
              are the sentiment values that have been associated to those 
              targets.
    '''
    targets_sentiments = defaultdict(set)
    for data in dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        targets_sentiments[target].add(data['sentiment'])
    return targets_sentiments

[docs]def same_one_sentiment(test_dataset: TargetCollection, 
                       train_dataset: TargetCollection, 
                       lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that have the same one sentiment label associated 
    to them in the train and test sets.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    same_one_sentiments = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments and 
            target in test_target_sentiments):
            train_sentiments = train_target_sentiments[target]
            test_sentiments = test_target_sentiments[target]
            if (len(train_sentiments) == 1 and 
                len(test_sentiments) == 1):
                if train_sentiments == test_sentiments:
                    same_one_sentiments.add(target)

    same_one_samples = targets_to_samples(test_dataset, same_one_sentiments, 
                                          lower)
    same_one_ids = [sample['target_id'] for sample in same_one_samples]
    return same_one_ids

[docs]def same_multi_sentiment(test_dataset: TargetCollection,
                         train_dataset: TargetCollection,
                         lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that have occured more than once in the train and 
    test sets with the same sentiment labels.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    same_multi_sentiments = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments and 
            target in test_target_sentiments):
            train_sentiments = train_target_sentiments[target]
            test_sentiments = test_target_sentiments[target]
            if (len(train_sentiments) > 1 and 
                len(test_sentiments) > 1):
                if train_sentiments == test_sentiments:
                    same_multi_sentiments.add(target)
    same_multi_samples = targets_to_samples(test_dataset, same_multi_sentiments, 
                                            lower)
    same_multi_ids = [sample['target_id'] for sample in same_multi_samples]
    return same_multi_ids

[docs]def similar_sentiment(test_dataset: TargetCollection,
                      train_dataset: TargetCollection,
                      lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that have occured more than once in the train or 
    test sets with at least some overlap between the test sentiment and train 
    but not identical. E.g. the target `camera` could occur with `positive` and
    `negative` sentiment in the test set and only `negative` in the train set.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    similar_sentiments = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments and 
            target in test_target_sentiments):
            train_sentiments = train_target_sentiments[target]
            test_sentiments = test_target_sentiments[target]
            if (len(train_sentiments) > 1 or 
                len(test_sentiments) > 1):
                if train_sentiments == test_sentiments:
                    continue
                if test_sentiments.intersection(train_sentiments):
                    similar_sentiments.add(target)
    similar_samples = targets_to_samples(test_dataset, similar_sentiments, 
                                         lower)
    similar_ids = [sample['target_id'] for sample in similar_samples]
    return similar_ids

[docs]def different_sentiment(test_dataset: TargetCollection,
                        train_dataset: TargetCollection, 
                        lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that have different sentiment labels with no 
    overlap in the test compared to the train set.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    different_sentiments = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments and 
            target in test_target_sentiments):
            train_sentiments = train_target_sentiments[target]
            test_sentiments = test_target_sentiments[target]
            if not test_sentiments.intersection(train_sentiments):
                different_sentiments.add(target)
    different_samples = targets_to_samples(test_dataset, different_sentiments, 
                                           lower)
    different_ids = [sample['target_id'] for sample in different_samples]
    return different_ids

[docs]def unknown_targets(test_dataset: TargetCollection,
                    train_dataset: TargetCollection, 
                    lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that did not exist in the training data.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    unknowns = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments and 
            target in test_target_sentiments):
            continue
        else:
            unknowns.add(target)

    unknown_samples = targets_to_samples(test_dataset, unknowns, lower)
    unknown_ids = [sample['target_id'] for sample in unknown_samples]
    return unknown_ids