Source code for target_extraction.analysis.sentiment_metrics

'''
This module contains functions that expect a TargetTextCollection that contains
`target_sentiments` key that represent the true sentiment values and a prediction
key e.g. `sentiment_predictions`. Given these the function will return either a 
metric score e.g. Accuracy or a list of scores based on the arguments given 
to the function and if the `sentiment_predictions` key is an array of values.

Arguments for all functions in this module:

1. TargetTextCollection -- Contains the true and predicted sentiment scores
2. true_sentiment_key -- Key that contains the true sentiment scores 
   for each target in the TargetTextCollection
3. predicted_sentiment_key -- Key that contains the predicted sentiment scores  
   for each target in the TargetTextCollection
4. average -- If the predicting model was ran *N* times whether or not to 
   average the score over the *N* runs. Assumes array_scores is False.
5. array_scores -- If average is False and you a model that has predicted 
   *N* times then this will return the *N* scores, one for each run.
6. assert_number_labels -- Whether or not to assert this many number of unique  
   labels must exist in the true sentiment key. If this is None then the 
   assertion is not raised.
7. ignore_label_differences -- If True then the ValueError will not be 
   raised if the predicted sentiment values are not in the true 
   sentiment values. See :py:func:`get_labels` for more details.

:raises ValueError: If the the prediction model has ran *N* times where 
                    *N>1* and `average` or `array_scores` are either both 
                    True or both False.
:raises ValueError: If the number of predictions made per target are 
                    different or zero. 
:raises ValueError: If only one set of model prediction exist then 
                    `average` and `array_scores` should be False.
:raises KeyError: If either the `true_sentiment_key` or 
                  `predicted_sentiment_key` does not exist.
:raises LabelError: If `assert_number_labels` is not None and the number of 
                    unique true labels does not equal the `assert_number_labels`
                    this is raised.
'''
import functools
from typing import Union, Optional, Callable, Tuple, List, Any
import statistics

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

from target_extraction.data_types import TargetTextCollection, TargetText

[docs]class LabelError(Exception): ''' If the number of unique labels does not match your expected number of unique labels. ''' def __init__(self, true_number_unique_labels: int, number_unique_labels_wanted: int) -> None: ''' :param true_number_unique_labels: Number of unique labels that came from the dataset :param number_unique_labels_wanted: Expected number of unique labels that should be in the dataset. ''' error_string = ('Number of unique labels in the dataset ' f'{true_number_unique_labels}. The number of unique ' 'labels expected in the dataset ' f'{number_unique_labels_wanted}') super().__init__(error_string)
[docs]def metric_error_checks(func: Callable[[TargetTextCollection, str, str, bool, bool, Optional[int], bool], Union[float, np.ndarray]] ) -> Callable[[TargetTextCollection, str, str, bool, bool, Optional[int], bool], Union[float, np.ndarray]]: ''' Decorator for the metric functions within this module. Will raise any of the Errors stated above in the module documentation before the metric functions is called. ''' @functools.wraps(func) def wrapper(target_collection: TargetTextCollection, true_sentiment_key: str, predicted_sentiment_key: str, average: bool, array_scores: bool, assert_number_labels: Optional[int] = None, ignore_label_differences: bool = True, **kwargs ) -> Union[float, np.ndarray]: # Check that the TargetTextCollection contains both the true and # predicted sentiment keys unique_label_set = set() total_number_model_predictions = 0 for target_object in target_collection.values(): target_object: TargetText target_object._key_error(true_sentiment_key) target_object._key_error(predicted_sentiment_key) for true_label in target_object[true_sentiment_key]: unique_label_set.add(true_label) # Cannot have inconsistent number of model predictions number_model_predictions = len(target_object[predicted_sentiment_key]) if total_number_model_predictions == 0: total_number_model_predictions = number_model_predictions else: if total_number_model_predictions != number_model_predictions: raise ValueError('The number of predictions made per ' 'Target within the collection is different' f'. This TargetText could have no targets' ' within the collection thus this error ' 'will be raise. TargetText that has an error' f' {target_object}\nThe number of predcitions' f' that this object should have: {total_number_model_predictions}') # Cannot have zero predictions if total_number_model_predictions == 0: raise ValueError('The number of predictions made per target are zero') # Perform the LabelError check if assert_number_labels is not None: number_unique_labels = len(unique_label_set) if number_unique_labels != assert_number_labels: raise LabelError(number_unique_labels, assert_number_labels) # If the dataset has one model prediction per target then average and # array_scores should be False if number_model_predictions == 1: if average or array_scores: raise ValueError('When only one set of predictions per target' ' then `average` and `array_scores` have to ' 'be both False') else: if average == array_scores: raise ValueError('As the number of model predictions is > 1 ' 'then either `average` or `array_scores` have ' 'to be True but not both.') return func(target_collection, true_sentiment_key, predicted_sentiment_key, average, array_scores, assert_number_labels, ignore_label_differences, **kwargs) return wrapper
[docs]def get_labels(target_collection: TargetTextCollection, true_sentiment_key: str, predicted_sentiment_key: str, labels_per_text: bool = False, ignore_label_differences: bool = True ) -> Tuple[Union[List[Any], List[List[Any]]], Union[List[List[Any]], List[List[List[Any]]]]]: ''' :param target_collection: Collection of targets that have true and predicted sentiment values. :param true_sentiment_key: Key that contains the true sentiment scores for each target in the TargetTextCollection :param predicted_sentiment_key: Key that contains the predicted sentiment scores for each target in the TargetTextCollection. It assumes that the predictions is a List of List where the outer list are the number of model runs and the inner list is the number of targets to predict for, the the second Tuple of the example return for an example of this. :param labels_per_text: If True instead of returning a List[Any] it will return a List[List[Any]] where in the inner list represents the predictions per text rather than in the normal case where it is all predictions ignoring which text they came from. :param ignore_label_differences: If True then the ValueError will not be raised if the predicted sentiment values are not in the true sentiment values. :returns: A tuple of 1; true sentiment value 2; predicted sentiment values. where the predicted sentiment values is a list of predicted sentiment value, one for each models predictions. See `Example of return 2` for an example of what this means where in that example there are two texts/sentences. :raises ValueError: If the number of predicted sentiment values are not equal to the number true sentiment values. :raises ValueError: If the labels in the predicted sentiment values are not in the true sentiment values. :Example of return 1: (['pos', 'neg', 'neu'], [['neg', 'pos', 'neu'], ['neu', 'pos', 'neu']]) :Example of return 2: ([['pos'], ['neg', 'neu']], [[['neg'], ['pos', 'neu']], [['neu'], ['pos', 'neu']]]) ''' all_predicted_values: List[List[Any]] = [] all_true_values: List[Any] = [] for target_object in target_collection.values(): target_object: TargetText true_values = target_object[true_sentiment_key] if labels_per_text: all_true_values.append(true_values) else: all_true_values.extend(true_values) predicted_values_lists = target_object[predicted_sentiment_key] # Create a list per model predictions if all_predicted_values == []: for _ in predicted_values_lists: all_predicted_values.append([]) for index, prediction_list in enumerate(predicted_values_lists): if labels_per_text: all_predicted_values[index].append(prediction_list) else: all_predicted_values[index].extend(prediction_list) # Check that the number of values in the predicted values is the same as # the number of values in the true list true_number_values = len(all_true_values) for prediction_list in all_predicted_values: number_predictions = len(prediction_list) if number_predictions != true_number_values: raise ValueError(f'Number targets predicted {number_predictions}. ' f'Number of targets {true_number_values}. ' 'These should be the same!') # Check that the values in True are the same as those in predicted if labels_per_text: unique_true_values = set([value for values in all_true_values for value in values]) else: unique_true_values = set(all_true_values) for prediction_list in all_predicted_values: if labels_per_text: unique_predicted_values = set([value for values in prediction_list for value in values]) else: unique_predicted_values = set(prediction_list) if (unique_predicted_values.difference(unique_true_values) and not ignore_label_differences): raise ValueError(f'Values in the predicted sentiment are not in the' ' True sentiment values. Values in predicted ' f'{unique_predicted_values}, values in True ' f'{unique_true_values}') return (all_true_values, all_predicted_values)
[docs]@metric_error_checks def accuracy(target_collection: TargetTextCollection, true_sentiment_key: str, predicted_sentiment_key: str, average: bool, array_scores: bool, assert_number_labels: Optional[int] = None, ignore_label_differences: bool = True ) -> Union[float, List[float]]: ''' :param ignore_label_differences: See :py:func:`get_labels` Accuracy score. Description at top of module explains arguments. ''' true_values, predicted_values_list = get_labels(target_collection, true_sentiment_key, predicted_sentiment_key, ignore_label_differences=ignore_label_differences) scores: List[float] = [] for predicted_values in predicted_values_list: scores.append(accuracy_score(true_values, predicted_values)) if average: return statistics.mean(scores) elif array_scores: return scores else: assert 1 == len(scores) return scores[0]
[docs]@metric_error_checks def macro_f1(target_collection: TargetTextCollection, true_sentiment_key: str, predicted_sentiment_key: str, average: bool, array_scores: bool, assert_number_labels: Optional[int] = None, ignore_label_differences: bool = True, **kwargs ) -> Union[float, List[float]]: ''' :param ignore_label_differences: See :py:func:`get_labels` :param **kwargs: These are the keyword arguments to give to the underlying scikit learn :py:func:`f1_metric`. Note that the only argument that cannot be changed that is given to :py:func:`f1_metric` is `average`. If you want the F1 score for one label this can still be done by providing the `labels` argument where the value would be the label you want the F1 score for e.g. `labels` = [`positive`]. Macro F1 score. Description at top of module explains arguments. ''' true_values, predicted_values_list = get_labels(target_collection, true_sentiment_key, predicted_sentiment_key, ignore_label_differences=ignore_label_differences) scores: List[float] = [] for predicted_values in predicted_values_list: kwargs['average'] = 'macro' scores.append(f1_score(true_values, predicted_values, **kwargs)) if average: return statistics.mean(scores) elif array_scores: return scores else: assert 1 == len(scores) return scores[0]
[docs]@metric_error_checks def strict_text_accuracy(target_collection: TargetTextCollection, true_sentiment_key: str, predicted_sentiment_key: str, average: bool, array_scores: bool, assert_number_labels: Optional[int] = None, ignore_label_differences: bool = True ) -> Union[float, List[float]]: ''' This is performed at the text/sentence level where a sample is not denoted as one target but as all targets within a text. A sample is correct if all targets within the text have been predicted correctly. This will return the average of the correct predictions. Strict Text ACcuracy also known as STAC. This metric also assumes that all the texts within the `target_collection` also contains at least one target. If it does not a ValueError will be raised. :param ignore_label_differences: See :py:func:`get_labels` ''' true_values, predicted_values_list = get_labels(target_collection, true_sentiment_key, predicted_sentiment_key, labels_per_text=True, ignore_label_differences=ignore_label_differences) true_values: List[List[Any]] predicted_values_list: List[List[List[Any]]] scores: List[float] = [] num_texts = float(len(true_values)) for predicted_values in predicted_values_list: score = 0 for true_value, predicted_value in zip(true_values, predicted_values): if true_value == predicted_value: score += 1 scores.append(float(score) / num_texts) if average: return statistics.mean(scores) elif array_scores: return scores else: assert 1 == len(scores) return scores[0]