Source code for target_extraction.analysis.dataset_statistics

'''
This module allows TargetTextCollection objects to be analysed and report 
overall statistics.
'''
from collections import defaultdict
from typing import Dict, Any, List, Union, Callable
import statistics

import pandas as pd

from target_extraction.data_types import TargetTextCollection
from target_extraction.tokenizers import spacy_tokenizer

[docs]def get_sentiment_counts(collection: TargetTextCollection, sentiment_key: str, normalised: bool = True) -> Dict[str, float]: ''' :param collection: The collection containing the sentiment data :param sentiment_key: The key in each TargetText within the collection that contains the True sentiment value. :param normalised: Whether to normalise the values in the dictionary by the number of targets in the collection. :returns: A dictionary where keys are sentiment values and the keys are the number of times they occur in the collection. ''' sentiment_count = defaultdict(lambda: 0) for target_text in collection.values(): if target_text[sentiment_key] is not None: for sentiment_value in target_text[sentiment_key]: sentiment_count[sentiment_value] += 1 number_targets = collection.number_targets() assert number_targets == sum(sentiment_count.values()) if normalised: for sentiment, count in sentiment_count.items(): sentiment_count[sentiment] = float(count) / float(number_targets) return dict(sentiment_count)
[docs]def average_target_per_sentences(collection: TargetTextCollection, sentence_must_contain_targets: bool) -> float: ''' :param collection: Collection to calculate average target per sentence (ATS) on. :param sentence_must_contain_targets: Whether or not the sentences within the collection must contains at least one target. This filtering would affect the value of the dominator stated in the returns. :returns: The ATS for the given collection. Which is: Number of targets / number of sentences ''' number_targets = float(collection.number_targets()) if sentence_must_contain_targets: number_sentences = len(collection.samples_with_targets()) else: number_sentences = len(collection) return number_targets / float(number_sentences)
[docs]def tokens_per_target(collection: TargetTextCollection, target_key: str, tokeniser: Callable[[str], List[str]], normalise: bool = False, cumulative_percentage: bool = False) -> Dict[int, int]: ''' :param collection: collection to analyse :param target_key: The key within each sample in the collection that contains the list of targets to be analysed. This can also be the predicted target key, which might be useful for error analysis. :param tokenizer: The tokenizer to use to split the target(s) into tokens. See for a module of comptabile tokenisers :py:mod:`target_extraction.tokenizers` :param normalise: The values are normalised based on the total number of targets. (This does not change the return if `cumulative_percentage` is True) :param cumulative_percentage: If the return should not be frequency counts of the number of tokens in each target but rather the cumulative percentage of targets with that number of tokens. :returns: The dictionary where keys are the target length based on the number of tokens in the target and the values are the number of targets in the dataset that contain that number of tokens (same target can be counted more than once if it exists in the dataset more then once). **This is a defaultdict where the value will be 0 if the key does not exist.** ''' lengths = defaultdict(lambda: 0) target_count = collection.target_count(lower=False, target_key=target_key) total_target_count = sum(target_count.values()) for target, count in target_count.items(): length = len(tokeniser(target)) if normalise: count = count / total_target_count lengths[length] += count if cumulative_percentage: lengths = sorted(lengths.items(), key=lambda x: x[0]) temp_lengths = {} current_percentage = 0.0 for length, count in lengths: percentage = (count/total_target_count) * 100 temp_lengths[length] = current_percentage + percentage current_percentage += percentage lengths = temp_lengths return lengths
[docs]def tokens_per_sentence(collection: TargetTextCollection, tokeniser: Callable[[str], List[str]]) -> Dict[int, int]: ''' :param collection: The collection to generate the statistic for. :param tokeniser: The tokenizer to use to split the sentences/texts into tokens. If the collection has already been tokenised then it will use the tokens in the `tokenized_text` key within each sample in the collection else it will produce the tokens within this function and save them to that key as well. For a module of comptabile tokenisers :py:mod:`target_extraction.tokenizers` :returns: A dictionary of sentence lengths and their frequency. **This is a defaultdict where the value will be 0 if the key does not exist.** ''' if_tokenised = 'tokenized_text' in next(collection.dict_iterator()) if not if_tokenised: collection.tokenize(tokeniser) length_count = defaultdict(lambda: 0) for target_text in collection.values(): length_count[len(target_text['tokenized_text'])] += 1 return length_count
def _statistics_to_dataframe(collection_statistics: List[Dict[str, Union[str,int,float]]] ) -> pd.DataFrame: ''' :param collection_statistics: The dictionaries to be converted into a single dataframe. :returns: The collection statistics given into a dataframe where all columns are the key names and the values are the associated values in the keys from the list of dictionaries. ''' pd_dict = defaultdict(list) for collection_statistic in collection_statistics: for stat_key, stat_value in collection_statistic.items(): pd_dict[stat_key].append(stat_value) return pd.DataFrame(pd_dict)
[docs]def dataset_target_extraction_statistics(collections: List[TargetTextCollection], lower_target: bool = True, target_key: str = 'targets', tokeniser: Callable[[str], List[str]]=spacy_tokenizer(), dataframe_format: bool = False, incl_sentence_statistics: bool = True ) -> List[Dict[str, Union[str,int,float]]]: ''' :param collections: A list of collections :param lower_target: Whether to lower case the targets before counting them :param target_key: The key within each sample in each collection that contains the list of targets to be analysed. This can also be the predicted target key, which might be useful for error analysis. :param tokenizer: The tokenizer to use to split the target(s) into tokens. See for a module of comptabile tokenisers :py:mod:`target_extraction.tokenizers`. This is required to give statistics on target length. :param dataframe_format: If True instead of a list of dictionaries the return will be a pandas dataframe :param incl_sentence_statistics: If False statistics about the sentence will not be included. This is so that the statistics can still be created for datasets that have been anonymised. :returns: A list of dictionaries each containing the statistics for the associated collection. Each dictionary will have the following keys: 1. Name -- this comes from the collection's name attribute 2. No. Sentences -- number of sentences in the collection 3. No. Sentences(t) -- number of sentence that contain targets. 4. No. Targets -- number of targets 5. No. Uniq Targets -- number of unique targets 6. ATS -- Average Target per Sentence (ATS) 7. ATS(t) -- ATS but where all sentences in the collection must contain at least one target. 8. TL (1) -- Percentage of targets that are length 1 based on the number of tokens. 9. TL (2) -- Percentage of targets that are length 2 based on the number of tokens. 10. TL (3+) -- Percentage of targets that are length 3+ based on the number of tokens. 11. Mean Sent L -- Mean sentence length based on the tokens provided by the `tokenized_text` key in each TargetText within the collections. If this key does not exist then the collection will be tokenized using the given tokeniser argument. 12. Mean Sent L(t) -- `Mean Sent L` but where all sentences in the collection must contain at least one target. ''' dataset_stats: List[Dict[str, Union[str,int,float]]] = [] for collection in collections: collection_stats = {} collection_stats['Name'] = collection.name collection_stats['No. Sentences'] = len(collection) collection_stats['No. Sentences(t)'] = len(collection.samples_with_targets()) collection_stats['No. Targets'] = collection.number_targets() collection_stats['No. Uniq Targets'] = len(collection.target_count(lower=lower_target)) collection_stats['ATS'] = round(average_target_per_sentences(collection, False), 2) collection_stats['ATS(t)'] = round(average_target_per_sentences(collection, True), 2) target_lengths = tokens_per_target(collection, target_key, tokeniser, normalise=True) collection_stats['TL 1 %'] = round(target_lengths[1] * 100, 2) collection_stats['TL 2 %'] = round(target_lengths[2] * 100, 2) three_plus = sum([fraction for token_length, fraction in target_lengths.items() if token_length > 2]) collection_stats['TL 3+ %'] = round(three_plus * 100, 2) if not incl_sentence_statistics: dataset_stats.append(collection_stats) continue for samples_with_targets_only in [False, True]: if samples_with_targets_only: sentence_lengths = tokens_per_sentence(collection.samples_with_targets(), tokeniser) else: sentence_lengths = tokens_per_sentence(collection, tokeniser) sentence_lengths_flattened = [] for length, count in sentence_lengths.items(): sentence_lengths_flattened.extend([length] * count) mean_sentence_length = round(statistics.mean(sentence_lengths_flattened), 2) if samples_with_targets_only: collection_stats['Mean Sentence Length(t)'] = mean_sentence_length else: collection_stats['Mean Sentence Length'] = mean_sentence_length dataset_stats.append(collection_stats) if dataframe_format: return _statistics_to_dataframe(dataset_stats) return dataset_stats
[docs]def dataset_target_sentiment_statistics(collections: List[TargetTextCollection], lower_target: bool = True, target_key: str = 'targets', tokeniser: Callable[[str], List[str]]=spacy_tokenizer(), sentiment_key: str = 'target_sentiments', dataframe_format: bool = False, incl_sentence_statistics: bool = True ) -> Union[List[Dict[str, Union[str,int,float]]], pd.DataFrame]: ''' :param collections: A list of collections :param lower_target: Whether to lower case the targets before counting them :param target_key: The key within each sample in each collection that contains the list of targets to be analysed. This can also be the predicted target key, which might be useful for error analysis. :param tokenizer: The tokenizer to use to split the target(s) into tokens. See for a module of comptabile tokenisers :py:mod:`target_extraction.tokenizers`. This is required to give statistics on target length. :param sentiment_key: The key in each TargetText within each collection that contains the True sentiment value. :param dataframe_format: If True instead of a list of dictionaries the return will be a pandas dataframe :param incl_sentence_statistics: If False statistics about the sentence will not be included. This is so that the statistics can still be created for datasets that have been anonymised. :returns: A list of dictionaries each containing the statistics for the associated collection. Each dictionary will have the keys from :py:func:`dataset_target_extraction_statistics` and the following in addition: 1. POS (%) -- Number (Percentage) of positive targets 2. NEU (%) -- Number (Percentage) of neutral targets 3. NEG (%) -- Number (Percentage) of Negative targets ''' initial_dataset_stats = dataset_target_extraction_statistics(collections, lower_target=lower_target, target_key=target_key, tokeniser=tokeniser, dataframe_format=False, incl_sentence_statistics=incl_sentence_statistics) dataset_stats = [] for collection, collection_stats in zip(collections, initial_dataset_stats): sentiment_percent = get_sentiment_counts(collection, normalised=True, sentiment_key=sentiment_key) sentiment_percent = {sentiment_name: round((fraction * 100), 2) for sentiment_name, fraction in sentiment_percent.items()} sentiment_count = get_sentiment_counts(collection, normalised=False, sentiment_key=sentiment_key) pos_value = f'{sentiment_count["positive"]} ({sentiment_percent["positive"]})' collection_stats['POS (%)'] = pos_value neu_value = f'{sentiment_count["neutral"]} ({sentiment_percent["neutral"]})' collection_stats['NEU (%)'] = neu_value neg_value = f'{sentiment_count["negative"]} ({sentiment_percent["negative"]})' collection_stats['NEG (%)'] = neg_value dataset_stats.append(collection_stats) if dataframe_format: return _statistics_to_dataframe(dataset_stats) return dataset_stats