'''
This module allows TargetTextCollection objects to be analysed and report
overall statistics.
'''
from collections import defaultdict
from typing import Dict, Any, List, Union, Callable
import statistics
import pandas as pd
from target_extraction.data_types import TargetTextCollection
from target_extraction.tokenizers import spacy_tokenizer
[docs]def get_sentiment_counts(collection: TargetTextCollection,
sentiment_key: str,
normalised: bool = True) -> Dict[str, float]:
'''
:param collection: The collection containing the sentiment data
:param sentiment_key: The key in each TargetText within the collection that
contains the True sentiment value.
:param normalised: Whether to normalise the values in the dictionary
by the number of targets in the collection.
:returns: A dictionary where keys are sentiment values and the keys
are the number of times they occur in the collection.
'''
sentiment_count = defaultdict(lambda: 0)
for target_text in collection.values():
if target_text[sentiment_key] is not None:
for sentiment_value in target_text[sentiment_key]:
sentiment_count[sentiment_value] += 1
number_targets = collection.number_targets()
assert number_targets == sum(sentiment_count.values())
if normalised:
for sentiment, count in sentiment_count.items():
sentiment_count[sentiment] = float(count) / float(number_targets)
return dict(sentiment_count)
[docs]def average_target_per_sentences(collection: TargetTextCollection,
sentence_must_contain_targets: bool) -> float:
'''
:param collection: Collection to calculate average target per sentence (ATS)
on.
:param sentence_must_contain_targets: Whether or not the sentences within the
collection must contains at least one
target. This filtering would affect
the value of the dominator stated in
the returns.
:returns: The ATS for the given collection. Which is:
Number of targets / number of sentences
'''
number_targets = float(collection.number_targets())
if sentence_must_contain_targets:
number_sentences = len(collection.samples_with_targets())
else:
number_sentences = len(collection)
return number_targets / float(number_sentences)
[docs]def tokens_per_target(collection: TargetTextCollection,
target_key: str,
tokeniser: Callable[[str], List[str]],
normalise: bool = False,
cumulative_percentage: bool = False) -> Dict[int, int]:
'''
:param collection: collection to analyse
:param target_key: The key within each sample in the collection that contains
the list of targets to be analysed. This can also be the
predicted target key, which might be useful for error
analysis.
:param tokenizer: The tokenizer to use to split the target(s) into tokens. See
for a module of comptabile tokenisers
:py:mod:`target_extraction.tokenizers`
:param normalise: The values are normalised based on the total number of
targets. (This does not change the return if
`cumulative_percentage` is True)
:param cumulative_percentage: If the return should not be frequency counts of
the number of tokens in each target but rather
the cumulative percentage of targets with
that number of tokens.
:returns: The dictionary where keys are the target length based on the number
of tokens in the target and the values are the number of targets
in the dataset that contain that number of tokens (same target can
be counted more than once if it exists in the dataset more then
once). **This is a defaultdict where the value will be 0 if the key
does not exist.**
'''
lengths = defaultdict(lambda: 0)
target_count = collection.target_count(lower=False, target_key=target_key)
total_target_count = sum(target_count.values())
for target, count in target_count.items():
length = len(tokeniser(target))
if normalise:
count = count / total_target_count
lengths[length] += count
if cumulative_percentage:
lengths = sorted(lengths.items(), key=lambda x: x[0])
temp_lengths = {}
current_percentage = 0.0
for length, count in lengths:
percentage = (count/total_target_count) * 100
temp_lengths[length] = current_percentage + percentage
current_percentage += percentage
lengths = temp_lengths
return lengths
[docs]def tokens_per_sentence(collection: TargetTextCollection,
tokeniser: Callable[[str], List[str]]) -> Dict[int, int]:
'''
:param collection: The collection to generate the statistic for.
:param tokeniser: The tokenizer to use to split the sentences/texts into
tokens. If the collection has already been tokenised then
it will use the tokens in the `tokenized_text` key within
each sample in the collection else it will produce the
tokens within this function and save them to that key as
well. For a module of comptabile tokenisers
:py:mod:`target_extraction.tokenizers`
:returns: A dictionary of sentence lengths and their frequency.
**This is a defaultdict where the value will be 0 if the key
does not exist.**
'''
if_tokenised = 'tokenized_text' in next(collection.dict_iterator())
if not if_tokenised:
collection.tokenize(tokeniser)
length_count = defaultdict(lambda: 0)
for target_text in collection.values():
length_count[len(target_text['tokenized_text'])] += 1
return length_count
def _statistics_to_dataframe(collection_statistics: List[Dict[str, Union[str,int,float]]]
) -> pd.DataFrame:
'''
:param collection_statistics: The dictionaries to be converted into
a single dataframe.
:returns: The collection statistics given into a dataframe where all columns
are the key names and the values are the associated values in the
keys from the list of dictionaries.
'''
pd_dict = defaultdict(list)
for collection_statistic in collection_statistics:
for stat_key, stat_value in collection_statistic.items():
pd_dict[stat_key].append(stat_value)
return pd.DataFrame(pd_dict)
[docs]def dataset_target_sentiment_statistics(collections: List[TargetTextCollection],
lower_target: bool = True,
target_key: str = 'targets',
tokeniser: Callable[[str], List[str]]=spacy_tokenizer(),
sentiment_key: str = 'target_sentiments',
dataframe_format: bool = False,
incl_sentence_statistics: bool = True
) -> Union[List[Dict[str, Union[str,int,float]]],
pd.DataFrame]:
'''
:param collections: A list of collections
:param lower_target: Whether to lower case the targets before counting them
:param target_key: The key within each sample in each collection that contains
the list of targets to be analysed. This can also be the
predicted target key, which might be useful for error
analysis.
:param tokenizer: The tokenizer to use to split the target(s) into tokens. See
for a module of comptabile tokenisers
:py:mod:`target_extraction.tokenizers`. This is required
to give statistics on target length.
:param sentiment_key: The key in each TargetText within each collection that
contains the True sentiment value.
:param dataframe_format: If True instead of a list of dictionaries the
return will be a pandas dataframe
:param incl_sentence_statistics: If False statistics about the sentence
will not be included. This is so that
the statistics can still be created for
datasets that have been anonymised.
:returns: A list of dictionaries each containing the statistics for the
associated collection. Each dictionary will have the keys from
:py:func:`dataset_target_extraction_statistics` and the following
in addition:
1. POS (%) -- Number (Percentage) of positive targets
2. NEU (%) -- Number (Percentage) of neutral targets
3. NEG (%) -- Number (Percentage) of Negative targets
'''
initial_dataset_stats = dataset_target_extraction_statistics(collections,
lower_target=lower_target,
target_key=target_key,
tokeniser=tokeniser,
dataframe_format=False,
incl_sentence_statistics=incl_sentence_statistics)
dataset_stats = []
for collection, collection_stats in zip(collections, initial_dataset_stats):
sentiment_percent = get_sentiment_counts(collection, normalised=True,
sentiment_key=sentiment_key)
sentiment_percent = {sentiment_name: round((fraction * 100), 2)
for sentiment_name, fraction in sentiment_percent.items()}
sentiment_count = get_sentiment_counts(collection, normalised=False,
sentiment_key=sentiment_key)
pos_value = f'{sentiment_count["positive"]} ({sentiment_percent["positive"]})'
collection_stats['POS (%)'] = pos_value
neu_value = f'{sentiment_count["neutral"]} ({sentiment_percent["neutral"]})'
collection_stats['NEU (%)'] = neu_value
neg_value = f'{sentiment_count["negative"]} ({sentiment_percent["negative"]})'
collection_stats['NEG (%)'] = neg_value
dataset_stats.append(collection_stats)
if dataframe_format:
return _statistics_to_dataframe(dataset_stats)
return dataset_stats