'''
This module contains all the functions that will parse a particular dataset
into a `target_extraction.data_types.TargetTextCollection` object.
Functions:
1. semeval_2014
'''
import json
from pathlib import Path
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
from xml.etree.ElementTree import ParseError
from typing import List, Union, Dict, Any, Optional
import tempfile
import zipfile
import tarfile
from allennlp.common.file_utils import cached_path
import requests
from target_extraction.data_types import TargetTextCollection, TargetText
from target_extraction.data_types_util import Span
CACHE_DIRECTORY = Path(Path.home(), '.bella_tdsa')
def _semeval_extract_data(sentence_tree: Element, conflict: bool
) -> TargetTextCollection:
'''
:param sentence_tree: The root element of the XML tree that has come
from a SemEval XML formatted XML File.
:param conflict: Whether or not to include targets or categories that
have the `conflict` sentiment value. True is to include
conflict targets and categories.
:returns: The SemEval data formatted into a
`target_extraction.data_types.TargetTextCollection` object.
'''
target_text_collection = TargetTextCollection()
for sentence in sentence_tree:
text_id = sentence.attrib['id']
targets: List[str] = []
target_sentiments: List[Union[str, int]] = []
spans: List[Span] = []
category_sentiments: List[Union[str, int]] = []
categories: List[str] = []
for data in sentence:
if data.tag == 'text':
text = data.text
text = text.replace(u'\xa0', u' ')
elif data.tag == 'aspectTerms':
for target in data:
# If it is a conflict sentiment and conflict argument True
# skip this target
target_sentiment = target.attrib['polarity']
if not conflict and target_sentiment == 'conflict':
continue
targets.append(target.attrib['term'].replace(u'\xa0', u' '))
target_sentiments.append(target_sentiment)
span_from = int(target.attrib['from'])
span_to = int(target.attrib['to'])
spans.append(Span(span_from, span_to))
elif data.tag == 'aspectCategories':
for category in data:
# If it is a conflict sentiment and conflict argument True
# skip this category
category_sentiment = category.attrib['polarity']
if not conflict and category_sentiment == 'conflict':
continue
categories.append(category.attrib['category'])
category_sentiments.append(category.attrib['polarity'])
elif data.tag == 'Opinions':
for opinion in data:
category_target_sentiment = opinion.attrib['polarity']
if not conflict and category_target_sentiment == 'conflict':
continue
# Handle the case where some of the SemEval 16 files do
# not contain targets and are only category sentiment files
if 'target' in opinion.attrib:
# Handle the case where there is a category but no
# target
target_text = opinion.attrib['target'].replace(u'\xa0', u' ')
span_from = int(opinion.attrib['from'])
span_to = int(opinion.attrib['to'])
# Special cases for poor annotation in SemEval 2016
# task 5 subtask 1 Restaurant dataset
if text_id == 'DBG#2:15' and target_text == 'NULL':
span_from = 0
span_to = 0
if text_id == "en_Patsy'sPizzeria_478231878:2"\
and target_text == 'NULL':
span_to = 0
if text_id == "en_MercedesRestaurant_478010602:1" \
and target_text == 'NULL':
span_to = 0
if text_id == "en_MiopostoCaffe_479702043:9" \
and target_text == 'NULL':
span_to = 0
if text_id == "en_MercedesRestaurant_478010600:1" \
and target_text == 'NULL':
span_from = 0
span_to = 0
if target_text == 'NULL':
target_text = None
# Special cases for poor annotation in SemEval 2016
# task 5 subtask 1 Restaurant dataset
if text_id == '1490757:0':
target_text = 'restaurant'
if text_id == 'TR#1:0' and span_from == 27:
target_text = 'spot'
if text_id == 'TFS#5:26':
target_text = "environment"
if text_id == 'en_SchoonerOrLater_477965850:10':
target_text = 'Schooner or Later'
targets.append(target_text)
spans.append(Span(span_from, span_to))
categories.append(opinion.attrib['category'])
target_sentiments.append(category_target_sentiment)
target_text_kwargs = {'targets': targets, 'spans': spans, 'text_id': text_id,
'target_sentiments': target_sentiments,
'categories': categories, 'text': text,
'category_sentiments': category_sentiments}
for key in target_text_kwargs:
if not target_text_kwargs[key]:
target_text_kwargs[key] = None
target_text = TargetText(**target_text_kwargs)
target_text_collection.add(target_text)
return target_text_collection
[docs]def semeval_2014(data_fp: Path, conflict: bool) -> TargetTextCollection:
'''
The sentiment labels are the following: 1. negative, 2. neutral,
3. positive, and 4. conflict. conflict will not appear if the argument
`conflict` is False.
:param data_fp: Path to the SemEval 2014 formatted file.
:param conflict: Whether or not to include targets or categories that
have the `conflict` sentiment value. True is to include
conflict targets and categories.
:returns: The SemEval 2014 data formatted into a
`target_extraction.data_types.TargetTextCollection` object.
:raises SyntaxError: If the File passed is detected as not a SemEval
formatted file.
:raises `xml.etree.ElementTree.ParseError`: If the File passed is
not formatted correctly e.g.
mismatched tags
'''
tree = ET.parse(data_fp)
sentences = tree.getroot()
if sentences.tag != 'sentences':
raise SyntaxError('The root of all semeval xml files should '
f'be sentences and not {sentences.tag}')
return _semeval_extract_data(sentences, conflict)
[docs]def semeval_2016(data_fp: Path, conflict: bool) -> TargetTextCollection:
'''
This is only for subtask 1 files where the review is broken down into
sentences. Furthermore if the data contains targets and not just categories
the targets and category sentiments are linked and are all stored in the
`targets_sentiments` further as some of the datasets only contain category
information to make it the same across domains the sentiment values here
will always be in the targets_sentiments field.
The sentiment labels are the following: 1. negative, 2. neutral,
3. positive, and 4. conflict. conflict will not appear if the argument
`conflict` is False.
:param data_fp: Path to the SemEval 2016 formatted file.
:param conflict: Whether or not to include targets and categories that
have the `conflict` sentiment value. True is to include
conflict targets and categories.
:returns: The SemEval 2016 data formatted into a
`target_extraction.data_types.TargetTextCollection` object.
:raises SyntaxError: If the File passed is detected as not a SemEval
formatted file.
:raises `xml.etree.ElementTree.ParseError`: If the File passed is
not formatted correctly e.g.
mismatched tags
'''
tree = ET.parse(data_fp)
reviews = tree.getroot()
if reviews.tag != 'Reviews':
raise SyntaxError('The root of all semeval xml files should '
f'be Reviews and not {reviews.tag}')
all_target_texts: List[TargetText] = []
for review in reviews:
if len(review) != 1:
raise SyntaxError('The number of `sentences` tags under the '
'`review` tag should be just 1 and not '
f'{len(review)}')
sentences = review[0]
review_target_texts = list(_semeval_extract_data(sentences,
conflict).values())
all_target_texts.extend(review_target_texts)
return TargetTextCollection(all_target_texts)
[docs]def download_election_folder(cache_dir: Optional[Path] = None) -> Path:
'''
Downloads the data for the Election Twitter dataset by
`Wang et al, 2017 <https://www.aclweb.org/anthology/E17-1046>` that can be found
`here <https://figshare.com/articles/EACL_2017_-_Multi-target_UK_election_Twitter_sentiment_corpus/4479563/1>`_
This is then further used in the following functions
:func:`target_extraction.dataset_parsers.wang_2017_election_twitter_train`
and
:func:`target_extraction.dataset_parsers.wang_2017_election_twitter_test`
as a way to get the data.
:param cache_dir: The directory where all of the data is stored for
this code base. If None then the cache directory is
`dataset_parsers.CACHE_DIRECTORY`
:returns: The Path to the `Wang 2017 Election Twitter` folder within the
`cache_dir`.
:raises FileNotFoundError: If not all of files where downloaded the first
time. Will require the user to delete either
the cache directory or the
`Wang 2017 Election Twitter` folder within the
cache directory.
'''
def untar_folder(tar_file: Path, folder_to_extract_to: Path) -> None:
with tarfile.open(tar_file) as _tar_file:
_tar_file.extractall(folder_to_extract_to)
if cache_dir is None:
cache_dir = CACHE_DIRECTORY
dataset_folder_fp = Path(cache_dir, 'Wang 2017 Election Twitter')
annotation_fp = Path(dataset_folder_fp, 'annotations')
tweets_fp = Path(dataset_folder_fp, 'tweets')
test_id_fp = Path(dataset_folder_fp, 'test_id.txt')
train_id_fp = Path(dataset_folder_fp, 'train_id.txt')
if dataset_folder_fp.exists():
# The following Paths must exist in the folder for it to be the correct
# downloaded directory else raises FileExistsError
path_to_exist = [annotation_fp, tweets_fp, test_id_fp, train_id_fp]
for _path in path_to_exist:
if not _path.exists():
file_not_err = (f'The following file is not found {_path} '
'and should exist as currently in the '
'corresponding data directory to resolve this'
' problem please either delete this whole '
f'directory {dataset_folder_fp} or use a '
'different cache directory other '
f'than {cache_dir}')
raise FileNotFoundError(file_not_err)
# As the folder exists and contains all of the data return as we should
# not download something for the sake of downloading it
return dataset_folder_fp
dataset_folder_fp.mkdir(parents=True, exist_ok=True)
download_url = 'http://ndownloader.figshare.com/articles/4479563/versions/1'
response = requests.get(download_url, stream=True)
with tempfile.NamedTemporaryFile('wb+') as download_file:
for chunk in response.iter_content(chunk_size=128):
download_file.write(chunk)
with zipfile.ZipFile(download_file) as download_zip:
download_zip.extractall(dataset_folder_fp)
# Need to un tar annotations and tweet folders
annotation_tar_file = Path(dataset_folder_fp, 'annotations.tar.gz')
tweet_tar_file = Path(dataset_folder_fp, 'tweets.tar.gz')
untar_folder(annotation_tar_file, annotation_fp)
untar_folder(tweet_tar_file, tweets_fp)
return dataset_folder_fp
def _wang_2017_election_parser(train: bool, cache_dir: Optional[Path] = None
) -> TargetTextCollection:
'''
Parser for the Election Twitter dataset by
`Wang et al, 2017 <https://www.aclweb.org/anthology/E17-1046>` that can be found
`here <https://figshare.com/articles/EACL_2017_-_Multi-target_UK_election_Twitter_sentiment_corpus/4479563/1>`_
:param train: Whether to return the Train data. If False returns the
test data.
:param cache_dir: The directory where all of the data is stored for
this code base. If None then the cache directory is
`dataset_parsers.CACHE_DIRECTORY`
:returns: Either the training or test dataset of the Election Twitter
dataset.
:raises FileNotFoundError: If not all of files where downloaded the first
time. Will require the user to delete either
the cache directory or the
`Wang 2017 Election Twitter` folder within the
cache directory.
'''
def get_tweet_data(tweet_folder: Path) -> Dict[str, Dict[str, Any]]:
'''
:param tweet_folder: Directory containing files where each one represents
a Tweet and some target dependent sentiment data.
:returns: A dictionary of tweet IDs as keys and JSON data representing
the tweet target dependent sentiment data as values
'''
data = {}
for file_path in tweet_folder.iterdir():
file_name = file_path.stem
tweet_id = file_name.lstrip('5')
with open(file_path, 'r') as tweet_data:
data[tweet_id] = json.load(tweet_data)
return data
def parse_tweet(tweet_data: Dict[str, Any], annotation_data: Dict[str, Any],
tweet_id: str) -> TargetText:
'''
:params tweet_data: Data containing the Tweet information
:params annotation_data: Data containing the annotation data on the
Tweet
:params tweet_id: ID of the Tweet
:returns: The Tweet data in
:class:`target_extraction.data_types.TargetText` format
:raises ValueError: If the Target offset cannot be found.
'''
def get_offsets(from_offset: int, tweet_text: str, target: str) -> Span:
offset_shifts = [0, -1, 1]
for offset_shift in offset_shifts:
from_offset_shift = from_offset + offset_shift
to_offset = from_offset_shift + len(target)
offsets = Span(from_offset_shift, to_offset)
offset_text = tweet_text[from_offset_shift : to_offset].lower()
if offset_text == target.lower():
return offsets
raise ValueError(f'Offset {from_offset} does not match target text'
f' {target}. Full text {tweet_text}\nid {tweet_id}')
target_id = str(tweet_id)
target_text = tweet_data['content']
target_categories = None
target_category_sentiments = None
targets = []
target_spans = []
target_sentiments = []
for entity in tweet_data['entities']:
target_sentiment = annotation_data['items'][str(entity['id'])]
if target_sentiment == 'doesnotapply':
continue
target = entity['entity']
target_span = get_offsets(entity['offset'], target_text, target)
# Take the target from the text as sometimes the original label
# is lower cased when it should not be according to the text.
target = target_text[target_span.start: target_span.end]
targets.append(target)
target_spans.append(target_span)
target_sentiments.append(target_sentiment)
return TargetText(target_text, target_id, targets, target_spans,
target_sentiments, target_categories,
target_category_sentiments)
def get_data(tweet_id_file: Path, all_tweet_data: Dict[str, Dict[str, Any]],
all_annotation_data: Dict[str, Dict[str, Any]]
) -> TargetTextCollection:
'''
:params tweet_id_file: File Path containing a Tweet id on each new line
:params all_tweet_data: Dictionary containing data about the Tweet where
the keys are Tweet ID's and values a Dict of
information about the Tweet.
:param all_annotation_data: Dictionary containing annotation data about
the Tweet where the keys are Tweet ID's
and values are the annotation data about
the Tweet in a form of a Dict.
:returns: The Twitter data into a
:class:`target_extraction.data_types.TargetTextCollection`
object.
'''
targets = []
with tweet_id_file.open('r') as tweet_id_data:
for tweet_id in tweet_id_data:
tweet_id = tweet_id.strip()
tweet_data = all_tweet_data[tweet_id]
anno_data = all_annotation_data[tweet_id]
targets.append(parse_tweet(tweet_data, anno_data, tweet_id))
return TargetTextCollection(targets)
data_fp = download_election_folder(cache_dir)
tweets_folder = Path(data_fp, 'tweets', 'tweets')
annotations_folder = Path(data_fp, 'annotations', 'annotations')
tweet_data = get_tweet_data(tweets_folder)
annotation_data = get_tweet_data(annotations_folder)
ids_file = Path(data_fp, 'train_id.txt')
if not train:
ids_file = Path(data_fp, 'test_id.txt')
return get_data(ids_file, tweet_data, annotation_data)
[docs]def multi_aspect_multi_sentiment_atsa(dataset: str,
cache_dir: Optional[Path] = None,
original: bool = True
) -> TargetTextCollection:
'''
The data for this function when downloaded is stored within:
`Path(cache_dir, 'Jiang 2019 MAMS ATSA')
:NOTE: That as each sentence/`TargetText` object has to have
a `text_id`, as no ids exist in this dataset the ids are created
based on when the sentence occurs in the dataset e.g. the first
sentence/`TargetText` object id is '0'
:param dataset: Either `train`, `val` or `test`, determines the dataset that
is returned.
:param cache_dir: The directory where all of the data is stored for
this code base. If None then the cache directory is
`dataset_parsers.CACHE_DIRECTORY`
:param original: This does not affect `val` or `test`. If True then it will
download the original training data from the `original paper
<https://www.aclweb.org/anthology/D19-1654.pdf>`_ . Else
it will download the cleaned Training dataset version. The
cleaned version only contains a few sample differences
but these differences are with respect to overlapping
targets. See this `notebook for full differences
<https://github.com/apmoore1/target-extraction/blob/master/tutorials/Difference_between_MAMS_ATSA_original_and_MAMS_ATSA_cleaned.ipynb>`_:
:returns: The `train`, `val`, or `test` dataset from the
Multi-Aspect-Multi-Sentiment dataset (MAMS) ATSA version.
Dataset came from the `A Challenge Dataset and Effective Models
for Aspect-Based Sentiment Analysis, EMNLP 2019
<https://www.aclweb.org/anthology/D19-1654.pdf>`_
:raises ValueError: If the `dataset` value is not `train`, `val`, or `test`
'''
accepted_datasets = {'train', 'val', 'test'}
if dataset not in accepted_datasets:
raise ValueError('dataset has to be one of these values '
f'{accepted_datasets}, not {dataset}')
if cache_dir is None:
cache_dir = CACHE_DIRECTORY
data_folder = Path(cache_dir, 'Jiang 2019 MAMS ATSA')
data_folder.mkdir(parents=True, exist_ok=True)
dataset_url = {'train': 'https://github.com/siat-nlp/MAMS-for-ABSA/raw/master/data/MAMS-ATSA/raw/train.xml',
'val': 'https://github.com/siat-nlp/MAMS-for-ABSA/raw/master/data/MAMS-ATSA/raw/val.xml',
'test': 'https://github.com/siat-nlp/MAMS-for-ABSA/raw/master/data/MAMS-ATSA/raw/test.xml'}
url = dataset_url[dataset]
if dataset == 'train' and not original:
url = 'https://raw.githubusercontent.com/apmoore1/target-extraction/master/data/MAMS/MAMS_ATSA_cleaned_train.xml'
data_fp = Path(cached_path(url, cache_dir=data_folder))
# Parsing the data
target_text_collection = TargetTextCollection()
tree = ET.parse(data_fp)
sentences = tree.getroot()
for sentence_id, sentence in enumerate(sentences):
targets: List[str] = []
target_sentiments: List[Union[str, int]] = []
spans: List[Span] = []
for data in sentence:
if data.tag == 'text':
text = data.text
text = text.replace(u'\xa0', u' ')
elif data.tag == 'aspectTerms':
for target in data:
target_sentiment = target.attrib['polarity']
target_sentiments.append(target_sentiment)
targets.append(target.attrib['term'].replace(u'\xa0', u' '))
span_from = int(target.attrib['from'])
span_to = int(target.attrib['to'])
spans.append(Span(span_from, span_to))
else:
raise ValueError(f'This tag {data.tag} should not occur '
'within a sentence tag')
target_text_kwargs = {'targets': targets, 'spans': spans,
'text_id': f'{dataset}${str(sentence_id)}',
'target_sentiments': target_sentiments,
'categories': None, 'text': text,
'category_sentiments': None}
for key in target_text_kwargs:
if not target_text_kwargs[key]:
target_text_kwargs[key] = None
target_text = TargetText(**target_text_kwargs)
target_text_collection.add(target_text)
return target_text_collection
[docs]def multi_aspect_multi_sentiment_acsa(dataset: str,
cache_dir: Optional[Path] = None
) -> TargetTextCollection:
'''
The data for this function when downloaded is stored within:
`Path(cache_dir, 'Jiang 2019 MAMS ACSA')
:NOTE: That as each sentence/`TargetText` object has to have
a `text_id`, as no ids exist in this dataset the ids are created
based on when the sentence occurs in the dataset e.g. the first
sentence/`TargetText` object id is '0'
For reference this dataset has 8 different aspect categories.
:param dataset: Either `train`, `val` or `test`, determines the dataset that
is returned.
:param cache_dir: The directory where all of the data is stored for
this code base. If None then the cache directory is
`dataset_parsers.CACHE_DIRECTORY`
:returns: The `train`, `val`, or `test` dataset from the
Multi-Aspect-Multi-Sentiment dataset (MAMS) ACSA version.
Dataset came from the `A Challenge Dataset and Effective Models
for Aspect-Based Sentiment Analysis, EMNLP 2019
<https://www.aclweb.org/anthology/D19-1654.pdf>`_
:raises ValueError: If the `dataset` value is not `train`, `val`, or `test`
'''
accepted_datasets = {'train', 'val', 'test'}
if dataset not in accepted_datasets:
raise ValueError('dataset has to be one of these values '
f'{accepted_datasets}, not {dataset}')
if cache_dir is None:
cache_dir = CACHE_DIRECTORY
data_folder = Path(cache_dir, 'Jiang 2019 MAMS ACSA')
data_folder.mkdir(parents=True, exist_ok=True)
dataset_url = {'train': 'https://github.com/siat-nlp/MAMS-for-ABSA/raw/master/data/MAMS-ACSA/raw/train.xml',
'val': 'https://github.com/siat-nlp/MAMS-for-ABSA/raw/master/data/MAMS-ACSA/raw/val.xml',
'test': 'https://github.com/siat-nlp/MAMS-for-ABSA/raw/master/data/MAMS-ACSA/raw/test.xml'}
url = dataset_url[dataset]
data_fp = Path(cached_path(url, cache_dir=data_folder))
# Parsing the data
category_text_collection = TargetTextCollection()
tree = ET.parse(data_fp)
sentences = tree.getroot()
for sentence_id, sentence in enumerate(sentences):
categories: List[str] = []
category_sentiments: List[Union[str, int]] = []
for data in sentence:
if data.tag == 'text':
text = data.text
text = text.replace(u'\xa0', u' ')
elif data.tag == 'aspectCategories':
for category in data:
category_sentiment = category.attrib['polarity']
category_sentiments.append(category_sentiment)
categories.append(category.attrib['category'].replace(u'\xa0', u' '))
else:
raise ValueError(f'This tag {data.tag} should not occur '
'within a sentence tag')
category_text_kwargs = {'targets': None, 'spans': None,
'text_id': f'{dataset}${str(sentence_id)}',
'target_sentiments': None,
'categories': categories, 'text': text,
'category_sentiments': category_sentiments}
for key in category_text_kwargs:
if not category_text_kwargs[key]:
category_text_kwargs[key] = None
category_text = TargetText(**category_text_kwargs)
category_text_collection.add(category_text)
return category_text_collection