Source code for bella.syntactic_contexts

'''
A set of functions which either produce contexts and related targets based on
syntactic parsing. Or functions that create
'''
import re

from bella.data_types import Target
from bella.dependency_parsers import stanford


[docs]def normalise_target(target, text, sorted_target_spans, renormalise=False,
                     parser=None):

    def add_target_to_text(target):
        target_added_text = text
        for start, end in sorted_target_spans:
            start_text = target_added_text[: start]
            end_text = target_added_text[end :]
            start_text += ' {} '.format(target)
            target_added_text = start_text + end_text
        return ' '.join(target_added_text.split())

    def replace_target_in_text(target, new_target):
        return text.replace(target, new_target)

    def check_target_unique(target, text, num_target_spans):
        escaped_target = re.escape(target)
        num_target_occurences = len(re.findall(escaped_target, text))
        if num_target_occurences != num_target_spans:
            return False
        return True

    norm_target = target.strip()
    norm_text = text
    num_target_spans = len(sorted_target_spans)

    split_target = target.split()
    num_spaces_in_target = len(split_target)
    # Converts words such LG Flat Screen into LG_FlatScreen This is done as the
    # parser won't keep a word as a word if it is LG_Flat_Screen
    if renormalise:
        norm_target = ''.join(split_target)
        if norm_target[0] == '#' or norm_target[0] == '@':
            if '#' in norm_target[1:]:
                norm_target = norm_target.replace('#', '')
                norm_target = '#{}'.format(norm_target)
            elif '@' in norm_target[1:]:
                norm_target = norm_target.replace('@', '')
                norm_target = '@{}'.format(norm_target)

            if parser == stanford:
                norm_target = norm_target.replace('@', '')
        if norm_target[-1] == '#':
            norm_target = norm_target.replace('#', '')

    elif num_spaces_in_target > 2:
        first_word = split_target[0]
        joined_rest_words = ''.join(split_target[1:])
        norm_target = '_'.join([first_word, joined_rest_words])
    # Keeps the word as a whole word.
    elif num_spaces_in_target == 2:
        temp_split_target = []
        for word_index, split_word in enumerate(split_target):
            if word_index != 0:
                split_word = split_word.replace('#', '')
            temp_split_target.append(split_word)
        norm_target = '_'.join(temp_split_target)
    # Gets rid of anything this is not a word @ or #
    norm_target = re.sub(r'[^\w@#]', '', norm_target)

    # Put the normalised targets into the text
    norm_text = add_target_to_text(norm_target)

    #if parser == stanford:
    #  norm_target = norm_target.replace('@', '')

    # Checks a word that is the normalised target word exists in the text and
    # if so changes it to a word that is not by putting dollar signs around it.
    if not check_target_unique(norm_target, norm_text, num_target_spans):
        if parser == stanford:
            norm_target = norm_target.replace('@', '')
            norm_target = norm_target.replace('#', '')
        elif '@' in norm_target:
            norm_target = norm_target.replace('@', '')
        else:
            if renormalise and len(split_target) == 1:
                if norm_target[0] == '#':
                    norm_target = '${}$'.format(norm_target[1:])
            else:
                norm_target = '${}$'.format(norm_target)


        norm_text = add_target_to_text(norm_target)
        if not check_target_unique(norm_target, norm_text, num_target_spans):
            if parser == stanford:
                norm_target = '<{}>'.format(norm_target)
                norm_text = add_target_to_text(norm_target)
                if check_target_unique(norm_target,
                                       norm_text, num_target_spans):
                    return norm_target
            elif '$' not in norm_target:
                norm_target = '${}$'.format(norm_target)
                norm_text = add_target_to_text(norm_target)
                if check_target_unique(norm_target,
                                       norm_text, num_target_spans):
                    return norm_target
            raise Exception('Normalised word {} occurs in the text more times '\
                            'than it spans {}. Text {}'\
                            .format(norm_target, num_target_spans, norm_text))
    elif parser == stanford:
        if '@' in norm_target:
            norm_target

    return norm_target

[docs]def target_normalisation(target_dict, renormalise=False, parser=None):
    '''
    Given a target instance it normalises the target by removing whitespaces
    between target words and inserting `_`. Then inserting the normalised word
    into where the target spans appear and adding whitespace around the target
    word incase other words are joined on. Returns the text with the inserted
    normalised words and the normalised target word.

    :param target_dict: target instance.
    :type target_dict: Target
    :returns: Tuple of two Strings containing. Text with normalised targets and \
    the normalised target
    :rtype: tuple
    '''

    sorted_spans = sorted(target_dict['spans'], key=lambda span: span[0],
                          reverse=True)
    org_text = target_dict['text']
    target = target_dict['target']


    target = normalise_target(target, org_text, sorted_spans, renormalise,
                              parser=parser)
    for start_index, end_index in sorted_spans:
        start_text = org_text[: start_index]
        end_text = org_text[end_index :]
        start_text += ' {} '.format(target)
        org_text = start_text + end_text
    org_text = ' '.join(org_text.split())

    return org_text, target

[docs]def normalise_context(target_dicts, lower, renormalise=False, parser=None):
    '''
    Given a list of target dicts and if the text should be lower cased returns
    all of the text and targets within those target dicts as lists where the
    text and targets have been normalised to ensure the targets within the
    text can be identified.

    :param target_dicts: list of dicts
    :param lower: state if the text within the dicts should be lower cased
    :type target_dicts: list
    :type lower: bool
    :returns: A tuple of length two which contain a list of normalised texts and \
    targets.
    :rtype: list
    '''

    # Normalise the target and text
    all_text = []
    all_norm_targets = []
    for target_dict in target_dicts:
        norm_text, norm_target = target_normalisation(target_dict, renormalise,
                                                      parser=parser)
        if lower:
            norm_text = norm_text.lower()
            norm_target = norm_target.lower()
        all_text.append(norm_text)
        all_norm_targets.append(norm_target)
    return all_text, all_norm_targets

[docs]def dependency_relation_context(target_dicts, parser, lower=False,
                                n_relations=(1, 1)):
    '''
    Given a list of target dicts where each target dict has a sentence that
    contains one or more of the same target. Returns a list of a list of Strings
    where each String is associated to a target within the target sentence.
    The String is a concatenation of n_relations depth of dependency relations
    where each relation is a child of the target. e.g. n_relations = (1, 1)
    will return a String of the concatenation of the children of the target
    within the dependency tree. n_relations = (1, 2) will return the children
    of the target and the children of the children.

    :param target_dicts: list of dictionaries where each dictionary is associated \
    to a target sentence.
    :param parser: function that performs dependency parsing
    :param lower: Whether to lower case the text
    :param n_relations: The depth of the dependency relation text from the target \
    to return. Represented as a tuple of two ints the first defining the \
    starting depth the second end depth e.g. (1, 2) will return depths one and \
    two of dependency tree.
    :type target_dicts: list
    :type parser: String
    :type lower: bool. Default False.
    :type n_relations: tuple. Default (1, 1).
    :returns: A list of a list of Strings where each String represents a specific \
    target word within a target sentence dependency related text at n_relations \
    depth.
    :rtype: list
    '''

    # Normalise the target and text
    targets = [target_dict['target'] for target_dict in target_dicts]
    norm_texts, norm_targets = normalise_context(target_dicts, lower,
                                                 parser=parser)
    # Get contexts
    all_dependency_tokens = parser(norm_texts)
    all_contexts = []

    for index, dependency_tokens in enumerate(all_dependency_tokens):
        for attempts in range(1, 3):
            contexts = []
            norm_target = norm_targets[index]
            # This only happens if the first normalization does not work
            if attempts == 2:
                text, norm_target = normalise_context([target_dicts[index]],
                                                      lower=lower, parser=parser,
                                                      renormalise=True)
                norm_target = norm_target[0]
                dependency_tokens = parser(text)[0]
            for dependency_token in dependency_tokens:
                current_token = targets[index]
                if lower:
                    current_token = current_token.lower()
                if dependency_token.token == norm_target:
                    all_related_words = dependency_token.get_n_relations(n_relations)
                    related_text = ' '.join(all_related_words)
                    related_text = related_text.replace(current_token, norm_target)
                    contexts.append(related_text)

            rel_target = target_dicts[index]
            valid_num_targets = len(rel_target['spans'])
            if valid_num_targets != len(contexts):
                if attempts == 1:
                    continue
                raise ValueError('The number of identified targets `{}` not equal '\
                                 'to the number of targets in the data `{}`'\
                                 .format(contexts, rel_target))
            # Ensure the returned data type is consistent
            if contexts == []:
                raise ValueError('This should not happen as each data type should '\
                                 'have a target {}'.format(rel_target))
            all_contexts.append(contexts)
            break
    return all_contexts

[docs]def dependency_context(target_dicts, parser, lower=False):
    '''
    Given a list of target dicts it will normalise the target word to ensure
    that it is seperated and if it is a multi word target join the target words
    together to ensure when it is processed by the dependency parser it is treated
    as a singular word.

    Given a list of target sentences returns a list of contexts where each contexts
    is associated to a target sentence and each contexts contains a target context
    for each target word in the target sentence. A target context is a dict which
    contains `text` and `span` keys where the values correspond to all the
    dependency related words as a String and the span are the indexs to the target
    word within the text.

    :param target_dicts: list of dictionaries where each dictionary is associated \
    to a target sentence.
    :param parser: function that performs dependency parsing
    :param lower: Whether to lower case the texts before processing them with the \
    parser.
    :type target_dicts: list
    :type parser: function
    :type lower: bool Default False
    :returns: A list of a list of dicts where each list is contains many contexts.
    :rtype: list
    '''

    # Normalise the target and text
    targets = [target_dict['target'] for target_dict in target_dicts]
    norm_texts, norm_targets = normalise_context(target_dicts, lower,
                                                 parser=parser)
    # Get contexts
    all_dependency_tokens = parser(norm_texts)
    all_contexts = []
    for index, dependency_tokens in enumerate(all_dependency_tokens):
        for attempts in range(1, 3):
            contexts = []
            norm_target = norm_targets[index]
            # This only happens if the first normalization does not work
            if attempts == 2:
                text, norm_target = normalise_context([target_dicts[index]],
                                                      lower=lower, parser=parser,
                                                      renormalise=True)
                norm_target = norm_target[0]
                dependency_tokens = parser(text)[0]

            for dependency_token in dependency_tokens:
                current_target = targets[index]
                if lower:
                    current_target = current_target.lower()

                if dependency_token.token == norm_target:
                    norm_org = (norm_target, current_target)
                    text_span = dependency_token\
                                .connected_target_span(renormalise=norm_org)
                    connected_text, target_span = text_span
                    contexts.append({'text' : connected_text,
                                     'span' : target_span})
            rel_target = target_dicts[index]
            valid_num_targets = len(rel_target['spans'])
            if valid_num_targets != len(contexts):
                if attempts == 1:
                    continue
                raise ValueError('The number of identified targets `{}` not equal '\
                                 'to the number of targets in the data `{}`'\
                                 ' norm target {}'\
                                 .format(contexts, rel_target, norm_target))
            # Ensure the returned data type is consistent
            if contexts == []:
                raise ValueError('This should not happen as each data type should '\
                                 'have a target {}'.format(rel_target))
            all_contexts.append(contexts)
            break
    return all_contexts

[docs]def context(all_context_dicts, specific_context, inc_target=False):
    '''
    Returns a list of a list of Strings based on the location of the target word
    in the text within the target dict (NOTE the target word can occur more than
    once hence why a list is returned as the context is returned for each
    occurence). Context can be one of the following:

    1. left - left of the target occurence.
    2. right - right of the target occurence.
    3. target - target word/words of each target occurence.
    4. full - whole text repeated for each occurence.

    If the target only occur once in the text then for that text occurence the
    length of the list will be one.

    :param all_context_dicts: A list of dicts where each dict contains text and \
    span keys.
    :param specific_context: String specifying the context e.g. left.
    :param inc_target: Whether to include the target word in the context text. \
    (Only applies for left and right context.)
    :type all_context_dicts: list
    :type specific_context: String
    :type inc_target: Boolean Default False
    :returns: A list of of a list of context strings
    :rtype: list
    '''

    acceptable_contexts = {'left', 'right', 'target', 'full'}
    if specific_context not in acceptable_contexts:
        raise ValueError('context parameter can only be one of the following {}'\
                         ' not {}'.format(acceptable_contexts, context))
    all_contexts = []
    for context_dicts in all_context_dicts:
        contexts = []
        for context_dict in context_dicts:
            text = context_dict['text']
            target_span = context_dict['span']
            start_char = target_span[0]
            end_char = target_span[1]
            if specific_context == 'left':
                if inc_target:
                    contexts.append(text[:end_char])
                else:
                    contexts.append(text[:start_char])
            elif specific_context == 'right':
                if inc_target:
                    contexts.append(text[start_char:])
                else:
                    contexts.append(text[end_char:])
            elif specific_context == 'target':
                contexts.append(text[start_char:end_char])
            elif specific_context == 'full':
                contexts.append(text)
            else:
                raise ValueError('context parameter should only be `right` or '\
                                 '`left` not {} there must be a logic error'\
                                 .format(specific_context))
        all_contexts.append(contexts)
    return all_contexts