Source code for bella.scikit_features.word_vector

import numpy as np
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

from bella.word_vectors import WordVectors

[docs]class ContextWordVectors(BaseEstimator, TransformerMixin):

[docs]    def __init__(self, vectors=None, zero_token='$$$ZERO_TOKEN$$$'):
        self.vectors = vectors
        self.zero_token = zero_token

[docs]    def fit(self, context_tokens, y=None):
        '''Kept for consistnecy with the TransformerMixin'''

        return self

[docs]    def fit_transform(self, context_tokens, y=None):
        '''see self.transform'''

        return self.transform(context_tokens)

[docs]    def transform(self, contexts_tokens):
        '''
        Given a list of contexts (either right, left or target) which are made
        up of lists of tokens return the tokens as a word vector matrix.

        The word vector matrix is a word vector for each token but instead of
        storing in a list it is stored in a numpy.ndarray of shape:
        (length of word vector, number of tokens).

        Example of the input
        [[['context', 'one'], ['context', 'two']], [['another context']]]

        :param contexts_tokens: A list of data of which each data contains a list \
        of contexts which contains a list of tokens.
        :type context_tokens: list
        :returns: The same list but with word vectors as numpy.ndarray instead \
        of tokens which are Strings
        :rtype: list
        '''
        context_word_vectors = []
        for context_tokens in contexts_tokens:
            all_contexts = []
            for context in context_tokens:
                context_word_vector = []
                for token in context:
                    token_vector = []
                    for word_vector in self.vectors:
                        token_vector.append(word_vector.lookup_vector(token))
                    context_word_vector.append(np.hstack(token_vector))
                # Padding
                if len(context_word_vector) == 0:
                    token_vector = []
                    for word_vector in self.vectors:
                        token_vector.append(word_vector.lookup_vector(self.zero_token))
                    context_word_vector.append(np.hstack(token_vector))

                context_matrix = self.list_to_matrix(context_word_vector)
                all_contexts.append(context_matrix)
            context_word_vectors.append(all_contexts)
        return context_word_vectors

[docs]    @staticmethod
    def list_to_matrix(word_vector_list):
        '''
        Converts a list of numpy.ndarrays (vectors) into a numpy.ndarray (matrix).

        :param word_vector_list: list of numpy.ndarray
        :type word_vector_list: list
        :returns: a matrix of the numpy.ndarray
        :rtype: numpy.ndarray
        '''

        num_rows = len(word_vector_list)
        matrix = np.asarray(word_vector_list)
        if matrix.shape[0] != num_rows:
            raise ValueError('The matrix row should equal the number of tokens')
        return matrix