Source code for bella.models.target

'''
Module contains all of the classes that represent Machine Learning models
that are within `Vo and Zhang 2015 paper \
<https://ijcai.org/Proceedings/15/Papers/194.pdf>`_:

1. :py:class:`bella.models.target.TargetInd` -- Target Indepdent model
2. :py:class:`bella.models.target.TargetDepMinus` -- Target Dependent Minus
   model
3. :py:class:`bella.models.target.TargetDep` -- Target Dependent model
4. :py:class:`bella.models.target.TargetDepPlus` -- Target Dependent Plus model
'''

from typing import Any, List, Callable, Dict

import sklearn
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC

import bella
from bella.tokenisers import ark_twokenize
from bella.neural_pooling import matrix_max, matrix_min, matrix_avg
from bella.neural_pooling import matrix_median, matrix_prod, matrix_std
from bella.scikit_features.context import Context
from bella.scikit_features.lexicon_filter import LexiconFilter
from bella.scikit_features.join_context_vectors import JoinContextVectors
from bella.scikit_features.neural_pooling import NeuralPooling
from bella.scikit_features.tokeniser import ContextTokeniser
from bella.scikit_features.word_vector import ContextWordVectors
from bella.models.base import SKLearnModel


[docs]class TargetInd(SKLearnModel):
    '''
    Attributes:

    1. model -- Machine learning model. Expects it to be a
       :py:class:`sklearn.pipeline.Pipeline` instance.
    2. fitted -- If the machine learning model has been fitted (default False)
    3. model_parameters -- The parameters that are set in the machine
       learning model. E.g. Parameter could be the tokeniser used.

    Methods:

    1. fit -- Fit the model according to the given training data.
    2. predict -- Predict class labels for samples in X.
    3. probabilities -- The probability of each class label for all samples
       in X.
    4. __repr__ -- Name of the machine learning model.

    Class Methods:

    1. get_parameters -- Transform the given parameters into a dictonary
       that is accepted as model parameters.
    2. get_cv_parameters -- Transform the given parameters into a list of
       dictonaries that is accepted as `param_grid` parameter in
       :py:class:`sklearn.model_selection.GridSearchCV`
    3. name -- -- Returns the name of the model.

    Functions:

    1. save -- Given a instance of this class will save it to a file.
    2. load -- Loads an instance of this class from a file.
    3. pipeline -- Machine Learning model that is used as the base template
       for the model attribute.
    '''

[docs]    @classmethod
    def name(cls) -> str:
        return 'Target Independent'

    def __repr__(self) -> str:
        '''
        Name of the machine learning model.
        '''
        return self.name()

[docs]    def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'],
                 tokeniser: Callable[[str], List[str]] = ark_twokenize,
                 lower: bool = True, C: float = 0.01,
                 random_state: int = 42,
                 scale: Any = MinMaxScaler()) -> None:
        '''
        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Whether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        '''
        # The parameters here go into the self.get_parameters method
        super().__init__(word_vectors, tokeniser, lower, C, random_state,
                         scale)

[docs]    @staticmethod
    def pipeline() -> 'sklearn.pipeline.Pipeline':
        '''
        Machine Learning model that is used as the base template for the model
        attribute.

        :returns: The template machine learning model
        '''
        return Pipeline([
            ('contexts', Context('full')),
            ('tokens', ContextTokeniser()),
            ('word_vectors', ContextWordVectors()),
            ('pool_funcs', FeatureUnion([
                ('max_pipe', Pipeline([
                    ('max', NeuralPooling(matrix_max)),
                    ('join', JoinContextVectors(matrix_median))
                ])),
                ('min_pipe', Pipeline([
                    ('min', NeuralPooling(matrix_min)),
                    ('join', JoinContextVectors(matrix_median))
                ])),
                ('avg_pipe', Pipeline([
                    ('avg', NeuralPooling(matrix_avg)),
                    ('join', JoinContextVectors(matrix_median))
                ])),
                ('prod_pipe', Pipeline([
                    ('min', NeuralPooling(matrix_prod)),
                    ('join', JoinContextVectors(matrix_median))
                ])),
                ('std_pipe', Pipeline([
                    ('min', NeuralPooling(matrix_std)),
                    ('join', JoinContextVectors(matrix_median))
                ]))
            ])),
            ('scale', MinMaxScaler()),
            ('svm', LinearSVC())
        ])

[docs]    @classmethod
    def normalise_parameter_names(cls, parameter_dict: Dict[str, Any]
                                  ) -> Dict[str, Any]:
        '''
        Converts the output of :py:meth:`get_parameters` into a dictionary that 
        can be used as input into :py:meth:`get_parameters`.

        :returns: A dictonary that can be used as keyword arguments into the 
                  :py:meth:`get_parameters` method
        '''

        parameter_names = ['word_vectors__', '__tokeniser', '__lower', 'scale',
                           '__C', '__random_state']
        name_parameter = {}
        for parameter_name in parameter_names:
            for name, parameter in parameter_dict.items():
                if parameter_name in name:
                    real_parameter_name = parameter_name.strip('_')
                    name_parameter[real_parameter_name] = parameter
        return name_parameter

[docs]    @classmethod
    def get_parameters(cls,
                       word_vectors: List['bella.word_vectors.WordVectors'],
                       tokeniser: Callable[[str], List[str]] = ark_twokenize,
                       lower: bool = True, C: float = 0.01,
                       random_state: int = 42,
                       scale: Any = MinMaxScaler()) -> Dict[str, Any]:
        '''
        Transform the given parameters into a dictonary that is accepted as
        model parameters

        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Whether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        :return: Model parameters
        '''
        params_dict = {}
        # Add word vectors to Pipeline model
        params_dict = cls._add_to_params_dict(params_dict,
                                              cls._get_word_vector_names(),
                                              word_vectors)
        # Add tokenisers to Pipeline model
        tokenisers_names = [param_name + '__tokeniser'
                            for param_name in cls._get_tokeniser_names()]
        params_dict = cls._add_to_params_dict(params_dict,
                                              tokenisers_names,
                                              tokeniser)
        # Add if the words should be lower cased
        lower_names = [param_name + '__lower'
                       for param_name in cls._get_tokeniser_names()]
        params_dict = cls._add_to_params_dict(params_dict, lower_names, lower)
        # Add how the data should be scaled before going into the SVM
        # If None then it means no scaling happens
        params_dict = cls._add_to_params_dict(params_dict, ['scale'],
                                              scale)
        # Add the C value for the SVM
        params_dict = cls._add_to_params_dict(params_dict, ['svm__C'], C)
        # Add the random state for the SVM
        params_dict = cls._add_to_params_dict(params_dict,
                                              ['svm__random_state'],
                                              random_state)
        return params_dict

[docs]    @classmethod
    def get_cv_parameters(cls, word_vectors, tokeniser=[ark_twokenize],
                          lower=[True], C=[0.01], random_state=[42],
                          scale=[MinMaxScaler()]):
        '''
        Transform the given parameters into a list of dictonaries that is
        accepted as `param_grid` parameter in
        :py:class:`sklearn.model_selection.GridSearchCV`

        :param word_vectors: A list of a list of word vectors e.g. [[SSWE()],
                             [SSWE(), GloveCommonCrawl()]].
        :param tokenisers: A list of tokeniser to be used e.g.
                           :py:meth:`str.split`. Default [ark_twokenize]
        :param lowers: A list of bool values which indicate whether to lower
                       case the input words. Default [True]
        :param C: A list of C values for the :py:class:`sklearn.svm.SVC`
                  estimator that is used in the pipeline. Default [0.01]
        :param random_state: A list of random_state values for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline. Default [42]
        :param scale: List of scale values. The list can include
                      :py:class:`sklearn.preprocessing.MinMaxScaler` type of
                      clases or None if no scaling is to be used. Default
                      [:py:class:`sklearn.preprocessing.MinMaxScaler`]
        :return: Parameters to explore through cross validation
        '''
        params_list = []
        # Word Vectors
        params_list = cls._add_to_params(params_list, word_vectors,
                                         cls._get_word_vector_names())
        # Tokeniser
        tokenisers_names = [param_name + '__tokeniser'
                            for param_name in cls._get_tokeniser_names()]
        params_list = cls._add_to_params(params_list, tokeniser,
                                         tokenisers_names)
        # Lower
        lower_names = [param_name + '__lower'
                       for param_name in cls._get_tokeniser_names()]
        params_list = cls._add_to_params(params_list, lower, lower_names)
        # C
        params_list = cls._add_to_all_params(params_list, 'svm__C', C)
        # Random State
        params_list = cls._add_to_all_params(params_list, 'svm__random_state',
                                             random_state)
        # Scale
        params_list = cls._add_to_all_params(params_list, 'scale', scale)
        return params_list

    @staticmethod
    def _get_word_vector_names() -> List[str]:
        '''
        Method to be overidden by subclasses as each pipeline will be different
        and will have different parameter names for where the word vectors are
        set.

        :returns: A list of of parameter names where the word vectors are set
                  in the pipeline.
        '''

        return ['word_vectors__vectors']

    @staticmethod
    def _get_tokeniser_names() -> List[str]:
        '''
        Method to be overidden by subclasses as each pipeline will be different
        and will have different parameter names for where tokenisers are
        used.

        :returns: A list of of parameter names where the tokenisers are used
                  in the pipeline.
        '''

        return ['tokens']


[docs]class TargetDepMinus(TargetInd):

[docs]    @classmethod
    def name(cls) -> str:
        return 'Target Dependent Minus'

    def __repr__(self) -> str:
        '''
        Name of the machine learning model.
        '''
        return self.name()

[docs]    def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'],
                 tokeniser: Callable[[str], List[str]] = ark_twokenize,
                 lower: bool = True, C: float = 0.025,
                 random_state: int = 42,
                 scale: Any = MinMaxScaler()) -> None:
        '''
        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Wether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is used
                             in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        '''
        # Inherit from SKLearnModel __init__ method
        # The parameters here go into the self.get_parameters method
        super(TargetInd, self).__init__(word_vectors, tokeniser, lower, C,
                                        random_state, scale)

[docs]    @staticmethod
    def pipeline() -> 'sklearn.pipeline.Pipeline':
        '''
        Machine Learning model that is used as the base template for the model
        attribute.

        :returns: The template machine learning model
        '''
        return Pipeline([
            ('union', FeatureUnion([
                ('left', Pipeline([
                    ('contexts', Context('left')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('right', Pipeline([
                    ('contexts', Context('right')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('target', Pipeline([
                    ('contexts', Context('target')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ]))
            ])),
            ('scale', MinMaxScaler()),
            ('svm', LinearSVC())
        ])

[docs]    @classmethod
    def get_parameters(cls,
                       word_vectors: List['bella.word_vectors.WordVectors'],
                       tokeniser: Callable[[str], List[str]] = ark_twokenize,
                       lower: bool = True, C: float = 0.025,
                       random_state: int = 42,
                       scale: Any = MinMaxScaler()) -> Dict[str, Any]:
        '''
        Transform the given parameters into a dictonary that is accepted as
        model parameters

        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Wether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is used
                             in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        :return: Model parameters
        '''
        return super().get_parameters(word_vectors, tokeniser, lower, C,
                                      random_state, scale)

[docs]    @classmethod
    def get_cv_parameters(cls, word_vectors, tokeniser=[ark_twokenize],
                          lower=[True], C=[0.025], random_state=[42],
                          scale=[MinMaxScaler()]):
        '''
        Transform the given parameters into a list of dictonaries that is
        accepted as `param_grid` parameter in
        :py:class:`sklearn.model_selection.GridSearchCV`

        :param word_vectors: A list of a list of word vectors e.g. [[SSWE()],
                             [SSWE(), GloveCommonCrawl()]].
        :param tokenisers: A list of tokeniser to be used e.g.
                           :py:meth:`str.split`. Default [ark_twokenize]
        :param lowers: A list of bool values which indicate whether to lower
                       case the input words. Default [True]
        :param C: A list of C values for the :py:class:`sklearn.svm.SVC`
                  estimator that is used in the pipeline. Default [0.025]
        :param random_state: A list of random_state values for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline. Default [42]
        :param scale: List of scale values. The list can include
                      :py:class:`sklearn.preprocessing.MinMaxScaler` type of
                      clases or None if no scaling is to be used. Default
                      [:py:class:`sklearn.preprocessing.MinMaxScaler`]
        :return: Parameters to explore through cross validation
        '''
        return super().get_cv_parameters(word_vectors, tokeniser, lower, C,
                                         random_state, scale)

    @staticmethod
    def _get_word_vector_names() -> List[str]:
        '''
        :returns: A list of of parameter names where the word vectors are set
        in the pipeline.
        '''

        return ['union__left__word_vectors__vectors',
                'union__right__word_vectors__vectors',
                'union__target__word_vectors__vectors']

    @staticmethod
    def _get_tokeniser_names() -> List[str]:
        '''
        :returns: A list of of parameter names where the tokenisers are set
        in the pipeline.
        '''

        return ['union__left__tokens',
                'union__right__tokens',
                'union__target__tokens']


[docs]class TargetDep(TargetInd):
    '''
    Target-dep model from `Vo and Zhang 2015 paper
    <https://ijcai.org/Proceedings/15/Papers/194.pdf>`_.
    '''

[docs]    @classmethod
    def name(cls) -> str:
        return 'Target Dependent'

    def __repr__(self) -> str:
        '''
        Name of the machine learning model.
        '''
        return self.name()

[docs]    def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'],
                 tokeniser: Callable[[str], List[str]] = ark_twokenize,
                 lower: bool = True, C: float = 0.01,
                 random_state: int = 42,
                 scale: Any = MinMaxScaler()) -> None:
        '''
        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Wether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        '''
        # Inherit from SKLearnModel __init__ method
        # The parameters here go into the self.get_parameters method
        super(TargetInd, self).__init__(word_vectors, tokeniser, lower, C,
                                        random_state, scale)

[docs]    @staticmethod
    def pipeline() -> 'sklearn.pipeline.Pipeline':
        '''
        Machine Learning model that is used as the base template for the model
        attribute.

        :returns: The template machine learning model
        '''
        return Pipeline([
            ('union', FeatureUnion([
                ('left', Pipeline([
                    ('contexts', Context('left')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('right', Pipeline([
                    ('contexts', Context('right')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('target', Pipeline([
                    ('contexts', Context('target')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('full', Pipeline([
                    ('contexts', Context('full')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ]))
            ])),
            ('scale', MinMaxScaler()),
            ('svm', LinearSVC())
        ])

[docs]    @classmethod
    def get_parameters(cls,
                       word_vectors: List['bella.word_vectors.WordVectors'],
                       tokeniser: Callable[[str], List[str]] = ark_twokenize,
                       lower: bool = True, C: float = 0.01,
                       random_state: int = 42,
                       scale: Any = MinMaxScaler()) -> Dict[str, Any]:
        '''
        Transform the given parameters into a dictonary that is accepted as
        model parameters

        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Wether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        :return: Model parameters
        '''
        return super().get_parameters(word_vectors, tokeniser, lower, C,
                                      random_state, scale)

[docs]    @classmethod
    def get_cv_parameters(cls, word_vectors, tokeniser=[ark_twokenize],
                          lower=[True], C=[0.01], random_state=[42],
                          scale=[MinMaxScaler()]):
        '''
        Transform the given parameters into a list of dictonaries that is
        accepted as `param_grid` parameter in
        :py:class:`sklearn.model_selection.GridSearchCV`

        :param word_vectors: A list of a list of word vectors e.g. [[SSWE()],
                             [SSWE(), GloveCommonCrawl()]].
        :param tokenisers: A list of tokeniser to be used e.g.
                           :py:meth:`str.split`. Default [ark_twokenize]
        :param lowers: A list of bool values which indicate whether to lower
                       case the input words. Default [True]
        :param C: A list of C values for the :py:class:`sklearn.svm.SVC`
                  estimator that is used in the pipeline. Default [0.01]
        :param random_state: A list of random_state values for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline. Default [42]
        :param scale: List of scale values. The list can include
                      :py:class:`sklearn.preprocessing.MinMaxScaler` type of
                      clases or None if no scaling is to be used. Default
                      [:py:class:`sklearn.preprocessing.MinMaxScaler`]
        :return: Parameters to explore through cross validation
        '''
        return super().get_cv_parameters(word_vectors, tokeniser, lower, C,
                                         random_state, scale)

    @staticmethod
    def _get_word_vector_names() -> List[str]:
        '''
        :returns: A list of of parameter names where the word vectors are set
        in the pipeline.
        '''

        return ['union__left__word_vectors__vectors',
                'union__right__word_vectors__vectors',
                'union__target__word_vectors__vectors',
                'union__full__word_vectors__vectors']

    @staticmethod
    def _get_tokeniser_names() -> List[str]:
        '''
        :returns: A list of of parameter names where the tokenisers are set
        in the pipeline.
        '''

        return ['union__left__tokens',
                'union__right__tokens',
                'union__target__tokens',
                'union__full__tokens']


[docs]class TargetDepPlus(TargetInd):

[docs]    @classmethod
    def name(cls) -> str:
        return 'Target Dependent Plus'

    def __repr__(self) -> str:
        '''
        Name of the machine learning model.
        '''
        return self.name()

[docs]    def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'],
                 senti_lexicon: 'bella.lexicons.Lexicon',
                 tokeniser: Callable[[str], List[str]] = ark_twokenize,
                 lower: bool = True, C: float = 0.01,
                 random_state: int = 42,
                 scale: Any = MinMaxScaler()) -> None:
        '''
        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param senti_lexicon: Sentiment Lexicon to be used for the Left and
                              Right sentiment context (LS and RS).
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Whether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is used
                             in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        '''
        # Inherit from SKLearnModel __init__ method
        # The parameters here go into the self.get_parameters method
        super(TargetInd, self).__init__(word_vectors, senti_lexicon, tokeniser,
                                        lower, C, random_state, scale)

[docs]    @staticmethod
    def pipeline() -> 'sklearn.pipeline.Pipeline':
        '''
        Machine Learning model that is used as the base template for the model
        attribute.

        :returns: The template machine learning model
        '''
        return Pipeline([
            ('union', FeatureUnion([
                ('left', Pipeline([
                    ('contexts', Context('left')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('left_s', Pipeline([
                    ('contexts', Context('left')),
                    ('tokens', ContextTokeniser()),
                    ('filter', LexiconFilter()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('right', Pipeline([
                    ('contexts', Context('right')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('right_s', Pipeline([
                    ('contexts', Context('right')),
                    ('tokens', ContextTokeniser()),
                    ('filter', LexiconFilter()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('target', Pipeline([
                    ('contexts', Context('target')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ])),
                ('full', Pipeline([
                    ('contexts', Context('full')),
                    ('tokens', ContextTokeniser()),
                    ('word_vectors', ContextWordVectors()),
                    ('pool_funcs', FeatureUnion([
                        ('max_pipe', Pipeline([
                            ('max', NeuralPooling(matrix_max)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('min_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_min)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('avg_pipe', Pipeline([
                            ('avg', NeuralPooling(matrix_avg)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('prod_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_prod)),
                            ('join', JoinContextVectors(matrix_median))
                        ])),
                        ('std_pipe', Pipeline([
                            ('min', NeuralPooling(matrix_std)),
                            ('join', JoinContextVectors(matrix_median))
                        ]))
                    ]))
                ]))
            ])),
            ('scale', MinMaxScaler()),
            ('svm', LinearSVC())
        ])

[docs]    @classmethod
    def normalise_parameter_names(cls, parameter_dict: Dict[str, Any]
                                  ) -> Dict[str, Any]:
        '''
        Converts the output of :py:meth:`get_parameters` into a dictionary that 
        can be used as input into :py:meth:`get_parameters`.

        :returns: A dictonary that can be used as keyword arguments into the 
                  :py:meth:`get_parameters` method
        '''

        parameter_names = ['word_vectors__', '__tokeniser', '__lower', 'scale',
                           '__C', '__random_state', '__lexicon']
        name_parameter = {}
        for parameter_name in parameter_names:
            for name, parameter in parameter_dict.items():
                if parameter_name in name:
                    real_parameter_name = parameter_name.strip('_')
                    if parameter_name == '__lexicon':
                        real_parameter_name = 'senti_lexicon'
                    name_parameter[real_parameter_name] = parameter
        return name_parameter

[docs]    @classmethod
    def get_parameters(cls,
                       word_vectors: List['bella.word_vectors.WordVectors'],
                       senti_lexicon: 'bella.lexicons.Lexicon',
                       tokeniser: Callable[[str], List[str]] = ark_twokenize,
                       lower: bool = True, C: float = 0.01,
                       random_state: int = 42,
                       scale: Any = MinMaxScaler()) -> Dict[str, Any]:
        '''
        Transform the given parameters into a dictonary that is accepted as
        model parameters

        :param word_vectors: A list of one or more word vectors to be used as
                             feature vector lookups. If more than one is used
                             the word vectors are concatenated together to
                             create a the feature vector for each word.
        :param senti_lexicon: Sentiment Lexicon to be used for the Left and
                              Right sentiment context (LS and RS).
        :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split`
        :param lower: Whether to lower case the words
        :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator
                  that is used in the pipeline.
        :param random_state: The random_state value for the
                             :py:class:`sklearn.svm.SVC` estimator that is used
                             in the pipeline.
        :param scale: How to scale the data before input into the estimator.
                      If no scaling is to be used set this to None.
        :return: Model parameters
        '''
        params_dict = super().get_parameters(word_vectors, tokeniser, lower, C,
                                             random_state, scale)
        params_dict = cls._add_to_params_dict(params_dict,
                                              cls._get_word_senti_names(),
                                              senti_lexicon)
        return params_dict

[docs]    @classmethod
    def get_cv_parameters(cls,
                          word_vectors: List[List['bella.word_vectors\
                                                        .WordVectors']],
                          senti_lexicon: List['bella.lexicons.Lexicon'],
                          tokeniser=[ark_twokenize],
                          lower=[True], C=[0.01], random_state=[42],
                          scale=[MinMaxScaler()]):
        '''
        Transform the given parameters into a list of dictonaries that is
        accepted as `param_grid` parameter in
        :py:class:`sklearn.model_selection.GridSearchCV`

        :param word_vectors: A list of a list of word vectors e.g. [[SSWE()],
                             [SSWE(), GloveCommonCrawl()]].
        :param senti_lexicon: A list of Sentiment Lexicons to be explored for
                              the Left and Right sentiment context (LS and RS).
                              Default None, use the sentiment lexicons already
                              within the model.
        :param tokenisers: A list of tokeniser to be used e.g.
                           :py:meth:`str.split`. Default [ark_twokenize]
        :param lowers: A list of bool values which indicate whether to lower
                       case the input words. Default [True]
        :param C: A list of C values for the :py:class:`sklearn.svm.SVC`
                  estimator that is used in the pipeline. Default [0.01]
        :param random_state: A list of random_state values for the
                             :py:class:`sklearn.svm.SVC` estimator that is
                             used in the pipeline. Default [42]
        :param scale: List of scale values. The list can include
                      :py:class:`sklearn.preprocessing.MinMaxScaler` type of
                      clases or None if no scaling is to be used. Default
                      [:py:class:`sklearn.preprocessing.MinMaxScaler`]
        :return: Parameters to explore through cross validation
        '''
        params_list = super().get_cv_parameters(word_vectors, tokeniser,
                                                lower, C, random_state, scale)
        params_list = cls._add_to_params(params_list, senti_lexicon,
                                         cls._get_word_senti_names())
        return params_list

    @staticmethod
    def _get_word_vector_names() -> List[str]:
        '''
        :returns: A list of of parameter names where the word vectors are set
        in the pipeline.
        '''

        return ['union__left__word_vectors__vectors',
                'union__left_s__word_vectors__vectors',
                'union__right__word_vectors__vectors',
                'union__right_s__word_vectors__vectors',
                'union__target__word_vectors__vectors',
                'union__full__word_vectors__vectors']

    @staticmethod
    def _get_tokeniser_names() -> List[str]:
        '''
        :returns: A list of of parameter names where the tokenisers are set
        in the pipeline.
        '''

        return ['union__left__tokens',
                'union__left_s__tokens',
                'union__right__tokens',
                'union__right_s__tokens',
                'union__target__tokens',
                'union__full__tokens']

    @staticmethod
    def _get_word_senti_names() -> List[str]:
        '''
        :returns: A list of of parameter names where the sentiment lexicons
        are set in the pipeline.
        '''

        return ['union__left_s__filter__lexicon',
                'union__right_s__filter__lexicon']