Source code for bella.models.target

'''
Module contains all of the classes that represent Machine Learning models
that are within `Vo and Zhang 2015 paper \
<https://ijcai.org/Proceedings/15/Papers/194.pdf>`_:

1. :py:class:`bella.models.target.TargetInd` -- Target Indepdent model
2. :py:class:`bella.models.target.TargetDepMinus` -- Target Dependent Minus
   model
3. :py:class:`bella.models.target.TargetDep` -- Target Dependent model
4. :py:class:`bella.models.target.TargetDepPlus` -- Target Dependent Plus model
'''

from typing import Any, List, Callable, Dict

import sklearn
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC

import bella
from bella.tokenisers import ark_twokenize
from bella.neural_pooling import matrix_max, matrix_min, matrix_avg
from bella.neural_pooling import matrix_median, matrix_prod, matrix_std
from bella.scikit_features.context import Context
from bella.scikit_features.lexicon_filter import LexiconFilter
from bella.scikit_features.join_context_vectors import JoinContextVectors
from bella.scikit_features.neural_pooling import NeuralPooling
from bella.scikit_features.tokeniser import ContextTokeniser
from bella.scikit_features.word_vector import ContextWordVectors
from bella.models.base import SKLearnModel


[docs]class TargetInd(SKLearnModel): ''' Attributes: 1. model -- Machine learning model. Expects it to be a :py:class:`sklearn.pipeline.Pipeline` instance. 2. fitted -- If the machine learning model has been fitted (default False) 3. model_parameters -- The parameters that are set in the machine learning model. E.g. Parameter could be the tokeniser used. Methods: 1. fit -- Fit the model according to the given training data. 2. predict -- Predict class labels for samples in X. 3. probabilities -- The probability of each class label for all samples in X. 4. __repr__ -- Name of the machine learning model. Class Methods: 1. get_parameters -- Transform the given parameters into a dictonary that is accepted as model parameters. 2. get_cv_parameters -- Transform the given parameters into a list of dictonaries that is accepted as `param_grid` parameter in :py:class:`sklearn.model_selection.GridSearchCV` 3. name -- -- Returns the name of the model. Functions: 1. save -- Given a instance of this class will save it to a file. 2. load -- Loads an instance of this class from a file. 3. pipeline -- Machine Learning model that is used as the base template for the model attribute. '''
[docs] @classmethod def name(cls) -> str: return 'Target Independent'
def __repr__(self) -> str: ''' Name of the machine learning model. ''' return self.name()
[docs] def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'], tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.01, random_state: int = 42, scale: Any = MinMaxScaler()) -> None: ''' :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Whether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. ''' # The parameters here go into the self.get_parameters method super().__init__(word_vectors, tokeniser, lower, C, random_state, scale)
[docs] @staticmethod def pipeline() -> 'sklearn.pipeline.Pipeline': ''' Machine Learning model that is used as the base template for the model attribute. :returns: The template machine learning model ''' return Pipeline([ ('contexts', Context('full')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])), ('scale', MinMaxScaler()), ('svm', LinearSVC()) ])
[docs] @classmethod def normalise_parameter_names(cls, parameter_dict: Dict[str, Any] ) -> Dict[str, Any]: ''' Converts the output of :py:meth:`get_parameters` into a dictionary that can be used as input into :py:meth:`get_parameters`. :returns: A dictonary that can be used as keyword arguments into the :py:meth:`get_parameters` method ''' parameter_names = ['word_vectors__', '__tokeniser', '__lower', 'scale', '__C', '__random_state'] name_parameter = {} for parameter_name in parameter_names: for name, parameter in parameter_dict.items(): if parameter_name in name: real_parameter_name = parameter_name.strip('_') name_parameter[real_parameter_name] = parameter return name_parameter
[docs] @classmethod def get_parameters(cls, word_vectors: List['bella.word_vectors.WordVectors'], tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.01, random_state: int = 42, scale: Any = MinMaxScaler()) -> Dict[str, Any]: ''' Transform the given parameters into a dictonary that is accepted as model parameters :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Whether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. :return: Model parameters ''' params_dict = {} # Add word vectors to Pipeline model params_dict = cls._add_to_params_dict(params_dict, cls._get_word_vector_names(), word_vectors) # Add tokenisers to Pipeline model tokenisers_names = [param_name + '__tokeniser' for param_name in cls._get_tokeniser_names()] params_dict = cls._add_to_params_dict(params_dict, tokenisers_names, tokeniser) # Add if the words should be lower cased lower_names = [param_name + '__lower' for param_name in cls._get_tokeniser_names()] params_dict = cls._add_to_params_dict(params_dict, lower_names, lower) # Add how the data should be scaled before going into the SVM # If None then it means no scaling happens params_dict = cls._add_to_params_dict(params_dict, ['scale'], scale) # Add the C value for the SVM params_dict = cls._add_to_params_dict(params_dict, ['svm__C'], C) # Add the random state for the SVM params_dict = cls._add_to_params_dict(params_dict, ['svm__random_state'], random_state) return params_dict
[docs] @classmethod def get_cv_parameters(cls, word_vectors, tokeniser=[ark_twokenize], lower=[True], C=[0.01], random_state=[42], scale=[MinMaxScaler()]): ''' Transform the given parameters into a list of dictonaries that is accepted as `param_grid` parameter in :py:class:`sklearn.model_selection.GridSearchCV` :param word_vectors: A list of a list of word vectors e.g. [[SSWE()], [SSWE(), GloveCommonCrawl()]]. :param tokenisers: A list of tokeniser to be used e.g. :py:meth:`str.split`. Default [ark_twokenize] :param lowers: A list of bool values which indicate whether to lower case the input words. Default [True] :param C: A list of C values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [0.01] :param random_state: A list of random_state values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [42] :param scale: List of scale values. The list can include :py:class:`sklearn.preprocessing.MinMaxScaler` type of clases or None if no scaling is to be used. Default [:py:class:`sklearn.preprocessing.MinMaxScaler`] :return: Parameters to explore through cross validation ''' params_list = [] # Word Vectors params_list = cls._add_to_params(params_list, word_vectors, cls._get_word_vector_names()) # Tokeniser tokenisers_names = [param_name + '__tokeniser' for param_name in cls._get_tokeniser_names()] params_list = cls._add_to_params(params_list, tokeniser, tokenisers_names) # Lower lower_names = [param_name + '__lower' for param_name in cls._get_tokeniser_names()] params_list = cls._add_to_params(params_list, lower, lower_names) # C params_list = cls._add_to_all_params(params_list, 'svm__C', C) # Random State params_list = cls._add_to_all_params(params_list, 'svm__random_state', random_state) # Scale params_list = cls._add_to_all_params(params_list, 'scale', scale) return params_list
@staticmethod def _get_word_vector_names() -> List[str]: ''' Method to be overidden by subclasses as each pipeline will be different and will have different parameter names for where the word vectors are set. :returns: A list of of parameter names where the word vectors are set in the pipeline. ''' return ['word_vectors__vectors'] @staticmethod def _get_tokeniser_names() -> List[str]: ''' Method to be overidden by subclasses as each pipeline will be different and will have different parameter names for where tokenisers are used. :returns: A list of of parameter names where the tokenisers are used in the pipeline. ''' return ['tokens']
[docs]class TargetDepMinus(TargetInd):
[docs] @classmethod def name(cls) -> str: return 'Target Dependent Minus'
def __repr__(self) -> str: ''' Name of the machine learning model. ''' return self.name()
[docs] def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'], tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.025, random_state: int = 42, scale: Any = MinMaxScaler()) -> None: ''' :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Wether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. ''' # Inherit from SKLearnModel __init__ method # The parameters here go into the self.get_parameters method super(TargetInd, self).__init__(word_vectors, tokeniser, lower, C, random_state, scale)
[docs] @staticmethod def pipeline() -> 'sklearn.pipeline.Pipeline': ''' Machine Learning model that is used as the base template for the model attribute. :returns: The template machine learning model ''' return Pipeline([ ('union', FeatureUnion([ ('left', Pipeline([ ('contexts', Context('left')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('right', Pipeline([ ('contexts', Context('right')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('target', Pipeline([ ('contexts', Context('target')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])) ])), ('scale', MinMaxScaler()), ('svm', LinearSVC()) ])
[docs] @classmethod def get_parameters(cls, word_vectors: List['bella.word_vectors.WordVectors'], tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.025, random_state: int = 42, scale: Any = MinMaxScaler()) -> Dict[str, Any]: ''' Transform the given parameters into a dictonary that is accepted as model parameters :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Wether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. :return: Model parameters ''' return super().get_parameters(word_vectors, tokeniser, lower, C, random_state, scale)
[docs] @classmethod def get_cv_parameters(cls, word_vectors, tokeniser=[ark_twokenize], lower=[True], C=[0.025], random_state=[42], scale=[MinMaxScaler()]): ''' Transform the given parameters into a list of dictonaries that is accepted as `param_grid` parameter in :py:class:`sklearn.model_selection.GridSearchCV` :param word_vectors: A list of a list of word vectors e.g. [[SSWE()], [SSWE(), GloveCommonCrawl()]]. :param tokenisers: A list of tokeniser to be used e.g. :py:meth:`str.split`. Default [ark_twokenize] :param lowers: A list of bool values which indicate whether to lower case the input words. Default [True] :param C: A list of C values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [0.025] :param random_state: A list of random_state values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [42] :param scale: List of scale values. The list can include :py:class:`sklearn.preprocessing.MinMaxScaler` type of clases or None if no scaling is to be used. Default [:py:class:`sklearn.preprocessing.MinMaxScaler`] :return: Parameters to explore through cross validation ''' return super().get_cv_parameters(word_vectors, tokeniser, lower, C, random_state, scale)
@staticmethod def _get_word_vector_names() -> List[str]: ''' :returns: A list of of parameter names where the word vectors are set in the pipeline. ''' return ['union__left__word_vectors__vectors', 'union__right__word_vectors__vectors', 'union__target__word_vectors__vectors'] @staticmethod def _get_tokeniser_names() -> List[str]: ''' :returns: A list of of parameter names where the tokenisers are set in the pipeline. ''' return ['union__left__tokens', 'union__right__tokens', 'union__target__tokens']
[docs]class TargetDep(TargetInd): ''' Target-dep model from `Vo and Zhang 2015 paper <https://ijcai.org/Proceedings/15/Papers/194.pdf>`_. '''
[docs] @classmethod def name(cls) -> str: return 'Target Dependent'
def __repr__(self) -> str: ''' Name of the machine learning model. ''' return self.name()
[docs] def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'], tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.01, random_state: int = 42, scale: Any = MinMaxScaler()) -> None: ''' :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Wether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. ''' # Inherit from SKLearnModel __init__ method # The parameters here go into the self.get_parameters method super(TargetInd, self).__init__(word_vectors, tokeniser, lower, C, random_state, scale)
[docs] @staticmethod def pipeline() -> 'sklearn.pipeline.Pipeline': ''' Machine Learning model that is used as the base template for the model attribute. :returns: The template machine learning model ''' return Pipeline([ ('union', FeatureUnion([ ('left', Pipeline([ ('contexts', Context('left')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('right', Pipeline([ ('contexts', Context('right')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('target', Pipeline([ ('contexts', Context('target')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('full', Pipeline([ ('contexts', Context('full')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])) ])), ('scale', MinMaxScaler()), ('svm', LinearSVC()) ])
[docs] @classmethod def get_parameters(cls, word_vectors: List['bella.word_vectors.WordVectors'], tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.01, random_state: int = 42, scale: Any = MinMaxScaler()) -> Dict[str, Any]: ''' Transform the given parameters into a dictonary that is accepted as model parameters :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Wether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. :return: Model parameters ''' return super().get_parameters(word_vectors, tokeniser, lower, C, random_state, scale)
[docs] @classmethod def get_cv_parameters(cls, word_vectors, tokeniser=[ark_twokenize], lower=[True], C=[0.01], random_state=[42], scale=[MinMaxScaler()]): ''' Transform the given parameters into a list of dictonaries that is accepted as `param_grid` parameter in :py:class:`sklearn.model_selection.GridSearchCV` :param word_vectors: A list of a list of word vectors e.g. [[SSWE()], [SSWE(), GloveCommonCrawl()]]. :param tokenisers: A list of tokeniser to be used e.g. :py:meth:`str.split`. Default [ark_twokenize] :param lowers: A list of bool values which indicate whether to lower case the input words. Default [True] :param C: A list of C values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [0.01] :param random_state: A list of random_state values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [42] :param scale: List of scale values. The list can include :py:class:`sklearn.preprocessing.MinMaxScaler` type of clases or None if no scaling is to be used. Default [:py:class:`sklearn.preprocessing.MinMaxScaler`] :return: Parameters to explore through cross validation ''' return super().get_cv_parameters(word_vectors, tokeniser, lower, C, random_state, scale)
@staticmethod def _get_word_vector_names() -> List[str]: ''' :returns: A list of of parameter names where the word vectors are set in the pipeline. ''' return ['union__left__word_vectors__vectors', 'union__right__word_vectors__vectors', 'union__target__word_vectors__vectors', 'union__full__word_vectors__vectors'] @staticmethod def _get_tokeniser_names() -> List[str]: ''' :returns: A list of of parameter names where the tokenisers are set in the pipeline. ''' return ['union__left__tokens', 'union__right__tokens', 'union__target__tokens', 'union__full__tokens']
[docs]class TargetDepPlus(TargetInd):
[docs] @classmethod def name(cls) -> str: return 'Target Dependent Plus'
def __repr__(self) -> str: ''' Name of the machine learning model. ''' return self.name()
[docs] def __init__(self, word_vectors: List['bella.word_vectors.WordVectors'], senti_lexicon: 'bella.lexicons.Lexicon', tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.01, random_state: int = 42, scale: Any = MinMaxScaler()) -> None: ''' :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param senti_lexicon: Sentiment Lexicon to be used for the Left and Right sentiment context (LS and RS). :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Whether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. ''' # Inherit from SKLearnModel __init__ method # The parameters here go into the self.get_parameters method super(TargetInd, self).__init__(word_vectors, senti_lexicon, tokeniser, lower, C, random_state, scale)
[docs] @staticmethod def pipeline() -> 'sklearn.pipeline.Pipeline': ''' Machine Learning model that is used as the base template for the model attribute. :returns: The template machine learning model ''' return Pipeline([ ('union', FeatureUnion([ ('left', Pipeline([ ('contexts', Context('left')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('left_s', Pipeline([ ('contexts', Context('left')), ('tokens', ContextTokeniser()), ('filter', LexiconFilter()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('right', Pipeline([ ('contexts', Context('right')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('right_s', Pipeline([ ('contexts', Context('right')), ('tokens', ContextTokeniser()), ('filter', LexiconFilter()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('target', Pipeline([ ('contexts', Context('target')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])), ('full', Pipeline([ ('contexts', Context('full')), ('tokens', ContextTokeniser()), ('word_vectors', ContextWordVectors()), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([ ('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median)) ])), ('min_pipe', Pipeline([ ('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median)) ])), ('avg_pipe', Pipeline([ ('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median)) ])), ('prod_pipe', Pipeline([ ('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median)) ])), ('std_pipe', Pipeline([ ('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median)) ])) ])) ])) ])), ('scale', MinMaxScaler()), ('svm', LinearSVC()) ])
[docs] @classmethod def normalise_parameter_names(cls, parameter_dict: Dict[str, Any] ) -> Dict[str, Any]: ''' Converts the output of :py:meth:`get_parameters` into a dictionary that can be used as input into :py:meth:`get_parameters`. :returns: A dictonary that can be used as keyword arguments into the :py:meth:`get_parameters` method ''' parameter_names = ['word_vectors__', '__tokeniser', '__lower', 'scale', '__C', '__random_state', '__lexicon'] name_parameter = {} for parameter_name in parameter_names: for name, parameter in parameter_dict.items(): if parameter_name in name: real_parameter_name = parameter_name.strip('_') if parameter_name == '__lexicon': real_parameter_name = 'senti_lexicon' name_parameter[real_parameter_name] = parameter return name_parameter
[docs] @classmethod def get_parameters(cls, word_vectors: List['bella.word_vectors.WordVectors'], senti_lexicon: 'bella.lexicons.Lexicon', tokeniser: Callable[[str], List[str]] = ark_twokenize, lower: bool = True, C: float = 0.01, random_state: int = 42, scale: Any = MinMaxScaler()) -> Dict[str, Any]: ''' Transform the given parameters into a dictonary that is accepted as model parameters :param word_vectors: A list of one or more word vectors to be used as feature vector lookups. If more than one is used the word vectors are concatenated together to create a the feature vector for each word. :param senti_lexicon: Sentiment Lexicon to be used for the Left and Right sentiment context (LS and RS). :param tokeniser: Tokeniser to be used e.g. :py:meth:`str.split` :param lower: Whether to lower case the words :param C: The C value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param random_state: The random_state value for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. :param scale: How to scale the data before input into the estimator. If no scaling is to be used set this to None. :return: Model parameters ''' params_dict = super().get_parameters(word_vectors, tokeniser, lower, C, random_state, scale) params_dict = cls._add_to_params_dict(params_dict, cls._get_word_senti_names(), senti_lexicon) return params_dict
[docs] @classmethod def get_cv_parameters(cls, word_vectors: List[List['bella.word_vectors\ .WordVectors']], senti_lexicon: List['bella.lexicons.Lexicon'], tokeniser=[ark_twokenize], lower=[True], C=[0.01], random_state=[42], scale=[MinMaxScaler()]): ''' Transform the given parameters into a list of dictonaries that is accepted as `param_grid` parameter in :py:class:`sklearn.model_selection.GridSearchCV` :param word_vectors: A list of a list of word vectors e.g. [[SSWE()], [SSWE(), GloveCommonCrawl()]]. :param senti_lexicon: A list of Sentiment Lexicons to be explored for the Left and Right sentiment context (LS and RS). Default None, use the sentiment lexicons already within the model. :param tokenisers: A list of tokeniser to be used e.g. :py:meth:`str.split`. Default [ark_twokenize] :param lowers: A list of bool values which indicate whether to lower case the input words. Default [True] :param C: A list of C values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [0.01] :param random_state: A list of random_state values for the :py:class:`sklearn.svm.SVC` estimator that is used in the pipeline. Default [42] :param scale: List of scale values. The list can include :py:class:`sklearn.preprocessing.MinMaxScaler` type of clases or None if no scaling is to be used. Default [:py:class:`sklearn.preprocessing.MinMaxScaler`] :return: Parameters to explore through cross validation ''' params_list = super().get_cv_parameters(word_vectors, tokeniser, lower, C, random_state, scale) params_list = cls._add_to_params(params_list, senti_lexicon, cls._get_word_senti_names()) return params_list
@staticmethod def _get_word_vector_names() -> List[str]: ''' :returns: A list of of parameter names where the word vectors are set in the pipeline. ''' return ['union__left__word_vectors__vectors', 'union__left_s__word_vectors__vectors', 'union__right__word_vectors__vectors', 'union__right_s__word_vectors__vectors', 'union__target__word_vectors__vectors', 'union__full__word_vectors__vectors'] @staticmethod def _get_tokeniser_names() -> List[str]: ''' :returns: A list of of parameter names where the tokenisers are set in the pipeline. ''' return ['union__left__tokens', 'union__left_s__tokens', 'union__right__tokens', 'union__right_s__tokens', 'union__target__tokens', 'union__full__tokens'] @staticmethod def _get_word_senti_names() -> List[str]: ''' :returns: A list of of parameter names where the sentiment lexicons are set in the pipeline. ''' return ['union__left_s__filter__lexicon', 'union__right_s__filter__lexicon']