Source code for bella.models.base

'''
Module contains all of the main base classes for the machine learning models
these are grouped into 3 categories; 1. Mixin, 2. Abstract, and 3. Concrete.


Mixin classes - This is a function based class that contains functions that
do not rely on the type of model and are useful for all:

1. :py:class:`bella.models.base.ModelMixin`

Abstract classes - This is used to enforce all the functions that all
the machine learning models must have. This is also the class that inherits
the Mixin class:

1. :py:class:`bella.models.base.BaseModel`

Concrete classes - These are more concete classes that still contain some
abstract methods. However they are the classes to inherit from to create a
machine learning model base on a certain framework e.g. SKlearn or Keras:

1. :py:class:`bella.models.base.SKLearnModel`
2. :py:class:`bella.models.base.KerasModel`
'''

from abc import ABC, abstractmethod
from collections import defaultdict
import copy
import os
from pathlib import Path
import pickle
import random as rn
import tempfile
from typing import Any, List, Dict, Union, Tuple, Callable
from multiprocessing.pool import Pool

import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import preprocessing
import numpy as np
import pandas as pd
import sklearn
from sklearn.externals import joblib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import tensorflow as tf

import bella
from bella.data_types import TargetCollection, Target


[docs]class ModelMixin(): ''' Mixin class for all of the machine learning models. Contain functions only so they are as generic as possible. Functions: 1. train_val_split -- Splits the training dataset into a train and validation set in a stratified split. ''' @staticmethod def _convert_to_targets(data: List[Dict[str, Any]] ) -> List['bella.data_types.Target']: ''' Converts a list of dictionaries into a list of :py:class:`bella.data_types.Target`. ''' all_targets = [] for target in data: all_targets.append(Target(**target)) return all_targets
[docs] @staticmethod def train_val_split(train: 'TargetCollection', split_size: float = 0.2, seed: Union[None, int] = 42 ) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]: ''' Splits the training dataset into a train and validation set in a stratified split. :param train: The training dataset that needs to be split into :param split_size: Fraction of the dataset to assign to the validation set. :param seed: Seed value to give to the stratified splitter. If None then it uses the radnom state of numpy. :return: Two tuples of length two where each tuple is the train and validation splits respectively, and each tuple contains the data (X) and class labels (y) respectively. Returns ((X_train, y_train), (X_val, y_val)) ''' splitter = StratifiedShuffleSplit(n_splits=1, test_size=split_size, random_state=seed) data = np.asarray(train.data_dict()) sentiment = np.asarray(train.sentiment_data()) for train_indexs, test_indexs in splitter.split(data, sentiment): train_data = data[train_indexs] test_data = data[test_indexs] train = TargetCollection(ModelMixin._convert_to_targets(train_data)) val = TargetCollection(ModelMixin._convert_to_targets(test_data)) X_train = np.array(train.data_dict()) y_train = np.array(train.sentiment_data()) X_val = np.array(val.data_dict()) y_val = np.array(val.sentiment_data()) return (X_train, y_train), (X_val, y_val)
[docs]class BaseModel(ModelMixin, ABC): ''' Abstract class for all of the machine learning models. Attributes: 1. model -- Machine learning model that is associated to this instance. 2. fitted -- If the machine learning model has been fitted (default False) Methods: 1. fit -- Fit the model according to the given training data. 2. predict -- Predict class labels for samples in X. 3. probabilities -- The probability of each class label for all samples in X. 4. __repr__ -- Name of the machine learning model. Class Methods: 1. name -- -- Returns the name of the model. Functions: 1. save -- Saves the given machine learning model instance to a file. 2. load -- Loads the entire machine learning model from a file. 3. evaluate_parameter -- fit and predict given training, validation and test data the given model when the given parameter is changed on the model. 4. evaluate_parameters -- same as evaluate_parameter however it evaluates over many parameter values for the same parameter. '''
[docs] @abstractmethod def fit(self, X: np.ndarray, y: np.ndarray) -> None: ''' Fit the model according to the given training data. :param X: Training samples matrix, shape = [n_samples, n_features] :param y: Training targets, shape = [n_samples] :return: The `model` attribute will now be trained. '''
[docs] @abstractmethod def predict(self, X: np.ndarray) -> np.ndarray: ''' Predict class labels for samples in X. :param X: Test samples matrix, shape = [n_samples, n_features] :return: Predicted class label per sample, shape = [n_samples] '''
[docs] @abstractmethod def probabilities(self, X: np.ndarray) -> np.ndarray: ''' The probability of each class label for all samples in X. :param X: Test samples matrix, shape = [n_samples, n_features]] :return: Probability of each class label for all samples, shape = \ [n_samples, n_classes] '''
@abstractmethod def __repr__(self) -> str: ''' Name of the machine learning model. :return: Name of the machine learning model. '''
[docs] @staticmethod @abstractmethod def save(model: 'BaseModel', save_fp: Path) -> None: ''' Saves the entire machine learning model to a file. :param model: The machine learning model instance to be saved. :param save_fp: File path of the location that the model is to be \ saved to. :return: Nothing. '''
[docs] @staticmethod @abstractmethod def load(load_fp: Path) -> 'bella.models.base.BaseModel': ''' Loads the entire machine learning model from a file. :param load_fp: File path of the location that the model was saved to. :return: self '''
[docs] @staticmethod @abstractmethod def evaluate_parameter(model: 'BaseModel', train: Tuple[np.ndarray, np.ndarray], val: Union[None, Tuple[np.ndarray, np.ndarray]], test: np.ndarray, parameter_name: str, parameter: Any) -> Tuple[Any, np.ndarray]: ''' Given a model will set the `parameter_name` to `parameter` fit the model and return the a Tuple of parameter changed and predictions of the model on the test data, using the train and validation data for fitting. :param model: :py:class:`bella.models.base.BaseModel` instance :param train: Tuple of `(X_train, y_train)`. Used to fit the model. :param val: Tuple of `(X_val, y_val)` or None is not required. This is only required if the model requires validation data like the :py:class:`bella.models.base.KerasModel` models do. :param test: `X_test` data to predict on. :param parameter_name: Name of the parameter to change e.g. optimiser :param parameter: value to assign to the parameter e.g. :py:class:`keras.optimizers.RMSprop` :return: A tuple of (parameter value, predictions) '''
[docs] @staticmethod @abstractmethod def evaluate_parameters(model: 'bella.models.base.BaseModel', train: Tuple[np.ndarray, np.ndarray], val: Union[None, Tuple[np.ndarray, np.ndarray]], test: np.ndarray, parameter_name: str, parameters: List[Any], n_jobs: int ) -> List[Tuple[Any, np.ndarray]]: ''' Performs :py:func:`bella.models.base.BaseModel.evaluate_parameter` on one `parameter_name` but with multiple parameter values. This is useful if you would like to know the affect of changing the values of a parameter. It can also perform the task in a multiprocessing manner if `n_jobs` > 1. :param model: :py:class:`bella.models.base.BaseModel` instance :param train: Tuple of `(X_train, y_train)`. Used to fit the model. :param val: Tuple of `(X_val, y_val)` or None is not required. This is only required if the model requires validation data like the :py:class:`bella.models.base.KerasModel` models do. :param test: `X_test` data to predict on. :param parameter_name: Name of the parameter to change e.g. optimiser :param parameters: A list of values to assign to the parameter e.g. [:py:class:`keras.optimizers.RMSprop`] :param n_jobs: Number of cpus to use for multiprocessing if 1 then will not multiprocess. :return: A list of tuples of (parameter value, predictions) '''
[docs] @classmethod @abstractmethod def name(cls) -> str: ''' Returns the name of the model. :return: Name of the model '''
@property def model(self) -> Any: ''' Machine learning model that is associated to this instance. :return: The machine learning model ''' return self._model @model.setter def model(self, value) -> None: ''' Sets the model attribute :param value: The value to assign to the model attribute ''' self._model = value @property def fitted(self) -> bool: ''' If the machine learning model has been fitted (default False) :return: True or False ''' return self._fitted @fitted.setter def fitted(self, value: bool) -> None: ''' Sets the fitted attribute :param value: The value to assign to the fitted attribute ''' self._fitted = value
[docs]class KerasModel(BaseModel): ''' Concrete class that is designed to be used as the base class for all machine learning models that are based on the `Keras library <https://keras.io>`_. Attributes: 1. tokeniser -- Tokeniser model uses e.g. :py:meth:`str.split`. 2. embeddings -- the word embeddings the model uses. e.g. :py:class:`bella.word_vectors.SSWE` 3. lower -- if the model lower cases the words when pre-processing the data 4. reproducible -- Whether to be reproducible. If None then it is quicker to run. Else provide a `int` that will represent the random seed value. 5. patience -- Number of epochs with no improvement before training is stopped. 6. batch_size -- Number of samples per gradient update. 7. epcohs -- Number of times to train over the entire training set before stopping. 8. optimiser -- Optimiser the model uses. e.g. :py:class:`keras.optimizers.SGD` 9. optimiser_params -- Parameters for the optimiser. If None uses default for the optimiser being used. Abstract Methods: 1. keras_model -- Keras machine Learning model that represents the class e.g. single forward LSTM. 2. create_training_text -- Converts the training and validation data into a format that the keras model can take as input. 3. create_training_y -- Converts the training and validation targets into a format that can be used by the keras model. Methods: 1. fit -- Fit the model according to the given training and validation data. 2. probabilities -- The probability of each class label for all samples in X. 3. predict -- Predict class labels for samples in X. Functions: 1. save -- Given a instance of this class will save it to a file. 2. load -- Loads an instance of this class from a file. 3. evaluate_parameter -- fit and predict given training, validation and test data the given model when the given parameter is changed on the model. 4. evaluate_parameters -- same as evaluate_parameter however it evaluates over many parameter values for the same parameter. '''
[docs] @abstractmethod def keras_model(self, num_classes: int) -> 'keras.models.Model': ''' Keras machine Learning model that represents the class e.g. single forward LSTM. :returns: Keras machine learning model ''' pass
[docs] @abstractmethod def create_training_text(self, train_data: List[Dict[str, Any]], validation_data: List[Dict[str, Any]] ) -> Tuple[Any, Any]: ''' Converts the training and validation data into a format that the keras model can take as input. :return: A tuple of length two containing the keras model training and validation input respectively. '''
[docs] @abstractmethod def create_training_y(self, train_y: np.ndarray, validation_y: np.ndarray ) -> Tuple[np.ndarray, np.ndarray]: ''' Converts the training and validation targets into a format that can be used by the keras model :return: A tuple of length containing two array the first for training and the second for validation. '''
@abstractmethod def _pre_process(self, data_dicts: Dict[str, Any], training: bool): ''' Converts the training or validation data into a format that will be used by the keras model. This function is normally used to process the training and the validation to be returned together by :py:meth:`bella.models.base.KerasModel.create_training_text` '''
[docs] def process_text(self, texts: List[str], max_length: int, padding: str = 'pre', truncate: str = 'pre' ) -> Tuple[int, np.ndarray]: ''' Given a list of Strings, tokenised the text and lower case if set and then convert the tokens into a integers representing the tokens in the embeddings. Lastly it pads the data based on the max_length param. If the max_length is smaller than the sentences size it truncates the sentence. If max_length = -1 then the max_length is that of the longest sentence in the texts. :params texts: List of texts :params max_length: How many tokens a sentence can contain. If it is -1 then it uses the sentence with the most tokens as the max_length parameter. :params padding: Which side of the sentence to pad: `pre` beginning, `post` end. :params truncate: Which side of the sentence to truncate: `pre` beginning `post` end. :returns: A tuple of length 2 containg: 1. The max_length parameter, 2. A matrix of shape [n_samples, pad_size] where each integer in the matrix represents the word embedding lookup. :raises ValueError: If the mex_length argument is equal to or less than 0. Or if the calculated max_length is 0. ''' if max_length == 0: raise ValueError('The max length of a sequence cannot be zero') elif max_length < -1: raise ValueError('The max length has to be either -1 or above ' f'zero not {max_length}') # Process the text into integers based on the embeddings given all_sequence_data = [] max_sequence = 0 for text in texts: sequence_data = [] tokens = self.tokeniser(text) for token in tokens: if self.lower: token = token.lower() # If the token does not exist it should lookup the unknown # word vector sequence_data.append(self.embeddings.word2index[token]) sequence_length = len(sequence_data) if sequence_length > max_sequence: max_sequence = sequence_length all_sequence_data.append(sequence_data) if max_sequence == 0: raise ValueError('The max sequence length is 0 suggesting no ' 'data was provided for training or testing') # Pad the sequences # If max pad size is set and training the model set the # test_pad_size to max sequence length if max_length == -1: max_length = max_sequence return (max_length, preprocessing.sequence.pad_sequences(all_sequence_data, maxlen=max_length, dtype='int32', value=0, padding=padding, truncating=truncate))
[docs] def fit(self, X: np.ndarray, y: np.ndarray, validation_data: Tuple[np.ndarray, np.ndarray], verbose: int = 0, continue_training: bool = False) -> 'keras.callbacks.History': ''' Fit the model according to the given training and validation data. :param X: Training samples matrix, shape = [n_samples, n_features] :param y: Training targets, shape = [n_samples] :param validation_data: Tuple of `(x_val, y_val)`. Used to evaluate the model at each epoch. Will not be trained on this data. :param verbose: 0 = silent, 1 = progress :param continue_training: Whether the model that has already been trained should be trained further. :return: A record of training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values. ''' X_val, y_val = validation_data if sum(y_val < 0) or sum(y < 0): raise ValueError('The class labels have to be greater than 0') X, X_val = self.create_training_text(X, X_val) if isinstance(X, tuple): X = list(X) X_val = list(X_val) y, y_val = self.create_training_y(y, y_val) num_classes = y.shape[1] if verbose: print(f'Number of classes in the data {num_classes}') if not continue_training: self.fitted = False self._to_be_reproducible(self.reproducible) self.model = self.keras_model(num_classes) elif self.fitted and not continue_training: raise ValueError('The model is already fitted') model = self.model if not continue_training: model.compile(optimizer=self.optimiser(**self.optimiser_params), metrics=['accuracy'], loss='categorical_crossentropy') with tempfile.NamedTemporaryFile() as weight_file: # Set up the callbacks model_checkpoint = ModelCheckpoint(weight_file.name, monitor='val_loss', save_best_only=True, save_weights_only=True, mode='min') early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=self.patience) callbacks = [early_stopping, model_checkpoint] history = model.fit(X, y, validation_data=(X_val, y_val), epochs=self.epochs, callbacks=callbacks, verbose=verbose, batch_size=self.batch_size) # Load the best model from the saved weight file model.load_weights(weight_file.name) self.model = model self.fitted = True return history
[docs] def probabilities(self, X: np.ndarray) -> np.ndarray: ''' The probability of each class label for all samples in X. :param X: Test samples matrix, shape = [n_samples, n_features]] :return: Probability of each class label for all samples, shape = \ [n_samples, n_classes] ''' if self.fitted is False: raise ValueError('The model has not been fitted please run the ' '`fit` method.') # Convert from a sequence of dictionaries into texts and then integers # that represent the tokens in the text within the embedding space. sequence_test_data = self._pre_process(X, training=False) predicted_values = self.model.predict(sequence_test_data) return predicted_values
[docs] def predict(self, X: np.ndarray) -> np.ndarray: ''' Predict class labels for samples in X. :param X: Test samples matrix, shape = [n_samples, n_features] :return: Predicted class label per sample, shape = [n_samples] ''' return np.argmax(self.probabilities(X), axis=1)
[docs] @staticmethod def save(model: 'bella.models.base.KerasModel', save_fp: Path) -> None: ''' Given a Keras Model, mode, path to the folder to save too, and a name to save the files it will save the data to restore the model. :param model: The machine learning model instance to be saved. :param save_fp: File path of the location that the model is to be saved. :return: Nothing. :raises ValueError: If the model has not been fitted or if the model is not of type :py:class:`bella.models.base.KerasModel` ''' if not isinstance(model, KerasModel): raise ValueError('The model parameter has to be of type ' f'KearsModel not {type(model)}') if model.fitted: model_fp = save_fp.with_suffix('.h5') model.model.save(model_fp) attributes_fp = save_fp.with_suffix('.pkl') with attributes_fp.open('wb') as attributes_file: # optimiser cannot be pickled attributes = model.model_parameters() del attributes['class_params']['optimiser'] pickle.dump(attributes, attributes_file) else: raise ValueError(f'The model {str(model)} has not been fitted. ' 'This can be done by using the `fit` method')
[docs] @staticmethod def load(load_fp: Path) -> 'bella.models.base.KerasModel': ''' Loads an instance of this class from a file. :param load_fp: File path of the location that the model was saved to. :return: self ''' model_fp = str(load_fp.with_suffix('.h5').resolve()) attributes_fp = load_fp.with_suffix('.pkl') with attributes_fp.open('rb') as attributes_file: attributes = pickle.load(attributes_file) # optimiser has to be recovered as it could not be pickled in the # model parameters keras_model = keras.models.load_model(model_fp) attributes['class_params']['optimiser'] = keras_model.optimizer model_class = attributes.pop('class') model = model_class(**attributes['class_params']) for name, class_attr in attributes['class_attrs'].items(): setattr(model, name, class_attr) model.model = keras_model model.fitted = True return model
[docs] @staticmethod def evaluate_parameter(model: 'bella.models.base.KerasModel', train: Tuple[np.ndarray, np.ndarray], val: Tuple[np.ndarray, np.ndarray], test: np.ndarray, parameter_name: str, parameter: Any) -> Tuple[Any, np.ndarray]: ''' Given a model will set the `parameter_name` to `parameter` fit the model and return the a Tuple of parameter changed and predictions of the model on the test data, using the train and validation data for fitting. :param model: KerasModel instance :param train: Tuple of `(X_train, y_train)`. Used to fit the model. :param val: Tuple of `(X_val, y_val)`. Used to evaluate the model at each epoch. Will not be trained on this data. :param test: `X_test` data to predict on. :param parameter_name: Name of the parameter to change e.g. optimiser :param parameter: value to assign to the parameter e.g. :py:class:`keras.optimizers.RMSprop` :return: A tuple of (parameter value, predictions) ''' setattr(model, parameter_name, parameter) model.fit(train[0], train[1], val) predictions = model.predict(test) return (parameter, predictions)
[docs] @staticmethod def evaluate_parameters(model: 'bella.models.base.KerasModel', train: Tuple[np.ndarray, np.ndarray], val: Tuple[np.ndarray, np.ndarray], test: np.ndarray, parameter_name: str, parameters: List[Any], n_jobs: int ) -> List[Tuple[Any, np.ndarray]]: ''' Performs :py:func:`bella.models.base.KerasModel.evaluate_parameter` on one `parameter_name` but with multiple parameter values. This is useful if you would like to know the affect of changing the values of a parameter. It can also perform the task in a multiprocessing manner if `n_jobs` > 1. :param model: :py:class:`bella.models.base.KerasModel` instance :param train: Tuple of `(X_train, y_train)`. Used to fit the model. :param val: Tuple of `(X_val, y_val)`. Used to evaluate the model at each epoch. Will not be trained on this data. :param test: `X_test` data to predict on. :param parameter_name: Name of the parameter to change e.g. optimiser :param parameters: A list of values to assign to the parameter e.g. [:py:class:`keras.optimizers.RMSprop`] :param n_jobs: Number of cpus to use for multiprocessing if 1 then will not multiprocess. :return: A list of tuples of (parameter value, predictions) ''' func_args = ((model, train, val, test, parameter_name, parameter) for parameter in parameters) if n_jobs == 1: return [KerasModel.evaluate_parameter(*args) for args in func_args] with Pool(n_jobs) as pool: return pool.starmap(KerasModel.evaluate_parameter, func_args)
@staticmethod def _to_be_reproducible(reproducible: Union[int, None]) -> None: ''' To make the method reproducible or not. If it is not needed then we can use all the python threads. :param reproducible: If int is provided this int is used as the seed values. Else None should be given if it is not to be reproducible. ''' if reproducible is not None: os.environ['PYTHONHASHSEED'] = '0' np.random.seed(reproducible) rn.seed(reproducible) # Forces tensorflow to use only one thread session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) tf.set_random_seed(reproducible) sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) keras.backend.set_session(sess) else: np.random.seed(None) rn.seed(np.random.randint(0, 1000)) tf.set_random_seed(np.random.randint(0, 1000)) @property def tokeniser(self) -> Callable[[str], List[str]]: ''' tokeniser attribute :return: The tokeniser used in the model ''' return self._tokeniser @tokeniser.setter def tokeniser(self, value: Callable[[str], List[str]]) -> None: ''' Sets the tokeniser attribute :param value: The value to assign to the tokeniser attribute ''' self.fitted = False self._tokeniser = value @property def embeddings(self) -> 'bella.word_vectors.WordVectors': ''' embeddings attribute :return: The embeddings used in the model ''' return self._embeddings @embeddings.setter def embeddings(self, value: 'bella.word_vectors.WordVectors') -> None: ''' Sets the embeddings attribute :param value: The value to assign to the embeddings attribute ''' self.fitted = False self._embeddings = value @property def lower(self) -> bool: ''' lower attribute :return: The lower used in the model ''' return self._lower @lower.setter def lower(self, value: bool) -> None: ''' Sets the lower attribute :param value: The value to assign to the lower attribute ''' self.fitted = False self._lower = value @property def reproducible(self) -> Union[int, None]: ''' reproducible attribute :return: The reproducible used in the model ''' return self._reproducible @reproducible.setter def reproducible(self, value: Union[int, None]) -> None: ''' Sets the reproducible attribute :param value: The value to assign to the reproducible attribute ''' self.fitted = False self._reproducible = value @property def patience(self) -> int: ''' patience attribute :return: The patience used in the model ''' return self._patience @patience.setter def patience(self, value: int) -> None: ''' Sets the patience attribute :param value: The value to assign to the patience attribute ''' self.fitted = False self._patience = value @property def batch_size(self) -> int: ''' batch_size attribute :return: The batch_size used in the model ''' return self._batch_size @batch_size.setter def batch_size(self, value: int) -> None: ''' Sets the batch_size attribute :param value: The value to assign to the batch_size attribute ''' self.fitted = False self._batch_size = value @property def epochs(self) -> int: ''' epochs attribute :return: The epochs used in the model ''' return self._epochs @epochs.setter def epochs(self, value: int) -> None: ''' Sets the epochs attribute :param value: The value to assign to the epochs attribute ''' self.fitted = False self._epochs = value @property def optimiser(self) -> 'keras.optimizers.Optimizer': ''' optimiser attribute :return: The optimiser used in the model ''' return self._optimiser @optimiser.setter def optimiser(self, value: 'keras.optimizers.Optimizer') -> None: ''' Sets the optimiser attribute :param value: The value to assign to the optimiser attribute ''' self.fitted = False self._optimiser = value @property def optimiser_params(self) -> Union[Dict[str, Any], None]: ''' optimiser_params attribute :return: The optimiser_params used in the model ''' return self._optimiser_params @optimiser_params.setter def optimiser_params(self, value: Union[Dict[str, Any], None]) -> None: ''' Sets the optimiser_params attribute :param value: The value to assign to the optimiser_params attribute ''' self.fitted = False self._optimiser_params = value
[docs]class SKLearnModel(BaseModel): ''' Concrete class that is designed to be used as the base class for all machine learning models that are based on the `scikit learn library <http://scikit-learn.org/stable/>`_. At the moment expects all of the machine learning models to use a `SVM <http://scikit-learn.org/0.19/modules/cla\ sses.html#module-sklearn.svm>`_ as their classifier. This is due to assuming the model will have the method :py:meth:`sklearn.svm.SVC.decision_function` to get `probabilities`. **NOTE** each time the *model_parameters* are set it resets the model i.e. the *fitted* attribute is :py:class:`False` Attributes: 1. model -- Machine learning model. Expects it to be a :py:class:`sklearn.pipeline.Pipeline` instance. 2. fitted -- If the machine learning model has been fitted (default False) 3. model_parameters -- The parameters that are set in the machine learning model. E.g. Parameter could be the tokeniser used. Abstract Class Methods: 1. get_parameters -- Transform the given parameters into a dictonary that is accepted as model parameters. 2. get_cv_parameters -- Transform the given parameters into a list of dictonaries that is accepted as `param_grid` parameter in :py:class:`sklearn.model_selection.GridSearchCV` 3. normalise_parameter_names -- Converts the output of :py:meth:`get_parameters` into a dictionary that can be used as input into :py:meth:`get_parameters`. This is required so that the :py:func:`evaluate_parameters` can work this class. Methods: 1. fit -- Fit the model according to the given training data. 2. predict -- Predict class labels for samples in X. 3. probabilities -- The probability of each class label for all samples in X. 4. __repr__ -- Name of the machine learning model. Functions: 1. save -- Given a instance of this class will save it to a file. 2. load -- Loads an instance of this class from a file. 3. evaluate_parameter -- fit and predict given training, validation and test data the given model when the given parameter is changed on the model. 4. evaluate_parameters -- same as evaluate_parameter however it evaluates over many parameter values for the same parameter. 5. grid_search_model -- Given a model class it will perform a Grid Search over the parameters you give to the models :py:func:`bella.models.base.SKLearnModel.get_cv_parameters` function via the keyword arguments. Returns a pandas dataframe representation of the grid search results. 6. get_grid_score -- Given the return of the :py:func:`grid_search_model` will return the grid scores as a List of the mean test accuracy result. 7. models_best_parameter -- Given a list of models and their base model arguments, it will find the best parameter value out of the values given for that parameter while keeping the base model arguments constant for each model. Abstract Functions: 1. Pipeline -- Machine Learning model that is used as the base template for the model attribute. Expects it to be a :py:class:`sklearn.pipeline.Pipeline` instance. '''
[docs] def __init__(self, *args, **kwargs) -> None: self.model = self.pipeline() self.fitted = False self._model_parameters = self.get_parameters(*args, **kwargs) self.model.set_params(**self._model_parameters)
[docs] def fit(self, X: np.ndarray, y: np.ndarray): ''' Fit the model according to the given training data. :param X: Training samples matrix, shape = [n_samples, n_features] :param y: Training targets, shape = [n_samples] :return: The `model` attribute will now be trained. ''' self.model.fit(X, y) self.fitted = True
[docs] def predict(self, X: np.ndarray): ''' Predict class labels for samples in X. :param X: Test samples matrix, shape = [n_samples, n_features] :return: Predicted class label per sample, shape = [n_samples] :raises ValueError: If the model has not been fitted ''' if self.fitted: return self.model.predict(X) raise ValueError(f'The model {str(self)} has not been fitted. ' 'This can be done by using the `fit` method')
[docs] def probabilities(self, X: np.ndarray): ''' The probability of each class label for all samples in X. :param X: Test samples matrix, shape = [n_samples, n_features]] :return: Probability of each class label for all samples, shape = [n_samples, n_classes] :raises ValueError: If the model has not been fitted ''' if self.fitted: return self.model.decision_function(X) raise ValueError(f'The model {str(self)} has not been fitted. ' 'This can be done by using the `fit` method')
@property def model_parameters(self) -> Dict[str, Any]: ''' The parameters that are set in the machine learning model. E.g. Parameter could be the tokeniser used. :return: parameters of the machine learning model ''' return self._model_parameters @model_parameters.setter def model_parameters(self, value: Dict[str, Any]) -> None: ''' Set the parameters of the machine learning model. :param value: The new parameters of the machine learning model ''' self._model_parameters = self.get_parameters(**value) self.model.set_params(**self._model_parameters) self.fitted = False
[docs] @staticmethod def save(model: 'bella.models.base.SKLearnModel', save_fp: Path, compress: int = 0) -> None: ''' Given an instance of this class will save it to a file. :param model: The machine learning model instance to be saved. :param save_fp: File path of the location that the model is to be saved to. :param compress: Optional (default 0). Level of compression 0 is no compression and 9 is the most compressed. The more compressed the lower the read/write time. :return: Nothing. :raises ValueError: If the model has not been fitted or if the model is not of type :py:class:`bella.models.base.SKLearn` ''' if not isinstance(model, SKLearnModel): raise ValueError('The model parameter has to be of type ' f'SKLearnModel not {type(model)}') if model.fitted: joblib.dump(model, save_fp, compress=compress) else: raise ValueError(f'The model {str(model)} has not been fitted. ' 'This can be done by using the `fit` method')
[docs] @staticmethod def load(load_fp: Path) -> 'bella.models.base.SKLearnModel': ''' Loads an instance of this class from a file. :param load_fp: File path of the location that the model was saved to. :return: self ''' return joblib.load(load_fp)
[docs] @staticmethod def evaluate_parameter(model: 'bella.models.base.SKLearnModel', train: Tuple[np.ndarray, np.ndarray], val: None, test: np.ndarray, parameter_name: str, parameter: Any) -> Tuple[Any, np.ndarray]: ''' Given a model will set the `parameter_name` to `parameter` fit the model and return the a Tuple of parameter changed and predictions of the model on the test data, using the train and validation data for fitting. :param model: :py:class:`bella.models.base.SKLearn` instance :param train: Tuple of `(X_train, y_train)`. Used to fit the model. :param val: Use None. This is only kept to keep the API clean. :param test: `X_test` data to predict on. :param parameter_name: Name of the parameter to change e.g. word_vectors :param parameter: value to assign to the parameter e.g. :py:class:`bella.word_vectors.SSWE` :return: A tuple of (parameter value, predictions) ''' original_parameters = model._model_parameters original_parameters = model.normalise_parameter_names(original_parameters) original_parameters[parameter_name] = parameter model.model_parameters = original_parameters model.fit(train[0], train[1]) predictions = model.predict(test) return (parameter, predictions)
[docs] @staticmethod def evaluate_parameters(model: 'bella.models.base.SKLearnModel', train: Tuple[np.ndarray, np.ndarray], val: None, test: np.ndarray, parameter_name: str, parameters: List[Any], n_jobs: int ) -> List[Tuple[Any, np.ndarray]]: ''' Performs :py:func:`bella.models.base.KerasModel.evaluate_parameter` on one `parameter_name` but with multiple parameter values. This is useful if you would like to know the affect of changing the values of a parameter. It can also perform the task in a multiprocessing manner if `n_jobs` > 1. :param model: :py:class:`bella.models.base.SKLearn` instance :param train: Tuple of `(X_train, y_train)`. Used to fit the model. :param val: Use None. This is only kept to keep the API clean. :param test: `X_test` data to predict on. :param parameter_name: Name of the parameter to change e.g. word_vectors :param parameters: A list of values to assign to the parameter e.g. [:py:class:`bella.word_vectors.SSWE`] :param n_jobs: Number of cpus to use for multiprocessing if 1 then will not multiprocess. :return: A list of tuples of (parameter value, predictions) ''' func_args = ((model, train, val, test, parameter_name, parameter) for parameter in parameters) if n_jobs == 1: return [SKLearnModel.evaluate_parameter(*args) for args in func_args] with Pool(n_jobs) as pool: return pool.starmap(SKLearnModel.evaluate_parameter, func_args)
[docs] @staticmethod def grid_search_model(model: 'bella.models.base.SKLearnModel', X: np.ndarray, y: np.ndarray, n_cpus: int = 1, num_folds: int = 5, **kwargs) -> pd.DataFrame: ''' Given a model class it will perform a Grid Search over the parameters you give to the models :py:func:`bella.models.base.SKLearnModel\ .get_cv_parameters` function via the keyword arguments. Returns a pandas dataframe representation of the grid search results. :param model: The class of the model to use not an instance of the model. :param X: Training samples matrix, shape = [n_samples, n_features] :param y: Training targets, shape = [n_samples] :param n_cpus: Number of estimators to fit in parallel. Default 1. :param num_folds: Number of Stratified cross validation folds. Default 5. :param kwargs: Keyword arguments to give to the models :py:func:`bella.models.base.SKLearnModel\ .get_cv_parameters` function. :return: Pandas dataframe representation of the grid search results. ''' stratified_folds = StratifiedKFold(num_folds) grid_params = model.get_cv_parameters(**kwargs) grid_model = GridSearchCV(model.pipeline(), grid_params, cv=stratified_folds, n_jobs=n_cpus, return_train_score=False) grid_model.fit(X, y) return pd.DataFrame(grid_model.cv_results_)
[docs] @staticmethod def get_grid_score(grid_scores: pd.DataFrame, associated_param: Union[None, str] = None ) -> Union[List[float], List[Tuple[float, str]]]: ''' Given the return of the :py:func:`grid_search_model` will return the grid scores as a List of the mean test accuracy result. :param grid_scores: Return of the :py:func:`grid_search_model` :param associated_param: Optional. The name of the parameter you want to associate to the score. E.g. lexicon as you have grid searched over different lexicons and you want the return to be associated with the lexicon name e.g. [(0.68, 'MPQA), (0.70, 'NRC')] :return: A list of test scores from the grid search and if associated_param is not None a list of scores and parameter names. ''' extracted_scores = grid_scores['mean_test_score'].astype(float) extracted_scores = extracted_scores.round(4) * 100 extracted_scores = extracted_scores.tolist() if associated_param is not None: if associated_param not in grid_scores: for column_name in grid_scores.columns: if associated_param in column_name: associated_param = column_name associated_param = grid_scores[associated_param] associated_param = associated_param.apply(str).tolist() extracted_scores = list(zip(extracted_scores, associated_param)) return extracted_scores
[docs] @staticmethod def models_best_parameter(models_kwargs: List[Tuple['bella.models.base.SKLearnModel', Dict[str, Any]]], param_name: str, param_values: List[Any], X: List[Any], y: np.ndarray, n_cpus: int = 1, num_folds: int = 5 ) -> Dict['bella.models.base.SKLearnModel', str]: ''' Given a list of models and their base model arguments, it will find the best parameter value out of the values given for that parameter while keeping the base model arguments constant for each model. This essentially performs 5 fold cross validation grid search for the one parameter given, across all models given. :param models_kwargs: A list of tuples where each tuple contains a model and the models keyword arguments to give to its `get_cv_parameters` method. These arguments are the models standard arguments that are not to be changed. :param param_name: Name of the parameter to be changed. This name has to be the name of the keyword argument in the models `get_cv_parameters` method. :param param_values: The different values to assign to the param_name argument. :param X: The training samples. :param y: The training target samples. :return: A dictionary of model and the name of the best parameter. ''' model_best_param = {} for model, model_kwargs in models_kwargs: temp_model_kwargs = {**model_kwargs, param_name: param_values} grid_results = model.grid_search_model(model, X, y, n_cpus=n_cpus, num_folds=num_folds, **temp_model_kwargs) param_scores = model.get_grid_score(grid_results, param_name) param_scores = sorted(param_scores, key=lambda x: x[1], reverse=True) best_param = sorted(param_scores, key=lambda x: x[0])[-1][1] model_best_param[model] = best_param return model_best_param
[docs] @classmethod @abstractmethod def normalise_parameter_names(cls, parameter_dict: Dict[str, Any] ) -> Dict[str, Any]: ''' Converts the output of :py:meth:`get_parameters` into a dictionary that can be used as input into :py:meth:`get_parameters`. :returns: A dictonary that can be used as keyword arguments into the :py:meth:`get_parameters` method ''' pass
[docs] @classmethod @abstractmethod def get_parameters(cls) -> Dict[str, Any]: ''' Transform the given parameters into a dictonary that is accepted as model parameters ''' pass
[docs] @classmethod @abstractmethod def get_cv_parameters(cls) -> List[Dict[str, List[Any]]]: ''' Transform the given parameters into a list of dictonaries that is accepted as `param_grid` parameter in :py:class:`sklearn.model_selection.GridSearchCV` ''' pass
@staticmethod def _add_to_params_dict(params_dict: Dict[str, Any], keys: List[str], value: Any) -> Dict[str, Any]: ''' Given a dictionary it adds the value to each key in the list of keys into the dictionary. Returns the updated dictionary. Normally used in subclasses :py:meth:`get_parameters` :param params_dict: Dictionary to be updated :param keys: list of keys :param value: value to be added to each key in the list of keys. :returns: The dictionary updated ''' if not isinstance(keys, list): raise ValueError('The keys parameter has to be of type list and ' f'not {type(keys)}') for key in keys: params_dict[key] = value return params_dict @staticmethod def _add_to_params(params_list: Union[List[Dict[str, List[Any]]], List], to_add: List[Any], to_add_names: List[str]) -> List[Dict[str, List[Any]]]: ''' Used to add parameters that are stated multiple times in the same pipeline that must have the same value. Therefore to add them you have to copy the current parameter list N amount of times where N is the length of the to_add list. Returns the updated parameter list. Method to add parameters that are set in multiple parts of the pipeline but should contain the same value. Normally used in subclasses :py:meth:`get_cv_parameters` :params_list: A list of dicts where each dict contains parameters and corresponding values that are to be searched for. Can be an empty List. :param to_add: List of values that are to be added to the search space. :param to_add_names: List of names that are associated to the values. :returns: The updated params_list :raises TypeError: If any of the arguments are not of type :py:class:`List` ''' # Check the type of the argument if not isinstance(params_list, list): raise TypeError(f'params_list: {params_list}\nShould be of type ' f'list not {type(params_list)}') if not isinstance(to_add_names, list): raise TypeError(f'to_add_names: {to_add_names}\nShould be of type ' f'list not {type(to_add_names)}') param_name = to_add_names[0] if len(to_add_names) > 1: param_name = ''.join(param_name.split('__')[:-1]) if not isinstance(to_add, list): raise TypeError('If using get_cv_parameters this is due to ' f'parameter {param_name} not being of type list.' f'\nto_add: {to_add} should be of ' f'type List not {type(to_add)}.') num_params = len(params_list) num_to_add = len(to_add) new_param_list = [] # Catch the case that params_list was originally empty if num_params == 0: for _ in range(num_to_add): new_param_list.append([defaultdict(list)]) else: for _ in range(num_to_add): new_param_list.append(copy.deepcopy(params_list)) for index, param in enumerate(to_add): for param_name in to_add_names: for sub_list in new_param_list[index]: sub_list[param_name].append(param) params_list = [param_dict for sub_list in new_param_list for param_dict in sub_list] return params_list @staticmethod def _add_to_all_params(params_list: List[Dict[str, List[Any]]], param_name: str, param_value: List[Any] ) -> List[Dict[str, List[Any]]]: ''' Used to add param_name and its associated param_value to each dictionary of parameters in the params_list. Normally used in subclasses :py:meth:`get_cv_parameters` :param params_list: A list of dicts where each dict contains parameters and corresponding values that are to be searched for. :param param_name: The name associated to the parameter value to be added to the params_list. :param param_value: The list of values associated to the param_name that are added to the params_list. :returns: The updated params_list :raises TypeError: If the param_value is not of type :py:class:`List` ''' if not isinstance(param_value, list): raise TypeError(f'{param_name} should be of type list not ' f'{type(param_value)}') for param_dict in params_list: param_dict[param_name] = param_value return params_list
[docs] @staticmethod @abstractmethod def pipeline() -> 'sklearn.pipeline.Pipeline': ''' Machine Learning model that is used as the base template for the model attribute. :returns: The template machine learning model ''' pass