'''
Module contains all of the main base classes for the machine learning models
these are grouped into 3 categories; 1. Mixin, 2. Abstract, and 3. Concrete.
Mixin classes - This is a function based class that contains functions that
do not rely on the type of model and are useful for all:
1. :py:class:`bella.models.base.ModelMixin`
Abstract classes - This is used to enforce all the functions that all
the machine learning models must have. This is also the class that inherits
the Mixin class:
1. :py:class:`bella.models.base.BaseModel`
Concrete classes - These are more concete classes that still contain some
abstract methods. However they are the classes to inherit from to create a
machine learning model base on a certain framework e.g. SKlearn or Keras:
1. :py:class:`bella.models.base.SKLearnModel`
2. :py:class:`bella.models.base.KerasModel`
'''
from abc import ABC, abstractmethod
from collections import defaultdict
import copy
import os
from pathlib import Path
import pickle
import random as rn
import tempfile
from typing import Any, List, Dict, Union, Tuple, Callable
from multiprocessing.pool import Pool
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import preprocessing
import numpy as np
import pandas as pd
import sklearn
from sklearn.externals import joblib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import tensorflow as tf
import bella
from bella.data_types import TargetCollection, Target
[docs]class ModelMixin():
'''
Mixin class for all of the machine learning models. Contain functions
only so they are as generic as possible.
Functions:
1. train_val_split -- Splits the training dataset into a train and
validation set in a stratified split.
'''
@staticmethod
def _convert_to_targets(data: List[Dict[str, Any]]
) -> List['bella.data_types.Target']:
'''
Converts a list of dictionaries into a list of
:py:class:`bella.data_types.Target`.
'''
all_targets = []
for target in data:
all_targets.append(Target(**target))
return all_targets
[docs] @staticmethod
def train_val_split(train: 'TargetCollection',
split_size: float = 0.2, seed: Union[None, int] = 42
) -> Tuple[Tuple[np.ndarray, np.ndarray],
Tuple[np.ndarray, np.ndarray]]:
'''
Splits the training dataset into a train and validation set in a
stratified split.
:param train: The training dataset that needs to be split into
:param split_size: Fraction of the dataset to assign to the
validation set.
:param seed: Seed value to give to the stratified splitter. If
None then it uses the radnom state of numpy.
:return: Two tuples of length two where each tuple is the train
and validation splits respectively, and each tuple contains
the data (X) and class labels (y) respectively. Returns
((X_train, y_train), (X_val, y_val))
'''
splitter = StratifiedShuffleSplit(n_splits=1, test_size=split_size,
random_state=seed)
data = np.asarray(train.data_dict())
sentiment = np.asarray(train.sentiment_data())
for train_indexs, test_indexs in splitter.split(data, sentiment):
train_data = data[train_indexs]
test_data = data[test_indexs]
train = TargetCollection(ModelMixin._convert_to_targets(train_data))
val = TargetCollection(ModelMixin._convert_to_targets(test_data))
X_train = np.array(train.data_dict())
y_train = np.array(train.sentiment_data())
X_val = np.array(val.data_dict())
y_val = np.array(val.sentiment_data())
return (X_train, y_train), (X_val, y_val)
[docs]class BaseModel(ModelMixin, ABC):
'''
Abstract class for all of the machine learning models.
Attributes:
1. model -- Machine learning model that is associated to this instance.
2. fitted -- If the machine learning model has been fitted (default False)
Methods:
1. fit -- Fit the model according to the given training data.
2. predict -- Predict class labels for samples in X.
3. probabilities -- The probability of each class label for all samples
in X.
4. __repr__ -- Name of the machine learning model.
Class Methods:
1. name -- -- Returns the name of the model.
Functions:
1. save -- Saves the given machine learning model instance to a file.
2. load -- Loads the entire machine learning model from a file.
3. evaluate_parameter -- fit and predict given training, validation and
test data the given model when the given parameter is changed on the
model.
4. evaluate_parameters -- same as evaluate_parameter however it
evaluates over many parameter values for the same parameter.
'''
[docs] @abstractmethod
def fit(self, X: np.ndarray, y: np.ndarray) -> None:
'''
Fit the model according to the given training data.
:param X: Training samples matrix, shape = [n_samples, n_features]
:param y: Training targets, shape = [n_samples]
:return: The `model` attribute will now be trained.
'''
[docs] @abstractmethod
def predict(self, X: np.ndarray) -> np.ndarray:
'''
Predict class labels for samples in X.
:param X: Test samples matrix, shape = [n_samples, n_features]
:return: Predicted class label per sample, shape = [n_samples]
'''
[docs] @abstractmethod
def probabilities(self, X: np.ndarray) -> np.ndarray:
'''
The probability of each class label for all samples in X.
:param X: Test samples matrix, shape = [n_samples, n_features]]
:return: Probability of each class label for all samples, shape = \
[n_samples, n_classes]
'''
@abstractmethod
def __repr__(self) -> str:
'''
Name of the machine learning model.
:return: Name of the machine learning model.
'''
[docs] @staticmethod
@abstractmethod
def save(model: 'BaseModel', save_fp: Path) -> None:
'''
Saves the entire machine learning model to a file.
:param model: The machine learning model instance to be saved.
:param save_fp: File path of the location that the model is to be \
saved to.
:return: Nothing.
'''
[docs] @staticmethod
@abstractmethod
def load(load_fp: Path) -> 'bella.models.base.BaseModel':
'''
Loads the entire machine learning model from a file.
:param load_fp: File path of the location that the model was saved to.
:return: self
'''
[docs] @staticmethod
@abstractmethod
def evaluate_parameter(model: 'BaseModel',
train: Tuple[np.ndarray, np.ndarray],
val: Union[None, Tuple[np.ndarray, np.ndarray]],
test: np.ndarray, parameter_name: str,
parameter: Any) -> Tuple[Any, np.ndarray]:
'''
Given a model will set the `parameter_name` to `parameter` fit the
model and return the a Tuple of parameter changed and predictions of
the model on the test data, using the train and validation data for
fitting.
:param model: :py:class:`bella.models.base.BaseModel` instance
:param train: Tuple of `(X_train, y_train)`. Used to fit the model.
:param val: Tuple of `(X_val, y_val)` or None is not required.
This is only required if the model requires validation
data like the :py:class:`bella.models.base.KerasModel`
models do.
:param test: `X_test` data to predict on.
:param parameter_name: Name of the parameter to change e.g. optimiser
:param parameter: value to assign to the parameter e.g.
:py:class:`keras.optimizers.RMSprop`
:return: A tuple of (parameter value, predictions)
'''
[docs] @staticmethod
@abstractmethod
def evaluate_parameters(model: 'bella.models.base.BaseModel',
train: Tuple[np.ndarray, np.ndarray],
val: Union[None, Tuple[np.ndarray, np.ndarray]],
test: np.ndarray, parameter_name: str,
parameters: List[Any], n_jobs: int
) -> List[Tuple[Any, np.ndarray]]:
'''
Performs :py:func:`bella.models.base.BaseModel.evaluate_parameter` on
one `parameter_name` but with multiple parameter values.
This is useful if you would like to know the affect of changing the
values of a parameter. It can also perform the task in a
multiprocessing manner if `n_jobs` > 1.
:param model: :py:class:`bella.models.base.BaseModel` instance
:param train: Tuple of `(X_train, y_train)`. Used to fit the model.
:param val: Tuple of `(X_val, y_val)` or None is not required.
This is only required if the model requires validation
data like the :py:class:`bella.models.base.KerasModel`
models do.
:param test: `X_test` data to predict on.
:param parameter_name: Name of the parameter to change e.g. optimiser
:param parameters: A list of values to assign to the parameter e.g.
[:py:class:`keras.optimizers.RMSprop`]
:param n_jobs: Number of cpus to use for multiprocessing if 1 then
will not multiprocess.
:return: A list of tuples of (parameter value, predictions)
'''
[docs] @classmethod
@abstractmethod
def name(cls) -> str:
'''
Returns the name of the model.
:return: Name of the model
'''
@property
def model(self) -> Any:
'''
Machine learning model that is associated to this instance.
:return: The machine learning model
'''
return self._model
@model.setter
def model(self, value) -> None:
'''
Sets the model attribute
:param value: The value to assign to the model attribute
'''
self._model = value
@property
def fitted(self) -> bool:
'''
If the machine learning model has been fitted (default False)
:return: True or False
'''
return self._fitted
@fitted.setter
def fitted(self, value: bool) -> None:
'''
Sets the fitted attribute
:param value: The value to assign to the fitted attribute
'''
self._fitted = value
[docs]class KerasModel(BaseModel):
'''
Concrete class that is designed to be used as the base class for all
machine learning models that are based on the
`Keras library <https://keras.io>`_.
Attributes:
1. tokeniser -- Tokeniser model uses e.g. :py:meth:`str.split`.
2. embeddings -- the word embeddings the model uses. e.g.
:py:class:`bella.word_vectors.SSWE`
3. lower -- if the model lower cases the words when pre-processing the data
4. reproducible -- Whether to be reproducible. If None then it is quicker
to run. Else provide a `int` that will represent the random seed value.
5. patience -- Number of epochs with no improvement before training
is stopped.
6. batch_size -- Number of samples per gradient update.
7. epcohs -- Number of times to train over the entire training set
before stopping.
8. optimiser -- Optimiser the model uses.
e.g. :py:class:`keras.optimizers.SGD`
9. optimiser_params -- Parameters for the optimiser. If None uses default
for the optimiser being used.
Abstract Methods:
1. keras_model -- Keras machine Learning model that represents the class
e.g. single forward LSTM.
2. create_training_text -- Converts the training and validation data into
a format that the keras model can take as input.
3. create_training_y -- Converts the training and validation targets into a
format that can be used by the keras model.
Methods:
1. fit -- Fit the model according to the given training and validation
data.
2. probabilities -- The probability of each class label for all samples
in X.
3. predict -- Predict class labels for samples in X.
Functions:
1. save -- Given a instance of this class will save it to a file.
2. load -- Loads an instance of this class from a file.
3. evaluate_parameter -- fit and predict given training, validation and
test data the given model when the given parameter is changed on the
model.
4. evaluate_parameters -- same as evaluate_parameter however it
evaluates over many parameter values for the same parameter.
'''
[docs] @abstractmethod
def keras_model(self, num_classes: int) -> 'keras.models.Model':
'''
Keras machine Learning model that represents the class e.g.
single forward LSTM.
:returns: Keras machine learning model
'''
pass
[docs] @abstractmethod
def create_training_text(self, train_data: List[Dict[str, Any]],
validation_data: List[Dict[str, Any]]
) -> Tuple[Any, Any]:
'''
Converts the training and validation data into a format that the keras
model can take as input.
:return: A tuple of length two containing the keras model training and
validation input respectively.
'''
[docs] @abstractmethod
def create_training_y(self, train_y: np.ndarray, validation_y: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
'''
Converts the training and validation targets into a format that can
be used by the keras model
:return: A tuple of length containing two array the first for
training and the second for validation.
'''
@abstractmethod
def _pre_process(self, data_dicts: Dict[str, Any], training: bool):
'''
Converts the training or validation data into a format that will be
used by the keras model.
This function is normally used to process the training and the
validation to be returned together by
:py:meth:`bella.models.base.KerasModel.create_training_text`
'''
[docs] def process_text(self, texts: List[str], max_length: int,
padding: str = 'pre', truncate: str = 'pre'
) -> Tuple[int, np.ndarray]:
'''
Given a list of Strings, tokenised the text and lower case if set and
then convert the tokens into a integers representing the tokens in the
embeddings. Lastly it pads the data based on the max_length param.
If the max_length is smaller than the sentences size it truncates the
sentence. If max_length = -1 then the max_length is that of the longest
sentence in the texts.
:params texts: List of texts
:params max_length: How many tokens a sentence can contain. If it is
-1 then it uses the sentence with the most tokens
as the max_length parameter.
:params padding: Which side of the sentence to pad: `pre` beginning,
`post` end.
:params truncate: Which side of the sentence to truncate: `pre`
beginning `post` end.
:returns: A tuple of length 2 containg: 1. The max_length parameter,
2. A matrix of shape [n_samples, pad_size] where each integer
in the matrix represents the word embedding lookup.
:raises ValueError: If the mex_length argument is equal to or less
than 0. Or if the calculated max_length is 0.
'''
if max_length == 0:
raise ValueError('The max length of a sequence cannot be zero')
elif max_length < -1:
raise ValueError('The max length has to be either -1 or above '
f'zero not {max_length}')
# Process the text into integers based on the embeddings given
all_sequence_data = []
max_sequence = 0
for text in texts:
sequence_data = []
tokens = self.tokeniser(text)
for token in tokens:
if self.lower:
token = token.lower()
# If the token does not exist it should lookup the unknown
# word vector
sequence_data.append(self.embeddings.word2index[token])
sequence_length = len(sequence_data)
if sequence_length > max_sequence:
max_sequence = sequence_length
all_sequence_data.append(sequence_data)
if max_sequence == 0:
raise ValueError('The max sequence length is 0 suggesting no '
'data was provided for training or testing')
# Pad the sequences
# If max pad size is set and training the model set the
# test_pad_size to max sequence length
if max_length == -1:
max_length = max_sequence
return (max_length,
preprocessing.sequence.pad_sequences(all_sequence_data,
maxlen=max_length,
dtype='int32',
value=0,
padding=padding,
truncating=truncate))
[docs] def fit(self, X: np.ndarray, y: np.ndarray,
validation_data: Tuple[np.ndarray, np.ndarray],
verbose: int = 0,
continue_training: bool = False) -> 'keras.callbacks.History':
'''
Fit the model according to the given training and validation data.
:param X: Training samples matrix, shape = [n_samples, n_features]
:param y: Training targets, shape = [n_samples]
:param validation_data: Tuple of `(x_val, y_val)`. Used to evaluate the
model at each epoch. Will not be trained on
this data.
:param verbose: 0 = silent, 1 = progress
:param continue_training: Whether the model that has already been
trained should be trained further.
:return: A record of training loss values and metrics values at
successive epochs, as well as validation loss values and
validation metrics values.
'''
X_val, y_val = validation_data
if sum(y_val < 0) or sum(y < 0):
raise ValueError('The class labels have to be greater than 0')
X, X_val = self.create_training_text(X, X_val)
if isinstance(X, tuple):
X = list(X)
X_val = list(X_val)
y, y_val = self.create_training_y(y, y_val)
num_classes = y.shape[1]
if verbose:
print(f'Number of classes in the data {num_classes}')
if not continue_training:
self.fitted = False
self._to_be_reproducible(self.reproducible)
self.model = self.keras_model(num_classes)
elif self.fitted and not continue_training:
raise ValueError('The model is already fitted')
model = self.model
if not continue_training:
model.compile(optimizer=self.optimiser(**self.optimiser_params),
metrics=['accuracy'],
loss='categorical_crossentropy')
with tempfile.NamedTemporaryFile() as weight_file:
# Set up the callbacks
model_checkpoint = ModelCheckpoint(weight_file.name,
monitor='val_loss',
save_best_only=True,
save_weights_only=True,
mode='min')
early_stopping = EarlyStopping(monitor='val_loss', mode='min',
patience=self.patience)
callbacks = [early_stopping, model_checkpoint]
history = model.fit(X, y, validation_data=(X_val, y_val),
epochs=self.epochs, callbacks=callbacks,
verbose=verbose, batch_size=self.batch_size)
# Load the best model from the saved weight file
model.load_weights(weight_file.name)
self.model = model
self.fitted = True
return history
[docs] def probabilities(self, X: np.ndarray) -> np.ndarray:
'''
The probability of each class label for all samples in X.
:param X: Test samples matrix, shape = [n_samples, n_features]]
:return: Probability of each class label for all samples, shape = \
[n_samples, n_classes]
'''
if self.fitted is False:
raise ValueError('The model has not been fitted please run the '
'`fit` method.')
# Convert from a sequence of dictionaries into texts and then integers
# that represent the tokens in the text within the embedding space.
sequence_test_data = self._pre_process(X, training=False)
predicted_values = self.model.predict(sequence_test_data)
return predicted_values
[docs] def predict(self, X: np.ndarray) -> np.ndarray:
'''
Predict class labels for samples in X.
:param X: Test samples matrix, shape = [n_samples, n_features]
:return: Predicted class label per sample, shape = [n_samples]
'''
return np.argmax(self.probabilities(X), axis=1)
[docs] @staticmethod
def save(model: 'bella.models.base.KerasModel', save_fp: Path) -> None:
'''
Given a Keras Model, mode, path to the folder to save too, and a name
to save the files it will save the data to restore the model.
:param model: The machine learning model instance to be saved.
:param save_fp: File path of the location that the model is to be
saved.
:return: Nothing.
:raises ValueError: If the model has not been fitted or if the model
is not of type
:py:class:`bella.models.base.KerasModel`
'''
if not isinstance(model, KerasModel):
raise ValueError('The model parameter has to be of type '
f'KearsModel not {type(model)}')
if model.fitted:
model_fp = save_fp.with_suffix('.h5')
model.model.save(model_fp)
attributes_fp = save_fp.with_suffix('.pkl')
with attributes_fp.open('wb') as attributes_file:
# optimiser cannot be pickled
attributes = model.model_parameters()
del attributes['class_params']['optimiser']
pickle.dump(attributes, attributes_file)
else:
raise ValueError(f'The model {str(model)} has not been fitted. '
'This can be done by using the `fit` method')
[docs] @staticmethod
def load(load_fp: Path) -> 'bella.models.base.KerasModel':
'''
Loads an instance of this class from a file.
:param load_fp: File path of the location that the model was saved to.
:return: self
'''
model_fp = str(load_fp.with_suffix('.h5').resolve())
attributes_fp = load_fp.with_suffix('.pkl')
with attributes_fp.open('rb') as attributes_file:
attributes = pickle.load(attributes_file)
# optimiser has to be recovered as it could not be pickled in the
# model parameters
keras_model = keras.models.load_model(model_fp)
attributes['class_params']['optimiser'] = keras_model.optimizer
model_class = attributes.pop('class')
model = model_class(**attributes['class_params'])
for name, class_attr in attributes['class_attrs'].items():
setattr(model, name, class_attr)
model.model = keras_model
model.fitted = True
return model
[docs] @staticmethod
def evaluate_parameter(model: 'bella.models.base.KerasModel',
train: Tuple[np.ndarray, np.ndarray],
val: Tuple[np.ndarray, np.ndarray],
test: np.ndarray, parameter_name: str,
parameter: Any) -> Tuple[Any, np.ndarray]:
'''
Given a model will set the `parameter_name` to `parameter` fit the
model and return the a Tuple of parameter changed and predictions of
the model on the test data, using the train and validation data for
fitting.
:param model: KerasModel instance
:param train: Tuple of `(X_train, y_train)`. Used to fit the model.
:param val: Tuple of `(X_val, y_val)`. Used to evaluate the
model at each epoch. Will not be trained on
this data.
:param test: `X_test` data to predict on.
:param parameter_name: Name of the parameter to change e.g. optimiser
:param parameter: value to assign to the parameter e.g.
:py:class:`keras.optimizers.RMSprop`
:return: A tuple of (parameter value, predictions)
'''
setattr(model, parameter_name, parameter)
model.fit(train[0], train[1], val)
predictions = model.predict(test)
return (parameter, predictions)
[docs] @staticmethod
def evaluate_parameters(model: 'bella.models.base.KerasModel',
train: Tuple[np.ndarray, np.ndarray],
val: Tuple[np.ndarray, np.ndarray],
test: np.ndarray, parameter_name: str,
parameters: List[Any], n_jobs: int
) -> List[Tuple[Any, np.ndarray]]:
'''
Performs :py:func:`bella.models.base.KerasModel.evaluate_parameter` on
one `parameter_name` but with multiple parameter values.
This is useful if you would like to know the affect of changing the
values of a parameter. It can also perform the task in a
multiprocessing manner if `n_jobs` > 1.
:param model: :py:class:`bella.models.base.KerasModel` instance
:param train: Tuple of `(X_train, y_train)`. Used to fit the model.
:param val: Tuple of `(X_val, y_val)`. Used to evaluate the
model at each epoch. Will not be trained on
this data.
:param test: `X_test` data to predict on.
:param parameter_name: Name of the parameter to change e.g. optimiser
:param parameters: A list of values to assign to the parameter e.g.
[:py:class:`keras.optimizers.RMSprop`]
:param n_jobs: Number of cpus to use for multiprocessing if 1 then
will not multiprocess.
:return: A list of tuples of (parameter value, predictions)
'''
func_args = ((model, train, val, test, parameter_name, parameter)
for parameter in parameters)
if n_jobs == 1:
return [KerasModel.evaluate_parameter(*args)
for args in func_args]
with Pool(n_jobs) as pool:
return pool.starmap(KerasModel.evaluate_parameter, func_args)
@staticmethod
def _to_be_reproducible(reproducible: Union[int, None]) -> None:
'''
To make the method reproducible or not. If it is not needed then
we can use all the python threads.
:param reproducible: If int is provided this int is used as the seed
values. Else None should be given if it is not
to be reproducible.
'''
if reproducible is not None:
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(reproducible)
rn.seed(reproducible)
# Forces tensorflow to use only one thread
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1)
tf.set_random_seed(reproducible)
sess = tf.Session(graph=tf.get_default_graph(),
config=session_conf)
keras.backend.set_session(sess)
else:
np.random.seed(None)
rn.seed(np.random.randint(0, 1000))
tf.set_random_seed(np.random.randint(0, 1000))
@property
def tokeniser(self) -> Callable[[str], List[str]]:
'''
tokeniser attribute
:return: The tokeniser used in the model
'''
return self._tokeniser
@tokeniser.setter
def tokeniser(self, value: Callable[[str], List[str]]) -> None:
'''
Sets the tokeniser attribute
:param value: The value to assign to the tokeniser attribute
'''
self.fitted = False
self._tokeniser = value
@property
def embeddings(self) -> 'bella.word_vectors.WordVectors':
'''
embeddings attribute
:return: The embeddings used in the model
'''
return self._embeddings
@embeddings.setter
def embeddings(self, value: 'bella.word_vectors.WordVectors') -> None:
'''
Sets the embeddings attribute
:param value: The value to assign to the embeddings attribute
'''
self.fitted = False
self._embeddings = value
@property
def lower(self) -> bool:
'''
lower attribute
:return: The lower used in the model
'''
return self._lower
@lower.setter
def lower(self, value: bool) -> None:
'''
Sets the lower attribute
:param value: The value to assign to the lower attribute
'''
self.fitted = False
self._lower = value
@property
def reproducible(self) -> Union[int, None]:
'''
reproducible attribute
:return: The reproducible used in the model
'''
return self._reproducible
@reproducible.setter
def reproducible(self, value: Union[int, None]) -> None:
'''
Sets the reproducible attribute
:param value: The value to assign to the reproducible attribute
'''
self.fitted = False
self._reproducible = value
@property
def patience(self) -> int:
'''
patience attribute
:return: The patience used in the model
'''
return self._patience
@patience.setter
def patience(self, value: int) -> None:
'''
Sets the patience attribute
:param value: The value to assign to the patience attribute
'''
self.fitted = False
self._patience = value
@property
def batch_size(self) -> int:
'''
batch_size attribute
:return: The batch_size used in the model
'''
return self._batch_size
@batch_size.setter
def batch_size(self, value: int) -> None:
'''
Sets the batch_size attribute
:param value: The value to assign to the batch_size attribute
'''
self.fitted = False
self._batch_size = value
@property
def epochs(self) -> int:
'''
epochs attribute
:return: The epochs used in the model
'''
return self._epochs
@epochs.setter
def epochs(self, value: int) -> None:
'''
Sets the epochs attribute
:param value: The value to assign to the epochs attribute
'''
self.fitted = False
self._epochs = value
@property
def optimiser(self) -> 'keras.optimizers.Optimizer':
'''
optimiser attribute
:return: The optimiser used in the model
'''
return self._optimiser
@optimiser.setter
def optimiser(self, value: 'keras.optimizers.Optimizer') -> None:
'''
Sets the optimiser attribute
:param value: The value to assign to the optimiser attribute
'''
self.fitted = False
self._optimiser = value
@property
def optimiser_params(self) -> Union[Dict[str, Any], None]:
'''
optimiser_params attribute
:return: The optimiser_params used in the model
'''
return self._optimiser_params
@optimiser_params.setter
def optimiser_params(self, value: Union[Dict[str, Any], None]) -> None:
'''
Sets the optimiser_params attribute
:param value: The value to assign to the optimiser_params attribute
'''
self.fitted = False
self._optimiser_params = value
[docs]class SKLearnModel(BaseModel):
'''
Concrete class that is designed to be used as the base class for all
machine learning models that are based on the
`scikit learn library <http://scikit-learn.org/stable/>`_.
At the moment expects all of the machine learning models to use a
`SVM <http://scikit-learn.org/0.19/modules/cla\
sses.html#module-sklearn.svm>`_ as their classifier. This is due to
assuming the model will have the method
:py:meth:`sklearn.svm.SVC.decision_function` to get `probabilities`.
**NOTE** each time the *model_parameters* are set it resets the model
i.e. the *fitted* attribute is :py:class:`False`
Attributes:
1. model -- Machine learning model. Expects it to be a
:py:class:`sklearn.pipeline.Pipeline` instance.
2. fitted -- If the machine learning model has been fitted (default False)
3. model_parameters -- The parameters that are set in the machine
learning model. E.g. Parameter could be the tokeniser used.
Abstract Class Methods:
1. get_parameters -- Transform the given parameters into a dictonary
that is accepted as model parameters.
2. get_cv_parameters -- Transform the given parameters into a list of
dictonaries that is accepted as `param_grid` parameter in
:py:class:`sklearn.model_selection.GridSearchCV`
3. normalise_parameter_names -- Converts the output of
:py:meth:`get_parameters` into a dictionary that can be used as input
into :py:meth:`get_parameters`. This is required so that the
:py:func:`evaluate_parameters` can work this class.
Methods:
1. fit -- Fit the model according to the given training data.
2. predict -- Predict class labels for samples in X.
3. probabilities -- The probability of each class label for all samples
in X.
4. __repr__ -- Name of the machine learning model.
Functions:
1. save -- Given a instance of this class will save it to a file.
2. load -- Loads an instance of this class from a file.
3. evaluate_parameter -- fit and predict given training, validation and
test data the given model when the given parameter is changed on the
model.
4. evaluate_parameters -- same as evaluate_parameter however it
evaluates over many parameter values for the same parameter.
5. grid_search_model -- Given a model class it will perform a Grid Search
over the parameters you give to the models
:py:func:`bella.models.base.SKLearnModel.get_cv_parameters` function
via the keyword arguments. Returns a pandas dataframe representation of
the grid search results.
6. get_grid_score -- Given the return of the :py:func:`grid_search_model`
will return the grid scores as a List of the mean test accuracy result.
7. models_best_parameter -- Given a list of models and their base model
arguments, it will find the best parameter value out of the values
given for that parameter while keeping the base model arguments
constant for each model.
Abstract Functions:
1. Pipeline -- Machine Learning model that is used as the base template
for the model attribute. Expects it to be a
:py:class:`sklearn.pipeline.Pipeline` instance.
'''
[docs] def __init__(self, *args, **kwargs) -> None:
self.model = self.pipeline()
self.fitted = False
self._model_parameters = self.get_parameters(*args, **kwargs)
self.model.set_params(**self._model_parameters)
[docs] def fit(self, X: np.ndarray, y: np.ndarray):
'''
Fit the model according to the given training data.
:param X: Training samples matrix, shape = [n_samples, n_features]
:param y: Training targets, shape = [n_samples]
:return: The `model` attribute will now be trained.
'''
self.model.fit(X, y)
self.fitted = True
[docs] def predict(self, X: np.ndarray):
'''
Predict class labels for samples in X.
:param X: Test samples matrix, shape = [n_samples, n_features]
:return: Predicted class label per sample, shape = [n_samples]
:raises ValueError: If the model has not been fitted
'''
if self.fitted:
return self.model.predict(X)
raise ValueError(f'The model {str(self)} has not been fitted. '
'This can be done by using the `fit` method')
[docs] def probabilities(self, X: np.ndarray):
'''
The probability of each class label for all samples in X.
:param X: Test samples matrix, shape = [n_samples, n_features]]
:return: Probability of each class label for all samples, shape =
[n_samples, n_classes]
:raises ValueError: If the model has not been fitted
'''
if self.fitted:
return self.model.decision_function(X)
raise ValueError(f'The model {str(self)} has not been fitted. '
'This can be done by using the `fit` method')
@property
def model_parameters(self) -> Dict[str, Any]:
'''
The parameters that are set in the machine learning model. E.g.
Parameter could be the tokeniser used.
:return: parameters of the machine learning model
'''
return self._model_parameters
@model_parameters.setter
def model_parameters(self, value: Dict[str, Any]) -> None:
'''
Set the parameters of the machine learning model.
:param value: The new parameters of the machine learning model
'''
self._model_parameters = self.get_parameters(**value)
self.model.set_params(**self._model_parameters)
self.fitted = False
[docs] @staticmethod
def save(model: 'bella.models.base.SKLearnModel',
save_fp: Path, compress: int = 0) -> None:
'''
Given an instance of this class will save it to a file.
:param model: The machine learning model instance to be saved.
:param save_fp: File path of the location that the model is to be
saved to.
:param compress: Optional (default 0). Level of compression 0 is no
compression and 9 is the most compressed. The more
compressed the lower the read/write time.
:return: Nothing.
:raises ValueError: If the model has not been fitted or if the model
is not of type
:py:class:`bella.models.base.SKLearn`
'''
if not isinstance(model, SKLearnModel):
raise ValueError('The model parameter has to be of type '
f'SKLearnModel not {type(model)}')
if model.fitted:
joblib.dump(model, save_fp, compress=compress)
else:
raise ValueError(f'The model {str(model)} has not been fitted. '
'This can be done by using the `fit` method')
[docs] @staticmethod
def load(load_fp: Path) -> 'bella.models.base.SKLearnModel':
'''
Loads an instance of this class from a file.
:param load_fp: File path of the location that the model was saved to.
:return: self
'''
return joblib.load(load_fp)
[docs] @staticmethod
def evaluate_parameter(model: 'bella.models.base.SKLearnModel',
train: Tuple[np.ndarray, np.ndarray],
val: None,
test: np.ndarray, parameter_name: str,
parameter: Any) -> Tuple[Any, np.ndarray]:
'''
Given a model will set the `parameter_name` to `parameter` fit the
model and return the a Tuple of parameter changed and predictions of
the model on the test data, using the train and validation data for
fitting.
:param model: :py:class:`bella.models.base.SKLearn` instance
:param train: Tuple of `(X_train, y_train)`. Used to fit the model.
:param val: Use None. This is only kept to keep the API clean.
:param test: `X_test` data to predict on.
:param parameter_name: Name of the parameter to change
e.g. word_vectors
:param parameter: value to assign to the parameter e.g.
:py:class:`bella.word_vectors.SSWE`
:return: A tuple of (parameter value, predictions)
'''
original_parameters = model._model_parameters
original_parameters = model.normalise_parameter_names(original_parameters)
original_parameters[parameter_name] = parameter
model.model_parameters = original_parameters
model.fit(train[0], train[1])
predictions = model.predict(test)
return (parameter, predictions)
[docs] @staticmethod
def evaluate_parameters(model: 'bella.models.base.SKLearnModel',
train: Tuple[np.ndarray, np.ndarray],
val: None,
test: np.ndarray, parameter_name: str,
parameters: List[Any], n_jobs: int
) -> List[Tuple[Any, np.ndarray]]:
'''
Performs :py:func:`bella.models.base.KerasModel.evaluate_parameter` on
one `parameter_name` but with multiple parameter values.
This is useful if you would like to know the affect of changing the
values of a parameter. It can also perform the task in a
multiprocessing manner if `n_jobs` > 1.
:param model: :py:class:`bella.models.base.SKLearn` instance
:param train: Tuple of `(X_train, y_train)`. Used to fit the model.
:param val: Use None. This is only kept to keep the API clean.
:param test: `X_test` data to predict on.
:param parameter_name: Name of the parameter to change e.g.
word_vectors
:param parameters: A list of values to assign to the parameter e.g.
[:py:class:`bella.word_vectors.SSWE`]
:param n_jobs: Number of cpus to use for multiprocessing if 1 then
will not multiprocess.
:return: A list of tuples of (parameter value, predictions)
'''
func_args = ((model, train, val, test, parameter_name, parameter)
for parameter in parameters)
if n_jobs == 1:
return [SKLearnModel.evaluate_parameter(*args)
for args in func_args]
with Pool(n_jobs) as pool:
return pool.starmap(SKLearnModel.evaluate_parameter, func_args)
[docs] @staticmethod
def grid_search_model(model: 'bella.models.base.SKLearnModel',
X: np.ndarray, y: np.ndarray, n_cpus: int = 1,
num_folds: int = 5, **kwargs) -> pd.DataFrame:
'''
Given a model class it will perform a Grid Search over the parameters
you give to the models :py:func:`bella.models.base.SKLearnModel\
.get_cv_parameters` function via the keyword arguments. Returns a
pandas dataframe representation of the grid search results.
:param model: The class of the model to use not an instance of the
model.
:param X: Training samples matrix, shape = [n_samples, n_features]
:param y: Training targets, shape = [n_samples]
:param n_cpus: Number of estimators to fit in parallel. Default 1.
:param num_folds: Number of Stratified cross validation folds.
Default 5.
:param kwargs: Keyword arguments to give to the models
:py:func:`bella.models.base.SKLearnModel\
.get_cv_parameters` function.
:return: Pandas dataframe representation of the grid search results.
'''
stratified_folds = StratifiedKFold(num_folds)
grid_params = model.get_cv_parameters(**kwargs)
grid_model = GridSearchCV(model.pipeline(), grid_params,
cv=stratified_folds, n_jobs=n_cpus,
return_train_score=False)
grid_model.fit(X, y)
return pd.DataFrame(grid_model.cv_results_)
[docs] @staticmethod
def get_grid_score(grid_scores: pd.DataFrame,
associated_param: Union[None, str] = None
) -> Union[List[float], List[Tuple[float, str]]]:
'''
Given the return of the :py:func:`grid_search_model` will return
the grid scores as a List of the mean test accuracy result.
:param grid_scores: Return of the :py:func:`grid_search_model`
:param associated_param: Optional. The name of the parameter you want
to associate to the score. E.g. lexicon as you
have grid searched over different lexicons and
you want the return to be associated with the
lexicon name e.g. [(0.68, 'MPQA),
(0.70, 'NRC')]
:return: A list of test scores from the grid search and if
associated_param is not None a list of scores and parameter
names.
'''
extracted_scores = grid_scores['mean_test_score'].astype(float)
extracted_scores = extracted_scores.round(4) * 100
extracted_scores = extracted_scores.tolist()
if associated_param is not None:
if associated_param not in grid_scores:
for column_name in grid_scores.columns:
if associated_param in column_name:
associated_param = column_name
associated_param = grid_scores[associated_param]
associated_param = associated_param.apply(str).tolist()
extracted_scores = list(zip(extracted_scores, associated_param))
return extracted_scores
[docs] @staticmethod
def models_best_parameter(models_kwargs: List[Tuple['bella.models.base.SKLearnModel',
Dict[str, Any]]],
param_name: str, param_values: List[Any],
X: List[Any], y: np.ndarray, n_cpus: int = 1,
num_folds: int = 5
) -> Dict['bella.models.base.SKLearnModel', str]:
'''
Given a list of models and their base model arguments, it will
find the best parameter value out of the values given for that
parameter while keeping the base model arguments constant for
each model.
This essentially performs 5 fold cross validation grid search
for the one parameter given, across all models given.
:param models_kwargs: A list of tuples where each tuple contains
a model and the models keyword arguments to
give to its `get_cv_parameters` method. These
arguments are the models standard arguments
that are not to be changed.
:param param_name: Name of the parameter to be changed. This name
has to be the name of the keyword argument in
the models `get_cv_parameters` method.
:param param_values: The different values to assign to the param_name
argument.
:param X: The training samples.
:param y: The training target samples.
:return: A dictionary of model and the name of the best parameter.
'''
model_best_param = {}
for model, model_kwargs in models_kwargs:
temp_model_kwargs = {**model_kwargs, param_name: param_values}
grid_results = model.grid_search_model(model, X, y, n_cpus=n_cpus,
num_folds=num_folds,
**temp_model_kwargs)
param_scores = model.get_grid_score(grid_results, param_name)
param_scores = sorted(param_scores, key=lambda x: x[1],
reverse=True)
best_param = sorted(param_scores, key=lambda x: x[0])[-1][1]
model_best_param[model] = best_param
return model_best_param
[docs] @classmethod
@abstractmethod
def normalise_parameter_names(cls, parameter_dict: Dict[str, Any]
) -> Dict[str, Any]:
'''
Converts the output of :py:meth:`get_parameters` into a dictionary that
can be used as input into :py:meth:`get_parameters`.
:returns: A dictonary that can be used as keyword arguments into the
:py:meth:`get_parameters` method
'''
pass
[docs] @classmethod
@abstractmethod
def get_parameters(cls) -> Dict[str, Any]:
'''
Transform the given parameters into a dictonary that is accepted as
model parameters
'''
pass
[docs] @classmethod
@abstractmethod
def get_cv_parameters(cls) -> List[Dict[str, List[Any]]]:
'''
Transform the given parameters into a list of dictonaries that is
accepted as `param_grid` parameter in
:py:class:`sklearn.model_selection.GridSearchCV`
'''
pass
@staticmethod
def _add_to_params_dict(params_dict: Dict[str, Any], keys: List[str],
value: Any) -> Dict[str, Any]:
'''
Given a dictionary it adds the value to each key in the list of keys
into the dictionary. Returns the updated dictionary.
Normally used in subclasses :py:meth:`get_parameters`
:param params_dict: Dictionary to be updated
:param keys: list of keys
:param value: value to be added to each key in the list of keys.
:returns: The dictionary updated
'''
if not isinstance(keys, list):
raise ValueError('The keys parameter has to be of type list and '
f'not {type(keys)}')
for key in keys:
params_dict[key] = value
return params_dict
@staticmethod
def _add_to_params(params_list: Union[List[Dict[str, List[Any]]], List],
to_add: List[Any],
to_add_names: List[str]) -> List[Dict[str, List[Any]]]:
'''
Used to add parameters that are stated multiple times in the same
pipeline that must have the same value.
Therefore to add them you have to copy the current parameter
list N amount of times where N is the length of the to_add list.
Returns the updated parameter list. Method to add parameters that
are set in multiple parts of the pipeline but should contain the
same value.
Normally used in subclasses :py:meth:`get_cv_parameters`
:params_list: A list of dicts where each dict contains parameters and
corresponding values that are to be searched for. Can be
an empty List.
:param to_add: List of values that are to be added to the search space.
:param to_add_names: List of names that are associated to the values.
:returns: The updated params_list
:raises TypeError: If any of the arguments are not of type
:py:class:`List`
'''
# Check the type of the argument
if not isinstance(params_list, list):
raise TypeError(f'params_list: {params_list}\nShould be of type '
f'list not {type(params_list)}')
if not isinstance(to_add_names, list):
raise TypeError(f'to_add_names: {to_add_names}\nShould be of type '
f'list not {type(to_add_names)}')
param_name = to_add_names[0]
if len(to_add_names) > 1:
param_name = ''.join(param_name.split('__')[:-1])
if not isinstance(to_add, list):
raise TypeError('If using get_cv_parameters this is due to '
f'parameter {param_name} not being of type list.'
f'\nto_add: {to_add} should be of '
f'type List not {type(to_add)}.')
num_params = len(params_list)
num_to_add = len(to_add)
new_param_list = []
# Catch the case that params_list was originally empty
if num_params == 0:
for _ in range(num_to_add):
new_param_list.append([defaultdict(list)])
else:
for _ in range(num_to_add):
new_param_list.append(copy.deepcopy(params_list))
for index, param in enumerate(to_add):
for param_name in to_add_names:
for sub_list in new_param_list[index]:
sub_list[param_name].append(param)
params_list = [param_dict for sub_list in new_param_list
for param_dict in sub_list]
return params_list
@staticmethod
def _add_to_all_params(params_list: List[Dict[str, List[Any]]],
param_name: str, param_value: List[Any]
) -> List[Dict[str, List[Any]]]:
'''
Used to add param_name and its associated param_value to each
dictionary of parameters in the params_list.
Normally used in subclasses :py:meth:`get_cv_parameters`
:param params_list: A list of dicts where each dict contains
parameters and corresponding values that are to be
searched for.
:param param_name: The name associated to the parameter value to be
added to the params_list.
:param param_value: The list of values associated to the param_name
that are added to the params_list.
:returns: The updated params_list
:raises TypeError: If the param_value is not of type :py:class:`List`
'''
if not isinstance(param_value, list):
raise TypeError(f'{param_name} should be of type list not '
f'{type(param_value)}')
for param_dict in params_list:
param_dict[param_name] = param_value
return params_list
[docs] @staticmethod
@abstractmethod
def pipeline() -> 'sklearn.pipeline.Pipeline':
'''
Machine Learning model that is used as the base template for the model
attribute.
:returns: The template machine learning model
'''
pass