Source code for bella.scikit_features.tokeniser

'''
Module contains a Class that is a scikit learn Transformer.

Classes:

1. ContextTokeniser - Converts a list of String lists into token lists. See the
transformer method of the class for more details.
'''
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

from bella import tokenisers


[docs]class ContextTokeniser(BaseEstimator, TransformerMixin): ''' Scikit learn transformer class. Converts list of String lists into tokens. Attributes: 1. self.tokeniser - tokeniser function. Given a String returns a list of Strings. Default whitespace tokeniser. 2. self.lower - whether to lower case the tokens. Default False. See :py:func:`bella.tokenisers` for more tokeniser functions that can be used here or create your own function. '''
[docs] def __init__(self, tokeniser=tokenisers.whitespace, lower=False): self.tokeniser = tokeniser self.lower = lower
[docs] def fit(self, target_contexts, y=None): '''Kept for consistnecy with the TransformerMixin''' return self
[docs] def fit_transform(self, target_contexts, y=None): '''see self.transform''' return self.transform(target_contexts)
[docs] def transform(self, target_contexts): ''' Given a list of String lists where each String represents a context per target span it returns those Strings as a list of Strings (tokens). :param target_contexts: A list of String lists e.g. \ [['It was nice this morning', 'It was nice this morning but not \ yesterday morning'], ['another day']] where each String is a span context \ for a target. :type target_contexts: list :returns: A list of Strings (tokens) per span context. e.g. [[['It', 'was', 'nice', 'this', 'morning'], ['It', 'was', 'nice', 'this',\ 'morning', 'but', 'not', 'yesterday', 'morning']], [['another', 'day']]] :rtype: list ''' token_contexts = [] for target_span_contexts in target_contexts: token_span_contexts = [] for span_context in target_span_contexts: tokens = self.tokeniser(span_context) if self.lower: tokens = [token.lower() for token in tokens] token_span_contexts.append(tokens) token_contexts.append(token_span_contexts) return token_contexts