from typing import Dict, List
from overrides import overrides
import torch
from allennlp.common.checks import ConfigurationError
from allennlp.common.util import pad_sequence_to_length
from allennlp.data.tokenizers.token import Token
from allennlp.data.token_indexers.token_indexer import TokenIndexer, IndexedTokenList
from allennlp.data.vocabulary import Vocabulary
def _make_bos_eos(
character: int,
padding_character: int,
beginning_of_word_character: int,
end_of_word_character: int,
max_word_length: int,
):
char_ids = [padding_character] * max_word_length
char_ids[0] = beginning_of_word_character
char_ids[1] = character
char_ids[2] = end_of_word_character
return char_ids
[docs]class ELMoCharacterMapper:
"""
Maps individual tokens to sequences of character ids, compatible with ELMo.
To be consistent with previously trained models, we include it here as special of existing
character indexers.
We allow to add optional additional special tokens with designated
character ids with `tokens_to_add`.
"""
max_word_length = 50
# char ids 0-255 come from utf-8 encoding bytes
# assign 256-300 to special chars
beginning_of_sentence_character = 256 # <begin sentence>
end_of_sentence_character = 257 # <end sentence>
beginning_of_word_character = 258 # <begin word>
end_of_word_character = 259 # <end word>
padding_character = 260 # <padding>
beginning_of_sentence_characters = _make_bos_eos(
beginning_of_sentence_character,
padding_character,
beginning_of_word_character,
end_of_word_character,
max_word_length,
)
end_of_sentence_characters = _make_bos_eos(
end_of_sentence_character,
padding_character,
beginning_of_word_character,
end_of_word_character,
max_word_length,
)
bos_token = "<S>"
eos_token = "</S>"
def __init__(self, tokens_to_add: Dict[str, int] = None) -> None:
self.tokens_to_add = tokens_to_add or {}
[docs] def convert_word_to_char_ids(self, word: str) -> List[int]:
if word in self.tokens_to_add:
char_ids = [ELMoCharacterMapper.padding_character] * ELMoCharacterMapper.max_word_length
char_ids[0] = ELMoCharacterMapper.beginning_of_word_character
char_ids[1] = self.tokens_to_add[word]
char_ids[2] = ELMoCharacterMapper.end_of_word_character
elif word == ELMoCharacterMapper.bos_token:
char_ids = ELMoCharacterMapper.beginning_of_sentence_characters
elif word == ELMoCharacterMapper.eos_token:
char_ids = ELMoCharacterMapper.end_of_sentence_characters
else:
word_encoded = word.encode("utf-8", "ignore")[
: (ELMoCharacterMapper.max_word_length - 2)
]
char_ids = [ELMoCharacterMapper.padding_character] * ELMoCharacterMapper.max_word_length
char_ids[0] = ELMoCharacterMapper.beginning_of_word_character
for k, chr_id in enumerate(word_encoded, start=1):
char_ids[k] = chr_id
char_ids[len(word_encoded) + 1] = ELMoCharacterMapper.end_of_word_character
# +1 one for masking
return [c + 1 for c in char_ids]
def __eq__(self, other) -> bool:
if isinstance(self, other.__class__):
return self.__dict__ == other.__dict__
return NotImplemented
[docs]@TokenIndexer.register("custom_elmo_characters")
class CustomELMoTokenCharactersIndexer(TokenIndexer):
"""
Convert a token to an array of character ids to compute ELMo representations.
# Parameters
namespace : `str`, optional (default=`elmo_characters`)
tokens_to_add : `Dict[str, int]`, optional (default=`None`)
If not None, then provides a mapping of special tokens to character
ids. When using pre-trained models, then the character id must be
less then 261, and we recommend using un-used ids (e.g. 1-32).
token_min_padding_length : `int`, optional (default=`0`)
See :class:`TokenIndexer`.
"""
def __init__(
self,
namespace: str = "elmo_characters",
tokens_to_add: Dict[str, int] = None,
token_min_padding_length: int = 0,
) -> None:
super().__init__(token_min_padding_length)
self._namespace = namespace
self._mapper = ELMoCharacterMapper(tokens_to_add)
[docs] @overrides
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
pass
[docs] @overrides
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[List[int]]]:
# TODO(brendanr): Retain the token to index mappings in the vocabulary and remove this
# https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113
texts = [token.text for token in tokens]
if any(text is None for text in texts):
raise ConfigurationError(
"ELMoTokenCharactersIndexer needs a tokenizer that retains text"
)
return {"tokens": [self._mapper.convert_word_to_char_ids(text) for text in texts]}
[docs] @overrides
def get_empty_token_list(self) -> IndexedTokenList:
return {"tokens": []}
[docs] @overrides
def as_padded_tensor_dict(
self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]
) -> Dict[str, torch.Tensor]:
# Overriding this method only because we need a different padding token than the default.
tensor_dict = {}
def padding_token():
return [0] * ELMoCharacterMapper.max_word_length
tensor_dict["tokens"] = torch.LongTensor(
pad_sequence_to_length(
tokens["tokens"], padding_lengths["tokens"], default_value=padding_token
)
)
return tensor_dict