Source code for Fireworks.toolbox.text

import numpy as np

[docs]def character_tokenizer(sequence): """ Splits sequence into a list of characters. """ if sequence: return [*sequence] # This unpacks an iterable into a list of elements
[docs]def space_tokenizer(sequence): """ Splits sequence based on spaces. """ if sequence: return sequence.split(' ')
[docs]def pad_sequence(sequence, max_length, embeddings_dict, pad_token = 'EOS'): """ Adds EOS tokens until sequence length is max_length. """ n = len(sequence) padded = sequence.copy() if n < max_length: if type(padded) is np.ndarray: padded = np.append(padded, [embeddings_dict[pad_token] for _ in range(max_length-n)], axis=0) else: padded.extend([embeddings_dict[pad_token] for _ in range(max_length-n)]) return np.array(padded)
[docs]def pad(batch, embeddings_dict, pad_token = 'EOS'): # TODO: Don't rely on embeddings dict for EOS tokens """ Pads all embeddings in a batch to be the same length. """ lengths = [len(x) for x in batch] max_length = max(lengths) padded_embeddings = np.stack([pad_sequence(seq, max_length, embeddings_dict, pad_token) for seq in batch]) batch = padded_embeddings return batch
[docs]def apply_embeddings(sequence, embeddings_dict, tokenizer): """ Decomposes sequence into tokens using tokenizer and then converts tokens to embeddings using embeddings_dict. """ if sequence: embeddings = [embeddings_dict['SOS']] embeddings.extend([embeddings_dict[token] if token in embeddings_dict else embeddings_dict['UNK'] for token in tokenizer(sequence)]) embeddings.append(embeddings_dict['EOS']) # if vectorizor: # embeddings = [vectorizor(embedding) for embedding in embeddings] return embeddings else: # In case input sequence is empty return [embeddings_dict['SOS'], embeddings_dict['EOS']]
[docs]def create_pretrained_embeddings(embeddings_file): # TODO: Write test """ Loads embeddings vectors from file into a dict. """ embeddings_dict = {} with open(embeddings_file) as vectors: for line in vectors: splits = line.split(' ') embeddings_dict[splits[0]] = np.array([float(x) for x in splits[1:]]) return embeddings_dict
[docs]def load_embeddings(name = 'glove840b'): # TODO: write test """ Loads serialized embeddings from pickle. """ with open(os.path.join(silph.RAW_DIR, name), 'rb') as pretrained: embeddings_dict = pickle.load(pretrained) return embeddings_dict
[docs]def make_vocabulary(text, tokenizer = None, cutoff_rule = None): #TODO: write test """ Converts an iterable of phrases into the set of unique tokens that are in the vocabulary. """ counts = defaultdict(lambda:0) if tokenizer is None: tokenizer = space_tokenizer #vocabulary = set() for phrase in text: if phrase is not None: tokens = tokenizer(phrase) #vocabulary.add(tokens) for token in tokens: counts[token] += 1 else: pass if cutoff_rule: counts = cutoff_rule(counts) return dict(counts) # The keys of this dict are the unique tokens.
[docs]def make_indices(vocabulary): # TODO: write test """ Constructs a dictionary of token names to indices from a vocabulary. Each index value corresponds to a one-hot vector. """ embeddings_dict = bidict({token: i for token, i in zip(vocabulary, count(start=3))}) embeddings_dict['SOS'] = special_tokens['SOS'] embeddings_dict['EOS'] = special_tokens['EOS'] embeddings_dict['UNK'] = special_tokens['UNK'] return embeddings_dict
[docs]def too_big(dataset, start, end, dim = 300, cutoff = 620000): """ Calculates if a batch consisting of dataset[start:end] is too big based on cutoff. """ sizes = [len(x) for x in dataset[start:end]['embeddings']] max_size = max(sizes) dim = dim batch_size = end-start size = dim*max_size*batch_size return size > cutoff