from abc import ABC, abstractmethod
from Bio import SeqIO
import pandas as pd
import Fireworks
from Fireworks import Pipe, HookedPassThroughPipe
from Fireworks import Message
from Fireworks.utils import index_to_list
from Fireworks.core.cache import LRUCache, LFUCache, UnlimitedCache
from Fireworks.utils import one_hot
from abc import ABC, abstractmethod
from itertools import count
import types
import random
from bidict import bidict
import torch
import numpy as np
[docs]class BioSeqPipe(Pipe):
"""
Class for representing biosequence data.
Specifically, this class can read biological data files (such as fasta) and iterate throug them as a Pipe.
This can serve as the first Pipe in a pipeline for analyzing genomic data.
"""
name = 'BioSeqPipe'
def __init__(self, path, input = None, filetype = 'fasta', **kwargs):
"""
Args:
path: Path on disk where file is located; will be supplied to the SeqIO.parse function.
inputs: Input Pipes. Default is None.
filetype: Type of file that will be supplied to the SeqIO.parse function. Default is 'fasta'
kwargs: Optional key word arguments that will be supplied to the SeqIO.parse function.
"""
Pipe.__init__(self, input=input)
self.path = path
self.filetype = filetype
self.kwargs = kwargs
self.seq = SeqIO.parse(self.path, self.filetype, **self.kwargs)
[docs] def reset(self):
"""
Resets the iterator to the beginning of the file. Specifically, it calls SeqIO.parse again as if reinstantiating the object.
"""
self.seq = SeqIO.parse(self.path, self.filetype, **self.kwargs)
return self
def __next__(self):
"""
Each iteration through a SeqIO object will produce a named tuple. This Pipe converts that tuple to a Message and produces the result.
"""
gene = self.seq.__next__()
try:
return Message({
'sequences': [str(gene.seq)],
'ids': [gene.id],
'names': [gene.name],
'descriptions': [gene.description],
'dbxrefs': [gene.dbxrefs],
})
except StopIteration:
raise StopIteration
def __iter__(self):
return self.reset()
[docs]class LoopingPipe(Pipe):
"""
This Pipe can take any iterator and make it appear to be indexable by iterating through the input
as needed to reach any given index.
The input Pipe must implement __next__ and reset (to be repeatable),
and this will simulate __getitem__ by repeatedly looping through the iterator as needed.
For example, say we have a Pipe that iterates through the lines of a FASTA file:
::
fasta = BioSeqPipe('genes.fasta')
This Pipe can only iterate through the file in one direciton. If we want to access arbitrary elements,
we can do this:
::
clock = LoopingPipe(inputs=fasta)
clock[10]
clock[2:6]
len(clock)
All of these actions are now possible. Note that this is in general an expensive process, because the Pipe
has to iterate one at a time to get to the index it needs. In practice, this Pipe should pipe its output
to a CachingPipe that can store values in memory. This approach enables you to process datasets that don't
entirely fit in memory; you can stream in what you need and cache portions. From the perspective of the downstream
Pipes, every element of the dataset is accessible as if it were in memory.
"""
name = 'LoopingPipe'
def __init__(self, input, *args, **kwargs):
super().__init__(input, *args, **kwargs)
self.check_input()
self.reset()
self.length = None
def __getitem__(self, index):
"""
Retreives items in index by looping through inputs as many times as needed.
"""
# TODO: Check if index exceeds length, either explicitly or implicitly.
# Sort index
index = sorted(index_to_list(index))
above = [i for i in index if i >= self.position] # Go forward to reach these
below = [i for i in index if i < self.position] # Will have to reset the loop to reach these
if len(above) > 0:
above_values = Fireworks.cat([self.step_forward(i+1) for i in above])
else:
above_values = Message()
if len(below) > 0:
self.reset() # Position will now be reset to 0
below_values = Fireworks.cat([self.step_forward(i+1) for i in below])
else:
below_values = Message()
return below_values.append(above_values) # TODO: Resort this message so values are in the order requested by index
def __len__(self):
"""
Length is also calculated implicitly as items are accessed.
"""
if self.length is not None:
return self.length
else:
self.compute_length()
return self.length
def __next__(self): # QUESTION: Should __next__ calls be supported for this Pipe?
"""
Go forward one step. This can be used to loop over the inputs while automatically recording
length once one cycle has been performed.
"""
if self.length is None:
return self.step_forward(1)
else:
p = (self.position + 1) % self.length
self.position = p
return self[p]
[docs] def reset(self):
"""
Calls reset on input Pipes and sets position to 0.
"""
# # for pipe in self.inputs.values():
# pipe.reset()
try:
self.input.reset()
except: # TODO: Do proper error catching here
pass
self.position = 0
[docs] def compute_length(self):
"""
Step forward as far as the inputs will allow and compute length.
Length is also calculated implicitly as items are accessed.
Note: If the inputs are infinite, then this will go on forever.
"""
while True:
try:
self.step_forward(self.position+1)
except (StopIteration, IndexError):
self.reset()
break
[docs] def step_forward(self, n):
"""
Steps forward through inputs until position = n and then returns that value.
This also updates the internal length variable if the iterator ends due to this method call.
"""
if self.length is not None and n > self.length:
raise IndexError("Requested index is out of bounds for inputs with length {0}.".format(self.length))
if n < self.position:
raise IndexError("Can only step forward to a value higher than current position.")
x = Message()
for _ in range(n - self.position):
try:
# x = x.append(Fireworks.merge([Pipe.__next__() for Pipe in self.inputs.values()]))
# x = Fireworks.merge([pipe.__next__() for pipe in self.inputs.values()])
x = self.input.__next__()
self.position += 1
except StopIteration:
self.length = self.position
raise IndexError("Requested index is out of bounds for inputs with length {0}.".format(self.length))
return x
[docs]class CachingPipe(Pipe):
"""
This Pipe can be used to dynamically cache elements from upstream Pipes.
Whenever data is requested by index, this Pipe will intercept the request and add that message alongside
the index to its internal cache.
This can be useful for dealing with datasets that don't fit in memory or are streamed in. You can cache portions
of the dataset as you use them. By combining this with a LoopingPipe, you can create the illusion of making
the entire dataset available to downstream Pipes regardless of the type and size of the original data.
More specifically, fiven input Pipes that implement __getitem__, will store all calls to __getitem__ into
an internal cache and therafter __getitem__ calls will either access from the cache or trigger __getitem__ calls
on the input and an update to the cache.
For example,
::
fasta = BioSeqPipe(path='genes.fasta')
clock = LoopingPipe(inputs=fasta)
cache = CachingPipe(inputs=clock, cache_size=100) # cache_size is optional; default=100
Will set up a pipeline that reads lines from a FASTA file and acessess and caches elements as requests are made
::
cache[20:40] # This will be cached
cache[25:30] # This will read from the cache
cache[44] # This will update the cache
cache[40:140] # This will fill the cache, flushing out old elements
cache[25:30] # This will read from the dataset and update the cache again
"""
def __init__(self, input, *args, cache_size = 100, buffer_size = 0, cache_type = 'LRU', infinite = False, **kwargs):
"""
Args:
args: Positional arguments for superclass initialization.
cache_size (int): Number of elements to hold in cache. Default = 100
buffer_size (int): Number of elements to clear once cache is full. This is useful if you expect frequent cache misses and
don't expect to reuse previous cached elements. Default = 0
cache_type (str): Type of cache. Choices are 'LRU' (least-recently-used) and 'LFU' (least-frequently-used). Default = 'LRU'
infinite (bool): If set to true, the cache will have no upper limit on size and will never clear itself.
"""
super().__init__(input, *args, **kwargs)
self.check_input()
self.length = None
self.lower_bound = 0
self.upper_bound = None
self.infinite = infinite
self.cache_size = cache_size
self.cache_type = cache_type
self.buffer_size = buffer_size
self.init_cache(*args, **kwargs)
# @abstractmethod # TODO: Make different types of caches implementable via subclasses
[docs] def init_cache(self, *args, **kwargs):
"""
This should initialize a cache object called self.cache
"""
choices = {'LRU': LRUCache, 'LFU': LFUCache}
self.cache = choices[self.cache_type](max_size = self.cache_size, buffer_size=self.buffer_size)
def __getitem__(self, index):
index = index_to_list(index)
if self.length and len(index): # Implicit length check if length is known
if max(index) >= self.length:
raise IndexError("Requested index is out of bounds for inputs with length {0}.".format(self.length))
# Identify what is in the cache and what isn't.
in_cache = [i for i in index if i in self.cache.pointers] # Elements of index corresponding to in_cache elements
in_cache_indices = [j for i,j in zip(index, count()) if i in self.cache.pointers] # Indices in index corresponding to in_cache elements
not_in_cache = [i for i in index if i not in self.cache.pointers] # Elements of index corresponding to not_in_cache elements
not_in_cache_indices = [j for i,j in zip(index, count()) if i not in self.cache.pointers] # Indices in index corresponding to not_in_cache elements
# Retrieve from cache existing elements
in_cache_elements = self.cache[in_cache] # elements in cache corresponding to indices in cache
# Update cache to have other elements
# not_in_cache_elements = Fireworks.merge([pipe[not_in_cache] for pipe in self.inputs.values()])
not_in_cache_elements = self.input[not_in_cache]
self.cache[not_in_cache] = not_in_cache_elements
# Reorder and merge requested elements
message = in_cache_elements.append(not_in_cache_elements)
indices = in_cache_indices
indices.extend(not_in_cache_indices)
# Elements must be reordered based on their order in index
permutation = indices.copy()
for i,j in zip(indices, count()):
permutation[i] = j
# permutation = in_cache_indices.extend(not_in_cache_indices)
message = message.permute(permutation)
# Implicit update of internal knowledge of length
if len(index) and self.length is None and not self.infinite:
l = max(index)
if l > self.lower_bound:
self.lower_bound = l
return message
def __len__(self):
"""
Length is computed implicitly and lazily. If any operation causes the Pipe
to reach the end of it's inputs, that position is stored as the length.
Alternatively, if this method is called before that happens, the Pipe will attempt to
loop to the end and calculate the length.
"""
if self.infinite:
raise ValueError("Pipe is labelled as having infinite length (ie. yields items indefinitely).")
elif self.length is None:
self.compute_length()
return self.length
else:
return self.length
[docs] def compute_length(self):
"""
Step forward as far as the inputs will allow and compute length.
Note: If the inputs are infinite, then this will go on forever.
"""
if not self.infinite:
while True:
try:
self[1 + self.lower_bound] # Use knowledge of lower bound from previous getitem calls to accelerate this process
except IndexError:
self.length = 1 + self.lower_bound
break
else:
raise ValueError("Pipe is labelled as having infinite length (ie. yields items indefinitely).")
[docs]class Title2LabelPipe(HookedPassThroughPipe):
"""
This Pipe takes one Pipe as input and inserts a column called 'label'
containing the provided title of the input Pipe to to all outputs.
"""
def __init__(self, title, input, *args, labels_column = 'labels', **kwargs):
super().__init__(input, *args, **kwargs)
self.labels_column = labels_column
self.label = title
def _get_item_hook(self, message):
return self.insert_labels(message)
def _next_hook(self, message):
return self.insert_labels(message)
"""
NOTE: Explore using the below code as an approach to make a general purpose wrapper Pipes,
ie. Pipes that modify all function calls from their input.
"""
# def __getattr__(self, *args, **kwargs):
# """
# Pass through all methods of the input pipe while adding labels.
# """
# output = self.input.__getattribute__(*args, **kwargs)
# if type(output) is types.MethodType: # Wrap the method in a converter
# return self.method_wrapper(output)
# else:
# return self.attribute_wrapper(output)
#
# def method_wrapper(self, function):
# """
# Wraps method with a label attacher such that whenever the method is called, the output is modified
# by adding the label.
# """
#
# def new_function(*args, **kwargs):
#
# output = function(*args, **kwargs)
# try:
# output = Message(output)
# except:
# return output
#
# return self.insert_labels(output)
#
# return new_function
#
# def attribute_wrapper(self, attribute):
# """
# Wraps attribute with new label if attribute returns a message.
# """
# try:
# output = Message(attribute)
# except:
# return attribute
#
# return self.insert_labels(output)
[docs] def insert_labels(self, message):
l = len(message)
message[self.labels_column] = [self.label for _ in range(l)]
return message
[docs]class LabelerPipe(Pipe):
"""
This Pipe implements a to_tensor function that converts labels contained in messages to tensors based on an internal labels dict.
"""
def __init__(self, input, labels, *args, **kwargs):
super().__init__(input, *args, **kwargs)
self.labels = labels
self.labels_dict = bidict({label:i for label,i in zip(labels,count())})
[docs] def to_tensor(self, batch, labels_column = 'labels'):
batch['labels_name'] = batch[labels_column]
labels = batch[labels_column]
labels = [self.labels_dict[label] for label in labels]
labels = torch.Tensor([one_hot(label,len(self.labels_dict)) for label in labels])
batch['labels'] = labels
try: # Allow an upstream Pipe to attempt to perform to_tensor if it can
batch = self.recursive_call('to_tensor', batch)
return batch
except AttributeError:
return batch
[docs]class RepeaterPipe(Pipe):
"""
Given an input Pipe that is iterable, enables repeat iteration.
"""
def __init__(self, input, *args,repetitions=10,**kwargs):
super().__init__(input, *args,**kwargs)
if not type(repetitions) is int:
raise ValueError("Number of repetitions must be provided as an integer.")
self.repetitions = repetitions
if not isinstance(self.input, Pipe): # TODO: Test this scenario
self.iterator = iter(input)
[docs] def reset(self):
self.iteration = 0
if not isinstance(self.input, Pipe):
self.iterator = iter(self.input)
else:
try:
self.recursive_call('reset',)()
except:
pass
return self
def __iter__(self):
self.reset()
return self
def __next__(self):
if not isinstance(self.input, Pipe):
return self.iterator.__next__()
else:
try:
return self.recursive_call('__next__')()
except StopIteration:
self.iteration += 1
if self.iteration >= self.repetitions:
raise StopIteration
else:
self.recursive_call('reset')()
return self.__next__()
[docs]class ShufflerPipe(Pipe):
"""
Given input Pipes that implement __getitem__ and __len__, will shuffle the indices so that iterating through
the Pipe or calling __getitem__ will return different values.
"""
def __init__(self, input, *args, **kwargs):
super().__init__(input, *args, **kwargs)
self.check_input()
self.current_index = 0
self.shuffle_indices = np.array([i for i in range(len(self))])
def __getitem__(self, index):
new_index = self.shuffle_indices[index]
return self.recursive_call('__getitem__')(new_index.tolist())
def __next__(self):
if self.current_index >= len(self):
raise StopIteration
message = self[self.current_index]
self.current_index += 1
return message
[docs] def shuffle(self, order = None): # TODO: Add support for ordering
np.random.shuffle(self.shuffle_indices)
[docs] def reset(self):
"""
Triggers a shuffle on reset.
"""
self.shuffle()
self.current_index = 0
try:
self.recursive_call('reset')()
except AttributeError:
pass
return self
def __iter__(self):
self.reset()
return self
[docs]class IndexMapperPipe(Pipe):
"""
Given input Pipes that implement __getitem__, returns a Pipe that maps indices in input_indices to output_indices via __getitem__
"""
def __init__(self, input_indices, output_indices, *args,**kwargs):
super().__init__(*args, **kwargs)
self.check_input()
if len(input_indices) != len(output_indices):
raise ValueError("The number of input indices does not match the number of output indices.")
self.pointers = UnlimitedCache()
self.pointers[input_indices] = Message({'output':output_indices})
def __getitem__(self, index):
index = index_to_list(index)
# return Fireworks.merge([pipe[self.pointers[index]['output'].values] for pipe in self.inputs.values()])
return self.input[self.pointers[index]['output'].values]
def __len__(self):
return len(self.pointers)
[docs]class BatchingPipe(Pipe):
"""
Generates minibatches.
"""
def __init__(self, *args, batch_size=5, **kwargs):
super().__init__(*args,**kwargs)
self.current_index = 0
self.batch_size = batch_size
def __iter__(self):
self.reset()
return self
[docs] def reset(self):
self.current_index = 0
def __next__(self):
if self.current_index + self.batch_size > len(self):
raise StopIteration
try:
batch = self[self.current_index:self.current_index+self.batch_size]
self.current_index += self.batch_size
return batch
except IndexError:
raise StopIteration