Source code for gptwosample.twosample.twosample_base

'''
Classes to apply GPTwoSample to data
====================================

All classes handling TwoSampleBase tasks should extend this class.

Created on Mar 18, 2011

@author: Max Zwiessele, Oliver Stegle
'''
from gptwosample.data.data_base import input_id, output_id, individual_id, \
    common_id, has_model_structure, get_model_structure, DataStructureError
import scipy
from pygp.gp.gp_base import GP
from pygp.gp.composite import GroupGP
from pygp.optimize.optimize_base import opt_hyper
import numpy
from pygp.plot.gpr_plot import plot_sausage, plot_training_data
from copy import deepcopy
from matplotlib import cm
from pygp.likelihood.likelihood_base import GaussLikISO

[docs]class TwoSampleBase(object): """ TwoSampleBase object with the given covariance function covar. """ def __init__(self, learn_hyperparameters=True, priors=None, initial_hyperparameters=None, **kwargs): """ Perform TwoSampleBase with the given covariance function covar. **Parameters**: covar : :py:class:`pygp.covar.CovarianceFunction` The covariance function this TwoSampleBase class works with. learn_hyperparameters : bool Specifies whether or not to optimize the hyperparameters for the given data priors : {'covar': priors for covar, ...} Default: None; The prior beliefs you provide for the hyperparamaters of the covariance function. """ self._learn_hyperparameters = learn_hyperparameters if has_model_structure(priors): self._priors = priors else: self._priors = get_model_structure(priors, priors) self._models = dict() if initial_hyperparameters is None and priors is not None: self._initial_hyperparameters = get_model_structure({}, {}); for name, prior in self._priors.iteritems(): if prior.has_key('covar'): # logarithmize the right hyperparameters for the covariance logtheta = scipy.array([p[1][0] * p[1][1] for p in prior['covar']], dtype='float') # out since version 1.0.0 # logtheta[covar.get_Iexp(logtheta)] = SP.log(logtheta[covar.get_Iexp(logtheta)]) self._initial_hyperparameters[name]['covar'] = logtheta elif has_model_structure(initial_hyperparameters): self._initial_hyperparameters = initial_hyperparameters elif initial_hyperparameters is not None: self._initial_hyperparameters = get_model_structure(initial_hyperparameters, initial_hyperparameters) else: self._initial_hyperparameters = get_model_structure({}) for name, hyper in self._initial_hyperparameters.iteritems(): hyper['lik'] = numpy.log([0.1]) self._invalidate_cache() def set_data_by_xy_data(self, x1, x2, y1, y2): #not_missing = (numpy.isfinite(x1) * numpy.isfinite(x2) * numpy.isfinite(y1) * numpy.isfinite(y2)).flatten() #x1, x2 = x1[not_missing], x2[not_missing] #y1, y2 = y1[not_missing], y2[not_missing] X = numpy.array([x1, x2]); Y = numpy.array([y1, y2]) # set individual model's data self._models[individual_id].setData(X, Y) # set common model's data self._models[common_id].setData(scipy.concatenate(X), scipy.concatenate(Y))
[docs] def set_data(self, training_data): """ Set the data of prediction. **Parameters:** training_data : dict traning_data The training data to learn from. Input are time-values and output are expression-values of e.g. a timeseries. Training data training_data has following structure:: {'input' : {'group 1':[double] ... 'group n':[double]}, 'output' : {'group 1':[double] ... 'group n':[double]}} """ try: self.set_data_by_xy_data(training_data[input_id]['group_1'], training_data[input_id]['group_2'], training_data[output_id]['group_1'], training_data[output_id]['group_2']) except KeyError: # print """Please validate training data given. \n # training_data must have following structure: \n # {'input' : {'group 1':[double] ... 'group n':[double]}, # 'output' : {'group 1':[double] ... 'group n':[double]}}""" raise DataStructureError("Please use gptwosample.data.data_base.get_training_data_structure for data passing!")
[docs] def predict_model_likelihoods(self, training_data=None, interval_indices=get_model_structure(), *args, **kwargs): """ Predict the probabilities of the models (individual and common) to describe the data. It will optimize hyperparameters respectively. **Parameters**: training_data : dict traning_data The training data to learn from. Input are time-values and output are expression-values of e.g. a timeseries. If not given, training data must be given previously by :py:class:`gptwosample.twosample.basic.set_data`. interval_indices: :py:class:`gptwosample.data.data_base.get_model_structure()` interval indices, which assign data to individual or common model, respectively. args : [..] see :py:class:`pygp.gpr.gp_base.GP` kwargs : {..} see :py:class:`pygp.gpr.gp_base.GP` """ if(training_data is not None): self.set_data(training_data) for name, model in self._models.iteritems(): model.set_active_set_indices(interval_indices[name]) try: if(self._learn_hyperparameters): opt_hyperparameters = opt_hyper(model, self._initial_hyperparameters[name], priors=self._priors[name], *args, **kwargs)[0] self._learned_hyperparameters[name] = opt_hyperparameters else: self._learned_hyperparameters[name] = self._initial_hyperparameters[name] except ValueError as r: print "caught error:", r.message, "\r", self._learned_hyperparameters[name] = self._initial_hyperparameters[name] self._model_likelihoods[name] = model.LML(self._learned_hyperparameters[name], priors=self._priors) return self._model_likelihoods
[docs] def predict_mean_variance(self, interpolation_interval, hyperparams=None, interval_indices=get_model_structure(), *args, **kwargs): """ Predicts the mean and variance of both models. Returns:: {'individual':{'mean':[pointwise mean], 'var':[pointwise variance]}, 'common':{'mean':[pointwise mean], 'var':[pointwise variance]}} **Parameters:** interpolation_interval : [double] The interval of inputs, which shall be predicted hyperparams : {'covar':logtheta, ...} Default: learned hyperparameters. Hyperparams for the covariance function's prediction. interval_indices : {'common':[boolean],'individual':[boolean]} Indices in which to predict, for each group, respectively. """ if interpolation_interval.ndim < 2: interpolation_interval = interpolation_interval[:, None] if(hyperparams is None): hyperparams = self._learned_hyperparameters self._predicted_mean_variance = get_model_structure() if(not has_model_structure(interpolation_interval)): interpolation_interval = get_model_structure(interpolation_interval, interpolation_interval) for name, model in self._models.iteritems(): model.set_active_set_indices(interval_indices[name]) prediction = model.predict(hyperparams[name], interpolation_interval[name], var=True, *args, **kwargs) self._predicted_mean_variance[name] = {'mean':prediction[0], 'var':prediction[1]} self._interpolation_interval_cache = interpolation_interval return self._predicted_mean_variance
[docs] def bayes_factor(self, model_likelihoods=None): """ Return the Bayes Factor for the given log marginal likelihoods model_likelihoods **Parameters:** model_likelihoods : {'individual': *the individual likelihoods*, 'common': *the common likelihoods*} The likelihoods calculated by predict_model_likelihoods(training_data) for given training data training_data. """ if model_likelihoods is numpy.NaN: return numpy.NaN if(model_likelihoods is None): model_likelihoods = self._model_likelihoods return model_likelihoods[common_id] - model_likelihoods[individual_id]
def get_covars(self): models = self._models return {individual_id: models[individual_id].covar, common_id: models[common_id].covar}
[docs] def get_model_likelihoods(self): """ Returns all calculated likelihoods in model structure. If not calculated returns None in model structure. """ return self._model_likelihoods
[docs] def get_learned_hyperparameters(self): """ Returns learned hyperparameters in model structure, if already learned. """ return self._learned_hyperparameters
[docs] def get_predicted_mean_variance(self): """ Get the predicted mean and variance as:: {'individual':{'mean':[pointwise mean], 'var':[pointwise variance]}, 'common':{'mean':[pointwise mean], 'var':[pointwise variance]}} If not yet predicted it will return 'individual' and 'common' empty. """ return self._predicted_mean_variance
[docs] def get_data(self, model=common_id, index=None): # , interval_indices=get_model_structure()): """ get inputs of model `model` with group index `index`. If index is None, the whole model group will be returned. """ if(index is None): return self._models[model].getData() # [:, interval_indices[model]].squeeze() else: return self._models[model].getData()[index] # [:, interval_indices[model]].squeeze()
[docs] def plot(self, xlabel="input", ylabel="ouput", title=None, interval_indices=None, alpha=None, legend=True, replicate_indices=None, shift=None, *args, **kwargs): """ Plot the results given by last prediction. Two Instance Plots of comparing two groups to each other: **Parameters:** twosample_object : :py:class:`gptwosample.twosample` GPTwoSample object, on which already 'predict' was called. **Differential Groups:** .. image:: ../images/plotGPTwoSampleDifferential.png :height: 8cm **Non-Differential Groups:** .. image:: ../images/plotGPTwoSampleSame.png :height: 8cm Returns: Proper rectangles for use in pylab.legend(). """ if self._predicted_mean_variance is None: print "Not yet predicted, or not predictable" return if interval_indices is None: interval_indices = get_model_structure( common=numpy.array(numpy.zeros_like(self.get_data(common_id)[0]), dtype='bool'), individual=numpy.array(numpy.ones_like(self.get_data(individual_id, 0)[0]), dtype='bool')) import pylab if title is None: title = r'Prediction result: $\log(p(\mathcal{H}_I)/p(\mathcal{H}_S)) = %.2f $' % (self.bayes_factor()) # plparams = {'axes.labelsize': 20, # 'text.fontsize': 20, # 'legend.fontsize': 18, # 'title.fontsize': 22, # 'xtick.labelsize': 20, # 'ytick.labelsize': 20, # 'usetex': True } legend_plots = [] legend_names = [] calc_replicate_indices = replicate_indices is None alpha_groups = alpha if alpha is not None: alpha_groups = 1 - alpha for name, value in self._predicted_mean_variance.iteritems(): mean = value['mean'] var = numpy.sqrt(value['var']) if len(mean.shape) > 1: number_of_groups = mean.shape[0] first = True for i in range(number_of_groups): col_num = (i / (2. * number_of_groups)) col = cm.jet(col_num) # (i/number_of_groups,i/number_of_groups,.8) @UndefinedVariable x, y = self.get_data(name, i) x, y = x.squeeze(), y.squeeze() replicate_length = len(numpy.unique(x)) number_of_replicates = len(x) / replicate_length if calc_replicate_indices: # Assume replicates are appended one after another replicate_indices = [] curr = x[0] - 1 rep = 0 replicate_length = 0 for xi in x: if xi < curr: replicate_indices.append(numpy.repeat(rep, replicate_length)) rep += 1 replicate_length = 0 replicate_length += 1 curr = xi replicate_indices.append(numpy.repeat(rep, replicate_length)) replicate_indices = numpy.concatenate(replicate_indices) shifti = deepcopy(shift) if shifti is not None: shifti = shift[i * number_of_replicates:(i + 1) * number_of_replicates] # import pdb;pdb.set_trace() plot_sausage(self._interpolation_interval_cache[name] - numpy.mean(shifti), mean[i], var[i], format_fill={'alpha':0.2, 'facecolor':col}, format_line={'alpha':1, 'color':col, 'lw':3, 'ls':'--'}, alpha=alpha_groups)[0] else: plot_sausage(self._interpolation_interval_cache[name], mean[i], var[i], format_fill={'alpha':0.2, 'facecolor':col}, format_line={'alpha':1, 'color':col, 'lw':3, 'ls':'--'}, alpha=alpha_groups)[0] plot_training_data( numpy.array(x), numpy.array(y), format_data={'alpha':.8, 'marker':'.', 'linestyle':'--', 'lw':1, 'markersize':6, 'color':col}, replicate_indices=replicate_indices, shift=shifti, *args, **kwargs) if(first): legend_plots.append(pylab.Rectangle((0, 0), 1, 1, alpha=.2, fill=True, facecolor=col)) legend_names.append("%s %i" % (name, i + 1)) # first=False else: col = cm.jet(1.) # @UndefinedVariable # data = self.get_data(name, interval_indices=interval_indices) # PLOT.plot_training_data( # x, y, # format_data={'alpha':.2, # 'marker':'.', # 'linestyle':'', # 'markersize':10, # 'color':col}) legend_names.append("%s" % (name)) plot_sausage( self._interpolation_interval_cache[name], mean, var, format_fill={'alpha':0.2, 'facecolor':col}, format_line={'alpha':1, 'color':col, 'lw':3, 'ls':'--'}, alpha=alpha)[0] legend_plots.append(pylab.Rectangle((0, 0), 1, 1, alpha=.2, fc=col, fill=True)) if legend: pylab.legend(legend_plots, legend_names, bbox_to_anchor=(0., 0., 1., 0.), loc=3, ncol=2, mode="expand", borderaxespad=0., fancybox=False, frameon=False) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.subplots_adjust(top=.88) pylab.title(title, fontsize=22) return legend_plots ######### PRIVATE ############## # def _init_twosample_model(self, covar): # """ # The initialization of the twosample model with # the given covariance function covar # """ # print("please implement twosample model") # pass
def _invalidate_cache(self): # self._learned_hyperparameters = dict([name,None for name in self._models.keys()]) self._model_likelihoods = get_model_structure() self._learned_hyperparameters = get_model_structure() self._interpolation_interval_cache = None self._predicted_mean_variance = None
[docs]class TwoSampleShare(TwoSampleBase): """ This class provides comparison of two Timeline Groups to each other. see :py:class:`gptwosample.twosample.twosample_base.TwoSampleBase` for detailed description of provided methods. """ def __init__(self, covar, *args, **kwargs): """ see :py:class:`gptwosample.twosample.twosample_base.TwoSampleBase` """ if not kwargs.has_key('initial_hyperparameters'): kwargs['initial_hyperparameters'] = \ get_model_structure(individual={'covar':numpy.zeros(covar.get_number_of_parameters())}, common={'covar':numpy.zeros(covar.get_number_of_parameters())}) super(TwoSampleShare, self).__init__(*args, **kwargs) gpr1 = GP(deepcopy(covar), likelihood=GaussLikISO()) gpr2 = GP(deepcopy(covar), likelihood=GaussLikISO()) # individual = GroupGP([gpr1,gpr2]) # common = GP(covar) # self.covar = covar # set models for this TwoSampleBase Test self._models = {individual_id:GroupGP([gpr1, gpr2]), common_id:GP(deepcopy(covar), likelihood=GaussLikISO())}
[docs]class TwoSampleSeparate(TwoSampleBase): """ This class provides comparison of two Timeline Groups to one another, inlcuding timeshifts in replicates, respectively. see :py:class:`gptwosample.twosample.twosample_base.TwoSampleBase` for detailed description of provided methods. Note that this model will need one covariance function for each model, respectively! """ def __init__(self, covar_individual_1, covar_individual_2, covar_common, **kwargs): """ see :py:class:`gptwosample.twosample.twosample_base.TwoSampleBase` """ if not kwargs.has_key('initial_hyperparameters'): kwargs['initial_hyperparameters'] = \ get_model_structure(individual={'covar':numpy.zeros(covar_individual_1.get_number_of_parameters())}, common={'covar':numpy.zeros(covar_common.get_number_of_parameters())}) super(TwoSampleSeparate, self).__init__(**kwargs) gpr1 = GP(deepcopy(covar_individual_1), likelihood=GaussLikISO()) gpr2 = GP(deepcopy(covar_individual_2), likelihood=GaussLikISO()) # self.covar_individual_1 = covar_individual_1 # self.covar_individual_2 = covar_individual_2 # self.covar_common = covar_common # set models for this TwoSampleBase Test self._models = {individual_id:GroupGP([gpr1, gpr2]), common_id:GP(deepcopy(covar_common), likelihood=GaussLikISO())}