Source code for gptwosample.confounder.confounder

'''
Confounder Learning and Correction Module
-----------------------------------------

@author: Max Zwiessele
'''
import numpy
from pygp.covar.linear import LinearCFISO
from pygp.covar.combinators import SumCF, ProductCF
from pygp.covar.se import SqexpCFARD
from pygp.gp.gplvm import GPLVM
from pygp.optimize.optimize_base import opt_hyper
from pygp.covar.fixed import FixedCF
from pygp.likelihood.likelihood_base import GaussLikISO
from pygp.covar.bias import BiasCF
from pygp.util.pca import PCA
from gptwosample.twosample.twosample import TwoSample

[docs]class TwoSampleConfounder(TwoSample): """Run GPTwoSample on given Data **Parameters**: - T : TimePoints [n x r x t] [Samples x Replicates x Timepoints] - Y : ExpressionMatrix [n x r x t x d] [Samples x Replicates x Timepoints x Genes] - q : Number of Confounders to use - lvm_covariance : optional - set covariance to use in confounder learning - init : [random, pca] **Fields**: * T: Time Points [n x r x t] [Samples x Replicates x Timepoints] * Y: Expression [n x r x t x d] [Samples x Replicates x Timepoints x Genes] * X: Confounders [nrt x 1+q] [SamplesReplicatesTimepoints x T+q] * lvm_covariance: GPLVM covaraince function used for confounder learning * n: Samples * r: Replicates * t: Timepoints * d: Genes * q: Confounder Components """ def __init__(self, T, Y, q=4, lvm_covariance=None, init="random", covar_common=None, covar_individual_1=None, covar_individual_2=None): """ **Parameters**: T : TimePoints [n x r x t] [Samples x Replicates x Timepoints] Y : ExpressionMatrix [n x r x t x d] [Samples x Replicates x Timepoints x Genes] q : Number of Confounders to use lvm_covariance : optional - set covariance to use in confounder learning init : [random, pca] """ super(TwoSampleConfounder, self).__init__(T, Y, covar_common=covar_common, covar_individual_1=covar_individual_1, covar_individual_2=covar_individual_2) self.q = q self.init = init self.init_X(Y, init) if lvm_covariance is not None: self._lvm_covariance = lvm_covariance else: rt = self.r * self.t # self.X_r = numpy.zeros((self.n * rt, self.n * self.r)) # for i in xrange(self.n * self.r):self.X_r[i * self.t:(i + 1) * self.t, i] = 1 # rep = LinearCFISO(dimension_indices=numpy.arange(1 + q, 1 + q + (self.n * self.r))) self.X_s = numpy.zeros((self.n * rt, self.n)) for i in xrange(self.n):self.X_s[i * rt:(i + 1) * rt, i] = 1 sam = LinearCFISO(dimension_indices=numpy.arange(1 + q, # + (self.n * self.r), 1 + q + self.n)) # + (self.n * self.r) self._lvm_covariance = SumCF([LinearCFISO(dimension_indices=numpy.arange(1, 2)), # rep, sam, ProductCF([sam, SqexpCFARD(dimension_indices=numpy.array([0]))]), BiasCF()]) # if initial_hyperparameters is None: # initial_hyperparameters = numpy.zeros(self._lvm_covariance.get_number_of_parameters())) self._initialized = False
[docs] def learn_confounder_matrix(self, ard_indices=None, x=None, messages=True, gradient_tolerance=1E-12, lvm_dimension_indices=None, gradcheck=False, maxiter=10000, ): """ Learn confounder matrix with this model. **Parameters**: x : array-like If you provided an own lvm_covariance you have to specify the X to use within GPLVM lvm_dimension_indices : [int] If you specified an own lvm_covariance you have to specify the dimension indices for GPLVM ard_indices : [indices] If you provided an own lvm_covariance, give the ard indices of the covariance here, to be able to use the correct hyperparameters for calculating the confounder covariance matrix. """ if lvm_dimension_indices is None: lvm_dimension_indices = xrange(1, 1 + self.q) if ard_indices is None: ard_indices = slice(0, self.q) self._check_data() self.init_X(self.Y, self.init) # p = PCA(Y) # try: # self.X = p.project(Y, self.q) # except IndexError: # raise IndexError("More confounder components then genes (q > d)") if x is None: x = self._x() self._Xlvm = x self.gplvm = self._gplvm(lvm_dimension_indices) hyper = { 'lik':numpy.log([.15]), 'x':self.X, 'covar':numpy.zeros(self._lvm_covariance.get_number_of_parameters()) } lvm_hyperparams, _ = opt_hyper(self.gplvm, hyper, Ifilter=None, maxiter=maxiter, gradcheck=gradcheck, bounds=None, messages=messages, gradient_tolerance=gradient_tolerance) self._Xlvm = self.gplvm.getData()[0] self._init_conf_matrix(lvm_hyperparams, ard_indices, lvm_dimension_indices) self.initialize_twosample_covariance()
def predict_lvm(self): return self.gplvm.predict(self._lvm_hyperparams, self._Xlvm, numpy.arange(self.d))
[docs] def initialize_twosample_covariance(self, covar_common=lambda x: SumCF([SqexpCFARD(1), x, BiasCF()]), covar_individual_1=lambda x: SumCF([SqexpCFARD(1), x, BiasCF()]), covar_individual_2=lambda x: SumCF([SqexpCFARD(1), x, BiasCF()]), ): """ initialize twosample covariance with function covariance(XX), where XX is a FixedCF with the learned confounder matrix. default is SumCF([SqexpCFARD(1), FixedCF(self.K_conf.copy()), BiasCF()]) """ self.covar_comm = covar_common(FixedCF(self.K_conf.copy())) self.covar_ind1 = covar_individual_1(FixedCF(self.K_conf.copy())) self.covar_ind2 = covar_individual_2(FixedCF(self.K_conf.copy()))
def init_X(self, Y, init): if init == 'pca': y = Y.reshape(-1, self.d) p = PCA(y) self.X = p.project(y, self.q) self.X += .1 * numpy.random.randn(*self.X.shape) elif init == 'random': self.X = numpy.random.randn(numpy.prod(self.n * self.r * self.t), self.q) else: print "init model {0!s} not known".format(init) def _init_conf_matrix(self, lvm_hyperparams, conf_covar_name, lvm_dimension_indices): self._initialized = True self.X = lvm_hyperparams['x'] self._lvm_hyperparams = lvm_hyperparams # if ard_indices is None: # ard_indices = numpy.arange(1) # ard = self._lvm_covariance.get_reparametrized_theta(lvm_hyperparams['covar'])[ard_indices] # self.K_conf = numpy.dot(self.X*ard, self.X.T) try: self.gplvm except: self.gplvm = self._gplvm(lvm_dimension_indices) self.K_conf = self._lvm_covariance.K(self._lvm_hyperparams['covar'], self._Xlvm, self._Xlvm, names=['XX']) def _gplvm(self, lvm_dimension_indices): self._Xlvm[:, lvm_dimension_indices] = self.X Y = self.Y.reshape(numpy.prod(self.n * self.r * self.t), self.Y.shape[3]) return GPLVM(gplvm_dimensions=lvm_dimension_indices, covar_func=self._lvm_covariance, likelihood=GaussLikISO(), x=self._Xlvm, y=Y) def _x(self): return numpy.concatenate((self.T.reshape(-1, 1), self.X, # self.X_r, self.X_s), axis=1)
if __name__ == '__main__': Tt = numpy.arange(0, 16, 2)[:, None] Tr = numpy.tile(Tt, 3).T Ts = numpy.array([Tr, Tr]) n, r, t, d = nrtd = Ts.shape + (12,) covar = SqexpCFARD(1) K = covar.K(covar.get_de_reparametrized_theta([1, 13]), Tt) m = numpy.zeros(t) try: from scikits.learn.mixture import sample_gaussian except: raise "scikits needed for this example" # raise r y1 = sample_gaussian(m, K, cvtype='full', n_samples=d) y2 = sample_gaussian(m, K, cvtype='full', n_samples=d) Y1 = numpy.zeros((t, d + d / 2)) Y2 = numpy.zeros((t, d + d / 2)) Y1[:, :d] = y1 Y2[:, :d] = y2 sames = numpy.random.randint(0, d, size=d / 2) Y1[:, d:] = y2[:, sames] Y2[:, d:] = y1[:, sames] Y = numpy.zeros((n, r, t, d + d / 2)) sigma = .5 Y[0, :, :, :] = Y1 + sigma * numpy.random.randn(r, t, d + d / 2) Y[1, :, :, :] = Y2 + sigma * numpy.random.randn(r, t, d + d / 2) c = TwoSampleConfounder(Ts, Y) # c.__verbose = True # lvm_hyperparams_file_name = 'lvm_hyperparams.pickle' c.learn_confounder_matrix() # lvm_hyperparams_file = open(lvm_hyperparams_file_name, 'w') # pickle.dump(c._lvm_hyperparams, lvm_hyperparams_file) # lvm_hyperparams_file.close() # lvm_hyperparams_file = open(lvm_hyperparams_file_name, 'r') # c._init_conf_matrix(pickle.load(lvm_hyperparams_file), None) # lvm_hyperparams_file.close() c.predict_likelihoods(Ts, Y) c.predict_means_variances(numpy.linspace(0, 24, 100)) import pylab pylab.ion() pylab.figure() for _ in c.plot(): raw_input("enter to continue")