Package csb :: Package statistics :: Module mixtures
[frames] | no frames]

Source Code for Module csb.statistics.mixtures

  1  """ 
  2  Mixture models for multi-dimensional data. 
  3   
  4  Reference: Hirsch M, Habeck M. - Bioinformatics. 2008 Oct 1;24(19):2184-92 
  5  """ 
  6  import numpy 
  7   
  8  from abc import ABCMeta, abstractmethod 
9 10 11 -class GaussianMixture(object):
12 """ 13 Gaussian mixture model for multi-dimensional data. 14 """ 15 _axis = None 16 17 # prior for variance (inverse Gamma distribution) 18 ALPHA_SIGMA = 0.0001 19 BETA_SIGMA = 0.01 20 MIN_SIGMA = 0.0 21 22 use_cache = True 23
24 - def __init__(self, X, K, train=True, axis=None):
25 """ 26 @param X: multi dimensional input vector with samples along first axis 27 @type X: (M,...) numpy array 28 29 @param K: number of components 30 @type K: int 31 32 @param train: train model 33 @type train: bool 34 35 @param axis: component axis in C{X} 36 @type axis: int 37 """ 38 if self._axis is not None: 39 if axis is not None and axis != self._axis: 40 raise ValueError('axis is fixed for {0}'.format(type(self).__name__)) 41 axis = self._axis 42 elif axis is None: 43 axis = 0 44 self._axis = axis 45 46 N = X.shape[axis] 47 self._X = X 48 self._dimension = numpy.prod(X.shape) / N 49 50 c = numpy.linspace(0, K, N, False).astype(int) 51 self._scales = numpy.equal.outer(range(K), c).astype(float) 52 self._means = numpy.zeros((K,) + X.shape[1:]) 53 self.del_cache() 54 55 if train: 56 self.em()
57 58 @property
59 - def K(self):
60 """ 61 Number of components 62 @rtype: int 63 """ 64 return len(self.means)
65 66 @property
67 - def N(self):
68 """ 69 Length of component axis 70 @rtype: int 71 """ 72 return self._scales.shape[1]
73 74 @property
75 - def M(self):
76 """ 77 Number of data points 78 @rtype: int 79 """ 80 return len(self._X)
81
82 - def del_cache(self):
83 """Clear model parameter cache (force recalculation)""" 84 self._w = None 85 self._sigma = None 86 self._delta = None
87 88 @property
89 - def dimension(self):
90 """ 91 Dimensionality of the mixture domain 92 @rtype: int 93 """ 94 return self._dimension
95 96 @property
97 - def means(self):
98 """ 99 @rtype: (K, ...) numpy array 100 """ 101 return self._means
102 103 @means.setter
104 - def means(self, means):
105 if means.shape != self._means.shape: 106 raise ValueError('shape mismatch') 107 self._means = means 108 self.del_cache()
109 110 @property
111 - def scales(self):
112 """ 113 @rtype: (K, N) numpy array 114 """ 115 return self._scales
116 117 @scales.setter
118 - def scales(self, scales):
119 if scales.shape != self._scales.shape: 120 raise ValueError('shape mismatch') 121 self._scales = scales 122 self.del_cache()
123 124 @property
125 - def w(self):
126 """ 127 Component weights 128 @rtype: (K,) numpy array 129 """ 130 if not self.use_cache or self._w is None: 131 self._w = self.scales.mean(1) 132 return self._w
133 134 @property
135 - def sigma(self):
136 """ 137 Component variations 138 @rtype: (K,) numpy array 139 """ 140 if not self.use_cache or self._sigma is None: 141 alpha = self.dimension * self.scales.sum(1) + self.ALPHA_SIGMA 142 beta = (self.delta * self.scales.T).sum(0) + self.BETA_SIGMA 143 self._sigma = numpy.sqrt(beta / alpha).clip(self.MIN_SIGMA) 144 return self._sigma
145 146 @property
147 - def delta(self):
148 """ 149 Squared "distances" between data and components 150 @rtype: (N, K) numpy array 151 """ 152 if not self.use_cache or self._delta is None: 153 self._delta = numpy.transpose([[d.sum() 154 for d in numpy.swapaxes([(self.means[k] - self.datapoint(m, k)) ** 2 155 for m in range(self.M)], 0, self._axis)] 156 for k in range(self.K)]) 157 return self._delta
158 159 @property
160 - def log_likelihood_reduced(self):
161 """ 162 Log-likelihood of the marginalized model (no auxiliary indicator variables) 163 @rtype: float 164 """ 165 from csb.numeric import log, log_sum_exp 166 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 167 log_p = log(self.w) - 0.5 * \ 168 (self.delta / s_sq + self.dimension * log(2 * numpy.pi * s_sq)) 169 return log_sum_exp(log_p.T).sum()
170 171 @property
172 - def log_likelihood(self):
173 """ 174 Log-likelihood of the extended model (with indicators) 175 @rtype: float 176 """ 177 from csb.numeric import log 178 from numpy import pi, sum 179 n = self.scales.sum(1) 180 N = self.dimension 181 Z = self.scales.T 182 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 183 return sum(n * log(self.w)) - 0.5 * \ 184 (sum(Z * self.delta / s_sq) + N * sum(n * log(2 * pi * s_sq)) + sum(log(s_sq)))
185
186 - def datapoint(self, m, k):
187 """ 188 Training point number C{m} as if it would belong to component C{k} 189 @rtype: numpy array 190 """ 191 return self._X[m]
192
193 - def estimate_means(self):
194 """ 195 Update means from current model and samples 196 """ 197 n = self.scales.sum(1) 198 self.means = numpy.array([numpy.sum([self.scales[k, m] * self.datapoint(m, k) 199 for m in range(self.M)], 0) / n[k] 200 for k in range(self.K)])
201
202 - def estimate_scales(self, beta=1.0):
203 """ 204 Update scales from current model and samples 205 @param beta: inverse temperature 206 @type beta: float 207 """ 208 from csb.numeric import log, log_sum_exp, exp 209 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 210 Z = (log(self.w) - 0.5 * (self.delta / s_sq + self.dimension * log(s_sq))) * beta 211 self.scales = exp(Z.T - log_sum_exp(Z.T))
212
213 - def randomize_means(self):
214 """ 215 Pick C{K} samples from C{X} as means 216 """ 217 import random 218 self.means = numpy.asarray(random.sample(self._X, self.K)) 219 self.estimate_scales()
220
221 - def randomize_scales(self, ordered=True):
222 """ 223 Random C{scales} initialization 224 """ 225 from numpy.random import random, multinomial 226 if ordered: 227 K, N = self.scales.shape 228 Ks = numpy.arange(K) 229 w = random(K) + (5. * K / N) # with pseudocounts 230 c = numpy.repeat(Ks, multinomial(N, w / w.sum())) 231 self.scales = numpy.equal.outer(Ks, c).astype(float) 232 else: 233 s = random(self.scales.shape) 234 self.scales = s / s.sum(0) 235 self.estimate_means()
236
237 - def e_step(self, beta=1.0):
238 """ 239 Expectation step for EM 240 @param beta: inverse temperature 241 @type beta: float 242 """ 243 self.estimate_scales(beta)
244
245 - def m_step(self):
246 """ 247 Maximization step for EM 248 """ 249 self.estimate_means()
250
251 - def em(self, n_iter=100, eps=1e-30):
252 """ 253 Expectation maximization 254 255 @param n_iter: maximum number of iteration steps 256 @type n_iter: int 257 258 @param eps: log-likelihood convergence criterion 259 @type eps: float 260 """ 261 LL_prev = -numpy.inf 262 for i in range(n_iter): 263 self.m_step() 264 self.e_step() 265 266 if eps is not None: 267 LL = self.log_likelihood 268 if abs(LL - LL_prev) < eps: 269 break 270 LL_prev = LL
271
272 - def anneal(self, betas):
273 """ 274 Deterministic annealing 275 276 @param betas: sequence of inverse temperatures 277 @type betas: iterable of floats 278 """ 279 for beta in betas: 280 self.m_step() 281 self.e_step(beta)
282
283 - def increment_K(self, train=True):
284 """ 285 Split component with largest sigma 286 287 @returns: new instance of mixture with incremented C{K} 288 @rtype: L{GaussianMixture} subclass 289 """ 290 i = self.sigma.argmax() 291 292 # duplicate column 293 Z = numpy.vstack([self.scales, self.scales[i]]) 294 295 # mask disjoint equal sized parts 296 mask = Z[i].cumsum() / Z[i].sum() > 0.5 297 Z[i, mask] *= 0.0 298 Z[-1, ~mask] *= 0.0 299 300 new = type(self)(self._X, self.K + 1, False, self._axis) 301 new.scales = Z 302 new.m_step() 303 if train: 304 new.em() 305 306 return new
307 308 @classmethod
309 - def series(cls, X, start=1, stop=9):
310 """ 311 Iterator with mixture instances for C{K in range(start, stop)} 312 313 @type X: (M,...) numpy array 314 @type start: int 315 @type stop: int 316 @rtype: generator 317 """ 318 mixture = cls(X, start) 319 yield mixture 320 321 for K in range(start + 1, stop): #@UnusedVariable 322 mixture = mixture.increment_K() 323 yield mixture
324 325 @classmethod
326 - def new(cls, X, K=0):
327 """ 328 Factory method with optional C{K}. If C{K=0}, guess best C{K} according 329 to L{BIC<GaussianMixture.BIC>}. 330 331 @param X: multi dimensional input vector with samples along first axis 332 @type X: (M,...) numpy array 333 334 @return: Mixture instance 335 @rtype: L{GaussianMixture} subclass 336 """ 337 if K > 0: 338 return cls(X, K) 339 340 mixture_it = cls.series(X) 341 mixture = next(mixture_it) 342 343 # increase K as long as next candidate looks better 344 for candidate in mixture_it: 345 if candidate.BIC >= mixture.BIC: 346 break 347 mixture = candidate 348 349 return mixture
350 351 @property
352 - def BIC(self):
353 """ 354 Bayesian information criterion, calculated as 355 BIC = M * ln(sigma_e^2) + K * ln(M) 356 357 @rtype: float 358 """ 359 from numpy import log 360 361 n = self.M 362 k = self.K 363 error_variance = sum(self.sigma ** 2 * self.w) 364 365 return n * log(error_variance) + k * log(n)
366 367 @property
368 - def membership(self):
369 """ 370 Membership array 371 @rtype: (N,) numpy array 372 """ 373 return self.scales.argmax(0)
374
375 - def overlap(self, other):
376 """ 377 Similarity of two mixtures measured in membership overlap 378 379 @param other: Mixture or membership array 380 @type other: L{GaussianMixture} or sequence 381 382 @return: segmentation overlap 383 @rtype: float in interval [0.0, 1.0] 384 """ 385 if isinstance(other, GaussianMixture): 386 other_w = other.membership 387 K = min(self.K, other.K) 388 elif isinstance(other, (list, tuple, numpy.ndarray)): 389 other_w = other 390 K = min(self.K, len(set(other))) 391 else: 392 raise TypeError('other') 393 394 self_w = self.membership 395 if len(self_w) != len(other_w): 396 raise ValueError('self.N != other.N') 397 398 # position numbers might be permutated, so count equal pairs 399 ww = tuple(zip(self_w, other_w)) 400 same = sum(sorted(ww.count(i) for i in set(ww))[-K:]) 401 402 return float(same) / len(ww)
403
404 -class AbstractStructureMixture(GaussianMixture):
405 """ 406 Abstract mixture model for protein structure ensembles. 407 """ 408 __metaclass__ = ABCMeta 409
410 - def __init__(self, X, K, *args, **kwargs):
411 if len(X.shape) != 3 or X.shape[-1] != 3: 412 raise ValueError('X must be array of shape (M,N,3)') 413 414 self._R = numpy.zeros((len(X), K, 3, 3)) 415 self._t = numpy.zeros((len(X), K, 3)) 416 417 super(AbstractStructureMixture, self).__init__(X, K, *args, **kwargs)
418 419 @property
420 - def R(self):
421 """ 422 Rotation matrices 423 @rtype: (M,K,3,3) numpy array 424 """ 425 return self._R
426 427 @property
428 - def t(self):
429 """ 430 Translation vectors 431 @rtype: (M,K,3) numpy array 432 """ 433 return self._t
434
435 - def datapoint(self, m, k):
436 return numpy.dot(self._X[m] - self._t[m, k], self._R[m, k])
437
438 - def m_step(self):
439 self.estimate_means() 440 self.estimate_T()
441 442 @abstractmethod
443 - def estimate_T(self):
444 """ 445 Estimate superpositions 446 """ 447 raise NotImplementedError
448
449 -class SegmentMixture(AbstractStructureMixture):
450 """ 451 Gaussian mixture model for protein structure ensembles using a set of segments 452 453 If C{X} is the coordinate array of a protein structure ensemble which 454 can be decomposed into 2 rigid segments, the segmentation will be found by: 455 456 >>> mixture = SegmentMixture(X, 2) 457 458 The segment membership of each atom is given by: 459 460 >>> mixture.membership 461 array([0, 0, 0, ..., 1, 1, 1]) 462 """ 463 _axis = 1 464
465 - def estimate_T(self):
466 from csb.bio.utils import wfit 467 for m in range(self.M): 468 for k in range(self.K): 469 self._R[m, k], self._t[m, k] = wfit(self._X[m], self.means[k], self.scales[k])
470
471 - def estimate_means(self):
472 # superpositions are weighted, so do unweighted mean here 473 self.means = numpy.mean([[self.datapoint(m, k) 474 for m in range(self.M)] 475 for k in range(self.K)], 1)
476
477 -class ConformerMixture(AbstractStructureMixture):
478 """ 479 Gaussian mixture model for protein structure ensembles using a set of conformers 480 481 If C{mixture} is a trained model, the ensemble coordinate array of 482 structures from C{X} which belong to conformation C{k} is given by: 483 484 >>> indices = numpy.where(mixture.membership == k)[0] 485 >>> conformer = [mixture.datapoint(m, k) for m in indices] 486 """ 487 _axis = 0 488
489 - def estimate_T(self):
490 from csb.bio.utils import fit 491 for m in range(self.M): 492 for k in range(self.K): 493 self._R[m, k], self._t[m, k] = fit(self._X[m], self.means[k])
494 495 # vi:expandtab:smarttab:sw=4 496