1 """
2 Mixture models for multi-dimensional data.
3
4 Reference: Hirsch M, Habeck M. - Bioinformatics. 2008 Oct 1;24(19):2184-92
5 """
6 import numpy
7
8 from abc import ABCMeta, abstractmethod
12 """
13 Gaussian mixture model for multi-dimensional data.
14 """
15 _axis = None
16
17
18 ALPHA_SIGMA = 0.0001
19 BETA_SIGMA = 0.01
20 MIN_SIGMA = 0.0
21
22 use_cache = True
23
24 - def __init__(self, X, K, train=True, axis=None):
25 """
26 @param X: multi dimensional input vector with samples along first axis
27 @type X: (M,...) numpy array
28
29 @param K: number of components
30 @type K: int
31
32 @param train: train model
33 @type train: bool
34
35 @param axis: component axis in C{X}
36 @type axis: int
37 """
38 if self._axis is not None:
39 if axis is not None and axis != self._axis:
40 raise ValueError('axis is fixed for {0}'.format(type(self).__name__))
41 axis = self._axis
42 elif axis is None:
43 axis = 0
44 self._axis = axis
45
46 N = X.shape[axis]
47 self._X = X
48 self._dimension = numpy.prod(X.shape) / N
49
50 c = numpy.linspace(0, K, N, False).astype(int)
51 self._scales = numpy.equal.outer(range(K), c).astype(float)
52 self._means = numpy.zeros((K,) + X.shape[1:])
53 self.del_cache()
54
55 if train:
56 self.em()
57
58 @property
60 """
61 Number of components
62 @rtype: int
63 """
64 return len(self.means)
65
66 @property
68 """
69 Length of component axis
70 @rtype: int
71 """
72 return self._scales.shape[1]
73
74 @property
76 """
77 Number of data points
78 @rtype: int
79 """
80 return len(self._X)
81
83 """Clear model parameter cache (force recalculation)"""
84 self._w = None
85 self._sigma = None
86 self._delta = None
87
88 @property
90 """
91 Dimensionality of the mixture domain
92 @rtype: int
93 """
94 return self._dimension
95
96 @property
98 """
99 @rtype: (K, ...) numpy array
100 """
101 return self._means
102
103 @means.setter
109
110 @property
112 """
113 @rtype: (K, N) numpy array
114 """
115 return self._scales
116
117 @scales.setter
123
124 @property
126 """
127 Component weights
128 @rtype: (K,) numpy array
129 """
130 if not self.use_cache or self._w is None:
131 self._w = self.scales.mean(1)
132 return self._w
133
134 @property
145
146 @property
148 """
149 Squared "distances" between data and components
150 @rtype: (N, K) numpy array
151 """
152 if not self.use_cache or self._delta is None:
153 self._delta = numpy.transpose([[d.sum()
154 for d in numpy.swapaxes([(self.means[k] - self.datapoint(m, k)) ** 2
155 for m in range(self.M)], 0, self._axis)]
156 for k in range(self.K)])
157 return self._delta
158
159 @property
161 """
162 Log-likelihood of the marginalized model (no auxiliary indicator variables)
163 @rtype: float
164 """
165 from csb.numeric import log, log_sum_exp
166 s_sq = (self.sigma ** 2).clip(1e-300, 1e300)
167 log_p = log(self.w) - 0.5 * \
168 (self.delta / s_sq + self.dimension * log(2 * numpy.pi * s_sq))
169 return log_sum_exp(log_p.T).sum()
170
171 @property
173 """
174 Log-likelihood of the extended model (with indicators)
175 @rtype: float
176 """
177 from csb.numeric import log
178 from numpy import pi, sum
179 n = self.scales.sum(1)
180 N = self.dimension
181 Z = self.scales.T
182 s_sq = (self.sigma ** 2).clip(1e-300, 1e300)
183 return sum(n * log(self.w)) - 0.5 * \
184 (sum(Z * self.delta / s_sq) + N * sum(n * log(2 * pi * s_sq)) + sum(log(s_sq)))
185
187 """
188 Training point number C{m} as if it would belong to component C{k}
189 @rtype: numpy array
190 """
191 return self._X[m]
192
194 """
195 Update means from current model and samples
196 """
197 n = self.scales.sum(1)
198 self.means = numpy.array([numpy.sum([self.scales[k, m] * self.datapoint(m, k)
199 for m in range(self.M)], 0) / n[k]
200 for k in range(self.K)])
201
212
220
222 """
223 Random C{scales} initialization
224 """
225 from numpy.random import random, multinomial
226 if ordered:
227 K, N = self.scales.shape
228 Ks = numpy.arange(K)
229 w = random(K) + (5. * K / N)
230 c = numpy.repeat(Ks, multinomial(N, w / w.sum()))
231 self.scales = numpy.equal.outer(Ks, c).astype(float)
232 else:
233 s = random(self.scales.shape)
234 self.scales = s / s.sum(0)
235 self.estimate_means()
236
238 """
239 Expectation step for EM
240 @param beta: inverse temperature
241 @type beta: float
242 """
243 self.estimate_scales(beta)
244
246 """
247 Maximization step for EM
248 """
249 self.estimate_means()
250
251 - def em(self, n_iter=100, eps=1e-30):
252 """
253 Expectation maximization
254
255 @param n_iter: maximum number of iteration steps
256 @type n_iter: int
257
258 @param eps: log-likelihood convergence criterion
259 @type eps: float
260 """
261 LL_prev = -numpy.inf
262 for i in range(n_iter):
263 self.m_step()
264 self.e_step()
265
266 if eps is not None:
267 LL = self.log_likelihood
268 if abs(LL - LL_prev) < eps:
269 break
270 LL_prev = LL
271
273 """
274 Deterministic annealing
275
276 @param betas: sequence of inverse temperatures
277 @type betas: iterable of floats
278 """
279 for beta in betas:
280 self.m_step()
281 self.e_step(beta)
282
284 """
285 Split component with largest sigma
286
287 @returns: new instance of mixture with incremented C{K}
288 @rtype: L{GaussianMixture} subclass
289 """
290 i = self.sigma.argmax()
291
292
293 Z = numpy.vstack([self.scales, self.scales[i]])
294
295
296 mask = Z[i].cumsum() / Z[i].sum() > 0.5
297 Z[i, mask] *= 0.0
298 Z[-1, ~mask] *= 0.0
299
300 new = type(self)(self._X, self.K + 1, False, self._axis)
301 new.scales = Z
302 new.m_step()
303 if train:
304 new.em()
305
306 return new
307
308 @classmethod
309 - def series(cls, X, start=1, stop=9):
310 """
311 Iterator with mixture instances for C{K in range(start, stop)}
312
313 @type X: (M,...) numpy array
314 @type start: int
315 @type stop: int
316 @rtype: generator
317 """
318 mixture = cls(X, start)
319 yield mixture
320
321 for K in range(start + 1, stop):
322 mixture = mixture.increment_K()
323 yield mixture
324
325 @classmethod
326 - def new(cls, X, K=0):
327 """
328 Factory method with optional C{K}. If C{K=0}, guess best C{K} according
329 to L{BIC<GaussianMixture.BIC>}.
330
331 @param X: multi dimensional input vector with samples along first axis
332 @type X: (M,...) numpy array
333
334 @return: Mixture instance
335 @rtype: L{GaussianMixture} subclass
336 """
337 if K > 0:
338 return cls(X, K)
339
340 mixture_it = cls.series(X)
341 mixture = next(mixture_it)
342
343
344 for candidate in mixture_it:
345 if candidate.BIC >= mixture.BIC:
346 break
347 mixture = candidate
348
349 return mixture
350
351 @property
353 """
354 Bayesian information criterion, calculated as
355 BIC = M * ln(sigma_e^2) + K * ln(M)
356
357 @rtype: float
358 """
359 from numpy import log
360
361 n = self.M
362 k = self.K
363 error_variance = sum(self.sigma ** 2 * self.w)
364
365 return n * log(error_variance) + k * log(n)
366
367 @property
369 """
370 Membership array
371 @rtype: (N,) numpy array
372 """
373 return self.scales.argmax(0)
374
376 """
377 Similarity of two mixtures measured in membership overlap
378
379 @param other: Mixture or membership array
380 @type other: L{GaussianMixture} or sequence
381
382 @return: segmentation overlap
383 @rtype: float in interval [0.0, 1.0]
384 """
385 if isinstance(other, GaussianMixture):
386 other_w = other.membership
387 K = min(self.K, other.K)
388 elif isinstance(other, (list, tuple, numpy.ndarray)):
389 other_w = other
390 K = min(self.K, len(set(other)))
391 else:
392 raise TypeError('other')
393
394 self_w = self.membership
395 if len(self_w) != len(other_w):
396 raise ValueError('self.N != other.N')
397
398
399 ww = tuple(zip(self_w, other_w))
400 same = sum(sorted(ww.count(i) for i in set(ww))[-K:])
401
402 return float(same) / len(ww)
403
405 """
406 Abstract mixture model for protein structure ensembles.
407 """
408 __metaclass__ = ABCMeta
409
410 - def __init__(self, X, K, *args, **kwargs):
411 if len(X.shape) != 3 or X.shape[-1] != 3:
412 raise ValueError('X must be array of shape (M,N,3)')
413
414 self._R = numpy.zeros((len(X), K, 3, 3))
415 self._t = numpy.zeros((len(X), K, 3))
416
417 super(AbstractStructureMixture, self).__init__(X, K, *args, **kwargs)
418
419 @property
421 """
422 Rotation matrices
423 @rtype: (M,K,3,3) numpy array
424 """
425 return self._R
426
427 @property
429 """
430 Translation vectors
431 @rtype: (M,K,3) numpy array
432 """
433 return self._t
434
436 return numpy.dot(self._X[m] - self._t[m, k], self._R[m, k])
437
441
442 @abstractmethod
444 """
445 Estimate superpositions
446 """
447 raise NotImplementedError
448
450 """
451 Gaussian mixture model for protein structure ensembles using a set of segments
452
453 If C{X} is the coordinate array of a protein structure ensemble which
454 can be decomposed into 2 rigid segments, the segmentation will be found by:
455
456 >>> mixture = SegmentMixture(X, 2)
457
458 The segment membership of each atom is given by:
459
460 >>> mixture.membership
461 array([0, 0, 0, ..., 1, 1, 1])
462 """
463 _axis = 1
464
466 from csb.bio.utils import wfit
467 for m in range(self.M):
468 for k in range(self.K):
469 self._R[m, k], self._t[m, k] = wfit(self._X[m], self.means[k], self.scales[k])
470
472
473 self.means = numpy.mean([[self.datapoint(m, k)
474 for m in range(self.M)]
475 for k in range(self.K)], 1)
476
494
495
496