Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/stats/kde.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#-------------------------------------------------------------------------------
2#
3# Define classes for (uni/multi)-variate kernel density estimation.
4#
5# Currently, only Gaussian kernels are implemented.
6#
7# Written by: Robert Kern
8#
9# Date: 2004-08-09
10#
11# Modified: 2005-02-10 by Robert Kern.
12# Contributed to SciPy
13# 2005-10-07 by Robert Kern.
14# Some fixes to match the new scipy_core
15#
16# Copyright 2004-2005 by Enthought, Inc.
17#
18#-------------------------------------------------------------------------------
20# Standard library imports.
21import warnings
23# SciPy imports.
24from scipy import linalg, special
25from scipy.special import logsumexp
26from scipy._lib._util import check_random_state
28from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, dot, exp, pi,
29 sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
30 ones, cov)
31import numpy as np
33# Local imports.
34from . import mvn
35from ._stats import gaussian_kernel_estimate
38__all__ = ['gaussian_kde']
41class gaussian_kde(object):
42 """Representation of a kernel-density estimate using Gaussian kernels.
44 Kernel density estimation is a way to estimate the probability density
45 function (PDF) of a random variable in a non-parametric way.
46 `gaussian_kde` works for both uni-variate and multi-variate data. It
47 includes automatic bandwidth determination. The estimation works best for
48 a unimodal distribution; bimodal or multi-modal distributions tend to be
49 oversmoothed.
51 Parameters
52 ----------
53 dataset : array_like
54 Datapoints to estimate from. In case of univariate data this is a 1-D
55 array, otherwise a 2-D array with shape (# of dims, # of data).
56 bw_method : str, scalar or callable, optional
57 The method used to calculate the estimator bandwidth. This can be
58 'scott', 'silverman', a scalar constant or a callable. If a scalar,
59 this will be used directly as `kde.factor`. If a callable, it should
60 take a `gaussian_kde` instance as only parameter and return a scalar.
61 If None (default), 'scott' is used. See Notes for more details.
62 weights : array_like, optional
63 weights of datapoints. This must be the same shape as dataset.
64 If None (default), the samples are assumed to be equally weighted
66 Attributes
67 ----------
68 dataset : ndarray
69 The dataset with which `gaussian_kde` was initialized.
70 d : int
71 Number of dimensions.
72 n : int
73 Number of datapoints.
74 neff : int
75 Effective number of datapoints.
77 .. versionadded:: 1.2.0
78 factor : float
79 The bandwidth factor, obtained from `kde.covariance_factor`, with which
80 the covariance matrix is multiplied.
81 covariance : ndarray
82 The covariance matrix of `dataset`, scaled by the calculated bandwidth
83 (`kde.factor`).
84 inv_cov : ndarray
85 The inverse of `covariance`.
87 Methods
88 -------
89 evaluate
90 __call__
91 integrate_gaussian
92 integrate_box_1d
93 integrate_box
94 integrate_kde
95 pdf
96 logpdf
97 resample
98 set_bandwidth
99 covariance_factor
101 Notes
102 -----
103 Bandwidth selection strongly influences the estimate obtained from the KDE
104 (much more so than the actual shape of the kernel). Bandwidth selection
105 can be done by a "rule of thumb", by cross-validation, by "plug-in
106 methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
107 uses a rule of thumb, the default is Scott's Rule.
109 Scott's Rule [1]_, implemented as `scotts_factor`, is::
111 n**(-1./(d+4)),
113 with ``n`` the number of data points and ``d`` the number of dimensions.
114 In the case of unequally weighted points, `scotts_factor` becomes::
116 neff**(-1./(d+4)),
118 with ``neff`` the effective number of datapoints.
119 Silverman's Rule [2]_, implemented as `silverman_factor`, is::
121 (n * (d + 2) / 4.)**(-1. / (d + 4)).
123 or in the case of unequally weighted points::
125 (neff * (d + 2) / 4.)**(-1. / (d + 4)).
127 Good general descriptions of kernel density estimation can be found in [1]_
128 and [2]_, the mathematics for this multi-dimensional implementation can be
129 found in [1]_.
131 With a set of weighted samples, the effective number of datapoints ``neff``
132 is defined by::
134 neff = sum(weights)^2 / sum(weights^2)
136 as detailed in [5]_.
138 References
139 ----------
140 .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
141 Visualization", John Wiley & Sons, New York, Chicester, 1992.
142 .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
143 Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
144 Chapman and Hall, London, 1986.
145 .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
146 Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
147 .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
148 conditional density estimation", Computational Statistics & Data
149 Analysis, Vol. 36, pp. 279-298, 2001.
150 .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
151 Series A (General), 132, 272
153 Examples
154 --------
155 Generate some random two-dimensional data:
157 >>> from scipy import stats
158 >>> def measure(n):
159 ... "Measurement model, return two coupled measurements."
160 ... m1 = np.random.normal(size=n)
161 ... m2 = np.random.normal(scale=0.5, size=n)
162 ... return m1+m2, m1-m2
164 >>> m1, m2 = measure(2000)
165 >>> xmin = m1.min()
166 >>> xmax = m1.max()
167 >>> ymin = m2.min()
168 >>> ymax = m2.max()
170 Perform a kernel density estimate on the data:
172 >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
173 >>> positions = np.vstack([X.ravel(), Y.ravel()])
174 >>> values = np.vstack([m1, m2])
175 >>> kernel = stats.gaussian_kde(values)
176 >>> Z = np.reshape(kernel(positions).T, X.shape)
178 Plot the results:
180 >>> import matplotlib.pyplot as plt
181 >>> fig, ax = plt.subplots()
182 >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
183 ... extent=[xmin, xmax, ymin, ymax])
184 >>> ax.plot(m1, m2, 'k.', markersize=2)
185 >>> ax.set_xlim([xmin, xmax])
186 >>> ax.set_ylim([ymin, ymax])
187 >>> plt.show()
189 """
190 def __init__(self, dataset, bw_method=None, weights=None):
191 self.dataset = atleast_2d(asarray(dataset))
192 if not self.dataset.size > 1:
193 raise ValueError("`dataset` input should have multiple elements.")
195 self.d, self.n = self.dataset.shape
197 if weights is not None:
198 self._weights = atleast_1d(weights).astype(float)
199 self._weights /= sum(self._weights)
200 if self.weights.ndim != 1:
201 raise ValueError("`weights` input should be one-dimensional.")
202 if len(self._weights) != self.n:
203 raise ValueError("`weights` input should be of length n")
204 self._neff = 1/sum(self._weights**2)
206 self.set_bandwidth(bw_method=bw_method)
208 def evaluate(self, points):
209 """Evaluate the estimated pdf on a set of points.
211 Parameters
212 ----------
213 points : (# of dimensions, # of points)-array
214 Alternatively, a (# of dimensions,) vector can be passed in and
215 treated as a single point.
217 Returns
218 -------
219 values : (# of points,)-array
220 The values at each point.
222 Raises
223 ------
224 ValueError : if the dimensionality of the input points is different than
225 the dimensionality of the KDE.
227 """
228 points = atleast_2d(asarray(points))
230 d, m = points.shape
231 if d != self.d:
232 if d == 1 and m == self.d:
233 # points was passed in as a row vector
234 points = reshape(points, (self.d, 1))
235 m = 1
236 else:
237 msg = "points have dimension %s, dataset has dimension %s" % (d,
238 self.d)
239 raise ValueError(msg)
241 output_dtype = np.common_type(self.covariance, points)
242 itemsize = np.dtype(output_dtype).itemsize
243 if itemsize == 4:
244 spec = 'float'
245 elif itemsize == 8:
246 spec = 'double'
247 elif itemsize in (12, 16):
248 spec = 'long double'
249 else:
250 raise TypeError('%s has unexpected item size %d' %
251 (output_dtype, itemsize))
252 result = gaussian_kernel_estimate[spec](self.dataset.T, self.weights[:, None],
253 points.T, self.inv_cov, output_dtype)
254 return result[:, 0]
256 __call__ = evaluate
258 def integrate_gaussian(self, mean, cov):
259 """
260 Multiply estimated density by a multivariate Gaussian and integrate
261 over the whole space.
263 Parameters
264 ----------
265 mean : aray_like
266 A 1-D array, specifying the mean of the Gaussian.
267 cov : array_like
268 A 2-D array, specifying the covariance matrix of the Gaussian.
270 Returns
271 -------
272 result : scalar
273 The value of the integral.
275 Raises
276 ------
277 ValueError
278 If the mean or covariance of the input Gaussian differs from
279 the KDE's dimensionality.
281 """
282 mean = atleast_1d(squeeze(mean))
283 cov = atleast_2d(cov)
285 if mean.shape != (self.d,):
286 raise ValueError("mean does not have dimension %s" % self.d)
287 if cov.shape != (self.d, self.d):
288 raise ValueError("covariance does not have dimension %s" % self.d)
290 # make mean a column vector
291 mean = mean[:, newaxis]
293 sum_cov = self.covariance + cov
295 # This will raise LinAlgError if the new cov matrix is not s.p.d
296 # cho_factor returns (ndarray, bool) where bool is a flag for whether
297 # or not ndarray is upper or lower triangular
298 sum_cov_chol = linalg.cho_factor(sum_cov)
300 diff = self.dataset - mean
301 tdiff = linalg.cho_solve(sum_cov_chol, diff)
303 sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
304 norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
306 energies = sum(diff * tdiff, axis=0) / 2.0
307 result = sum(exp(-energies)*self.weights, axis=0) / norm_const
309 return result
311 def integrate_box_1d(self, low, high):
312 """
313 Computes the integral of a 1D pdf between two bounds.
315 Parameters
316 ----------
317 low : scalar
318 Lower bound of integration.
319 high : scalar
320 Upper bound of integration.
322 Returns
323 -------
324 value : scalar
325 The result of the integral.
327 Raises
328 ------
329 ValueError
330 If the KDE is over more than one dimension.
332 """
333 if self.d != 1:
334 raise ValueError("integrate_box_1d() only handles 1D pdfs")
336 stdev = ravel(sqrt(self.covariance))[0]
338 normalized_low = ravel((low - self.dataset) / stdev)
339 normalized_high = ravel((high - self.dataset) / stdev)
341 value = np.sum(self.weights*(
342 special.ndtr(normalized_high) -
343 special.ndtr(normalized_low)))
344 return value
346 def integrate_box(self, low_bounds, high_bounds, maxpts=None):
347 """Computes the integral of a pdf over a rectangular interval.
349 Parameters
350 ----------
351 low_bounds : array_like
352 A 1-D array containing the lower bounds of integration.
353 high_bounds : array_like
354 A 1-D array containing the upper bounds of integration.
355 maxpts : int, optional
356 The maximum number of points to use for integration.
358 Returns
359 -------
360 value : scalar
361 The result of the integral.
363 """
364 if maxpts is not None:
365 extra_kwds = {'maxpts': maxpts}
366 else:
367 extra_kwds = {}
369 value, inform = mvn.mvnun_weighted(low_bounds, high_bounds,
370 self.dataset, self.weights,
371 self.covariance, **extra_kwds)
372 if inform:
373 msg = ('An integral in mvn.mvnun requires more points than %s' %
374 (self.d * 1000))
375 warnings.warn(msg)
377 return value
379 def integrate_kde(self, other):
380 """
381 Computes the integral of the product of this kernel density estimate
382 with another.
384 Parameters
385 ----------
386 other : gaussian_kde instance
387 The other kde.
389 Returns
390 -------
391 value : scalar
392 The result of the integral.
394 Raises
395 ------
396 ValueError
397 If the KDEs have different dimensionality.
399 """
400 if other.d != self.d:
401 raise ValueError("KDEs are not the same dimensionality")
403 # we want to iterate over the smallest number of points
404 if other.n < self.n:
405 small = other
406 large = self
407 else:
408 small = self
409 large = other
411 sum_cov = small.covariance + large.covariance
412 sum_cov_chol = linalg.cho_factor(sum_cov)
413 result = 0.0
414 for i in range(small.n):
415 mean = small.dataset[:, i, newaxis]
416 diff = large.dataset - mean
417 tdiff = linalg.cho_solve(sum_cov_chol, diff)
419 energies = sum(diff * tdiff, axis=0) / 2.0
420 result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
422 sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
423 norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
425 result /= norm_const
427 return result
429 def resample(self, size=None, seed=None):
430 """
431 Randomly sample a dataset from the estimated pdf.
433 Parameters
434 ----------
435 size : int, optional
436 The number of samples to draw. If not provided, then the size is
437 the same as the effective number of samples in the underlying
438 dataset.
439 seed : {None, int, `~np.random.RandomState`, `~np.random.Generator`}, optional
440 This parameter defines the object to use for drawing random
441 variates.
442 If `seed` is `None` the `~np.random.RandomState` singleton is used.
443 If `seed` is an int, a new ``RandomState`` instance is used, seeded
444 with seed.
445 If `seed` is already a ``RandomState`` or ``Generator`` instance,
446 then that object is used.
447 Default is None.
448 Specify `seed` for reproducible drawing of random variates.
450 Returns
451 -------
452 resample : (self.d, `size`) ndarray
453 The sampled dataset.
455 """
456 if size is None:
457 size = int(self.neff)
459 random_state = check_random_state(seed)
460 norm = transpose(random_state.multivariate_normal(
461 zeros((self.d,), float), self.covariance, size=size
462 ))
463 indices = random_state.choice(self.n, size=size, p=self.weights)
464 means = self.dataset[:, indices]
466 return means + norm
468 def scotts_factor(self):
469 """Compute Scott's factor.
471 Returns
472 -------
473 s : float
474 Scott's factor.
475 """
476 return power(self.neff, -1./(self.d+4))
478 def silverman_factor(self):
479 """Compute the Silverman factor.
481 Returns
482 -------
483 s : float
484 The silverman factor.
485 """
486 return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
488 # Default method to calculate bandwidth, can be overwritten by subclass
489 covariance_factor = scotts_factor
490 covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
491 multiplies the data covariance matrix to obtain the kernel covariance
492 matrix. The default is `scotts_factor`. A subclass can overwrite this
493 method to provide a different method, or set it through a call to
494 `kde.set_bandwidth`."""
496 def set_bandwidth(self, bw_method=None):
497 """Compute the estimator bandwidth with given method.
499 The new bandwidth calculated after a call to `set_bandwidth` is used
500 for subsequent evaluations of the estimated density.
502 Parameters
503 ----------
504 bw_method : str, scalar or callable, optional
505 The method used to calculate the estimator bandwidth. This can be
506 'scott', 'silverman', a scalar constant or a callable. If a
507 scalar, this will be used directly as `kde.factor`. If a callable,
508 it should take a `gaussian_kde` instance as only parameter and
509 return a scalar. If None (default), nothing happens; the current
510 `kde.covariance_factor` method is kept.
512 Notes
513 -----
514 .. versionadded:: 0.11
516 Examples
517 --------
518 >>> import scipy.stats as stats
519 >>> x1 = np.array([-7, -5, 1, 4, 5.])
520 >>> kde = stats.gaussian_kde(x1)
521 >>> xs = np.linspace(-10, 10, num=50)
522 >>> y1 = kde(xs)
523 >>> kde.set_bandwidth(bw_method='silverman')
524 >>> y2 = kde(xs)
525 >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
526 >>> y3 = kde(xs)
528 >>> import matplotlib.pyplot as plt
529 >>> fig, ax = plt.subplots()
530 >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
531 ... label='Data points (rescaled)')
532 >>> ax.plot(xs, y1, label='Scott (default)')
533 >>> ax.plot(xs, y2, label='Silverman')
534 >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
535 >>> ax.legend()
536 >>> plt.show()
538 """
539 if bw_method is None:
540 pass
541 elif bw_method == 'scott':
542 self.covariance_factor = self.scotts_factor
543 elif bw_method == 'silverman':
544 self.covariance_factor = self.silverman_factor
545 elif np.isscalar(bw_method) and not isinstance(bw_method, str):
546 self._bw_method = 'use constant'
547 self.covariance_factor = lambda: bw_method
548 elif callable(bw_method):
549 self._bw_method = bw_method
550 self.covariance_factor = lambda: self._bw_method(self)
551 else:
552 msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
553 "or a callable."
554 raise ValueError(msg)
556 self._compute_covariance()
558 def _compute_covariance(self):
559 """Computes the covariance matrix for each Gaussian kernel using
560 covariance_factor().
561 """
562 self.factor = self.covariance_factor()
563 # Cache covariance and inverse covariance of the data
564 if not hasattr(self, '_data_inv_cov'):
565 self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
566 bias=False,
567 aweights=self.weights))
568 self._data_inv_cov = linalg.inv(self._data_covariance)
570 self.covariance = self._data_covariance * self.factor**2
571 self.inv_cov = self._data_inv_cov / self.factor**2
572 self._norm_factor = sqrt(linalg.det(2*pi*self.covariance))
574 def pdf(self, x):
575 """
576 Evaluate the estimated pdf on a provided set of points.
578 Notes
579 -----
580 This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
581 docstring for more details.
583 """
584 return self.evaluate(x)
586 def logpdf(self, x):
587 """
588 Evaluate the log of the estimated pdf on a provided set of points.
589 """
591 points = atleast_2d(x)
593 d, m = points.shape
594 if d != self.d:
595 if d == 1 and m == self.d:
596 # points was passed in as a row vector
597 points = reshape(points, (self.d, 1))
598 m = 1
599 else:
600 msg = "points have dimension %s, dataset has dimension %s" % (d,
601 self.d)
602 raise ValueError(msg)
604 if m >= self.n:
605 # there are more points than data, so loop over data
606 energy = zeros((self.n, m), dtype=float)
607 for i in range(self.n):
608 diff = self.dataset[:, i, newaxis] - points
609 tdiff = dot(self.inv_cov, diff)
610 energy[i] = sum(diff*tdiff, axis=0) / 2.0
611 result = logsumexp(-energy.T,
612 b=self.weights / self._norm_factor, axis=1)
613 else:
614 # loop over points
615 result = zeros((m,), dtype=float)
616 for i in range(m):
617 diff = self.dataset - points[:, i, newaxis]
618 tdiff = dot(self.inv_cov, diff)
619 energy = sum(diff * tdiff, axis=0) / 2.0
620 result[i] = logsumexp(-energy, b=self.weights /
621 self._norm_factor)
623 return result
625 @property
626 def weights(self):
627 try:
628 return self._weights
629 except AttributeError:
630 self._weights = ones(self.n)/self.n
631 return self._weights
633 @property
634 def neff(self):
635 try:
636 return self._neff
637 except AttributeError:
638 self._neff = 1/sum(self.weights**2)
639 return self._neff