Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/stats/morestats.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import math
2import warnings
3from collections import namedtuple
5import numpy as np
6from numpy import (isscalar, r_, log, around, unique, asarray,
7 zeros, arange, sort, amin, amax, any, atleast_1d,
8 sqrt, ceil, floor, array, compress,
9 pi, exp, ravel, count_nonzero, sin, cos, arctan2, hypot)
11from scipy import optimize
12from scipy import special
13from . import statlib
14from . import stats
15from .stats import find_repeats, _contains_nan
16from .contingency import chi2_contingency
17from . import distributions
18from ._distn_infrastructure import rv_generic
19from ._hypotests import _get_wilcoxon_distr
22__all__ = ['mvsdist',
23 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
24 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
25 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test',
26 'fligner', 'mood', 'wilcoxon', 'median_test',
27 'circmean', 'circvar', 'circstd', 'anderson_ksamp',
28 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
29 'yeojohnson_normplot'
30 ]
33Mean = namedtuple('Mean', ('statistic', 'minmax'))
34Variance = namedtuple('Variance', ('statistic', 'minmax'))
35Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))
38def bayes_mvs(data, alpha=0.90):
39 r"""
40 Bayesian confidence intervals for the mean, var, and std.
42 Parameters
43 ----------
44 data : array_like
45 Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
46 Requires 2 or more data points.
47 alpha : float, optional
48 Probability that the returned confidence interval contains
49 the true parameter.
51 Returns
52 -------
53 mean_cntr, var_cntr, std_cntr : tuple
54 The three results are for the mean, variance and standard deviation,
55 respectively. Each result is a tuple of the form::
57 (center, (lower, upper))
59 with `center` the mean of the conditional pdf of the value given the
60 data, and `(lower, upper)` a confidence interval, centered on the
61 median, containing the estimate to a probability ``alpha``.
63 See Also
64 --------
65 mvsdist
67 Notes
68 -----
69 Each tuple of mean, variance, and standard deviation estimates represent
70 the (center, (lower, upper)) with center the mean of the conditional pdf
71 of the value given the data and (lower, upper) is a confidence interval
72 centered on the median, containing the estimate to a probability
73 ``alpha``.
75 Converts data to 1-D and assumes all data has the same mean and variance.
76 Uses Jeffrey's prior for variance and std.
78 Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``
80 References
81 ----------
82 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
83 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
84 2006.
86 Examples
87 --------
88 First a basic example to demonstrate the outputs:
90 >>> from scipy import stats
91 >>> data = [6, 9, 12, 7, 8, 8, 13]
92 >>> mean, var, std = stats.bayes_mvs(data)
93 >>> mean
94 Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
95 >>> var
96 Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...))
97 >>> std
98 Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631))
100 Now we generate some normally distributed random data, and get estimates of
101 mean and standard deviation with 95% confidence intervals for those
102 estimates:
104 >>> n_samples = 100000
105 >>> data = stats.norm.rvs(size=n_samples)
106 >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)
108 >>> import matplotlib.pyplot as plt
109 >>> fig = plt.figure()
110 >>> ax = fig.add_subplot(111)
111 >>> ax.hist(data, bins=100, density=True, label='Histogram of data')
112 >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
113 >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
114 ... alpha=0.2, label=r'Estimated mean (95% limits)')
115 >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
116 >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
117 ... label=r'Estimated scale (95% limits)')
119 >>> ax.legend(fontsize=10)
120 >>> ax.set_xlim([-4, 4])
121 >>> ax.set_ylim([0, 0.5])
122 >>> plt.show()
124 """
125 m, v, s = mvsdist(data)
126 if alpha >= 1 or alpha <= 0:
127 raise ValueError("0 < alpha < 1 is required, but alpha=%s was given."
128 % alpha)
130 m_res = Mean(m.mean(), m.interval(alpha))
131 v_res = Variance(v.mean(), v.interval(alpha))
132 s_res = Std_dev(s.mean(), s.interval(alpha))
134 return m_res, v_res, s_res
137def mvsdist(data):
138 """
139 'Frozen' distributions for mean, variance, and standard deviation of data.
141 Parameters
142 ----------
143 data : array_like
144 Input array. Converted to 1-D using ravel.
145 Requires 2 or more data-points.
147 Returns
148 -------
149 mdist : "frozen" distribution object
150 Distribution object representing the mean of the data.
151 vdist : "frozen" distribution object
152 Distribution object representing the variance of the data.
153 sdist : "frozen" distribution object
154 Distribution object representing the standard deviation of the data.
156 See Also
157 --------
158 bayes_mvs
160 Notes
161 -----
162 The return values from ``bayes_mvs(data)`` is equivalent to
163 ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.
165 In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
166 on the three distribution objects returned from this function will give
167 the same results that are returned from `bayes_mvs`.
169 References
170 ----------
171 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
172 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
173 2006.
175 Examples
176 --------
177 >>> from scipy import stats
178 >>> data = [6, 9, 12, 7, 8, 8, 13]
179 >>> mean, var, std = stats.mvsdist(data)
181 We now have frozen distribution objects "mean", "var" and "std" that we can
182 examine:
184 >>> mean.mean()
185 9.0
186 >>> mean.interval(0.95)
187 (6.6120585482655692, 11.387941451734431)
188 >>> mean.std()
189 1.1952286093343936
191 """
192 x = ravel(data)
193 n = len(x)
194 if n < 2:
195 raise ValueError("Need at least 2 data-points.")
196 xbar = x.mean()
197 C = x.var()
198 if n > 1000: # gaussian approximations for large n
199 mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
200 sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
201 vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
202 else:
203 nm1 = n - 1
204 fac = n * C / 2.
205 val = nm1 / 2.
206 mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
207 sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
208 vdist = distributions.invgamma(val, scale=fac)
209 return mdist, vdist, sdist
212def kstat(data, n=2):
213 r"""
214 Return the nth k-statistic (1<=n<=4 so far).
216 The nth k-statistic k_n is the unique symmetric unbiased estimator of the
217 nth cumulant kappa_n.
219 Parameters
220 ----------
221 data : array_like
222 Input array. Note that n-D input gets flattened.
223 n : int, {1, 2, 3, 4}, optional
224 Default is equal to 2.
226 Returns
227 -------
228 kstat : float
229 The nth k-statistic.
231 See Also
232 --------
233 kstatvar: Returns an unbiased estimator of the variance of the k-statistic.
234 moment: Returns the n-th central moment about the mean for a sample.
236 Notes
237 -----
238 For a sample size n, the first few k-statistics are given by:
240 .. math::
242 k_{1} = \mu
243 k_{2} = \frac{n}{n-1} m_{2}
244 k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3}
245 k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)}
247 where :math:`\mu` is the sample mean, :math:`m_2` is the sample
248 variance, and :math:`m_i` is the i-th sample central moment.
250 References
251 ----------
252 http://mathworld.wolfram.com/k-Statistic.html
254 http://mathworld.wolfram.com/Cumulant.html
256 Examples
257 --------
258 >>> from scipy import stats
259 >>> rndm = np.random.RandomState(1234)
261 As sample size increases, n-th moment and n-th k-statistic converge to the
262 same number (although they aren't identical). In the case of the normal
263 distribution, they converge to zero.
265 >>> for n in [2, 3, 4, 5, 6, 7]:
266 ... x = rndm.normal(size=10**n)
267 ... m, k = stats.moment(x, 3), stats.kstat(x, 3)
268 ... print("%.3g %.3g %.3g" % (m, k, m-k))
269 -0.631 -0.651 0.0194
270 0.0282 0.0283 -8.49e-05
271 -0.0454 -0.0454 1.36e-05
272 7.53e-05 7.53e-05 -2.26e-09
273 0.00166 0.00166 -4.99e-09
274 -2.88e-06 -2.88e-06 8.63e-13
275 """
276 if n > 4 or n < 1:
277 raise ValueError("k-statistics only supported for 1<=n<=4")
278 n = int(n)
279 S = np.zeros(n + 1, np.float64)
280 data = ravel(data)
281 N = data.size
283 # raise ValueError on empty input
284 if N == 0:
285 raise ValueError("Data input must not be empty")
287 # on nan input, return nan without warning
288 if np.isnan(np.sum(data)):
289 return np.nan
291 for k in range(1, n + 1):
292 S[k] = np.sum(data**k, axis=0)
293 if n == 1:
294 return S[1] * 1.0/N
295 elif n == 2:
296 return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
297 elif n == 3:
298 return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
299 elif n == 4:
300 return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
301 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
302 (N*(N-1.0)*(N-2.0)*(N-3.0)))
303 else:
304 raise ValueError("Should not be here.")
307def kstatvar(data, n=2):
308 r"""
309 Return an unbiased estimator of the variance of the k-statistic.
311 See `kstat` for more details of the k-statistic.
313 Parameters
314 ----------
315 data : array_like
316 Input array. Note that n-D input gets flattened.
317 n : int, {1, 2}, optional
318 Default is equal to 2.
320 Returns
321 -------
322 kstatvar : float
323 The nth k-statistic variance.
325 See Also
326 --------
327 kstat: Returns the n-th k-statistic.
328 moment: Returns the n-th central moment about the mean for a sample.
330 Notes
331 -----
332 The variances of the first few k-statistics are given by:
334 .. math::
336 var(k_{1}) = \frac{\kappa^2}{n}
337 var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1}
338 var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} +
339 \frac{9 \kappa^2_{3}}{n - 1} +
340 \frac{6 n \kappa^3_{2}}{(n-1) (n-2)}
341 var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} +
342 \frac{48 \kappa_{3} \kappa_5}{n - 1} +
343 \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} +
344 \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} +
345 \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)}
346 """
347 data = ravel(data)
348 N = len(data)
349 if n == 1:
350 return kstat(data, n=2) * 1.0/N
351 elif n == 2:
352 k2 = kstat(data, n=2)
353 k4 = kstat(data, n=4)
354 return (2*N*k2**2 + (N-1)*k4) / (N*(N+1))
355 else:
356 raise ValueError("Only n=1 or n=2 supported.")
359def _calc_uniform_order_statistic_medians(n):
360 """
361 Approximations of uniform order statistic medians.
363 Parameters
364 ----------
365 n : int
366 Sample size.
368 Returns
369 -------
370 v : 1d float array
371 Approximations of the order statistic medians.
373 References
374 ----------
375 .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient
376 Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
378 Examples
379 --------
380 Order statistics of the uniform distribution on the unit interval
381 are marginally distributed according to beta distributions.
382 The expectations of these order statistic are evenly spaced across
383 the interval, but the distributions are skewed in a way that
384 pushes the medians slightly towards the endpoints of the unit interval:
386 >>> n = 4
387 >>> k = np.arange(1, n+1)
388 >>> from scipy.stats import beta
389 >>> a = k
390 >>> b = n-k+1
391 >>> beta.mean(a, b)
392 array([ 0.2, 0.4, 0.6, 0.8])
393 >>> beta.median(a, b)
394 array([ 0.15910358, 0.38572757, 0.61427243, 0.84089642])
396 The Filliben approximation uses the exact medians of the smallest
397 and greatest order statistics, and the remaining medians are approximated
398 by points spread evenly across a sub-interval of the unit interval:
400 >>> from scipy.morestats import _calc_uniform_order_statistic_medians
401 >>> _calc_uniform_order_statistic_medians(n)
402 array([ 0.15910358, 0.38545246, 0.61454754, 0.84089642])
404 This plot shows the skewed distributions of the order statistics
405 of a sample of size four from a uniform distribution on the unit interval:
407 >>> import matplotlib.pyplot as plt
408 >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True)
409 >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)]
410 >>> plt.figure()
411 >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3])
413 """
414 v = np.zeros(n, dtype=np.float64)
415 v[-1] = 0.5**(1.0 / n)
416 v[0] = 1 - v[-1]
417 i = np.arange(2, n)
418 v[1:-1] = (i - 0.3175) / (n + 0.365)
419 return v
422def _parse_dist_kw(dist, enforce_subclass=True):
423 """Parse `dist` keyword.
425 Parameters
426 ----------
427 dist : str or stats.distributions instance.
428 Several functions take `dist` as a keyword, hence this utility
429 function.
430 enforce_subclass : bool, optional
431 If True (default), `dist` needs to be a
432 `_distn_infrastructure.rv_generic` instance.
433 It can sometimes be useful to set this keyword to False, if a function
434 wants to accept objects that just look somewhat like such an instance
435 (for example, they have a ``ppf`` method).
437 """
438 if isinstance(dist, rv_generic):
439 pass
440 elif isinstance(dist, str):
441 try:
442 dist = getattr(distributions, dist)
443 except AttributeError:
444 raise ValueError("%s is not a valid distribution name" % dist)
445 elif enforce_subclass:
446 msg = ("`dist` should be a stats.distributions instance or a string "
447 "with the name of such a distribution.")
448 raise ValueError(msg)
450 return dist
453def _add_axis_labels_title(plot, xlabel, ylabel, title):
454 """Helper function to add axes labels and a title to stats plots"""
455 try:
456 if hasattr(plot, 'set_title'):
457 # Matplotlib Axes instance or something that looks like it
458 plot.set_title(title)
459 plot.set_xlabel(xlabel)
460 plot.set_ylabel(ylabel)
461 else:
462 # matplotlib.pyplot module
463 plot.title(title)
464 plot.xlabel(xlabel)
465 plot.ylabel(ylabel)
466 except Exception:
467 # Not an MPL object or something that looks (enough) like it.
468 # Don't crash on adding labels or title
469 pass
472def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False):
473 """
474 Calculate quantiles for a probability plot, and optionally show the plot.
476 Generates a probability plot of sample data against the quantiles of a
477 specified theoretical distribution (the normal distribution by default).
478 `probplot` optionally calculates a best-fit line for the data and plots the
479 results using Matplotlib or a given plot function.
481 Parameters
482 ----------
483 x : array_like
484 Sample/response data from which `probplot` creates the plot.
485 sparams : tuple, optional
486 Distribution-specific shape parameters (shape parameters plus location
487 and scale).
488 dist : str or stats.distributions instance, optional
489 Distribution or distribution function name. The default is 'norm' for a
490 normal probability plot. Objects that look enough like a
491 stats.distributions instance (i.e. they have a ``ppf`` method) are also
492 accepted.
493 fit : bool, optional
494 Fit a least-squares regression (best-fit) line to the sample data if
495 True (default).
496 plot : object, optional
497 If given, plots the quantiles and least squares fit.
498 `plot` is an object that has to have methods "plot" and "text".
499 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
500 or a custom object with the same methods.
501 Default is None, which means that no plot is created.
503 Returns
504 -------
505 (osm, osr) : tuple of ndarrays
506 Tuple of theoretical quantiles (osm, or order statistic medians) and
507 ordered responses (osr). `osr` is simply sorted input `x`.
508 For details on how `osm` is calculated see the Notes section.
509 (slope, intercept, r) : tuple of floats, optional
510 Tuple containing the result of the least-squares fit, if that is
511 performed by `probplot`. `r` is the square root of the coefficient of
512 determination. If ``fit=False`` and ``plot=None``, this tuple is not
513 returned.
515 Notes
516 -----
517 Even if `plot` is given, the figure is not shown or saved by `probplot`;
518 ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after
519 calling `probplot`.
521 `probplot` generates a probability plot, which should not be confused with
522 a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this
523 type, see ``statsmodels.api.ProbPlot``.
525 The formula used for the theoretical quantiles (horizontal axis of the
526 probability plot) is Filliben's estimate::
528 quantiles = dist.ppf(val), for
530 0.5**(1/n), for i = n
531 val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1
532 1 - 0.5**(1/n), for i = 1
534 where ``i`` indicates the i-th ordered value and ``n`` is the total number
535 of values.
537 Examples
538 --------
539 >>> from scipy import stats
540 >>> import matplotlib.pyplot as plt
541 >>> nsample = 100
542 >>> np.random.seed(7654321)
544 A t distribution with small degrees of freedom:
546 >>> ax1 = plt.subplot(221)
547 >>> x = stats.t.rvs(3, size=nsample)
548 >>> res = stats.probplot(x, plot=plt)
550 A t distribution with larger degrees of freedom:
552 >>> ax2 = plt.subplot(222)
553 >>> x = stats.t.rvs(25, size=nsample)
554 >>> res = stats.probplot(x, plot=plt)
556 A mixture of two normal distributions with broadcasting:
558 >>> ax3 = plt.subplot(223)
559 >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5],
560 ... size=(nsample//2,2)).ravel()
561 >>> res = stats.probplot(x, plot=plt)
563 A standard normal distribution:
565 >>> ax4 = plt.subplot(224)
566 >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample)
567 >>> res = stats.probplot(x, plot=plt)
569 Produce a new figure with a loggamma distribution, using the ``dist`` and
570 ``sparams`` keywords:
572 >>> fig = plt.figure()
573 >>> ax = fig.add_subplot(111)
574 >>> x = stats.loggamma.rvs(c=2.5, size=500)
575 >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax)
576 >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5")
578 Show the results with Matplotlib:
580 >>> plt.show()
582 """
583 x = np.asarray(x)
584 _perform_fit = fit or (plot is not None)
585 if x.size == 0:
586 if _perform_fit:
587 return (x, x), (np.nan, np.nan, 0.0)
588 else:
589 return x, x
591 osm_uniform = _calc_uniform_order_statistic_medians(len(x))
592 dist = _parse_dist_kw(dist, enforce_subclass=False)
593 if sparams is None:
594 sparams = ()
595 if isscalar(sparams):
596 sparams = (sparams,)
597 if not isinstance(sparams, tuple):
598 sparams = tuple(sparams)
600 osm = dist.ppf(osm_uniform, *sparams)
601 osr = sort(x)
602 if _perform_fit:
603 # perform a linear least squares fit.
604 slope, intercept, r, prob, sterrest = stats.linregress(osm, osr)
606 if plot is not None:
607 plot.plot(osm, osr, 'bo', osm, slope*osm + intercept, 'r-')
608 _add_axis_labels_title(plot, xlabel='Theoretical quantiles',
609 ylabel='Ordered Values',
610 title='Probability Plot')
612 # Add R^2 value to the plot as text
613 if rvalue:
614 xmin = amin(osm)
615 xmax = amax(osm)
616 ymin = amin(x)
617 ymax = amax(x)
618 posx = xmin + 0.70 * (xmax - xmin)
619 posy = ymin + 0.01 * (ymax - ymin)
620 plot.text(posx, posy, "$R^2=%1.4f$" % r**2)
622 if fit:
623 return (osm, osr), (slope, intercept, r)
624 else:
625 return osm, osr
628def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'):
629 """
630 Calculate the shape parameter that maximizes the PPCC.
632 The probability plot correlation coefficient (PPCC) plot can be used to
633 determine the optimal shape parameter for a one-parameter family of
634 distributions. ppcc_max returns the shape parameter that would maximize the
635 probability plot correlation coefficient for the given data to a
636 one-parameter family of distributions.
638 Parameters
639 ----------
640 x : array_like
641 Input array.
642 brack : tuple, optional
643 Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c)
644 then they are assumed to be a starting interval for a downhill bracket
645 search (see `scipy.optimize.brent`).
646 dist : str or stats.distributions instance, optional
647 Distribution or distribution function name. Objects that look enough
648 like a stats.distributions instance (i.e. they have a ``ppf`` method)
649 are also accepted. The default is ``'tukeylambda'``.
651 Returns
652 -------
653 shape_value : float
654 The shape parameter at which the probability plot correlation
655 coefficient reaches its max value.
657 See Also
658 --------
659 ppcc_plot, probplot, boxcox
661 Notes
662 -----
663 The brack keyword serves as a starting point which is useful in corner
664 cases. One can use a plot to obtain a rough visual estimate of the location
665 for the maximum to start the search near it.
667 References
668 ----------
669 .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
670 Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
672 .. [2] https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm
674 Examples
675 --------
676 First we generate some random data from a Tukey-Lambda distribution,
677 with shape parameter -0.7:
679 >>> from scipy import stats
680 >>> x = stats.tukeylambda.rvs(-0.7, loc=2, scale=0.5, size=10000,
681 ... random_state=1234567) + 1e4
683 Now we explore this data with a PPCC plot as well as the related
684 probability plot and Box-Cox normplot. A red line is drawn where we
685 expect the PPCC value to be maximal (at the shape parameter -0.7 used
686 above):
688 >>> import matplotlib.pyplot as plt
689 >>> fig = plt.figure(figsize=(8, 6))
690 >>> ax = fig.add_subplot(111)
691 >>> res = stats.ppcc_plot(x, -5, 5, plot=ax)
693 We calculate the value where the shape should reach its maximum and a red
694 line is drawn there. The line should coincide with the highest point in the
695 ppcc_plot.
697 >>> max = stats.ppcc_max(x)
698 >>> ax.vlines(max, 0, 1, colors='r', label='Expected shape value')
700 >>> plt.show()
702 """
703 dist = _parse_dist_kw(dist)
704 osm_uniform = _calc_uniform_order_statistic_medians(len(x))
705 osr = sort(x)
707 # this function computes the x-axis values of the probability plot
708 # and computes a linear regression (including the correlation)
709 # and returns 1-r so that a minimization function maximizes the
710 # correlation
711 def tempfunc(shape, mi, yvals, func):
712 xvals = func(mi, shape)
713 r, prob = stats.pearsonr(xvals, yvals)
714 return 1 - r
716 return optimize.brent(tempfunc, brack=brack, args=(osm_uniform, osr, dist.ppf))
719def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):
720 """
721 Calculate and optionally plot probability plot correlation coefficient.
723 The probability plot correlation coefficient (PPCC) plot can be used to
724 determine the optimal shape parameter for a one-parameter family of
725 distributions. It cannot be used for distributions without shape parameters
726 (like the normal distribution) or with multiple shape parameters.
728 By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A
729 Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed
730 distributions via an approximately normal one, and is therefore particularly
731 useful in practice.
733 Parameters
734 ----------
735 x : array_like
736 Input array.
737 a, b : scalar
738 Lower and upper bounds of the shape parameter to use.
739 dist : str or stats.distributions instance, optional
740 Distribution or distribution function name. Objects that look enough
741 like a stats.distributions instance (i.e. they have a ``ppf`` method)
742 are also accepted. The default is ``'tukeylambda'``.
743 plot : object, optional
744 If given, plots PPCC against the shape parameter.
745 `plot` is an object that has to have methods "plot" and "text".
746 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
747 or a custom object with the same methods.
748 Default is None, which means that no plot is created.
749 N : int, optional
750 Number of points on the horizontal axis (equally distributed from
751 `a` to `b`).
753 Returns
754 -------
755 svals : ndarray
756 The shape values for which `ppcc` was calculated.
757 ppcc : ndarray
758 The calculated probability plot correlation coefficient values.
760 See Also
761 --------
762 ppcc_max, probplot, boxcox_normplot, tukeylambda
764 References
765 ----------
766 J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
767 Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
769 Examples
770 --------
771 First we generate some random data from a Tukey-Lambda distribution,
772 with shape parameter -0.7:
774 >>> from scipy import stats
775 >>> import matplotlib.pyplot as plt
776 >>> np.random.seed(1234567)
777 >>> x = stats.tukeylambda.rvs(-0.7, loc=2, scale=0.5, size=10000) + 1e4
779 Now we explore this data with a PPCC plot as well as the related
780 probability plot and Box-Cox normplot. A red line is drawn where we
781 expect the PPCC value to be maximal (at the shape parameter -0.7 used
782 above):
784 >>> fig = plt.figure(figsize=(12, 4))
785 >>> ax1 = fig.add_subplot(131)
786 >>> ax2 = fig.add_subplot(132)
787 >>> ax3 = fig.add_subplot(133)
788 >>> res = stats.probplot(x, plot=ax1)
789 >>> res = stats.boxcox_normplot(x, -5, 5, plot=ax2)
790 >>> res = stats.ppcc_plot(x, -5, 5, plot=ax3)
791 >>> ax3.vlines(-0.7, 0, 1, colors='r', label='Expected shape value')
792 >>> plt.show()
794 """
795 if b <= a:
796 raise ValueError("`b` has to be larger than `a`.")
798 svals = np.linspace(a, b, num=N)
799 ppcc = np.empty_like(svals)
800 for k, sval in enumerate(svals):
801 _, r2 = probplot(x, sval, dist=dist, fit=True)
802 ppcc[k] = r2[-1]
804 if plot is not None:
805 plot.plot(svals, ppcc, 'x')
806 _add_axis_labels_title(plot, xlabel='Shape Values',
807 ylabel='Prob Plot Corr. Coef.',
808 title='(%s) PPCC Plot' % dist)
810 return svals, ppcc
813def boxcox_llf(lmb, data):
814 r"""The boxcox log-likelihood function.
816 Parameters
817 ----------
818 lmb : scalar
819 Parameter for Box-Cox transformation. See `boxcox` for details.
820 data : array_like
821 Data to calculate Box-Cox log-likelihood for. If `data` is
822 multi-dimensional, the log-likelihood is calculated along the first
823 axis.
825 Returns
826 -------
827 llf : float or ndarray
828 Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`,
829 an array otherwise.
831 See Also
832 --------
833 boxcox, probplot, boxcox_normplot, boxcox_normmax
835 Notes
836 -----
837 The Box-Cox log-likelihood function is defined here as
839 .. math::
841 llf = (\lambda - 1) \sum_i(\log(x_i)) -
842 N/2 \log(\sum_i (y_i - \bar{y})^2 / N),
844 where ``y`` is the Box-Cox transformed input data ``x``.
846 Examples
847 --------
848 >>> from scipy import stats
849 >>> import matplotlib.pyplot as plt
850 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
851 >>> np.random.seed(1245)
853 Generate some random variates and calculate Box-Cox log-likelihood values
854 for them for a range of ``lmbda`` values:
856 >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
857 >>> lmbdas = np.linspace(-2, 10)
858 >>> llf = np.zeros(lmbdas.shape, dtype=float)
859 >>> for ii, lmbda in enumerate(lmbdas):
860 ... llf[ii] = stats.boxcox_llf(lmbda, x)
862 Also find the optimal lmbda value with `boxcox`:
864 >>> x_most_normal, lmbda_optimal = stats.boxcox(x)
866 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
867 horizontal line to check that that's really the optimum:
869 >>> fig = plt.figure()
870 >>> ax = fig.add_subplot(111)
871 >>> ax.plot(lmbdas, llf, 'b.-')
872 >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
873 >>> ax.set_xlabel('lmbda parameter')
874 >>> ax.set_ylabel('Box-Cox log-likelihood')
876 Now add some probability plots to show that where the log-likelihood is
877 maximized the data transformed with `boxcox` looks closest to normal:
879 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
880 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
881 ... xt = stats.boxcox(x, lmbda=lmbda)
882 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
883 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
884 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
885 ... ax_inset.set_xticklabels([])
886 ... ax_inset.set_yticklabels([])
887 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
889 >>> plt.show()
891 """
892 data = np.asarray(data)
893 N = data.shape[0]
894 if N == 0:
895 return np.nan
897 logdata = np.log(data)
899 # Compute the variance of the transformed data.
900 if lmb == 0:
901 variance = np.var(logdata, axis=0)
902 else:
903 # Transform without the constant offset 1/lmb. The offset does
904 # not effect the variance, and the subtraction of the offset can
905 # lead to loss of precision.
906 variance = np.var(data**lmb / lmb, axis=0)
908 return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)
911def _boxcox_conf_interval(x, lmax, alpha):
912 # Need to find the lambda for which
913 # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1
914 fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1)
915 target = boxcox_llf(lmax, x) - fac
917 def rootfunc(lmbda, data, target):
918 return boxcox_llf(lmbda, data) - target
920 # Find positive endpoint of interval in which answer is to be found
921 newlm = lmax + 0.5
922 N = 0
923 while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
924 newlm += 0.1
925 N += 1
927 if N == 500:
928 raise RuntimeError("Could not find endpoint.")
930 lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target))
932 # Now find negative interval in the same way
933 newlm = lmax - 0.5
934 N = 0
935 while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
936 newlm -= 0.1
937 N += 1
939 if N == 500:
940 raise RuntimeError("Could not find endpoint.")
942 lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target))
943 return lmminus, lmplus
946def boxcox(x, lmbda=None, alpha=None):
947 r"""
948 Return a dataset transformed by a Box-Cox power transformation.
950 Parameters
951 ----------
952 x : ndarray
953 Input array. Must be positive 1-dimensional. Must not be constant.
954 lmbda : {None, scalar}, optional
955 If `lmbda` is not None, do the transformation for that value.
957 If `lmbda` is None, find the lambda that maximizes the log-likelihood
958 function and return it as the second output argument.
959 alpha : {None, float}, optional
960 If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence
961 interval for `lmbda` as the third output argument.
962 Must be between 0.0 and 1.0.
964 Returns
965 -------
966 boxcox : ndarray
967 Box-Cox power transformed array.
968 maxlog : float, optional
969 If the `lmbda` parameter is None, the second returned argument is
970 the lambda that maximizes the log-likelihood function.
971 (min_ci, max_ci) : tuple of float, optional
972 If `lmbda` parameter is None and ``alpha`` is not None, this returned
973 tuple of floats represents the minimum and maximum confidence limits
974 given ``alpha``.
976 See Also
977 --------
978 probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
980 Notes
981 -----
982 The Box-Cox transform is given by::
984 y = (x**lmbda - 1) / lmbda, for lmbda > 0
985 log(x), for lmbda = 0
987 `boxcox` requires the input data to be positive. Sometimes a Box-Cox
988 transformation provides a shift parameter to achieve this; `boxcox` does
989 not. Such a shift parameter is equivalent to adding a positive constant to
990 `x` before calling `boxcox`.
992 The confidence limits returned when ``alpha`` is provided give the interval
993 where:
995 .. math::
997 llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
999 with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
1000 function.
1002 References
1003 ----------
1004 G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
1005 Royal Statistical Society B, 26, 211-252 (1964).
1007 Examples
1008 --------
1009 >>> from scipy import stats
1010 >>> import matplotlib.pyplot as plt
1012 We generate some random variates from a non-normal distribution and make a
1013 probability plot for it, to show it is non-normal in the tails:
1015 >>> fig = plt.figure()
1016 >>> ax1 = fig.add_subplot(211)
1017 >>> x = stats.loggamma.rvs(5, size=500) + 5
1018 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
1019 >>> ax1.set_xlabel('')
1020 >>> ax1.set_title('Probplot against normal distribution')
1022 We now use `boxcox` to transform the data so it's closest to normal:
1024 >>> ax2 = fig.add_subplot(212)
1025 >>> xt, _ = stats.boxcox(x)
1026 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
1027 >>> ax2.set_title('Probplot after Box-Cox transformation')
1029 >>> plt.show()
1031 """
1032 x = np.asarray(x)
1033 if x.ndim != 1:
1034 raise ValueError("Data must be 1-dimensional.")
1036 if x.size == 0:
1037 return x
1039 if np.all(x == x[0]):
1040 raise ValueError("Data must not be constant.")
1042 if any(x <= 0):
1043 raise ValueError("Data must be positive.")
1045 if lmbda is not None: # single transformation
1046 return special.boxcox(x, lmbda)
1048 # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
1049 lmax = boxcox_normmax(x, method='mle')
1050 y = boxcox(x, lmax)
1052 if alpha is None:
1053 return y, lmax
1054 else:
1055 # Find confidence interval
1056 interval = _boxcox_conf_interval(x, lmax, alpha)
1057 return y, lmax, interval
1060def boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr'):
1061 """Compute optimal Box-Cox transform parameter for input data.
1063 Parameters
1064 ----------
1065 x : array_like
1066 Input array.
1067 brack : 2-tuple, optional
1068 The starting interval for a downhill bracket search with
1069 `optimize.brent`. Note that this is in most cases not critical; the
1070 final result is allowed to be outside this bracket.
1071 method : str, optional
1072 The method to determine the optimal transform parameter (`boxcox`
1073 ``lmbda`` parameter). Options are:
1075 'pearsonr' (default)
1076 Maximizes the Pearson correlation coefficient between
1077 ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be
1078 normally-distributed.
1080 'mle'
1081 Minimizes the log-likelihood `boxcox_llf`. This is the method used
1082 in `boxcox`.
1084 'all'
1085 Use all optimization methods available, and return all results.
1086 Useful to compare different methods.
1088 Returns
1089 -------
1090 maxlog : float or ndarray
1091 The optimal transform parameter found. An array instead of a scalar
1092 for ``method='all'``.
1094 See Also
1095 --------
1096 boxcox, boxcox_llf, boxcox_normplot
1098 Examples
1099 --------
1100 >>> from scipy import stats
1101 >>> import matplotlib.pyplot as plt
1102 >>> np.random.seed(1234) # make this example reproducible
1104 Generate some data and determine optimal ``lmbda`` in various ways:
1106 >>> x = stats.loggamma.rvs(5, size=30) + 5
1107 >>> y, lmax_mle = stats.boxcox(x)
1108 >>> lmax_pearsonr = stats.boxcox_normmax(x)
1110 >>> lmax_mle
1111 7.177...
1112 >>> lmax_pearsonr
1113 7.916...
1114 >>> stats.boxcox_normmax(x, method='all')
1115 array([ 7.91667384, 7.17718692])
1117 >>> fig = plt.figure()
1118 >>> ax = fig.add_subplot(111)
1119 >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax)
1120 >>> ax.axvline(lmax_mle, color='r')
1121 >>> ax.axvline(lmax_pearsonr, color='g', ls='--')
1123 >>> plt.show()
1125 """
1127 def _pearsonr(x, brack):
1128 osm_uniform = _calc_uniform_order_statistic_medians(len(x))
1129 xvals = distributions.norm.ppf(osm_uniform)
1131 def _eval_pearsonr(lmbda, xvals, samps):
1132 # This function computes the x-axis values of the probability plot
1133 # and computes a linear regression (including the correlation) and
1134 # returns ``1 - r`` so that a minimization function maximizes the
1135 # correlation.
1136 y = boxcox(samps, lmbda)
1137 yvals = np.sort(y)
1138 r, prob = stats.pearsonr(xvals, yvals)
1139 return 1 - r
1141 return optimize.brent(_eval_pearsonr, brack=brack, args=(xvals, x))
1143 def _mle(x, brack):
1144 def _eval_mle(lmb, data):
1145 # function to minimize
1146 return -boxcox_llf(lmb, data)
1148 return optimize.brent(_eval_mle, brack=brack, args=(x,))
1150 def _all(x, brack):
1151 maxlog = np.zeros(2, dtype=float)
1152 maxlog[0] = _pearsonr(x, brack)
1153 maxlog[1] = _mle(x, brack)
1154 return maxlog
1156 methods = {'pearsonr': _pearsonr,
1157 'mle': _mle,
1158 'all': _all}
1159 if method not in methods.keys():
1160 raise ValueError("Method %s not recognized." % method)
1162 optimfunc = methods[method]
1163 return optimfunc(x, brack)
1166def _normplot(method, x, la, lb, plot=None, N=80):
1167 """Compute parameters for a Box-Cox or Yeo-Johnson normality plot,
1168 optionally show it. See `boxcox_normplot` or `yeojohnson_normplot` for
1169 details."""
1171 if method == 'boxcox':
1172 title = 'Box-Cox Normality Plot'
1173 transform_func = boxcox
1174 else:
1175 title = 'Yeo-Johnson Normality Plot'
1176 transform_func = yeojohnson
1178 x = np.asarray(x)
1179 if x.size == 0:
1180 return x
1182 if lb <= la:
1183 raise ValueError("`lb` has to be larger than `la`.")
1185 lmbdas = np.linspace(la, lb, num=N)
1186 ppcc = lmbdas * 0.0
1187 for i, val in enumerate(lmbdas):
1188 # Determine for each lmbda the square root of correlation coefficient
1189 # of transformed x
1190 z = transform_func(x, lmbda=val)
1191 _, (_, _, r) = probplot(z, dist='norm', fit=True)
1192 ppcc[i] = r
1194 if plot is not None:
1195 plot.plot(lmbdas, ppcc, 'x')
1196 _add_axis_labels_title(plot, xlabel='$\\lambda$',
1197 ylabel='Prob Plot Corr. Coef.',
1198 title=title)
1200 return lmbdas, ppcc
1203def boxcox_normplot(x, la, lb, plot=None, N=80):
1204 """Compute parameters for a Box-Cox normality plot, optionally show it.
1206 A Box-Cox normality plot shows graphically what the best transformation
1207 parameter is to use in `boxcox` to obtain a distribution that is close
1208 to normal.
1210 Parameters
1211 ----------
1212 x : array_like
1213 Input array.
1214 la, lb : scalar
1215 The lower and upper bounds for the ``lmbda`` values to pass to `boxcox`
1216 for Box-Cox transformations. These are also the limits of the
1217 horizontal axis of the plot if that is generated.
1218 plot : object, optional
1219 If given, plots the quantiles and least squares fit.
1220 `plot` is an object that has to have methods "plot" and "text".
1221 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
1222 or a custom object with the same methods.
1223 Default is None, which means that no plot is created.
1224 N : int, optional
1225 Number of points on the horizontal axis (equally distributed from
1226 `la` to `lb`).
1228 Returns
1229 -------
1230 lmbdas : ndarray
1231 The ``lmbda`` values for which a Box-Cox transform was done.
1232 ppcc : ndarray
1233 Probability Plot Correlelation Coefficient, as obtained from `probplot`
1234 when fitting the Box-Cox transformed input `x` against a normal
1235 distribution.
1237 See Also
1238 --------
1239 probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max
1241 Notes
1242 -----
1243 Even if `plot` is given, the figure is not shown or saved by
1244 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
1245 should be used after calling `probplot`.
1247 Examples
1248 --------
1249 >>> from scipy import stats
1250 >>> import matplotlib.pyplot as plt
1252 Generate some non-normally distributed data, and create a Box-Cox plot:
1254 >>> x = stats.loggamma.rvs(5, size=500) + 5
1255 >>> fig = plt.figure()
1256 >>> ax = fig.add_subplot(111)
1257 >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax)
1259 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
1260 the same plot:
1262 >>> _, maxlog = stats.boxcox(x)
1263 >>> ax.axvline(maxlog, color='r')
1265 >>> plt.show()
1267 """
1268 return _normplot('boxcox', x, la, lb, plot, N)
1271def yeojohnson(x, lmbda=None):
1272 r"""
1273 Return a dataset transformed by a Yeo-Johnson power transformation.
1275 Parameters
1276 ----------
1277 x : ndarray
1278 Input array. Should be 1-dimensional.
1279 lmbda : float, optional
1280 If ``lmbda`` is ``None``, find the lambda that maximizes the
1281 log-likelihood function and return it as the second output argument.
1282 Otherwise the transformation is done for the given value.
1284 Returns
1285 -------
1286 yeojohnson: ndarray
1287 Yeo-Johnson power transformed array.
1288 maxlog : float, optional
1289 If the `lmbda` parameter is None, the second returned argument is
1290 the lambda that maximizes the log-likelihood function.
1292 See Also
1293 --------
1294 probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox
1296 Notes
1297 -----
1298 The Yeo-Johnson transform is given by::
1300 y = ((x + 1)**lmbda - 1) / lmbda, for x >= 0, lmbda != 0
1301 log(x + 1), for x >= 0, lmbda = 0
1302 -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda), for x < 0, lmbda != 2
1303 -log(-x + 1), for x < 0, lmbda = 2
1305 Unlike `boxcox`, `yeojohnson` does not require the input data to be
1306 positive.
1308 .. versionadded:: 1.2.0
1311 References
1312 ----------
1313 I. Yeo and R.A. Johnson, "A New Family of Power Transformations to
1314 Improve Normality or Symmetry", Biometrika 87.4 (2000):
1317 Examples
1318 --------
1319 >>> from scipy import stats
1320 >>> import matplotlib.pyplot as plt
1322 We generate some random variates from a non-normal distribution and make a
1323 probability plot for it, to show it is non-normal in the tails:
1325 >>> fig = plt.figure()
1326 >>> ax1 = fig.add_subplot(211)
1327 >>> x = stats.loggamma.rvs(5, size=500) + 5
1328 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
1329 >>> ax1.set_xlabel('')
1330 >>> ax1.set_title('Probplot against normal distribution')
1332 We now use `yeojohnson` to transform the data so it's closest to normal:
1334 >>> ax2 = fig.add_subplot(212)
1335 >>> xt, lmbda = stats.yeojohnson(x)
1336 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
1337 >>> ax2.set_title('Probplot after Yeo-Johnson transformation')
1339 >>> plt.show()
1341 """
1343 x = np.asarray(x)
1344 if x.size == 0:
1345 return x
1347 if np.issubdtype(x.dtype, np.complexfloating):
1348 raise ValueError('Yeo-Johnson transformation is not defined for '
1349 'complex numbers.')
1351 if np.issubdtype(x.dtype, np.integer):
1352 x = x.astype(np.float64, copy=False)
1354 if lmbda is not None:
1355 return _yeojohnson_transform(x, lmbda)
1357 # if lmbda=None, find the lmbda that maximizes the log-likelihood function.
1358 lmax = yeojohnson_normmax(x)
1359 y = _yeojohnson_transform(x, lmax)
1361 return y, lmax
1364def _yeojohnson_transform(x, lmbda):
1365 """Return x transformed by the Yeo-Johnson power transform with given
1366 parameter lmbda."""
1368 out = np.zeros_like(x)
1369 pos = x >= 0 # binary mask
1371 # when x >= 0
1372 if abs(lmbda) < np.spacing(1.):
1373 out[pos] = np.log1p(x[pos])
1374 else: # lmbda != 0
1375 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
1377 # when x < 0
1378 if abs(lmbda - 2) > np.spacing(1.):
1379 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
1380 else: # lmbda == 2
1381 out[~pos] = -np.log1p(-x[~pos])
1383 return out
1386def yeojohnson_llf(lmb, data):
1387 r"""The yeojohnson log-likelihood function.
1389 Parameters
1390 ----------
1391 lmb : scalar
1392 Parameter for Yeo-Johnson transformation. See `yeojohnson` for
1393 details.
1394 data : array_like
1395 Data to calculate Yeo-Johnson log-likelihood for. If `data` is
1396 multi-dimensional, the log-likelihood is calculated along the first
1397 axis.
1399 Returns
1400 -------
1401 llf : float
1402 Yeo-Johnson log-likelihood of `data` given `lmb`.
1404 See Also
1405 --------
1406 yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax
1408 Notes
1409 -----
1410 The Yeo-Johnson log-likelihood function is defined here as
1412 .. math::
1414 llf = N/2 \log(\hat{\sigma}^2) + (\lambda - 1)
1415 \sum_i \text{ sign }(x_i)\log(|x_i| + 1)
1417 where :math:`\hat{\sigma}^2` is estimated variance of the the Yeo-Johnson
1418 transformed input data ``x``.
1420 .. versionadded:: 1.2.0
1422 Examples
1423 --------
1424 >>> from scipy import stats
1425 >>> import matplotlib.pyplot as plt
1426 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
1427 >>> np.random.seed(1245)
1429 Generate some random variates and calculate Yeo-Johnson log-likelihood
1430 values for them for a range of ``lmbda`` values:
1432 >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
1433 >>> lmbdas = np.linspace(-2, 10)
1434 >>> llf = np.zeros(lmbdas.shape, dtype=float)
1435 >>> for ii, lmbda in enumerate(lmbdas):
1436 ... llf[ii] = stats.yeojohnson_llf(lmbda, x)
1438 Also find the optimal lmbda value with `yeojohnson`:
1440 >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x)
1442 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
1443 horizontal line to check that that's really the optimum:
1445 >>> fig = plt.figure()
1446 >>> ax = fig.add_subplot(111)
1447 >>> ax.plot(lmbdas, llf, 'b.-')
1448 >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')
1449 >>> ax.set_xlabel('lmbda parameter')
1450 >>> ax.set_ylabel('Yeo-Johnson log-likelihood')
1452 Now add some probability plots to show that where the log-likelihood is
1453 maximized the data transformed with `yeojohnson` looks closest to normal:
1455 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
1456 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
1457 ... xt = stats.yeojohnson(x, lmbda=lmbda)
1458 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
1459 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
1460 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
1461 ... ax_inset.set_xticklabels([])
1462 ... ax_inset.set_yticklabels([])
1463 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
1465 >>> plt.show()
1467 """
1468 data = np.asarray(data)
1469 n_samples = data.shape[0]
1471 if n_samples == 0:
1472 return np.nan
1474 trans = _yeojohnson_transform(data, lmb)
1476 loglike = -n_samples / 2 * np.log(trans.var(axis=0))
1477 loglike += (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0)
1479 return loglike
1482def yeojohnson_normmax(x, brack=(-2, 2)):
1483 """
1484 Compute optimal Yeo-Johnson transform parameter.
1486 Compute optimal Yeo-Johnson transform parameter for input data, using
1487 maximum likelihood estimation.
1489 Parameters
1490 ----------
1491 x : array_like
1492 Input array.
1493 brack : 2-tuple, optional
1494 The starting interval for a downhill bracket search with
1495 `optimize.brent`. Note that this is in most cases not critical; the
1496 final result is allowed to be outside this bracket.
1498 Returns
1499 -------
1500 maxlog : float
1501 The optimal transform parameter found.
1503 See Also
1504 --------
1505 yeojohnson, yeojohnson_llf, yeojohnson_normplot
1507 Notes
1508 -----
1509 .. versionadded:: 1.2.0
1511 Examples
1512 --------
1513 >>> from scipy import stats
1514 >>> import matplotlib.pyplot as plt
1515 >>> np.random.seed(1234) # make this example reproducible
1517 Generate some data and determine optimal ``lmbda``
1519 >>> x = stats.loggamma.rvs(5, size=30) + 5
1520 >>> lmax = stats.yeojohnson_normmax(x)
1522 >>> fig = plt.figure()
1523 >>> ax = fig.add_subplot(111)
1524 >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax)
1525 >>> ax.axvline(lmax, color='r')
1527 >>> plt.show()
1529 """
1531 def _neg_llf(lmbda, data):
1532 return -yeojohnson_llf(lmbda, data)
1534 return optimize.brent(_neg_llf, brack=brack, args=(x,))
1537def yeojohnson_normplot(x, la, lb, plot=None, N=80):
1538 """Compute parameters for a Yeo-Johnson normality plot, optionally show it.
1540 A Yeo-Johnson normality plot shows graphically what the best
1541 transformation parameter is to use in `yeojohnson` to obtain a
1542 distribution that is close to normal.
1544 Parameters
1545 ----------
1546 x : array_like
1547 Input array.
1548 la, lb : scalar
1549 The lower and upper bounds for the ``lmbda`` values to pass to
1550 `yeojohnson` for Yeo-Johnson transformations. These are also the
1551 limits of the horizontal axis of the plot if that is generated.
1552 plot : object, optional
1553 If given, plots the quantiles and least squares fit.
1554 `plot` is an object that has to have methods "plot" and "text".
1555 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
1556 or a custom object with the same methods.
1557 Default is None, which means that no plot is created.
1558 N : int, optional
1559 Number of points on the horizontal axis (equally distributed from
1560 `la` to `lb`).
1562 Returns
1563 -------
1564 lmbdas : ndarray
1565 The ``lmbda`` values for which a Yeo-Johnson transform was done.
1566 ppcc : ndarray
1567 Probability Plot Correlelation Coefficient, as obtained from `probplot`
1568 when fitting the Box-Cox transformed input `x` against a normal
1569 distribution.
1571 See Also
1572 --------
1573 probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max
1575 Notes
1576 -----
1577 Even if `plot` is given, the figure is not shown or saved by
1578 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
1579 should be used after calling `probplot`.
1581 .. versionadded:: 1.2.0
1583 Examples
1584 --------
1585 >>> from scipy import stats
1586 >>> import matplotlib.pyplot as plt
1588 Generate some non-normally distributed data, and create a Yeo-Johnson plot:
1590 >>> x = stats.loggamma.rvs(5, size=500) + 5
1591 >>> fig = plt.figure()
1592 >>> ax = fig.add_subplot(111)
1593 >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)
1595 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
1596 the same plot:
1598 >>> _, maxlog = stats.yeojohnson(x)
1599 >>> ax.axvline(maxlog, color='r')
1601 >>> plt.show()
1603 """
1604 return _normplot('yeojohnson', x, la, lb, plot, N)
1607ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue'))
1609def shapiro(x):
1610 """
1611 Perform the Shapiro-Wilk test for normality.
1613 The Shapiro-Wilk test tests the null hypothesis that the
1614 data was drawn from a normal distribution.
1616 Parameters
1617 ----------
1618 x : array_like
1619 Array of sample data.
1621 Returns
1622 -------
1623 statistic : float
1624 The test statistic.
1625 p-value : float
1626 The p-value for the hypothesis test.
1628 See Also
1629 --------
1630 anderson : The Anderson-Darling test for normality
1631 kstest : The Kolmogorov-Smirnov test for goodness of fit.
1633 Notes
1634 -----
1635 The algorithm used is described in [4]_ but censoring parameters as
1636 described are not implemented. For N > 5000 the W test statistic is accurate
1637 but the p-value may not be.
1639 The chance of rejecting the null hypothesis when it is true is close to 5%
1640 regardless of sample size.
1642 References
1643 ----------
1644 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
1645 .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for
1646 normality (complete samples), Biometrika, Vol. 52, pp. 591-611.
1647 .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk,
1648 Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of
1649 Statistical Modeling and Analytics, Vol. 2, pp. 21-33.
1650 .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4.
1652 Examples
1653 --------
1654 >>> from scipy import stats
1655 >>> np.random.seed(12345678)
1656 >>> x = stats.norm.rvs(loc=5, scale=3, size=100)
1657 >>> shapiro_test = stats.shapiro(x)
1658 >>> shapiro_test
1659 ShapiroResult(statistic=0.9772805571556091, pvalue=0.08144091814756393)
1660 >>> shapiro_test.statistic
1661 0.9772805571556091
1662 >>> shapiro_test.pvalue
1663 0.08144091814756393
1665 """
1666 x = np.ravel(x)
1668 N = len(x)
1669 if N < 3:
1670 raise ValueError("Data must be at least length 3.")
1672 a = zeros(N, 'f')
1673 init = 0
1675 y = sort(x)
1676 a, w, pw, ifault = statlib.swilk(y, a[:N//2], init)
1677 if ifault not in [0, 2]:
1678 warnings.warn("Input data for shapiro has range zero. The results "
1679 "may not be accurate.")
1680 if N > 5000:
1681 warnings.warn("p-value may not be accurate for N > 5000.")
1683 return ShapiroResult(w, pw)
1686# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and
1687# Some Comparisons", Journal of the American Statistical
1688# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737
1689_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092])
1690_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957])
1691# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution",
1692# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588.
1693_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038])
1694# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based
1695# on the Empirical Distribution Function.", Biometrika,
1696# Vol. 66, Issue 3, Dec. 1979, pp 591-595.
1697_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010])
1700AndersonResult = namedtuple('AndersonResult', ('statistic',
1701 'critical_values',
1702 'significance_level'))
1705def anderson(x, dist='norm'):
1706 """
1707 Anderson-Darling test for data coming from a particular distribution.
1709 The Anderson-Darling test tests the null hypothesis that a sample is
1710 drawn from a population that follows a particular distribution.
1711 For the Anderson-Darling test, the critical values depend on
1712 which distribution is being tested against. This function works
1713 for normal, exponential, logistic, or Gumbel (Extreme Value
1714 Type I) distributions.
1716 Parameters
1717 ----------
1718 x : array_like
1719 Array of sample data.
1720 dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r',
1721 'extreme1'}, optional
1722 The type of distribution to test against. The default is 'norm'.
1723 The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the
1724 same distribution.
1726 Returns
1727 -------
1728 statistic : float
1729 The Anderson-Darling test statistic.
1730 critical_values : list
1731 The critical values for this distribution.
1732 significance_level : list
1733 The significance levels for the corresponding critical values
1734 in percents. The function returns critical values for a
1735 differing set of significance levels depending on the
1736 distribution that is being tested against.
1738 See Also
1739 --------
1740 kstest : The Kolmogorov-Smirnov test for goodness-of-fit.
1742 Notes
1743 -----
1744 Critical values provided are for the following significance levels:
1746 normal/exponenential
1747 15%, 10%, 5%, 2.5%, 1%
1748 logistic
1749 25%, 10%, 5%, 2.5%, 1%, 0.5%
1750 Gumbel
1751 25%, 10%, 5%, 2.5%, 1%
1753 If the returned statistic is larger than these critical values then
1754 for the corresponding significance level, the null hypothesis that
1755 the data come from the chosen distribution can be rejected.
1756 The returned statistic is referred to as 'A2' in the references.
1758 References
1759 ----------
1760 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
1761 .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and
1762 Some Comparisons, Journal of the American Statistical Association,
1763 Vol. 69, pp. 730-737.
1764 .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit
1765 Statistics with Unknown Parameters, Annals of Statistics, Vol. 4,
1766 pp. 357-369.
1767 .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value
1768 Distribution, Biometrika, Vol. 64, pp. 583-588.
1769 .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference
1770 to Tests for Exponentiality , Technical Report No. 262,
1771 Department of Statistics, Stanford University, Stanford, CA.
1772 .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution
1773 Based on the Empirical Distribution Function, Biometrika, Vol. 66,
1774 pp. 591-595.
1776 """
1777 if dist not in ['norm', 'expon', 'gumbel', 'gumbel_l',
1778 'gumbel_r', 'extreme1', 'logistic']:
1779 raise ValueError("Invalid distribution; dist must be 'norm', "
1780 "'expon', 'gumbel', 'extreme1' or 'logistic'.")
1781 y = sort(x)
1782 xbar = np.mean(x, axis=0)
1783 N = len(y)
1784 if dist == 'norm':
1785 s = np.std(x, ddof=1, axis=0)
1786 w = (y - xbar) / s
1787 logcdf = distributions.norm.logcdf(w)
1788 logsf = distributions.norm.logsf(w)
1789 sig = array([15, 10, 5, 2.5, 1])
1790 critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3)
1791 elif dist == 'expon':
1792 w = y / xbar
1793 logcdf = distributions.expon.logcdf(w)
1794 logsf = distributions.expon.logsf(w)
1795 sig = array([15, 10, 5, 2.5, 1])
1796 critical = around(_Avals_expon / (1.0 + 0.6/N), 3)
1797 elif dist == 'logistic':
1798 def rootfunc(ab, xj, N):
1799 a, b = ab
1800 tmp = (xj - a) / b
1801 tmp2 = exp(tmp)
1802 val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N,
1803 np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N]
1804 return array(val)
1806 sol0 = array([xbar, np.std(x, ddof=1, axis=0)])
1807 sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)
1808 w = (y - sol[0]) / sol[1]
1809 logcdf = distributions.logistic.logcdf(w)
1810 logsf = distributions.logistic.logsf(w)
1811 sig = array([25, 10, 5, 2.5, 1, 0.5])
1812 critical = around(_Avals_logistic / (1.0 + 0.25/N), 3)
1813 elif dist == 'gumbel_r':
1814 xbar, s = distributions.gumbel_r.fit(x)
1815 w = (y - xbar) / s
1816 logcdf = distributions.gumbel_r.logcdf(w)
1817 logsf = distributions.gumbel_r.logsf(w)
1818 sig = array([25, 10, 5, 2.5, 1])
1819 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
1820 else: # (dist == 'gumbel') or (dist == 'gumbel_l') or (dist == 'extreme1')
1821 xbar, s = distributions.gumbel_l.fit(x)
1822 w = (y - xbar) / s
1823 logcdf = distributions.gumbel_l.logcdf(w)
1824 logsf = distributions.gumbel_l.logsf(w)
1825 sig = array([25, 10, 5, 2.5, 1])
1826 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
1828 i = arange(1, N + 1)
1829 A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0)
1831 return AndersonResult(A2, critical, sig)
1834def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
1835 """
1836 Compute A2akN equation 7 of Scholz and Stephens.
1838 Parameters
1839 ----------
1840 samples : sequence of 1-D array_like
1841 Array of sample arrays.
1842 Z : array_like
1843 Sorted array of all observations.
1844 Zstar : array_like
1845 Sorted array of unique observations.
1846 k : int
1847 Number of samples.
1848 n : array_like
1849 Number of observations in each sample.
1850 N : int
1851 Total number of observations.
1853 Returns
1854 -------
1855 A2aKN : float
1856 The A2aKN statistics of Scholz and Stephens 1987.
1857 """
1859 A2akN = 0.
1860 Z_ssorted_left = Z.searchsorted(Zstar, 'left')
1861 if N == Zstar.size:
1862 lj = 1.
1863 else:
1864 lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
1865 Bj = Z_ssorted_left + lj / 2.
1866 for i in arange(0, k):
1867 s = np.sort(samples[i])
1868 s_ssorted_right = s.searchsorted(Zstar, side='right')
1869 Mij = s_ssorted_right.astype(float)
1870 fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
1871 Mij -= fij / 2.
1872 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
1873 A2akN += inner.sum() / n[i]
1874 A2akN *= (N - 1.) / N
1875 return A2akN
1878def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
1879 """
1880 Compute A2akN equation 6 of Scholz & Stephens.
1882 Parameters
1883 ----------
1884 samples : sequence of 1-D array_like
1885 Array of sample arrays.
1886 Z : array_like
1887 Sorted array of all observations.
1888 Zstar : array_like
1889 Sorted array of unique observations.
1890 k : int
1891 Number of samples.
1892 n : array_like
1893 Number of observations in each sample.
1894 N : int
1895 Total number of observations.
1897 Returns
1898 -------
1899 A2KN : float
1900 The A2KN statistics of Scholz and Stephens 1987.
1901 """
1903 A2kN = 0.
1904 lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],
1905 'left')
1906 Bj = lj.cumsum()
1907 for i in arange(0, k):
1908 s = np.sort(samples[i])
1909 Mij = s.searchsorted(Zstar[:-1], side='right')
1910 inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
1911 A2kN += inner.sum() / n[i]
1912 return A2kN
1915Anderson_ksampResult = namedtuple('Anderson_ksampResult',
1916 ('statistic', 'critical_values',
1917 'significance_level'))
1920def anderson_ksamp(samples, midrank=True):
1921 """The Anderson-Darling test for k-samples.
1923 The k-sample Anderson-Darling test is a modification of the
1924 one-sample Anderson-Darling test. It tests the null hypothesis
1925 that k-samples are drawn from the same population without having
1926 to specify the distribution function of that population. The
1927 critical values depend on the number of samples.
1929 Parameters
1930 ----------
1931 samples : sequence of 1-D array_like
1932 Array of sample data in arrays.
1933 midrank : bool, optional
1934 Type of Anderson-Darling test which is computed. Default
1935 (True) is the midrank test applicable to continuous and
1936 discrete populations. If False, the right side empirical
1937 distribution is used.
1939 Returns
1940 -------
1941 statistic : float
1942 Normalized k-sample Anderson-Darling test statistic.
1943 critical_values : array
1944 The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%,
1945 0.5%, 0.1%.
1946 significance_level : float
1947 An approximate significance level at which the null hypothesis for the
1948 provided samples can be rejected. The value is floored / capped at
1949 0.1% / 25%.
1951 Raises
1952 ------
1953 ValueError
1954 If less than 2 samples are provided, a sample is empty, or no
1955 distinct observations are in the samples.
1957 See Also
1958 --------
1959 ks_2samp : 2 sample Kolmogorov-Smirnov test
1960 anderson : 1 sample Anderson-Darling test
1962 Notes
1963 -----
1964 [1]_ defines three versions of the k-sample Anderson-Darling test:
1965 one for continuous distributions and two for discrete
1966 distributions, in which ties between samples may occur. The
1967 default of this routine is to compute the version based on the
1968 midrank empirical distribution function. This test is applicable
1969 to continuous and discrete data. If midrank is set to False, the
1970 right side empirical distribution is used for a test for discrete
1971 data. According to [1]_, the two discrete test statistics differ
1972 only slightly if a few collisions due to round-off errors occur in
1973 the test not adjusted for ties between samples.
1975 The critical values corresponding to the significance levels from 0.01
1976 to 0.25 are taken from [1]_. p-values are floored / capped
1977 at 0.1% / 25%. Since the range of critical values might be extended in
1978 future releases, it is recommended not to test ``p == 0.25``, but rather
1979 ``p >= 0.25`` (analogously for the lower bound).
1981 .. versionadded:: 0.14.0
1983 References
1984 ----------
1985 .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample
1986 Anderson-Darling Tests, Journal of the American Statistical
1987 Association, Vol. 82, pp. 918-924.
1989 Examples
1990 --------
1991 >>> from scipy import stats
1992 >>> np.random.seed(314159)
1994 The null hypothesis that the two random samples come from the same
1995 distribution can be rejected at the 5% level because the returned
1996 test value is greater than the critical value for 5% (1.961) but
1997 not at the 2.5% level. The interpolation gives an approximate
1998 significance level of 3.2%:
2000 >>> stats.anderson_ksamp([np.random.normal(size=50),
2001 ... np.random.normal(loc=0.5, size=30)])
2002 (2.4615796189876105,
2003 array([ 0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546]),
2004 0.03176687568842282)
2007 The null hypothesis cannot be rejected for three samples from an
2008 identical distribution. The reported p-value (25%) has been capped and
2009 may not be very accurate (since it corresponds to the value 0.449
2010 whereas the statistic is -0.731):
2012 >>> stats.anderson_ksamp([np.random.normal(size=50),
2013 ... np.random.normal(size=30), np.random.normal(size=20)])
2014 (-0.73091722665244196,
2015 array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856,
2016 4.07210043, 5.56419101]),
2017 0.25)
2019 """
2020 k = len(samples)
2021 if (k < 2):
2022 raise ValueError("anderson_ksamp needs at least two samples")
2024 samples = list(map(np.asarray, samples))
2025 Z = np.sort(np.hstack(samples))
2026 N = Z.size
2027 Zstar = np.unique(Z)
2028 if Zstar.size < 2:
2029 raise ValueError("anderson_ksamp needs more than one distinct "
2030 "observation")
2032 n = np.array([sample.size for sample in samples])
2033 if any(n == 0):
2034 raise ValueError("anderson_ksamp encountered sample without "
2035 "observations")
2037 if midrank:
2038 A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
2039 else:
2040 A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
2042 H = (1. / n).sum()
2043 hs_cs = (1. / arange(N - 1, 1, -1)).cumsum()
2044 h = hs_cs[-1] + 1
2045 g = (hs_cs / arange(2, N)).sum()
2047 a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
2048 b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
2049 c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
2050 d = (2*h + 6)*k**2 - 4*h*k
2051 sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
2052 m = k - 1
2053 A2 = (A2kN - m) / math.sqrt(sigmasq)
2055 # The b_i values are the interpolation coefficients from Table 2
2056 # of Scholz and Stephens 1987
2057 b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085])
2058 b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615])
2059 b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154])
2060 critical = b0 + b1 / math.sqrt(m) + b2 / m
2062 sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001])
2063 if A2 < critical.min():
2064 p = sig.max()
2065 warnings.warn("p-value capped: true value larger than {}".format(p),
2066 stacklevel=2)
2067 elif A2 > critical.max():
2068 p = sig.min()
2069 warnings.warn("p-value floored: true value smaller than {}".format(p),
2070 stacklevel=2)
2071 else:
2072 # interpolation of probit of significance level
2073 pf = np.polyfit(critical, log(sig), 2)
2074 p = math.exp(np.polyval(pf, A2))
2076 return Anderson_ksampResult(A2, critical, p)
2079AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue'))
2082def ansari(x, y):
2083 """
2084 Perform the Ansari-Bradley test for equal scale parameters.
2086 The Ansari-Bradley test is a non-parametric test for the equality
2087 of the scale parameter of the distributions from which two
2088 samples were drawn.
2090 Parameters
2091 ----------
2092 x, y : array_like
2093 Arrays of sample data.
2095 Returns
2096 -------
2097 statistic : float
2098 The Ansari-Bradley test statistic.
2099 pvalue : float
2100 The p-value of the hypothesis test.
2102 See Also
2103 --------
2104 fligner : A non-parametric test for the equality of k variances
2105 mood : A non-parametric test for the equality of two scale parameters
2107 Notes
2108 -----
2109 The p-value given is exact when the sample sizes are both less than
2110 55 and there are no ties, otherwise a normal approximation for the
2111 p-value is used.
2113 References
2114 ----------
2115 .. [1] Sprent, Peter and N.C. Smeeton. Applied nonparametric statistical
2116 methods. 3rd ed. Chapman and Hall/CRC. 2001. Section 5.8.2.
2118 """
2119 x, y = asarray(x), asarray(y)
2120 n = len(x)
2121 m = len(y)
2122 if m < 1:
2123 raise ValueError("Not enough other observations.")
2124 if n < 1:
2125 raise ValueError("Not enough test observations.")
2127 N = m + n
2128 xy = r_[x, y] # combine
2129 rank = stats.rankdata(xy)
2130 symrank = amin(array((rank, N - rank + 1)), 0)
2131 AB = np.sum(symrank[:n], axis=0)
2132 uxy = unique(xy)
2133 repeats = (len(uxy) != len(xy))
2134 exact = ((m < 55) and (n < 55) and not repeats)
2135 if repeats and (m < 55 or n < 55):
2136 warnings.warn("Ties preclude use of exact statistic.")
2137 if exact:
2138 astart, a1, ifault = statlib.gscale(n, m)
2139 ind = AB - astart
2140 total = np.sum(a1, axis=0)
2141 if ind < len(a1)/2.0:
2142 cind = int(ceil(ind))
2143 if ind == cind:
2144 pval = 2.0 * np.sum(a1[:cind+1], axis=0) / total
2145 else:
2146 pval = 2.0 * np.sum(a1[:cind], axis=0) / total
2147 else:
2148 find = int(floor(ind))
2149 if ind == floor(ind):
2150 pval = 2.0 * np.sum(a1[find:], axis=0) / total
2151 else:
2152 pval = 2.0 * np.sum(a1[find+1:], axis=0) / total
2153 return AnsariResult(AB, min(1.0, pval))
2155 # otherwise compute normal approximation
2156 if N % 2: # N odd
2157 mnAB = n * (N+1.0)**2 / 4.0 / N
2158 varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2)
2159 else:
2160 mnAB = n * (N+2.0) / 4.0
2161 varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0)
2162 if repeats: # adjust variance estimates
2163 # compute np.sum(tj * rj**2,axis=0)
2164 fac = np.sum(symrank**2, axis=0)
2165 if N % 2: # N odd
2166 varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1))
2167 else: # N even
2168 varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1))
2170 z = (AB - mnAB) / sqrt(varAB)
2171 pval = distributions.norm.sf(abs(z)) * 2.0
2172 return AnsariResult(AB, pval)
2175BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue'))
2178def bartlett(*args):
2179 """
2180 Perform Bartlett's test for equal variances.
2182 Bartlett's test tests the null hypothesis that all input samples
2183 are from populations with equal variances. For samples
2184 from significantly non-normal populations, Levene's test
2185 `levene` is more robust.
2187 Parameters
2188 ----------
2189 sample1, sample2,... : array_like
2190 arrays of sample data. Only 1d arrays are accepted, they may have
2191 different lengths.
2193 Returns
2194 -------
2195 statistic : float
2196 The test statistic.
2197 pvalue : float
2198 The p-value of the test.
2200 See Also
2201 --------
2202 fligner : A non-parametric test for the equality of k variances
2203 levene : A robust parametric test for equality of k variances
2205 Notes
2206 -----
2207 Conover et al. (1981) examine many of the existing parametric and
2208 nonparametric tests by extensive simulations and they conclude that the
2209 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
2210 superior in terms of robustness of departures from normality and power
2211 ([3]_).
2213 References
2214 ----------
2215 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm
2217 .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical
2218 Methods, Eighth Edition, Iowa State University Press.
2220 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2221 Hypothesis Testing based on Quadratic Inference Function. Technical
2222 Report #99-03, Center for Likelihood Studies, Pennsylvania State
2223 University.
2225 .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical
2226 Tests. Proceedings of the Royal Society of London. Series A,
2227 Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282.
2229 Examples
2230 --------
2231 Test whether or not the lists `a`, `b` and `c` come from populations
2232 with equal variances.
2234 >>> from scipy.stats import bartlett
2235 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2236 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2237 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2238 >>> stat, p = bartlett(a, b, c)
2239 >>> p
2240 1.1254782518834628e-05
2242 The very small p-value suggests that the populations do not have equal
2243 variances.
2245 This is not surprising, given that the sample variance of `b` is much
2246 larger than that of `a` and `c`:
2248 >>> [np.var(x, ddof=1) for x in [a, b, c]]
2249 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2250 """
2251 # Handle empty input and input that is not 1d
2252 for a in args:
2253 if np.asanyarray(a).size == 0:
2254 return BartlettResult(np.nan, np.nan)
2255 if np.asanyarray(a).ndim > 1:
2256 raise ValueError('Samples must be one-dimensional.')
2258 k = len(args)
2259 if k < 2:
2260 raise ValueError("Must enter at least two input sample vectors.")
2261 Ni = zeros(k)
2262 ssq = zeros(k, 'd')
2263 for j in range(k):
2264 Ni[j] = len(args[j])
2265 ssq[j] = np.var(args[j], ddof=1)
2266 Ntot = np.sum(Ni, axis=0)
2267 spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k))
2268 numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
2269 denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) -
2270 1.0/(Ntot - k))
2271 T = numer / denom
2272 pval = distributions.chi2.sf(T, k - 1) # 1 - cdf
2274 return BartlettResult(T, pval)
2277LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue'))
2280def levene(*args, **kwds):
2281 """
2282 Perform Levene test for equal variances.
2284 The Levene test tests the null hypothesis that all input samples
2285 are from populations with equal variances. Levene's test is an
2286 alternative to Bartlett's test `bartlett` in the case where
2287 there are significant deviations from normality.
2289 Parameters
2290 ----------
2291 sample1, sample2, ... : array_like
2292 The sample data, possibly with different lengths. Only one-dimensional
2293 samples are accepted.
2294 center : {'mean', 'median', 'trimmed'}, optional
2295 Which function of the data to use in the test. The default
2296 is 'median'.
2297 proportiontocut : float, optional
2298 When `center` is 'trimmed', this gives the proportion of data points
2299 to cut from each end. (See `scipy.stats.trim_mean`.)
2300 Default is 0.05.
2302 Returns
2303 -------
2304 statistic : float
2305 The test statistic.
2306 pvalue : float
2307 The p-value for the test.
2309 Notes
2310 -----
2311 Three variations of Levene's test are possible. The possibilities
2312 and their recommended usages are:
2314 * 'median' : Recommended for skewed (non-normal) distributions>
2315 * 'mean' : Recommended for symmetric, moderate-tailed distributions.
2316 * 'trimmed' : Recommended for heavy-tailed distributions.
2318 The test version using the mean was proposed in the original article
2319 of Levene ([2]_) while the median and trimmed mean have been studied by
2320 Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe
2321 test.
2323 References
2324 ----------
2325 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
2326 .. [2] Levene, H. (1960). In Contributions to Probability and Statistics:
2327 Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,
2328 Stanford University Press, pp. 278-292.
2329 .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American
2330 Statistical Association, 69, 364-367
2332 Examples
2333 --------
2334 Test whether or not the lists `a`, `b` and `c` come from populations
2335 with equal variances.
2337 >>> from scipy.stats import levene
2338 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2339 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2340 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2341 >>> stat, p = levene(a, b, c)
2342 >>> p
2343 0.002431505967249681
2345 The small p-value suggests that the populations do not have equal
2346 variances.
2348 This is not surprising, given that the sample variance of `b` is much
2349 larger than that of `a` and `c`:
2351 >>> [np.var(x, ddof=1) for x in [a, b, c]]
2352 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2353 """
2354 # Handle keyword arguments.
2355 center = 'median'
2356 proportiontocut = 0.05
2357 for kw, value in kwds.items():
2358 if kw not in ['center', 'proportiontocut']:
2359 raise TypeError("levene() got an unexpected keyword "
2360 "argument '%s'" % kw)
2361 if kw == 'center':
2362 center = value
2363 else:
2364 proportiontocut = value
2366 k = len(args)
2367 if k < 2:
2368 raise ValueError("Must enter at least two input sample vectors.")
2369 # check for 1d input
2370 for j in range(k):
2371 if np.asanyarray(args[j]).ndim > 1:
2372 raise ValueError('Samples must be one-dimensional.')
2374 Ni = zeros(k)
2375 Yci = zeros(k, 'd')
2377 if center not in ['mean', 'median', 'trimmed']:
2378 raise ValueError("Keyword argument <center> must be 'mean', 'median'"
2379 " or 'trimmed'.")
2381 if center == 'median':
2382 func = lambda x: np.median(x, axis=0)
2383 elif center == 'mean':
2384 func = lambda x: np.mean(x, axis=0)
2385 else: # center == 'trimmed'
2386 args = tuple(stats.trimboth(np.sort(arg), proportiontocut)
2387 for arg in args)
2388 func = lambda x: np.mean(x, axis=0)
2390 for j in range(k):
2391 Ni[j] = len(args[j])
2392 Yci[j] = func(args[j])
2393 Ntot = np.sum(Ni, axis=0)
2395 # compute Zij's
2396 Zij = [None] * k
2397 for i in range(k):
2398 Zij[i] = abs(asarray(args[i]) - Yci[i])
2400 # compute Zbari
2401 Zbari = zeros(k, 'd')
2402 Zbar = 0.0
2403 for i in range(k):
2404 Zbari[i] = np.mean(Zij[i], axis=0)
2405 Zbar += Zbari[i] * Ni[i]
2407 Zbar /= Ntot
2408 numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0)
2410 # compute denom_variance
2411 dvar = 0.0
2412 for i in range(k):
2413 dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0)
2415 denom = (k - 1.0) * dvar
2417 W = numer / denom
2418 pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf
2419 return LeveneResult(W, pval)
2422def binom_test(x, n=None, p=0.5, alternative='two-sided'):
2423 """
2424 Perform a test that the probability of success is p.
2426 This is an exact, two-sided test of the null hypothesis
2427 that the probability of success in a Bernoulli experiment
2428 is `p`.
2430 Parameters
2431 ----------
2432 x : int or array_like
2433 The number of successes, or if x has length 2, it is the
2434 number of successes and the number of failures.
2435 n : int
2436 The number of trials. This is ignored if x gives both the
2437 number of successes and failures.
2438 p : float, optional
2439 The hypothesized probability of success. ``0 <= p <= 1``. The
2440 default value is ``p = 0.5``.
2441 alternative : {'two-sided', 'greater', 'less'}, optional
2442 Indicates the alternative hypothesis. The default value is
2443 'two-sided'.
2445 Returns
2446 -------
2447 p-value : float
2448 The p-value of the hypothesis test.
2450 References
2451 ----------
2452 .. [1] https://en.wikipedia.org/wiki/Binomial_test
2454 Examples
2455 --------
2456 >>> from scipy import stats
2458 A car manufacturer claims that no more than 10% of their cars are unsafe.
2459 15 cars are inspected for safety, 3 were found to be unsafe. Test the
2460 manufacturer's claim:
2462 >>> stats.binom_test(3, n=15, p=0.1, alternative='greater')
2463 0.18406106910639114
2465 The null hypothesis cannot be rejected at the 5% level of significance
2466 because the returned p-value is greater than the critical value of 5%.
2468 """
2469 x = atleast_1d(x).astype(np.int_)
2470 if len(x) == 2:
2471 n = x[1] + x[0]
2472 x = x[0]
2473 elif len(x) == 1:
2474 x = x[0]
2475 if n is None or n < x:
2476 raise ValueError("n must be >= x")
2477 n = np.int_(n)
2478 else:
2479 raise ValueError("Incorrect length for x.")
2481 if (p > 1.0) or (p < 0.0):
2482 raise ValueError("p must be in range [0,1]")
2484 if alternative not in ('two-sided', 'less', 'greater'):
2485 raise ValueError("alternative not recognized\n"
2486 "should be 'two-sided', 'less' or 'greater'")
2488 if alternative == 'less':
2489 pval = distributions.binom.cdf(x, n, p)
2490 return pval
2492 if alternative == 'greater':
2493 pval = distributions.binom.sf(x-1, n, p)
2494 return pval
2496 # if alternative was neither 'less' nor 'greater', then it's 'two-sided'
2497 d = distributions.binom.pmf(x, n, p)
2498 rerr = 1 + 1e-7
2499 if x == p * n:
2500 # special case as shortcut, would also be handled by `else` below
2501 pval = 1.
2502 elif x < p * n:
2503 i = np.arange(np.ceil(p * n), n+1)
2504 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
2505 pval = (distributions.binom.cdf(x, n, p) +
2506 distributions.binom.sf(n - y, n, p))
2507 else:
2508 i = np.arange(np.floor(p*n) + 1)
2509 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
2510 pval = (distributions.binom.cdf(y-1, n, p) +
2511 distributions.binom.sf(x-1, n, p))
2513 return min(1.0, pval)
2516def _apply_func(x, g, func):
2517 # g is list of indices into x
2518 # separating x into different groups
2519 # func should be applied over the groups
2520 g = unique(r_[0, g, len(x)])
2521 output = [func(x[g[k]:g[k+1]]) for k in range(len(g) - 1)]
2523 return asarray(output)
2526FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue'))
2529def fligner(*args, **kwds):
2530 """
2531 Perform Fligner-Killeen test for equality of variance.
2533 Fligner's test tests the null hypothesis that all input samples
2534 are from populations with equal variances. Fligner-Killeen's test is
2535 distribution free when populations are identical [2]_.
2537 Parameters
2538 ----------
2539 sample1, sample2, ... : array_like
2540 Arrays of sample data. Need not be the same length.
2541 center : {'mean', 'median', 'trimmed'}, optional
2542 Keyword argument controlling which function of the data is used in
2543 computing the test statistic. The default is 'median'.
2544 proportiontocut : float, optional
2545 When `center` is 'trimmed', this gives the proportion of data points
2546 to cut from each end. (See `scipy.stats.trim_mean`.)
2547 Default is 0.05.
2549 Returns
2550 -------
2551 statistic : float
2552 The test statistic.
2553 pvalue : float
2554 The p-value for the hypothesis test.
2556 See Also
2557 --------
2558 bartlett : A parametric test for equality of k variances in normal samples
2559 levene : A robust parametric test for equality of k variances
2561 Notes
2562 -----
2563 As with Levene's test there are three variants of Fligner's test that
2564 differ by the measure of central tendency used in the test. See `levene`
2565 for more information.
2567 Conover et al. (1981) examine many of the existing parametric and
2568 nonparametric tests by extensive simulations and they conclude that the
2569 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
2570 superior in terms of robustness of departures from normality and power [3]_.
2572 References
2573 ----------
2574 .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2575 Hypothesis Testing based on Quadratic Inference Function. Technical
2576 Report #99-03, Center for Likelihood Studies, Pennsylvania State
2577 University.
2578 https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf
2580 .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample
2581 tests for scale. 'Journal of the American Statistical Association.'
2582 71(353), 210-213.
2584 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2585 Hypothesis Testing based on Quadratic Inference Function. Technical
2586 Report #99-03, Center for Likelihood Studies, Pennsylvania State
2587 University.
2589 .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A
2590 comparative study of tests for homogeneity of variances, with
2591 applications to the outer continental shelf biding data.
2592 Technometrics, 23(4), 351-361.
2594 Examples
2595 --------
2596 Test whether or not the lists `a`, `b` and `c` come from populations
2597 with equal variances.
2599 >>> from scipy.stats import fligner
2600 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2601 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2602 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2603 >>> stat, p = fligner(a, b, c)
2604 >>> p
2605 0.00450826080004775
2607 The small p-value suggests that the populations do not have equal
2608 variances.
2610 This is not surprising, given that the sample variance of `b` is much
2611 larger than that of `a` and `c`:
2613 >>> [np.var(x, ddof=1) for x in [a, b, c]]
2614 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2615 """
2616 # Handle empty input
2617 for a in args:
2618 if np.asanyarray(a).size == 0:
2619 return FlignerResult(np.nan, np.nan)
2621 # Handle keyword arguments.
2622 center = 'median'
2623 proportiontocut = 0.05
2624 for kw, value in kwds.items():
2625 if kw not in ['center', 'proportiontocut']:
2626 raise TypeError("fligner() got an unexpected keyword "
2627 "argument '%s'" % kw)
2628 if kw == 'center':
2629 center = value
2630 else:
2631 proportiontocut = value
2633 k = len(args)
2634 if k < 2:
2635 raise ValueError("Must enter at least two input sample vectors.")
2637 if center not in ['mean', 'median', 'trimmed']:
2638 raise ValueError("Keyword argument <center> must be 'mean', 'median'"
2639 " or 'trimmed'.")
2641 if center == 'median':
2642 func = lambda x: np.median(x, axis=0)
2643 elif center == 'mean':
2644 func = lambda x: np.mean(x, axis=0)
2645 else: # center == 'trimmed'
2646 args = tuple(stats.trimboth(arg, proportiontocut) for arg in args)
2647 func = lambda x: np.mean(x, axis=0)
2649 Ni = asarray([len(args[j]) for j in range(k)])
2650 Yci = asarray([func(args[j]) for j in range(k)])
2651 Ntot = np.sum(Ni, axis=0)
2652 # compute Zij's
2653 Zij = [abs(asarray(args[i]) - Yci[i]) for i in range(k)]
2654 allZij = []
2655 g = [0]
2656 for i in range(k):
2657 allZij.extend(list(Zij[i]))
2658 g.append(len(allZij))
2660 ranks = stats.rankdata(allZij)
2661 a = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5)
2663 # compute Aibar
2664 Aibar = _apply_func(a, g, np.sum) / Ni
2665 anbar = np.mean(a, axis=0)
2666 varsq = np.var(a, axis=0, ddof=1)
2667 Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq
2668 pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf
2669 return FlignerResult(Xsq, pval)
2672def mood(x, y, axis=0):
2673 """
2674 Perform Mood's test for equal scale parameters.
2676 Mood's two-sample test for scale parameters is a non-parametric
2677 test for the null hypothesis that two samples are drawn from the
2678 same distribution with the same scale parameter.
2680 Parameters
2681 ----------
2682 x, y : array_like
2683 Arrays of sample data.
2684 axis : int, optional
2685 The axis along which the samples are tested. `x` and `y` can be of
2686 different length along `axis`.
2687 If `axis` is None, `x` and `y` are flattened and the test is done on
2688 all values in the flattened arrays.
2690 Returns
2691 -------
2692 z : scalar or ndarray
2693 The z-score for the hypothesis test. For 1-D inputs a scalar is
2694 returned.
2695 p-value : scalar ndarray
2696 The p-value for the hypothesis test.
2698 See Also
2699 --------
2700 fligner : A non-parametric test for the equality of k variances
2701 ansari : A non-parametric test for the equality of 2 variances
2702 bartlett : A parametric test for equality of k variances in normal samples
2703 levene : A parametric test for equality of k variances
2705 Notes
2706 -----
2707 The data are assumed to be drawn from probability distributions ``f(x)``
2708 and ``f(x/s) / s`` respectively, for some probability density function f.
2709 The null hypothesis is that ``s == 1``.
2711 For multi-dimensional arrays, if the inputs are of shapes
2712 ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the
2713 resulting z and p values will have shape ``(n0, n2, n3)``. Note that
2714 ``n1`` and ``m1`` don't have to be equal, but the other dimensions do.
2716 Examples
2717 --------
2718 >>> from scipy import stats
2719 >>> np.random.seed(1234)
2720 >>> x2 = np.random.randn(2, 45, 6, 7)
2721 >>> x1 = np.random.randn(2, 30, 6, 7)
2722 >>> z, p = stats.mood(x1, x2, axis=1)
2723 >>> p.shape
2724 (2, 6, 7)
2726 Find the number of points where the difference in scale is not significant:
2728 >>> (p > 0.1).sum()
2729 74
2731 Perform the test with different scales:
2733 >>> x1 = np.random.randn(2, 30)
2734 >>> x2 = np.random.randn(2, 35) * 10.0
2735 >>> stats.mood(x1, x2, axis=1)
2736 (array([-5.7178125 , -5.25342163]), array([ 1.07904114e-08, 1.49299218e-07]))
2738 """
2739 x = np.asarray(x, dtype=float)
2740 y = np.asarray(y, dtype=float)
2742 if axis is None:
2743 x = x.flatten()
2744 y = y.flatten()
2745 axis = 0
2747 # Determine shape of the result arrays
2748 res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis])
2749 if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if
2750 ax != axis])):
2751 raise ValueError("Dimensions of x and y on all axes except `axis` "
2752 "should match")
2754 n = x.shape[axis]
2755 m = y.shape[axis]
2756 N = m + n
2757 if N < 3:
2758 raise ValueError("Not enough observations.")
2760 xy = np.concatenate((x, y), axis=axis)
2761 if axis != 0:
2762 xy = np.rollaxis(xy, axis)
2764 xy = xy.reshape(xy.shape[0], -1)
2766 # Generalized to the n-dimensional case by adding the axis argument, and
2767 # using for loops, since rankdata is not vectorized. For improving
2768 # performance consider vectorizing rankdata function.
2769 all_ranks = np.zeros_like(xy)
2770 for j in range(xy.shape[1]):
2771 all_ranks[:, j] = stats.rankdata(xy[:, j])
2773 Ri = all_ranks[:n]
2774 M = np.sum((Ri - (N + 1.0) / 2)**2, axis=0)
2775 # Approx stat.
2776 mnM = n * (N * N - 1.0) / 12
2777 varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180
2778 z = (M - mnM) / sqrt(varM)
2780 # sf for right tail, cdf for left tail. Factor 2 for two-sidedness
2781 z_pos = z > 0
2782 pval = np.zeros_like(z)
2783 pval[z_pos] = 2 * distributions.norm.sf(z[z_pos])
2784 pval[~z_pos] = 2 * distributions.norm.cdf(z[~z_pos])
2786 if res_shape == ():
2787 # Return scalars, not 0-D arrays
2788 z = z[0]
2789 pval = pval[0]
2790 else:
2791 z.shape = res_shape
2792 pval.shape = res_shape
2794 return z, pval
2797WilcoxonResult = namedtuple('WilcoxonResult', ('statistic', 'pvalue'))
2800def wilcoxon(x, y=None, zero_method="wilcox", correction=False,
2801 alternative="two-sided", mode='auto'):
2802 """
2803 Calculate the Wilcoxon signed-rank test.
2805 The Wilcoxon signed-rank test tests the null hypothesis that two
2806 related paired samples come from the same distribution. In particular,
2807 it tests whether the distribution of the differences x - y is symmetric
2808 about zero. It is a non-parametric version of the paired T-test.
2810 Parameters
2811 ----------
2812 x : array_like
2813 Either the first set of measurements (in which case `y` is the second
2814 set of measurements), or the differences between two sets of
2815 measurements (in which case `y` is not to be specified.) Must be
2816 one-dimensional.
2817 y : array_like, optional
2818 Either the second set of measurements (if `x` is the first set of
2819 measurements), or not specified (if `x` is the differences between
2820 two sets of measurements.) Must be one-dimensional.
2821 zero_method : {'pratt', 'wilcox', 'zsplit'}, optional
2822 The following options are available (default is 'wilcox'):
2824 * 'pratt': Includes zero-differences in the ranking process,
2825 but drops the ranks of the zeros, see [4]_, (more conservative).
2826 * 'wilcox': Discards all zero-differences, the default.
2827 * 'zsplit': Includes zero-differences in the ranking process and
2828 split the zero rank between positive and negative ones.
2829 correction : bool, optional
2830 If True, apply continuity correction by adjusting the Wilcoxon rank
2831 statistic by 0.5 towards the mean value when computing the
2832 z-statistic if a normal approximation is used. Default is False.
2833 alternative : {"two-sided", "greater", "less"}, optional
2834 The alternative hypothesis to be tested, see Notes. Default is
2835 "two-sided".
2836 mode : {"auto", "exact", "approx"}
2837 Method to calculate the p-value, see Notes. Default is "auto".
2839 Returns
2840 -------
2841 statistic : float
2842 If `alternative` is "two-sided", the sum of the ranks of the
2843 differences above or below zero, whichever is smaller.
2844 Otherwise the sum of the ranks of the differences above zero.
2845 pvalue : float
2846 The p-value for the test depending on `alternative` and `mode`.
2848 See Also
2849 --------
2850 kruskal, mannwhitneyu
2852 Notes
2853 -----
2854 The test has been introduced in [4]_. Given n independent samples
2855 (xi, yi) from a bivariate distribution (i.e. paired samples),
2856 it computes the differences di = xi - yi. One assumption of the test
2857 is that the differences are symmetric, see [2]_.
2858 The two-sided test has the null hypothesis that the median of the
2859 differences is zero against the alternative that it is different from
2860 zero. The one-sided test has the null hypothesis that the median is
2861 positive against the alternative that it is negative
2862 (``alternative == 'less'``), or vice versa (``alternative == 'greater.'``).
2864 To derive the p-value, the exact distribution (``mode == 'exact'``)
2865 can be used for sample sizes of up to 25. The default ``mode == 'auto'``
2866 uses the exact distribution if there are at most 25 observations and no
2867 ties, otherwise a normal approximation is used (``mode == 'approx'``).
2869 The treatment of ties can be controlled by the parameter `zero_method`.
2870 If ``zero_method == 'pratt'``, the normal approximation is adjusted as in
2871 [5]_. A typical rule is to require that n > 20 ([2]_, p. 383).
2873 References
2874 ----------
2875 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
2876 .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971.
2877 .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed
2878 Rank Procedures, Journal of the American Statistical Association,
2879 Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526`
2880 .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods,
2881 Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968`
2882 .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank
2883 Sampling Distribution When Zero Differences are Present,
2884 Journal of the American Statistical Association, Vol. 62, 1967,
2885 pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917`
2887 Examples
2888 --------
2889 In [4]_, the differences in height between cross- and self-fertilized
2890 corn plants is given as follows:
2892 >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]
2894 Cross-fertilized plants appear to be be higher. To test the null
2895 hypothesis that there is no height difference, we can apply the
2896 two-sided test:
2898 >>> from scipy.stats import wilcoxon
2899 >>> w, p = wilcoxon(d)
2900 >>> w, p
2901 (24.0, 0.041259765625)
2903 Hence, we would reject the null hypothesis at a confidence level of 5%,
2904 concluding that there is a difference in height between the groups.
2905 To confirm that the median of the differences can be assumed to be
2906 positive, we use:
2908 >>> w, p = wilcoxon(d, alternative='greater')
2909 >>> w, p
2910 (96.0, 0.0206298828125)
2912 This shows that the null hypothesis that the median is negative can be
2913 rejected at a confidence level of 5% in favor of the alternative that
2914 the median is greater than zero. The p-values above are exact. Using the
2915 normal approximation gives very similar values:
2917 >>> w, p = wilcoxon(d, mode='approx')
2918 >>> w, p
2919 (24.0, 0.04088813291185591)
2921 Note that the statistic changed to 96 in the one-sided case (the sum
2922 of ranks of positive differences) whereas it is 24 in the two-sided
2923 case (the minimum of sum of ranks above and below zero).
2925 """
2926 if mode not in ["auto", "approx", "exact"]:
2927 raise ValueError("mode must be either 'auto', 'approx' or 'exact'")
2929 if zero_method not in ["wilcox", "pratt", "zsplit"]:
2930 raise ValueError("Zero method must be either 'wilcox' "
2931 "or 'pratt' or 'zsplit'")
2933 if alternative not in ["two-sided", "less", "greater"]:
2934 raise ValueError("Alternative must be either 'two-sided', "
2935 "'greater' or 'less'")
2937 if y is None:
2938 d = asarray(x)
2939 if d.ndim > 1:
2940 raise ValueError('Sample x must be one-dimensional.')
2941 else:
2942 x, y = map(asarray, (x, y))
2943 if x.ndim > 1 or y.ndim > 1:
2944 raise ValueError('Samples x and y must be one-dimensional.')
2945 if len(x) != len(y):
2946 raise ValueError('The samples x and y must have the same length.')
2947 d = x - y
2949 if mode == "auto":
2950 if len(d) <= 25:
2951 mode = "exact"
2952 else:
2953 mode = "approx"
2955 n_zero = np.sum(d == 0)
2956 if n_zero > 0 and mode == "exact":
2957 mode = "approx"
2958 warnings.warn("Exact p-value calculation does not work if there are "
2959 "ties. Switching to normal approximation.")
2961 if mode == "approx":
2962 if zero_method in ["wilcox", "pratt"]:
2963 if n_zero == len(d):
2964 raise ValueError("zero_method 'wilcox' and 'pratt' do not "
2965 "work if x - y is zero for all elements.")
2966 if zero_method == "wilcox":
2967 # Keep all non-zero differences
2968 d = compress(np.not_equal(d, 0), d)
2970 count = len(d)
2971 if count < 10 and mode == "approx":
2972 warnings.warn("Sample size too small for normal approximation.")
2974 r = stats.rankdata(abs(d))
2975 r_plus = np.sum((d > 0) * r)
2976 r_minus = np.sum((d < 0) * r)
2978 if zero_method == "zsplit":
2979 r_zero = np.sum((d == 0) * r)
2980 r_plus += r_zero / 2.
2981 r_minus += r_zero / 2.
2983 # return min for two-sided test, but r_plus for one-sided test
2984 # the literature is not consistent here
2985 # r_plus is more informative since r_plus + r_minus = count*(count+1)/2,
2986 # i.e. the sum of the ranks, so r_minus and the min can be inferred
2987 # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.)
2988 # [3] uses the r_plus for the one-sided test, keep min for two-sided test
2989 # to keep backwards compatibility
2990 if alternative == "two-sided":
2991 T = min(r_plus, r_minus)
2992 else:
2993 T = r_plus
2995 if mode == "approx":
2996 mn = count * (count + 1.) * 0.25
2997 se = count * (count + 1.) * (2. * count + 1.)
2999 if zero_method == "pratt":
3000 r = r[d != 0]
3001 # normal approximation needs to be adjusted, see Cureton (1967)
3002 mn -= n_zero * (n_zero + 1.) * 0.25
3003 se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.)
3005 replist, repnum = find_repeats(r)
3006 if repnum.size != 0:
3007 # Correction for repeated elements.
3008 se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()
3010 se = sqrt(se / 24)
3012 # apply continuity correction if applicable
3013 d = 0
3014 if correction:
3015 if alternative == "two-sided":
3016 d = 0.5 * np.sign(T - mn)
3017 elif alternative == "less":
3018 d = -0.5
3019 else:
3020 d = 0.5
3022 # compute statistic and p-value using normal approximation
3023 z = (T - mn - d) / se
3024 if alternative == "two-sided":
3025 prob = 2. * distributions.norm.sf(abs(z))
3026 elif alternative == "greater":
3027 # large T = r_plus indicates x is greater than y; i.e.
3028 # accept alternative in that case and return small p-value (sf)
3029 prob = distributions.norm.sf(z)
3030 else:
3031 prob = distributions.norm.cdf(z)
3032 elif mode == "exact":
3033 # get frequencies cnt of the possible positive ranksums r_plus
3034 cnt = _get_wilcoxon_distr(count)
3035 # note: r_plus is int (ties not allowed), need int for slices below
3036 r_plus = int(r_plus)
3037 if alternative == "two-sided":
3038 if r_plus == (len(cnt) - 1) // 2:
3039 # r_plus is the center of the distribution.
3040 prob = 1.0
3041 else:
3042 p_less = np.sum(cnt[:r_plus + 1]) / 2**count
3043 p_greater = np.sum(cnt[r_plus:]) / 2**count
3044 prob = 2*min(p_greater, p_less)
3045 elif alternative == "greater":
3046 prob = np.sum(cnt[r_plus:]) / 2**count
3047 else:
3048 prob = np.sum(cnt[:r_plus + 1]) / 2**count
3050 return WilcoxonResult(T, prob)
3053def median_test(*args, **kwds):
3054 """
3055 Perform a Mood's median test.
3057 Test that two or more samples come from populations with the same median.
3059 Let ``n = len(args)`` be the number of samples. The "grand median" of
3060 all the data is computed, and a contingency table is formed by
3061 classifying the values in each sample as being above or below the grand
3062 median. The contingency table, along with `correction` and `lambda_`,
3063 are passed to `scipy.stats.chi2_contingency` to compute the test statistic
3064 and p-value.
3066 Parameters
3067 ----------
3068 sample1, sample2, ... : array_like
3069 The set of samples. There must be at least two samples.
3070 Each sample must be a one-dimensional sequence containing at least
3071 one value. The samples are not required to have the same length.
3072 ties : str, optional
3073 Determines how values equal to the grand median are classified in
3074 the contingency table. The string must be one of::
3076 "below":
3077 Values equal to the grand median are counted as "below".
3078 "above":
3079 Values equal to the grand median are counted as "above".
3080 "ignore":
3081 Values equal to the grand median are not counted.
3083 The default is "below".
3084 correction : bool, optional
3085 If True, *and* there are just two samples, apply Yates' correction
3086 for continuity when computing the test statistic associated with
3087 the contingency table. Default is True.
3088 lambda_ : float or str, optional
3089 By default, the statistic computed in this test is Pearson's
3090 chi-squared statistic. `lambda_` allows a statistic from the
3091 Cressie-Read power divergence family to be used instead. See
3092 `power_divergence` for details.
3093 Default is 1 (Pearson's chi-squared statistic).
3094 nan_policy : {'propagate', 'raise', 'omit'}, optional
3095 Defines how to handle when input contains nan. 'propagate' returns nan,
3096 'raise' throws an error, 'omit' performs the calculations ignoring nan
3097 values. Default is 'propagate'.
3099 Returns
3100 -------
3101 stat : float
3102 The test statistic. The statistic that is returned is determined by
3103 `lambda_`. The default is Pearson's chi-squared statistic.
3104 p : float
3105 The p-value of the test.
3106 m : float
3107 The grand median.
3108 table : ndarray
3109 The contingency table. The shape of the table is (2, n), where
3110 n is the number of samples. The first row holds the counts of the
3111 values above the grand median, and the second row holds the counts
3112 of the values below the grand median. The table allows further
3113 analysis with, for example, `scipy.stats.chi2_contingency`, or with
3114 `scipy.stats.fisher_exact` if there are two samples, without having
3115 to recompute the table. If ``nan_policy`` is "propagate" and there
3116 are nans in the input, the return value for ``table`` is ``None``.
3118 See Also
3119 --------
3120 kruskal : Compute the Kruskal-Wallis H-test for independent samples.
3121 mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y.
3123 Notes
3124 -----
3125 .. versionadded:: 0.15.0
3127 References
3128 ----------
3129 .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill
3130 (1950), pp. 394-399.
3131 .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010).
3132 See Sections 8.12 and 10.15.
3134 Examples
3135 --------
3136 A biologist runs an experiment in which there are three groups of plants.
3137 Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants.
3138 Each plant produces a number of seeds. The seed counts for each group
3139 are::
3141 Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49
3142 Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99
3143 Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84
3145 The following code applies Mood's median test to these samples.
3147 >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49]
3148 >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99]
3149 >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84]
3150 >>> from scipy.stats import median_test
3151 >>> stat, p, med, tbl = median_test(g1, g2, g3)
3153 The median is
3155 >>> med
3156 34.0
3158 and the contingency table is
3160 >>> tbl
3161 array([[ 5, 10, 7],
3162 [11, 5, 10]])
3164 `p` is too large to conclude that the medians are not the same:
3166 >>> p
3167 0.12609082774093244
3169 The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to
3170 `median_test`.
3172 >>> g, p, med, tbl = median_test(g1, g2, g3, lambda_="log-likelihood")
3173 >>> p
3174 0.12224779737117837
3176 The median occurs several times in the data, so we'll get a different
3177 result if, for example, ``ties="above"`` is used:
3179 >>> stat, p, med, tbl = median_test(g1, g2, g3, ties="above")
3180 >>> p
3181 0.063873276069553273
3183 >>> tbl
3184 array([[ 5, 11, 9],
3185 [11, 4, 8]])
3187 This example demonstrates that if the data set is not large and there
3188 are values equal to the median, the p-value can be sensitive to the
3189 choice of `ties`.
3191 """
3192 ties = kwds.pop('ties', 'below')
3193 correction = kwds.pop('correction', True)
3194 lambda_ = kwds.pop('lambda_', None)
3195 nan_policy = kwds.pop('nan_policy', 'propagate')
3197 if len(kwds) > 0:
3198 bad_kwd = kwds.keys()[0]
3199 raise TypeError("median_test() got an unexpected keyword "
3200 "argument %r" % bad_kwd)
3202 if len(args) < 2:
3203 raise ValueError('median_test requires two or more samples.')
3205 ties_options = ['below', 'above', 'ignore']
3206 if ties not in ties_options:
3207 raise ValueError("invalid 'ties' option '%s'; 'ties' must be one "
3208 "of: %s" % (ties, str(ties_options)[1:-1]))
3210 data = [np.asarray(arg) for arg in args]
3212 # Validate the sizes and shapes of the arguments.
3213 for k, d in enumerate(data):
3214 if d.size == 0:
3215 raise ValueError("Sample %d is empty. All samples must "
3216 "contain at least one value." % (k + 1))
3217 if d.ndim != 1:
3218 raise ValueError("Sample %d has %d dimensions. All "
3219 "samples must be one-dimensional sequences." %
3220 (k + 1, d.ndim))
3222 cdata = np.concatenate(data)
3223 contains_nan, nan_policy = _contains_nan(cdata, nan_policy)
3224 if contains_nan and nan_policy == 'propagate':
3225 return np.nan, np.nan, np.nan, None
3227 if contains_nan:
3228 grand_median = np.median(cdata[~np.isnan(cdata)])
3229 else:
3230 grand_median = np.median(cdata)
3231 # When the minimum version of numpy supported by scipy is 1.9.0,
3232 # the above if/else statement can be replaced by the single line:
3233 # grand_median = np.nanmedian(cdata)
3235 # Create the contingency table.
3236 table = np.zeros((2, len(data)), dtype=np.int64)
3237 for k, sample in enumerate(data):
3238 sample = sample[~np.isnan(sample)]
3240 nabove = count_nonzero(sample > grand_median)
3241 nbelow = count_nonzero(sample < grand_median)
3242 nequal = sample.size - (nabove + nbelow)
3243 table[0, k] += nabove
3244 table[1, k] += nbelow
3245 if ties == "below":
3246 table[1, k] += nequal
3247 elif ties == "above":
3248 table[0, k] += nequal
3250 # Check that no row or column of the table is all zero.
3251 # Such a table can not be given to chi2_contingency, because it would have
3252 # a zero in the table of expected frequencies.
3253 rowsums = table.sum(axis=1)
3254 if rowsums[0] == 0:
3255 raise ValueError("All values are below the grand median (%r)." %
3256 grand_median)
3257 if rowsums[1] == 0:
3258 raise ValueError("All values are above the grand median (%r)." %
3259 grand_median)
3260 if ties == "ignore":
3261 # We already checked that each sample has at least one value, but it
3262 # is possible that all those values equal the grand median. If `ties`
3263 # is "ignore", that would result in a column of zeros in `table`. We
3264 # check for that case here.
3265 zero_cols = np.nonzero((table == 0).all(axis=0))[0]
3266 if len(zero_cols) > 0:
3267 msg = ("All values in sample %d are equal to the grand "
3268 "median (%r), so they are ignored, resulting in an "
3269 "empty sample." % (zero_cols[0] + 1, grand_median))
3270 raise ValueError(msg)
3272 stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_,
3273 correction=correction)
3274 return stat, p, grand_median, table
3277def _circfuncs_common(samples, high, low, nan_policy='propagate'):
3278 # Ensure samples are array-like and size is not zero
3279 samples = np.asarray(samples)
3280 if samples.size == 0:
3281 return np.nan, np.asarray(np.nan), np.asarray(np.nan), None
3283 # Recast samples as radians that range between 0 and 2 pi and calculate
3284 # the sine and cosine
3285 sin_samp = sin((samples - low)*2.*pi / (high - low))
3286 cos_samp = cos((samples - low)*2.*pi / (high - low))
3288 # Apply the NaN policy
3289 contains_nan, nan_policy = _contains_nan(samples, nan_policy)
3290 if contains_nan and nan_policy == 'omit':
3291 mask = np.isnan(samples)
3292 # Set the sines and cosines that are NaN to zero
3293 sin_samp[mask] = 0.0
3294 cos_samp[mask] = 0.0
3295 else:
3296 mask = None
3298 return samples, sin_samp, cos_samp, mask
3301def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3302 """
3303 Compute the circular mean for samples in a range.
3305 Parameters
3306 ----------
3307 samples : array_like
3308 Input array.
3309 high : float or int, optional
3310 High boundary for circular mean range. Default is ``2*pi``.
3311 low : float or int, optional
3312 Low boundary for circular mean range. Default is 0.
3313 axis : int, optional
3314 Axis along which means are computed. The default is to compute
3315 the mean of the flattened array.
3316 nan_policy : {'propagate', 'raise', 'omit'}, optional
3317 Defines how to handle when input contains nan. 'propagate' returns nan,
3318 'raise' throws an error, 'omit' performs the calculations ignoring nan
3319 values. Default is 'propagate'.
3321 Returns
3322 -------
3323 circmean : float
3324 Circular mean.
3326 Examples
3327 --------
3328 >>> from scipy.stats import circmean
3329 >>> circmean([0.1, 2*np.pi+0.2, 6*np.pi+0.3])
3330 0.2
3332 >>> from scipy.stats import circmean
3333 >>> circmean([0.2, 1.4, 2.6], high = 1, low = 0)
3334 0.4
3336 """
3337 samples, sin_samp, cos_samp, nmask = _circfuncs_common(samples, high, low,
3338 nan_policy=nan_policy)
3339 sin_sum = sin_samp.sum(axis=axis)
3340 cos_sum = cos_samp.sum(axis=axis)
3341 res = arctan2(sin_sum, cos_sum)
3343 mask_nan = ~np.isnan(res)
3344 if mask_nan.ndim > 0:
3345 mask = res[mask_nan] < 0
3346 else:
3347 mask = res < 0
3349 if mask.ndim > 0:
3350 mask_nan[mask_nan] = mask
3351 res[mask_nan] += 2*pi
3352 elif mask:
3353 res += 2*pi
3355 # Set output to NaN if no samples went into the mean
3356 if nmask is not None:
3357 if nmask.all():
3358 res = np.full(shape=res.shape, fill_value=np.nan)
3359 else:
3360 # Find out if any of the axis that are being averaged consist
3361 # entirely of NaN. If one exists, set the result (res) to NaN
3362 nshape = 0 if axis is None else axis
3363 smask = nmask.shape[nshape] == nmask.sum(axis=axis)
3364 if smask.any():
3365 res[smask] = np.nan
3367 return res*(high - low)/2.0/pi + low
3370def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3371 """
3372 Compute the circular variance for samples assumed to be in a range.
3374 Parameters
3375 ----------
3376 samples : array_like
3377 Input array.
3378 high : float or int, optional
3379 High boundary for circular variance range. Default is ``2*pi``.
3380 low : float or int, optional
3381 Low boundary for circular variance range. Default is 0.
3382 axis : int, optional
3383 Axis along which variances are computed. The default is to compute
3384 the variance of the flattened array.
3385 nan_policy : {'propagate', 'raise', 'omit'}, optional
3386 Defines how to handle when input contains nan. 'propagate' returns nan,
3387 'raise' throws an error, 'omit' performs the calculations ignoring nan
3388 values. Default is 'propagate'.
3390 Returns
3391 -------
3392 circvar : float
3393 Circular variance.
3395 Notes
3396 -----
3397 This uses a definition of circular variance that in the limit of small
3398 angles returns a number close to the 'linear' variance.
3400 Examples
3401 --------
3402 >>> from scipy.stats import circvar
3403 >>> circvar([0, 2*np.pi/3, 5*np.pi/3])
3404 2.19722457734
3406 """
3407 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,
3408 nan_policy=nan_policy)
3409 if mask is None:
3410 sin_mean = sin_samp.mean(axis=axis)
3411 cos_mean = cos_samp.mean(axis=axis)
3412 else:
3413 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))
3414 nsum[nsum == 0] = np.nan
3415 sin_mean = sin_samp.sum(axis=axis) / nsum
3416 cos_mean = cos_samp.sum(axis=axis) / nsum
3417 R = hypot(sin_mean, cos_mean)
3419 return ((high - low)/2.0/pi)**2 * 2 * log(1/R)
3422def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3423 """
3424 Compute the circular standard deviation for samples assumed to be in the
3425 range [low to high].
3427 Parameters
3428 ----------
3429 samples : array_like
3430 Input array.
3431 high : float or int, optional
3432 High boundary for circular standard deviation range.
3433 Default is ``2*pi``.
3434 low : float or int, optional
3435 Low boundary for circular standard deviation range. Default is 0.
3436 axis : int, optional
3437 Axis along which standard deviations are computed. The default is
3438 to compute the standard deviation of the flattened array.
3439 nan_policy : {'propagate', 'raise', 'omit'}, optional
3440 Defines how to handle when input contains nan. 'propagate' returns nan,
3441 'raise' throws an error, 'omit' performs the calculations ignoring nan
3442 values. Default is 'propagate'.
3444 Returns
3445 -------
3446 circstd : float
3447 Circular standard deviation.
3449 Notes
3450 -----
3451 This uses a definition of circular standard deviation that in the limit of
3452 small angles returns a number close to the 'linear' standard deviation.
3454 Examples
3455 --------
3456 >>> from scipy.stats import circstd
3457 >>> circstd([0, 0.1*np.pi/2, 0.001*np.pi, 0.03*np.pi/2])
3458 0.063564063306
3460 """
3461 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,
3462 nan_policy=nan_policy)
3463 if mask is None:
3464 sin_mean = sin_samp.mean(axis=axis)
3465 cos_mean = cos_samp.mean(axis=axis)
3466 else:
3467 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))
3468 nsum[nsum == 0] = np.nan
3469 sin_mean = sin_samp.sum(axis=axis) / nsum
3470 cos_mean = cos_samp.sum(axis=axis) / nsum
3471 R = hypot(sin_mean, cos_mean)
3473 return ((high - low)/2.0/pi) * sqrt(-2*log(R))