Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/stats/stats.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright 2002 Gary Strangman. All rights reserved
2# Copyright 2002-2016 The SciPy Developers
3#
4# The original code from Gary Strangman was heavily adapted for
5# use in SciPy by Travis Oliphant. The original code came with the
6# following disclaimer:
7#
8# This software is provided "as-is". There are no expressed or implied
9# warranties of any kind, including, but not limited to, the warranties
10# of merchantability and fitness for a given application. In no event
11# shall Gary Strangman be liable for any direct, indirect, incidental,
12# special, exemplary or consequential damages (including, but not limited
13# to, loss of use, data or profits, or business interruption) however
14# caused and on any theory of liability, whether in contract, strict
15# liability or tort (including negligence or otherwise) arising in any way
16# out of the use of this software, even if advised of the possibility of
17# such damage.
19"""
20A collection of basic statistical functions for Python. The function
21names appear below.
23 Some scalar functions defined here are also available in the scipy.special
24 package where they work on arbitrary sized arrays.
26Disclaimers: The function list is obviously incomplete and, worse, the
27functions are not optimized. All functions have been tested (some more
28so than others), but they are far from bulletproof. Thus, as with any
29free software, no warranty or guarantee is expressed or implied. :-) A
30few extra functions that don't appear in the list below can be found by
31interested treasure-hunters. These functions don't necessarily have
32both list and array versions but were deemed useful.
34Central Tendency
35----------------
36.. autosummary::
37 :toctree: generated/
39 gmean
40 hmean
41 mode
43Moments
44-------
45.. autosummary::
46 :toctree: generated/
48 moment
49 variation
50 skew
51 kurtosis
52 normaltest
54Altered Versions
55----------------
56.. autosummary::
57 :toctree: generated/
59 tmean
60 tvar
61 tstd
62 tsem
63 describe
65Frequency Stats
66---------------
67.. autosummary::
68 :toctree: generated/
70 itemfreq
71 scoreatpercentile
72 percentileofscore
73 cumfreq
74 relfreq
76Variability
77-----------
78.. autosummary::
79 :toctree: generated/
81 obrientransform
82 sem
83 zmap
84 zscore
85 gstd
86 iqr
87 median_abs_deviation
89Trimming Functions
90------------------
91.. autosummary::
92 :toctree: generated/
94 trimboth
95 trim1
97Correlation Functions
98---------------------
99.. autosummary::
100 :toctree: generated/
102 pearsonr
103 fisher_exact
104 spearmanr
105 pointbiserialr
106 kendalltau
107 weightedtau
108 linregress
109 theilslopes
110 multiscale_graphcorr
112Inferential Stats
113-----------------
114.. autosummary::
115 :toctree: generated/
117 ttest_1samp
118 ttest_ind
119 ttest_ind_from_stats
120 ttest_rel
121 chisquare
122 power_divergence
123 kstest
124 ks_1samp
125 ks_2samp
126 epps_singleton_2samp
127 mannwhitneyu
128 ranksums
129 wilcoxon
130 kruskal
131 friedmanchisquare
132 brunnermunzel
133 combine_pvalues
135Statistical Distances
136---------------------
137.. autosummary::
138 :toctree: generated/
140 wasserstein_distance
141 energy_distance
143ANOVA Functions
144---------------
145.. autosummary::
146 :toctree: generated/
148 f_oneway
150Support Functions
151-----------------
152.. autosummary::
153 :toctree: generated/
155 rankdata
156 rvs_ratio_uniforms
158References
159----------
160.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
161 Probability and Statistics Tables and Formulae. Chapman & Hall: New
162 York. 2000.
164"""
166import warnings
167import math
168from math import gcd
169from collections import namedtuple
171import numpy as np
172from numpy import array, asarray, ma
174from scipy.spatial.distance import cdist
175from scipy.ndimage import measurements
176from scipy._lib._util import (_lazywhere, check_random_state, MapWrapper,
177 rng_integers)
178import scipy.special as special
179from scipy import linalg
180from . import distributions
181from . import mstats_basic
182from ._stats_mstats_common import (_find_repeats, linregress, theilslopes,
183 siegelslopes)
184from ._stats import (_kendall_dis, _toint64, _weightedrankedtau,
185 _local_correlations)
186from ._rvs_sampling import rvs_ratio_uniforms
187from ._hypotests import epps_singleton_2samp
190__all__ = ['find_repeats', 'gmean', 'hmean', 'mode', 'tmean', 'tvar',
191 'tmin', 'tmax', 'tstd', 'tsem', 'moment', 'variation',
192 'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
193 'normaltest', 'jarque_bera', 'itemfreq',
194 'scoreatpercentile', 'percentileofscore',
195 'cumfreq', 'relfreq', 'obrientransform',
196 'sem', 'zmap', 'zscore', 'iqr', 'gstd', 'median_absolute_deviation',
197 'median_abs_deviation',
198 'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
199 'f_oneway', 'F_onewayConstantInputWarning',
200 'F_onewayBadInputSizesWarning',
201 'PearsonRConstantInputWarning', 'PearsonRNearConstantInputWarning',
202 'pearsonr', 'fisher_exact', 'SpearmanRConstantInputWarning',
203 'spearmanr', 'pointbiserialr',
204 'kendalltau', 'weightedtau', 'multiscale_graphcorr',
205 'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
206 'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel',
207 'kstest', 'ks_1samp', 'ks_2samp',
208 'chisquare', 'power_divergence', 'mannwhitneyu',
209 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
210 'rankdata', 'rvs_ratio_uniforms',
211 'combine_pvalues', 'wasserstein_distance', 'energy_distance',
212 'brunnermunzel', 'epps_singleton_2samp']
215def _contains_nan(a, nan_policy='propagate'):
216 policies = ['propagate', 'raise', 'omit']
217 if nan_policy not in policies:
218 raise ValueError("nan_policy must be one of {%s}" %
219 ', '.join("'%s'" % s for s in policies))
220 try:
221 # Calling np.sum to avoid creating a huge array into memory
222 # e.g. np.isnan(a).any()
223 with np.errstate(invalid='ignore'):
224 contains_nan = np.isnan(np.sum(a))
225 except TypeError:
226 # This can happen when attempting to sum things which are not
227 # numbers (e.g. as in the function `mode`). Try an alternative method:
228 try:
229 contains_nan = np.nan in set(a.ravel())
230 except TypeError:
231 # Don't know what to do. Fall back to omitting nan values and
232 # issue a warning.
233 contains_nan = False
234 nan_policy = 'omit'
235 warnings.warn("The input array could not be properly checked for nan "
236 "values. nan values will be ignored.", RuntimeWarning)
238 if contains_nan and nan_policy == 'raise':
239 raise ValueError("The input contains nan values")
241 return contains_nan, nan_policy
244def _chk_asarray(a, axis):
245 if axis is None:
246 a = np.ravel(a)
247 outaxis = 0
248 else:
249 a = np.asarray(a)
250 outaxis = axis
252 if a.ndim == 0:
253 a = np.atleast_1d(a)
255 return a, outaxis
258def _chk2_asarray(a, b, axis):
259 if axis is None:
260 a = np.ravel(a)
261 b = np.ravel(b)
262 outaxis = 0
263 else:
264 a = np.asarray(a)
265 b = np.asarray(b)
266 outaxis = axis
268 if a.ndim == 0:
269 a = np.atleast_1d(a)
270 if b.ndim == 0:
271 b = np.atleast_1d(b)
273 return a, b, outaxis
276def _shape_with_dropped_axis(a, axis):
277 """
278 Given an array `a` and an integer `axis`, return the shape
279 of `a` with the `axis` dimension removed.
281 Examples
282 --------
283 >>> a = np.zeros((3, 5, 2))
284 >>> _shape_with_dropped_axis(a, 1)
285 (3, 2)
286 """
287 shp = list(a.shape)
288 try:
289 del shp[axis]
290 except IndexError:
291 raise np.AxisError(axis, a.ndim) from None
292 return tuple(shp)
295def _broadcast_shapes(shape1, shape2):
296 """
297 Given two shapes (i.e. tuples of integers), return the shape
298 that would result from broadcasting two arrays with the given
299 shapes.
301 Examples
302 --------
303 >>> _broadcast_shapes((2, 1), (4, 1, 3))
304 (4, 2, 3)
305 """
306 d = len(shape1) - len(shape2)
307 if d <= 0:
308 shp1 = (1,)*(-d) + shape1
309 shp2 = shape2
310 elif d > 0:
311 shp1 = shape1
312 shp2 = (1,)*d + shape2
313 shape = []
314 for n1, n2 in zip(shp1, shp2):
315 if n1 == 1:
316 n = n2
317 elif n2 == 1 or n1 == n2:
318 n = n1
319 else:
320 raise ValueError(f'shapes {shape1} and {shape2} could not be '
321 'broadcast together')
322 shape.append(n)
323 return tuple(shape)
326def _broadcast_shapes_with_dropped_axis(a, b, axis):
327 """
328 Given two arrays `a` and `b` and an integer `axis`, find the
329 shape of the broadcast result after dropping `axis` from the
330 shapes of `a` and `b`.
332 Examples
333 --------
334 >>> a = np.zeros((5, 2, 1))
335 >>> b = np.zeros((1, 9, 3))
336 >>> _broadcast_shapes_with_dropped_axis(a, b, 1)
337 (5, 3)
338 """
339 shp1 = _shape_with_dropped_axis(a, axis)
340 shp2 = _shape_with_dropped_axis(b, axis)
341 try:
342 shp = _broadcast_shapes(shp1, shp2)
343 except ValueError:
344 raise ValueError(f'non-axis shapes {shp1} and {shp2} could not be '
345 'broadcast together') from None
346 return shp
349def gmean(a, axis=0, dtype=None):
350 """
351 Compute the geometric mean along the specified axis.
353 Return the geometric average of the array elements.
354 That is: n-th root of (x1 * x2 * ... * xn)
356 Parameters
357 ----------
358 a : array_like
359 Input array or object that can be converted to an array.
360 axis : int or None, optional
361 Axis along which the geometric mean is computed. Default is 0.
362 If None, compute over the whole array `a`.
363 dtype : dtype, optional
364 Type of the returned array and of the accumulator in which the
365 elements are summed. If dtype is not specified, it defaults to the
366 dtype of a, unless a has an integer dtype with a precision less than
367 that of the default platform integer. In that case, the default
368 platform integer is used.
370 Returns
371 -------
372 gmean : ndarray
373 See `dtype` parameter above.
375 See Also
376 --------
377 numpy.mean : Arithmetic average
378 numpy.average : Weighted average
379 hmean : Harmonic mean
381 Notes
382 -----
383 The geometric average is computed over a single dimension of the input
384 array, axis=0 by default, or all values in the array if axis=None.
385 float64 intermediate and return values are used for integer inputs.
387 Use masked arrays to ignore any non-finite values in the input or that
388 arise in the calculations such as Not a Number and infinity because masked
389 arrays automatically mask any non-finite values.
391 Examples
392 --------
393 >>> from scipy.stats import gmean
394 >>> gmean([1, 4])
395 2.0
396 >>> gmean([1, 2, 3, 4, 5, 6, 7])
397 3.3800151591412964
399 """
400 if not isinstance(a, np.ndarray):
401 # if not an ndarray object attempt to convert it
402 log_a = np.log(np.array(a, dtype=dtype))
403 elif dtype:
404 # Must change the default dtype allowing array type
405 if isinstance(a, np.ma.MaskedArray):
406 log_a = np.log(np.ma.asarray(a, dtype=dtype))
407 else:
408 log_a = np.log(np.asarray(a, dtype=dtype))
409 else:
410 log_a = np.log(a)
411 return np.exp(log_a.mean(axis=axis))
414def hmean(a, axis=0, dtype=None):
415 """
416 Calculate the harmonic mean along the specified axis.
418 That is: n / (1/x1 + 1/x2 + ... + 1/xn)
420 Parameters
421 ----------
422 a : array_like
423 Input array, masked array or object that can be converted to an array.
424 axis : int or None, optional
425 Axis along which the harmonic mean is computed. Default is 0.
426 If None, compute over the whole array `a`.
427 dtype : dtype, optional
428 Type of the returned array and of the accumulator in which the
429 elements are summed. If `dtype` is not specified, it defaults to the
430 dtype of `a`, unless `a` has an integer `dtype` with a precision less
431 than that of the default platform integer. In that case, the default
432 platform integer is used.
434 Returns
435 -------
436 hmean : ndarray
437 See `dtype` parameter above.
439 See Also
440 --------
441 numpy.mean : Arithmetic average
442 numpy.average : Weighted average
443 gmean : Geometric mean
445 Notes
446 -----
447 The harmonic mean is computed over a single dimension of the input
448 array, axis=0 by default, or all values in the array if axis=None.
449 float64 intermediate and return values are used for integer inputs.
451 Use masked arrays to ignore any non-finite values in the input or that
452 arise in the calculations such as Not a Number and infinity.
454 Examples
455 --------
456 >>> from scipy.stats import hmean
457 >>> hmean([1, 4])
458 1.6000000000000001
459 >>> hmean([1, 2, 3, 4, 5, 6, 7])
460 2.6997245179063363
462 """
463 if not isinstance(a, np.ndarray):
464 a = np.array(a, dtype=dtype)
465 if np.all(a >= 0):
466 # Harmonic mean only defined if greater than or equal to to zero.
467 if isinstance(a, np.ma.MaskedArray):
468 size = a.count(axis)
469 else:
470 if axis is None:
471 a = a.ravel()
472 size = a.shape[0]
473 else:
474 size = a.shape[axis]
475 with np.errstate(divide='ignore'):
476 return size / np.sum(1.0 / a, axis=axis, dtype=dtype)
477 else:
478 raise ValueError("Harmonic mean only defined if all elements greater "
479 "than or equal to zero")
482ModeResult = namedtuple('ModeResult', ('mode', 'count'))
485def mode(a, axis=0, nan_policy='propagate'):
486 """
487 Return an array of the modal (most common) value in the passed array.
489 If there is more than one such value, only the smallest is returned.
490 The bin-count for the modal bins is also returned.
492 Parameters
493 ----------
494 a : array_like
495 n-dimensional array of which to find mode(s).
496 axis : int or None, optional
497 Axis along which to operate. Default is 0. If None, compute over
498 the whole array `a`.
499 nan_policy : {'propagate', 'raise', 'omit'}, optional
500 Defines how to handle when input contains nan.
501 The following options are available (default is 'propagate'):
503 * 'propagate': returns nan
504 * 'raise': throws an error
505 * 'omit': performs the calculations ignoring nan values
507 Returns
508 -------
509 mode : ndarray
510 Array of modal values.
511 count : ndarray
512 Array of counts for each mode.
514 Examples
515 --------
516 >>> a = np.array([[6, 8, 3, 0],
517 ... [3, 2, 1, 7],
518 ... [8, 1, 8, 4],
519 ... [5, 3, 0, 5],
520 ... [4, 7, 5, 9]])
521 >>> from scipy import stats
522 >>> stats.mode(a)
523 ModeResult(mode=array([[3, 1, 0, 0]]), count=array([[1, 1, 1, 1]]))
525 To get mode of whole array, specify ``axis=None``:
527 >>> stats.mode(a, axis=None)
528 ModeResult(mode=array([3]), count=array([3]))
530 """
531 a, axis = _chk_asarray(a, axis)
532 if a.size == 0:
533 return ModeResult(np.array([]), np.array([]))
535 contains_nan, nan_policy = _contains_nan(a, nan_policy)
537 if contains_nan and nan_policy == 'omit':
538 a = ma.masked_invalid(a)
539 return mstats_basic.mode(a, axis)
541 if a.dtype == object and np.nan in set(a.ravel()):
542 # Fall back to a slower method since np.unique does not work with NaN
543 scores = set(np.ravel(a)) # get ALL unique values
544 testshape = list(a.shape)
545 testshape[axis] = 1
546 oldmostfreq = np.zeros(testshape, dtype=a.dtype)
547 oldcounts = np.zeros(testshape, dtype=int)
549 for score in scores:
550 template = (a == score)
551 counts = np.expand_dims(np.sum(template, axis), axis)
552 mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
553 oldcounts = np.maximum(counts, oldcounts)
554 oldmostfreq = mostfrequent
556 return ModeResult(mostfrequent, oldcounts)
558 def _mode1D(a):
559 vals, cnts = np.unique(a, return_counts=True)
560 return vals[cnts.argmax()], cnts.max()
562 # np.apply_along_axis will convert the _mode1D tuples to a numpy array, casting types in the process
563 # This recreates the results without that issue
564 # View of a, rotated so the requested axis is last
565 in_dims = list(range(a.ndim))
566 a_view = np.transpose(a, in_dims[:axis] + in_dims[axis+1:] + [axis])
568 inds = np.ndindex(a_view.shape[:-1])
569 modes = np.empty(a_view.shape[:-1], dtype=a.dtype)
570 counts = np.zeros(a_view.shape[:-1], dtype=np.int)
571 for ind in inds:
572 modes[ind], counts[ind] = _mode1D(a_view[ind])
573 newshape = list(a.shape)
574 newshape[axis] = 1
575 return ModeResult(modes.reshape(newshape), counts.reshape(newshape))
578def _mask_to_limits(a, limits, inclusive):
579 """Mask an array for values outside of given limits.
581 This is primarily a utility function.
583 Parameters
584 ----------
585 a : array
586 limits : (float or None, float or None)
587 A tuple consisting of the (lower limit, upper limit). Values in the
588 input array less than the lower limit or greater than the upper limit
589 will be masked out. None implies no limit.
590 inclusive : (bool, bool)
591 A tuple consisting of the (lower flag, upper flag). These flags
592 determine whether values exactly equal to lower or upper are allowed.
594 Returns
595 -------
596 A MaskedArray.
598 Raises
599 ------
600 A ValueError if there are no values within the given limits.
602 """
603 lower_limit, upper_limit = limits
604 lower_include, upper_include = inclusive
605 am = ma.MaskedArray(a)
606 if lower_limit is not None:
607 if lower_include:
608 am = ma.masked_less(am, lower_limit)
609 else:
610 am = ma.masked_less_equal(am, lower_limit)
612 if upper_limit is not None:
613 if upper_include:
614 am = ma.masked_greater(am, upper_limit)
615 else:
616 am = ma.masked_greater_equal(am, upper_limit)
618 if am.count() == 0:
619 raise ValueError("No array values within given limits")
621 return am
624def tmean(a, limits=None, inclusive=(True, True), axis=None):
625 """
626 Compute the trimmed mean.
628 This function finds the arithmetic mean of given values, ignoring values
629 outside the given `limits`.
631 Parameters
632 ----------
633 a : array_like
634 Array of values.
635 limits : None or (lower limit, upper limit), optional
636 Values in the input array less than the lower limit or greater than the
637 upper limit will be ignored. When limits is None (default), then all
638 values are used. Either of the limit values in the tuple can also be
639 None representing a half-open interval.
640 inclusive : (bool, bool), optional
641 A tuple consisting of the (lower flag, upper flag). These flags
642 determine whether values exactly equal to the lower or upper limits
643 are included. The default value is (True, True).
644 axis : int or None, optional
645 Axis along which to compute test. Default is None.
647 Returns
648 -------
649 tmean : float
650 Trimmed mean.
652 See Also
653 --------
654 trim_mean : Returns mean after trimming a proportion from both tails.
656 Examples
657 --------
658 >>> from scipy import stats
659 >>> x = np.arange(20)
660 >>> stats.tmean(x)
661 9.5
662 >>> stats.tmean(x, (3,17))
663 10.0
665 """
666 a = asarray(a)
667 if limits is None:
668 return np.mean(a, None)
670 am = _mask_to_limits(a.ravel(), limits, inclusive)
671 return am.mean(axis=axis)
674def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
675 """
676 Compute the trimmed variance.
678 This function computes the sample variance of an array of values,
679 while ignoring values which are outside of given `limits`.
681 Parameters
682 ----------
683 a : array_like
684 Array of values.
685 limits : None or (lower limit, upper limit), optional
686 Values in the input array less than the lower limit or greater than the
687 upper limit will be ignored. When limits is None, then all values are
688 used. Either of the limit values in the tuple can also be None
689 representing a half-open interval. The default value is None.
690 inclusive : (bool, bool), optional
691 A tuple consisting of the (lower flag, upper flag). These flags
692 determine whether values exactly equal to the lower or upper limits
693 are included. The default value is (True, True).
694 axis : int or None, optional
695 Axis along which to operate. Default is 0. If None, compute over the
696 whole array `a`.
697 ddof : int, optional
698 Delta degrees of freedom. Default is 1.
700 Returns
701 -------
702 tvar : float
703 Trimmed variance.
705 Notes
706 -----
707 `tvar` computes the unbiased sample variance, i.e. it uses a correction
708 factor ``n / (n - 1)``.
710 Examples
711 --------
712 >>> from scipy import stats
713 >>> x = np.arange(20)
714 >>> stats.tvar(x)
715 35.0
716 >>> stats.tvar(x, (3,17))
717 20.0
719 """
720 a = asarray(a)
721 a = a.astype(float)
722 if limits is None:
723 return a.var(ddof=ddof, axis=axis)
724 am = _mask_to_limits(a, limits, inclusive)
725 amnan = am.filled(fill_value=np.nan)
726 return np.nanvar(amnan, ddof=ddof, axis=axis)
729def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
730 """
731 Compute the trimmed minimum.
733 This function finds the miminum value of an array `a` along the
734 specified axis, but only considering values greater than a specified
735 lower limit.
737 Parameters
738 ----------
739 a : array_like
740 Array of values.
741 lowerlimit : None or float, optional
742 Values in the input array less than the given limit will be ignored.
743 When lowerlimit is None, then all values are used. The default value
744 is None.
745 axis : int or None, optional
746 Axis along which to operate. Default is 0. If None, compute over the
747 whole array `a`.
748 inclusive : {True, False}, optional
749 This flag determines whether values exactly equal to the lower limit
750 are included. The default value is True.
751 nan_policy : {'propagate', 'raise', 'omit'}, optional
752 Defines how to handle when input contains nan.
753 The following options are available (default is 'propagate'):
755 * 'propagate': returns nan
756 * 'raise': throws an error
757 * 'omit': performs the calculations ignoring nan values
759 Returns
760 -------
761 tmin : float, int or ndarray
762 Trimmed minimum.
764 Examples
765 --------
766 >>> from scipy import stats
767 >>> x = np.arange(20)
768 >>> stats.tmin(x)
769 0
771 >>> stats.tmin(x, 13)
772 13
774 >>> stats.tmin(x, 13, inclusive=False)
775 14
777 """
778 a, axis = _chk_asarray(a, axis)
779 am = _mask_to_limits(a, (lowerlimit, None), (inclusive, False))
781 contains_nan, nan_policy = _contains_nan(am, nan_policy)
783 if contains_nan and nan_policy == 'omit':
784 am = ma.masked_invalid(am)
786 res = ma.minimum.reduce(am, axis).data
787 if res.ndim == 0:
788 return res[()]
789 return res
792def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
793 """
794 Compute the trimmed maximum.
796 This function computes the maximum value of an array along a given axis,
797 while ignoring values larger than a specified upper limit.
799 Parameters
800 ----------
801 a : array_like
802 Array of values.
803 upperlimit : None or float, optional
804 Values in the input array greater than the given limit will be ignored.
805 When upperlimit is None, then all values are used. The default value
806 is None.
807 axis : int or None, optional
808 Axis along which to operate. Default is 0. If None, compute over the
809 whole array `a`.
810 inclusive : {True, False}, optional
811 This flag determines whether values exactly equal to the upper limit
812 are included. The default value is True.
813 nan_policy : {'propagate', 'raise', 'omit'}, optional
814 Defines how to handle when input contains nan.
815 The following options are available (default is 'propagate'):
817 * 'propagate': returns nan
818 * 'raise': throws an error
819 * 'omit': performs the calculations ignoring nan values
821 Returns
822 -------
823 tmax : float, int or ndarray
824 Trimmed maximum.
826 Examples
827 --------
828 >>> from scipy import stats
829 >>> x = np.arange(20)
830 >>> stats.tmax(x)
831 19
833 >>> stats.tmax(x, 13)
834 13
836 >>> stats.tmax(x, 13, inclusive=False)
837 12
839 """
840 a, axis = _chk_asarray(a, axis)
841 am = _mask_to_limits(a, (None, upperlimit), (False, inclusive))
843 contains_nan, nan_policy = _contains_nan(am, nan_policy)
845 if contains_nan and nan_policy == 'omit':
846 am = ma.masked_invalid(am)
848 res = ma.maximum.reduce(am, axis).data
849 if res.ndim == 0:
850 return res[()]
851 return res
854def tstd(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
855 """
856 Compute the trimmed sample standard deviation.
858 This function finds the sample standard deviation of given values,
859 ignoring values outside the given `limits`.
861 Parameters
862 ----------
863 a : array_like
864 Array of values.
865 limits : None or (lower limit, upper limit), optional
866 Values in the input array less than the lower limit or greater than the
867 upper limit will be ignored. When limits is None, then all values are
868 used. Either of the limit values in the tuple can also be None
869 representing a half-open interval. The default value is None.
870 inclusive : (bool, bool), optional
871 A tuple consisting of the (lower flag, upper flag). These flags
872 determine whether values exactly equal to the lower or upper limits
873 are included. The default value is (True, True).
874 axis : int or None, optional
875 Axis along which to operate. Default is 0. If None, compute over the
876 whole array `a`.
877 ddof : int, optional
878 Delta degrees of freedom. Default is 1.
880 Returns
881 -------
882 tstd : float
883 Trimmed sample standard deviation.
885 Notes
886 -----
887 `tstd` computes the unbiased sample standard deviation, i.e. it uses a
888 correction factor ``n / (n - 1)``.
890 Examples
891 --------
892 >>> from scipy import stats
893 >>> x = np.arange(20)
894 >>> stats.tstd(x)
895 5.9160797830996161
896 >>> stats.tstd(x, (3,17))
897 4.4721359549995796
899 """
900 return np.sqrt(tvar(a, limits, inclusive, axis, ddof))
903def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
904 """
905 Compute the trimmed standard error of the mean.
907 This function finds the standard error of the mean for given
908 values, ignoring values outside the given `limits`.
910 Parameters
911 ----------
912 a : array_like
913 Array of values.
914 limits : None or (lower limit, upper limit), optional
915 Values in the input array less than the lower limit or greater than the
916 upper limit will be ignored. When limits is None, then all values are
917 used. Either of the limit values in the tuple can also be None
918 representing a half-open interval. The default value is None.
919 inclusive : (bool, bool), optional
920 A tuple consisting of the (lower flag, upper flag). These flags
921 determine whether values exactly equal to the lower or upper limits
922 are included. The default value is (True, True).
923 axis : int or None, optional
924 Axis along which to operate. Default is 0. If None, compute over the
925 whole array `a`.
926 ddof : int, optional
927 Delta degrees of freedom. Default is 1.
929 Returns
930 -------
931 tsem : float
932 Trimmed standard error of the mean.
934 Notes
935 -----
936 `tsem` uses unbiased sample standard deviation, i.e. it uses a
937 correction factor ``n / (n - 1)``.
939 Examples
940 --------
941 >>> from scipy import stats
942 >>> x = np.arange(20)
943 >>> stats.tsem(x)
944 1.3228756555322954
945 >>> stats.tsem(x, (3,17))
946 1.1547005383792515
948 """
949 a = np.asarray(a).ravel()
950 if limits is None:
951 return a.std(ddof=ddof) / np.sqrt(a.size)
953 am = _mask_to_limits(a, limits, inclusive)
954 sd = np.sqrt(np.ma.var(am, ddof=ddof, axis=axis))
955 return sd / np.sqrt(am.count())
958#####################################
959# MOMENTS #
960#####################################
962def moment(a, moment=1, axis=0, nan_policy='propagate'):
963 r"""
964 Calculate the nth moment about the mean for a sample.
966 A moment is a specific quantitative measure of the shape of a set of
967 points. It is often used to calculate coefficients of skewness and kurtosis
968 due to its close relationship with them.
970 Parameters
971 ----------
972 a : array_like
973 Input array.
974 moment : int or array_like of ints, optional
975 Order of central moment that is returned. Default is 1.
976 axis : int or None, optional
977 Axis along which the central moment is computed. Default is 0.
978 If None, compute over the whole array `a`.
979 nan_policy : {'propagate', 'raise', 'omit'}, optional
980 Defines how to handle when input contains nan.
981 The following options are available (default is 'propagate'):
983 * 'propagate': returns nan
984 * 'raise': throws an error
985 * 'omit': performs the calculations ignoring nan values
987 Returns
988 -------
989 n-th central moment : ndarray or float
990 The appropriate moment along the given axis or over all values if axis
991 is None. The denominator for the moment calculation is the number of
992 observations, no degrees of freedom correction is done.
994 See Also
995 --------
996 kurtosis, skew, describe
998 Notes
999 -----
1000 The k-th central moment of a data sample is:
1002 .. math::
1004 m_k = \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x})^k
1006 Where n is the number of samples and x-bar is the mean. This function uses
1007 exponentiation by squares [1]_ for efficiency.
1009 References
1010 ----------
1011 .. [1] https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
1013 Examples
1014 --------
1015 >>> from scipy.stats import moment
1016 >>> moment([1, 2, 3, 4, 5], moment=1)
1017 0.0
1018 >>> moment([1, 2, 3, 4, 5], moment=2)
1019 2.0
1021 """
1022 a, axis = _chk_asarray(a, axis)
1024 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1026 if contains_nan and nan_policy == 'omit':
1027 a = ma.masked_invalid(a)
1028 return mstats_basic.moment(a, moment, axis)
1030 if a.size == 0:
1031 # empty array, return nan(s) with shape matching `moment`
1032 if np.isscalar(moment):
1033 return np.nan
1034 else:
1035 return np.full(np.asarray(moment).shape, np.nan, dtype=np.float64)
1037 # for array_like moment input, return a value for each.
1038 if not np.isscalar(moment):
1039 mmnt = [_moment(a, i, axis) for i in moment]
1040 return np.array(mmnt)
1041 else:
1042 return _moment(a, moment, axis)
1045def _moment(a, moment, axis):
1046 if np.abs(moment - np.round(moment)) > 0:
1047 raise ValueError("All moment parameters must be integers")
1049 if moment == 0:
1050 # When moment equals 0, the result is 1, by definition.
1051 shape = list(a.shape)
1052 del shape[axis]
1053 if shape:
1054 # return an actual array of the appropriate shape
1055 return np.ones(shape, dtype=float)
1056 else:
1057 # the input was 1D, so return a scalar instead of a rank-0 array
1058 return 1.0
1060 elif moment == 1:
1061 # By definition the first moment about the mean is 0.
1062 shape = list(a.shape)
1063 del shape[axis]
1064 if shape:
1065 # return an actual array of the appropriate shape
1066 return np.zeros(shape, dtype=float)
1067 else:
1068 # the input was 1D, so return a scalar instead of a rank-0 array
1069 return np.float64(0.0)
1070 else:
1071 # Exponentiation by squares: form exponent sequence
1072 n_list = [moment]
1073 current_n = moment
1074 while current_n > 2:
1075 if current_n % 2:
1076 current_n = (current_n - 1) / 2
1077 else:
1078 current_n /= 2
1079 n_list.append(current_n)
1081 # Starting point for exponentiation by squares
1082 a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis)
1083 if n_list[-1] == 1:
1084 s = a_zero_mean.copy()
1085 else:
1086 s = a_zero_mean**2
1088 # Perform multiplications
1089 for n in n_list[-2::-1]:
1090 s = s**2
1091 if n % 2:
1092 s *= a_zero_mean
1093 return np.mean(s, axis)
1096def variation(a, axis=0, nan_policy='propagate'):
1097 """
1098 Compute the coefficient of variation.
1100 The coefficient of variation is the ratio of the biased standard
1101 deviation to the mean.
1103 Parameters
1104 ----------
1105 a : array_like
1106 Input array.
1107 axis : int or None, optional
1108 Axis along which to calculate the coefficient of variation. Default
1109 is 0. If None, compute over the whole array `a`.
1110 nan_policy : {'propagate', 'raise', 'omit'}, optional
1111 Defines how to handle when input contains nan.
1112 The following options are available (default is 'propagate'):
1114 * 'propagate': returns nan
1115 * 'raise': throws an error
1116 * 'omit': performs the calculations ignoring nan values
1118 Returns
1119 -------
1120 variation : ndarray
1121 The calculated variation along the requested axis.
1123 References
1124 ----------
1125 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
1126 Probability and Statistics Tables and Formulae. Chapman & Hall: New
1127 York. 2000.
1129 Examples
1130 --------
1131 >>> from scipy.stats import variation
1132 >>> variation([1, 2, 3, 4, 5])
1133 0.47140452079103173
1135 """
1136 a, axis = _chk_asarray(a, axis)
1138 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1140 if contains_nan and nan_policy == 'omit':
1141 a = ma.masked_invalid(a)
1142 return mstats_basic.variation(a, axis)
1144 return a.std(axis) / a.mean(axis)
1147def skew(a, axis=0, bias=True, nan_policy='propagate'):
1148 r"""
1149 Compute the sample skewness of a data set.
1151 For normally distributed data, the skewness should be about zero. For
1152 unimodal continuous distributions, a skewness value greater than zero means
1153 that there is more weight in the right tail of the distribution. The
1154 function `skewtest` can be used to determine if the skewness value
1155 is close enough to zero, statistically speaking.
1157 Parameters
1158 ----------
1159 a : ndarray
1160 Input array.
1161 axis : int or None, optional
1162 Axis along which skewness is calculated. Default is 0.
1163 If None, compute over the whole array `a`.
1164 bias : bool, optional
1165 If False, then the calculations are corrected for statistical bias.
1166 nan_policy : {'propagate', 'raise', 'omit'}, optional
1167 Defines how to handle when input contains nan.
1168 The following options are available (default is 'propagate'):
1170 * 'propagate': returns nan
1171 * 'raise': throws an error
1172 * 'omit': performs the calculations ignoring nan values
1174 Returns
1175 -------
1176 skewness : ndarray
1177 The skewness of values along an axis, returning 0 where all values are
1178 equal.
1180 Notes
1181 -----
1182 The sample skewness is computed as the Fisher-Pearson coefficient
1183 of skewness, i.e.
1185 .. math::
1187 g_1=\frac{m_3}{m_2^{3/2}}
1189 where
1191 .. math::
1193 m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i
1195 is the biased sample :math:`i\texttt{th}` central moment, and :math:`\bar{x}` is
1196 the sample mean. If ``bias`` is False, the calculations are
1197 corrected for bias and the value computed is the adjusted
1198 Fisher-Pearson standardized moment coefficient, i.e.
1200 .. math::
1202 G_1=\frac{k_3}{k_2^{3/2}}=
1203 \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}.
1205 References
1206 ----------
1207 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
1208 Probability and Statistics Tables and Formulae. Chapman & Hall: New
1209 York. 2000.
1210 Section 2.2.24.1
1212 Examples
1213 --------
1214 >>> from scipy.stats import skew
1215 >>> skew([1, 2, 3, 4, 5])
1216 0.0
1217 >>> skew([2, 8, 0, 4, 1, 9, 9, 0])
1218 0.2650554122698573
1220 """
1221 a, axis = _chk_asarray(a, axis)
1222 n = a.shape[axis]
1224 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1226 if contains_nan and nan_policy == 'omit':
1227 a = ma.masked_invalid(a)
1228 return mstats_basic.skew(a, axis, bias)
1230 m2 = moment(a, 2, axis)
1231 m3 = moment(a, 3, axis)
1232 zero = (m2 == 0)
1233 vals = _lazywhere(~zero, (m2, m3),
1234 lambda m2, m3: m3 / m2**1.5,
1235 0.)
1236 if not bias:
1237 can_correct = (n > 2) & (m2 > 0)
1238 if can_correct.any():
1239 m2 = np.extract(can_correct, m2)
1240 m3 = np.extract(can_correct, m3)
1241 nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2**1.5
1242 np.place(vals, can_correct, nval)
1244 if vals.ndim == 0:
1245 return vals.item()
1247 return vals
1250def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'):
1251 """
1252 Compute the kurtosis (Fisher or Pearson) of a dataset.
1254 Kurtosis is the fourth central moment divided by the square of the
1255 variance. If Fisher's definition is used, then 3.0 is subtracted from
1256 the result to give 0.0 for a normal distribution.
1258 If bias is False then the kurtosis is calculated using k statistics to
1259 eliminate bias coming from biased moment estimators
1261 Use `kurtosistest` to see if result is close enough to normal.
1263 Parameters
1264 ----------
1265 a : array
1266 Data for which the kurtosis is calculated.
1267 axis : int or None, optional
1268 Axis along which the kurtosis is calculated. Default is 0.
1269 If None, compute over the whole array `a`.
1270 fisher : bool, optional
1271 If True, Fisher's definition is used (normal ==> 0.0). If False,
1272 Pearson's definition is used (normal ==> 3.0).
1273 bias : bool, optional
1274 If False, then the calculations are corrected for statistical bias.
1275 nan_policy : {'propagate', 'raise', 'omit'}, optional
1276 Defines how to handle when input contains nan. 'propagate' returns nan,
1277 'raise' throws an error, 'omit' performs the calculations ignoring nan
1278 values. Default is 'propagate'.
1280 Returns
1281 -------
1282 kurtosis : array
1283 The kurtosis of values along an axis. If all values are equal,
1284 return -3 for Fisher's definition and 0 for Pearson's definition.
1286 References
1287 ----------
1288 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
1289 Probability and Statistics Tables and Formulae. Chapman & Hall: New
1290 York. 2000.
1292 Examples
1293 --------
1294 In Fisher's definiton, the kurtosis of the normal distribution is zero.
1295 In the following example, the kurtosis is close to zero, because it was
1296 calculated from the dataset, not from the continuous distribution.
1298 >>> from scipy.stats import norm, kurtosis
1299 >>> data = norm.rvs(size=1000, random_state=3)
1300 >>> kurtosis(data)
1301 -0.06928694200380558
1303 The distribution with a higher kurtosis has a heavier tail.
1304 The zero valued kurtosis of the normal distribution in Fisher's definition
1305 can serve as a reference point.
1307 >>> import matplotlib.pyplot as plt
1308 >>> import scipy.stats as stats
1309 >>> from scipy.stats import kurtosis
1311 >>> x = np.linspace(-5, 5, 100)
1312 >>> ax = plt.subplot()
1313 >>> distnames = ['laplace', 'norm', 'uniform']
1315 >>> for distname in distnames:
1316 ... if distname == 'uniform':
1317 ... dist = getattr(stats, distname)(loc=-2, scale=4)
1318 ... else:
1319 ... dist = getattr(stats, distname)
1320 ... data = dist.rvs(size=1000)
1321 ... kur = kurtosis(data, fisher=True)
1322 ... y = dist.pdf(x)
1323 ... ax.plot(x, y, label="{}, {}".format(distname, round(kur, 3)))
1324 ... ax.legend()
1326 The Laplace distribution has a heavier tail than the normal distribution.
1327 The uniform distribution (which has negative kurtosis) has the thinnest
1328 tail.
1330 """
1331 a, axis = _chk_asarray(a, axis)
1333 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1335 if contains_nan and nan_policy == 'omit':
1336 a = ma.masked_invalid(a)
1337 return mstats_basic.kurtosis(a, axis, fisher, bias)
1339 n = a.shape[axis]
1340 m2 = moment(a, 2, axis)
1341 m4 = moment(a, 4, axis)
1342 zero = (m2 == 0)
1343 with np.errstate(all='ignore'):
1344 vals = np.where(zero, 0, m4 / m2**2.0)
1346 if not bias:
1347 can_correct = (n > 3) & (m2 > 0)
1348 if can_correct.any():
1349 m2 = np.extract(can_correct, m2)
1350 m4 = np.extract(can_correct, m4)
1351 nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0)
1352 np.place(vals, can_correct, nval + 3.0)
1354 if vals.ndim == 0:
1355 vals = vals.item() # array scalar
1357 return vals - 3 if fisher else vals
1360DescribeResult = namedtuple('DescribeResult',
1361 ('nobs', 'minmax', 'mean', 'variance', 'skewness',
1362 'kurtosis'))
1365def describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate'):
1366 """
1367 Compute several descriptive statistics of the passed array.
1369 Parameters
1370 ----------
1371 a : array_like
1372 Input data.
1373 axis : int or None, optional
1374 Axis along which statistics are calculated. Default is 0.
1375 If None, compute over the whole array `a`.
1376 ddof : int, optional
1377 Delta degrees of freedom (only for variance). Default is 1.
1378 bias : bool, optional
1379 If False, then the skewness and kurtosis calculations are corrected for
1380 statistical bias.
1381 nan_policy : {'propagate', 'raise', 'omit'}, optional
1382 Defines how to handle when input contains nan.
1383 The following options are available (default is 'propagate'):
1385 * 'propagate': returns nan
1386 * 'raise': throws an error
1387 * 'omit': performs the calculations ignoring nan values
1389 Returns
1390 -------
1391 nobs : int or ndarray of ints
1392 Number of observations (length of data along `axis`).
1393 When 'omit' is chosen as nan_policy, each column is counted separately.
1394 minmax: tuple of ndarrays or floats
1395 Minimum and maximum value of data array.
1396 mean : ndarray or float
1397 Arithmetic mean of data along axis.
1398 variance : ndarray or float
1399 Unbiased variance of the data along axis, denominator is number of
1400 observations minus one.
1401 skewness : ndarray or float
1402 Skewness, based on moment calculations with denominator equal to
1403 the number of observations, i.e. no degrees of freedom correction.
1404 kurtosis : ndarray or float
1405 Kurtosis (Fisher). The kurtosis is normalized so that it is
1406 zero for the normal distribution. No degrees of freedom are used.
1408 See Also
1409 --------
1410 skew, kurtosis
1412 Examples
1413 --------
1414 >>> from scipy import stats
1415 >>> a = np.arange(10)
1416 >>> stats.describe(a)
1417 DescribeResult(nobs=10, minmax=(0, 9), mean=4.5, variance=9.166666666666666,
1418 skewness=0.0, kurtosis=-1.2242424242424244)
1419 >>> b = [[1, 2], [3, 4]]
1420 >>> stats.describe(b)
1421 DescribeResult(nobs=2, minmax=(array([1, 2]), array([3, 4])),
1422 mean=array([2., 3.]), variance=array([2., 2.]),
1423 skewness=array([0., 0.]), kurtosis=array([-2., -2.]))
1425 """
1426 a, axis = _chk_asarray(a, axis)
1428 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1430 if contains_nan and nan_policy == 'omit':
1431 a = ma.masked_invalid(a)
1432 return mstats_basic.describe(a, axis, ddof, bias)
1434 if a.size == 0:
1435 raise ValueError("The input must not be empty.")
1436 n = a.shape[axis]
1437 mm = (np.min(a, axis=axis), np.max(a, axis=axis))
1438 m = np.mean(a, axis=axis)
1439 v = np.var(a, axis=axis, ddof=ddof)
1440 sk = skew(a, axis, bias=bias)
1441 kurt = kurtosis(a, axis, bias=bias)
1443 return DescribeResult(n, mm, m, v, sk, kurt)
1445#####################################
1446# NORMALITY TESTS #
1447#####################################
1450SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
1453def skewtest(a, axis=0, nan_policy='propagate'):
1454 """
1455 Test whether the skew is different from the normal distribution.
1457 This function tests the null hypothesis that the skewness of
1458 the population that the sample was drawn from is the same
1459 as that of a corresponding normal distribution.
1461 Parameters
1462 ----------
1463 a : array
1464 The data to be tested.
1465 axis : int or None, optional
1466 Axis along which statistics are calculated. Default is 0.
1467 If None, compute over the whole array `a`.
1468 nan_policy : {'propagate', 'raise', 'omit'}, optional
1469 Defines how to handle when input contains nan.
1470 The following options are available (default is 'propagate'):
1472 * 'propagate': returns nan
1473 * 'raise': throws an error
1474 * 'omit': performs the calculations ignoring nan values
1476 Returns
1477 -------
1478 statistic : float
1479 The computed z-score for this test.
1480 pvalue : float
1481 Two-sided p-value for the hypothesis test.
1483 Notes
1484 -----
1485 The sample size must be at least 8.
1487 References
1488 ----------
1489 .. [1] R. B. D'Agostino, A. J. Belanger and R. B. D'Agostino Jr.,
1490 "A suggestion for using powerful and informative tests of
1491 normality", American Statistician 44, pp. 316-321, 1990.
1493 Examples
1494 --------
1495 >>> from scipy.stats import skewtest
1496 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8])
1497 SkewtestResult(statistic=1.0108048609177787, pvalue=0.3121098361421897)
1498 >>> skewtest([2, 8, 0, 4, 1, 9, 9, 0])
1499 SkewtestResult(statistic=0.44626385374196975, pvalue=0.6554066631275459)
1500 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8000])
1501 SkewtestResult(statistic=3.571773510360407, pvalue=0.0003545719905823133)
1502 >>> skewtest([100, 100, 100, 100, 100, 100, 100, 101])
1503 SkewtestResult(statistic=3.5717766638478072, pvalue=0.000354567720281634)
1505 """
1506 a, axis = _chk_asarray(a, axis)
1508 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1510 if contains_nan and nan_policy == 'omit':
1511 a = ma.masked_invalid(a)
1512 return mstats_basic.skewtest(a, axis)
1514 if axis is None:
1515 a = np.ravel(a)
1516 axis = 0
1517 b2 = skew(a, axis)
1518 n = a.shape[axis]
1519 if n < 8:
1520 raise ValueError(
1521 "skewtest is not valid with less than 8 samples; %i samples"
1522 " were given." % int(n))
1523 y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2)))
1524 beta2 = (3.0 * (n**2 + 27*n - 70) * (n+1) * (n+3) /
1525 ((n-2.0) * (n+5) * (n+7) * (n+9)))
1526 W2 = -1 + math.sqrt(2 * (beta2 - 1))
1527 delta = 1 / math.sqrt(0.5 * math.log(W2))
1528 alpha = math.sqrt(2.0 / (W2 - 1))
1529 y = np.where(y == 0, 1, y)
1530 Z = delta * np.log(y / alpha + np.sqrt((y / alpha)**2 + 1))
1532 return SkewtestResult(Z, 2 * distributions.norm.sf(np.abs(Z)))
1535KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
1538def kurtosistest(a, axis=0, nan_policy='propagate'):
1539 """
1540 Test whether a dataset has normal kurtosis.
1542 This function tests the null hypothesis that the kurtosis
1543 of the population from which the sample was drawn is that
1544 of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``.
1546 Parameters
1547 ----------
1548 a : array
1549 Array of the sample data.
1550 axis : int or None, optional
1551 Axis along which to compute test. Default is 0. If None,
1552 compute over the whole array `a`.
1553 nan_policy : {'propagate', 'raise', 'omit'}, optional
1554 Defines how to handle when input contains nan.
1555 The following options are available (default is 'propagate'):
1557 * 'propagate': returns nan
1558 * 'raise': throws an error
1559 * 'omit': performs the calculations ignoring nan values
1561 Returns
1562 -------
1563 statistic : float
1564 The computed z-score for this test.
1565 pvalue : float
1566 The two-sided p-value for the hypothesis test.
1568 Notes
1569 -----
1570 Valid only for n>20. This function uses the method described in [1]_.
1572 References
1573 ----------
1574 .. [1] see e.g. F. J. Anscombe, W. J. Glynn, "Distribution of the kurtosis
1575 statistic b2 for normal samples", Biometrika, vol. 70, pp. 227-234, 1983.
1577 Examples
1578 --------
1579 >>> from scipy.stats import kurtosistest
1580 >>> kurtosistest(list(range(20)))
1581 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.08804338332528348)
1583 >>> np.random.seed(28041990)
1584 >>> s = np.random.normal(0, 1, 1000)
1585 >>> kurtosistest(s)
1586 KurtosistestResult(statistic=1.2317590987707365, pvalue=0.21803908613450895)
1588 """
1589 a, axis = _chk_asarray(a, axis)
1591 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1593 if contains_nan and nan_policy == 'omit':
1594 a = ma.masked_invalid(a)
1595 return mstats_basic.kurtosistest(a, axis)
1597 n = a.shape[axis]
1598 if n < 5:
1599 raise ValueError(
1600 "kurtosistest requires at least 5 observations; %i observations"
1601 " were given." % int(n))
1602 if n < 20:
1603 warnings.warn("kurtosistest only valid for n>=20 ... continuing "
1604 "anyway, n=%i" % int(n))
1605 b2 = kurtosis(a, axis, fisher=False)
1607 E = 3.0*(n-1) / (n+1)
1608 varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) # [1]_ Eq. 1
1609 x = (b2-E) / np.sqrt(varb2) # [1]_ Eq. 4
1610 # [1]_ Eq. 2:
1611 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
1612 (n*(n-2)*(n-3)))
1613 # [1]_ Eq. 3:
1614 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
1615 term1 = 1 - 2/(9.0*A)
1616 denom = 1 + x*np.sqrt(2/(A-4.0))
1617 term2 = np.sign(denom) * np.where(denom == 0.0, np.nan,
1618 np.power((1-2.0/A)/np.abs(denom), 1/3.0))
1619 if np.any(denom == 0):
1620 msg = "Test statistic not defined in some cases due to division by " \
1621 "zero. Return nan in that case..."
1622 warnings.warn(msg, RuntimeWarning)
1624 Z = (term1 - term2) / np.sqrt(2/(9.0*A)) # [1]_ Eq. 5
1625 if Z.ndim == 0:
1626 Z = Z[()]
1628 # zprob uses upper tail, so Z needs to be positive
1629 return KurtosistestResult(Z, 2 * distributions.norm.sf(np.abs(Z)))
1632NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
1635def normaltest(a, axis=0, nan_policy='propagate'):
1636 """
1637 Test whether a sample differs from a normal distribution.
1639 This function tests the null hypothesis that a sample comes
1640 from a normal distribution. It is based on D'Agostino and
1641 Pearson's [1]_, [2]_ test that combines skew and kurtosis to
1642 produce an omnibus test of normality.
1644 Parameters
1645 ----------
1646 a : array_like
1647 The array containing the sample to be tested.
1648 axis : int or None, optional
1649 Axis along which to compute test. Default is 0. If None,
1650 compute over the whole array `a`.
1651 nan_policy : {'propagate', 'raise', 'omit'}, optional
1652 Defines how to handle when input contains nan.
1653 The following options are available (default is 'propagate'):
1655 * 'propagate': returns nan
1656 * 'raise': throws an error
1657 * 'omit': performs the calculations ignoring nan values
1659 Returns
1660 -------
1661 statistic : float or array
1662 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
1663 ``k`` is the z-score returned by `kurtosistest`.
1664 pvalue : float or array
1665 A 2-sided chi squared probability for the hypothesis test.
1667 References
1668 ----------
1669 .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for
1670 moderate and large sample size", Biometrika, 58, 341-348
1672 .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Tests for departure from
1673 normality", Biometrika, 60, 613-622
1675 Examples
1676 --------
1677 >>> from scipy import stats
1678 >>> pts = 1000
1679 >>> np.random.seed(28041990)
1680 >>> a = np.random.normal(0, 1, size=pts)
1681 >>> b = np.random.normal(2, 1, size=pts)
1682 >>> x = np.concatenate((a, b))
1683 >>> k2, p = stats.normaltest(x)
1684 >>> alpha = 1e-3
1685 >>> print("p = {:g}".format(p))
1686 p = 3.27207e-11
1687 >>> if p < alpha: # null hypothesis: x comes from a normal distribution
1688 ... print("The null hypothesis can be rejected")
1689 ... else:
1690 ... print("The null hypothesis cannot be rejected")
1691 The null hypothesis can be rejected
1693 """
1694 a, axis = _chk_asarray(a, axis)
1696 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1698 if contains_nan and nan_policy == 'omit':
1699 a = ma.masked_invalid(a)
1700 return mstats_basic.normaltest(a, axis)
1702 s, _ = skewtest(a, axis)
1703 k, _ = kurtosistest(a, axis)
1704 k2 = s*s + k*k
1706 return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
1709Jarque_beraResult = namedtuple('Jarque_beraResult', ('statistic', 'pvalue'))
1712def jarque_bera(x):
1713 """
1714 Perform the Jarque-Bera goodness of fit test on sample data.
1716 The Jarque-Bera test tests whether the sample data has the skewness and
1717 kurtosis matching a normal distribution.
1719 Note that this test only works for a large enough number of data samples
1720 (>2000) as the test statistic asymptotically has a Chi-squared distribution
1721 with 2 degrees of freedom.
1723 Parameters
1724 ----------
1725 x : array_like
1726 Observations of a random variable.
1728 Returns
1729 -------
1730 jb_value : float
1731 The test statistic.
1732 p : float
1733 The p-value for the hypothesis test.
1735 References
1736 ----------
1737 .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality,
1738 homoscedasticity and serial independence of regression residuals",
1739 6 Econometric Letters 255-259.
1741 Examples
1742 --------
1743 >>> from scipy import stats
1744 >>> np.random.seed(987654321)
1745 >>> x = np.random.normal(0, 1, 100000)
1746 >>> jarque_bera_test = stats.jarque_bera(x)
1747 >>> jarque_bera_test
1748 Jarque_beraResult(statistic=4.716570798957913, pvalue=0.0945822550304295)
1749 >>> jarque_bera_test.statistic
1750 4.716570798957913
1751 >>> jarque_bera_test.pvalue
1752 0.0945822550304295
1754 """
1755 x = np.asarray(x)
1756 n = x.size
1757 if n == 0:
1758 raise ValueError('At least one observation is required.')
1760 mu = x.mean()
1761 diffx = x - mu
1762 skewness = (1 / n * np.sum(diffx**3)) / (1 / n * np.sum(diffx**2))**(3 / 2.)
1763 kurtosis = (1 / n * np.sum(diffx**4)) / (1 / n * np.sum(diffx**2))**2
1764 jb_value = n / 6 * (skewness**2 + (kurtosis - 3)**2 / 4)
1765 p = 1 - distributions.chi2.cdf(jb_value, 2)
1767 return Jarque_beraResult(jb_value, p)
1770#####################################
1771# FREQUENCY FUNCTIONS #
1772#####################################
1774# deindent to work around numpy/gh-16202
1775@np.deprecate(
1776 message="`itemfreq` is deprecated and will be removed in a "
1777 "future version. Use instead `np.unique(..., return_counts=True)`")
1778def itemfreq(a):
1779 """
1780Return a 2-D array of item frequencies.
1782Parameters
1783----------
1784a : (N,) array_like
1785 Input array.
1787Returns
1788-------
1789itemfreq : (K, 2) ndarray
1790 A 2-D frequency table. Column 1 contains sorted, unique values from
1791 `a`, column 2 contains their respective counts.
1793Examples
1794--------
1795>>> from scipy import stats
1796>>> a = np.array([1, 1, 5, 0, 1, 2, 2, 0, 1, 4])
1797>>> stats.itemfreq(a)
1798array([[ 0., 2.],
1799 [ 1., 4.],
1800 [ 2., 2.],
1801 [ 4., 1.],
1802 [ 5., 1.]])
1803>>> np.bincount(a)
1804array([2, 4, 2, 0, 1, 1])
1806>>> stats.itemfreq(a/10.)
1807array([[ 0. , 2. ],
1808 [ 0.1, 4. ],
1809 [ 0.2, 2. ],
1810 [ 0.4, 1. ],
1811 [ 0.5, 1. ]])
1812"""
1813 items, inv = np.unique(a, return_inverse=True)
1814 freq = np.bincount(inv)
1815 return np.array([items, freq]).T
1818def scoreatpercentile(a, per, limit=(), interpolation_method='fraction',
1819 axis=None):
1820 """
1821 Calculate the score at a given percentile of the input sequence.
1823 For example, the score at `per=50` is the median. If the desired quantile
1824 lies between two data points, we interpolate between them, according to
1825 the value of `interpolation`. If the parameter `limit` is provided, it
1826 should be a tuple (lower, upper) of two values.
1828 Parameters
1829 ----------
1830 a : array_like
1831 A 1-D array of values from which to extract score.
1832 per : array_like
1833 Percentile(s) at which to extract score. Values should be in range
1834 [0,100].
1835 limit : tuple, optional
1836 Tuple of two scalars, the lower and upper limits within which to
1837 compute the percentile. Values of `a` outside
1838 this (closed) interval will be ignored.
1839 interpolation_method : {'fraction', 'lower', 'higher'}, optional
1840 Specifies the interpolation method to use,
1841 when the desired quantile lies between two data points `i` and `j`
1842 The following options are available (default is 'fraction'):
1844 * 'fraction': ``i + (j - i) * fraction`` where ``fraction`` is the
1845 fractional part of the index surrounded by ``i`` and ``j``
1846 * 'lower': ``i``
1847 * 'higher': ``j``
1849 axis : int, optional
1850 Axis along which the percentiles are computed. Default is None. If
1851 None, compute over the whole array `a`.
1853 Returns
1854 -------
1855 score : float or ndarray
1856 Score at percentile(s).
1858 See Also
1859 --------
1860 percentileofscore, numpy.percentile
1862 Notes
1863 -----
1864 This function will become obsolete in the future.
1865 For NumPy 1.9 and higher, `numpy.percentile` provides all the functionality
1866 that `scoreatpercentile` provides. And it's significantly faster.
1867 Therefore it's recommended to use `numpy.percentile` for users that have
1868 numpy >= 1.9.
1870 Examples
1871 --------
1872 >>> from scipy import stats
1873 >>> a = np.arange(100)
1874 >>> stats.scoreatpercentile(a, 50)
1875 49.5
1877 """
1878 # adapted from NumPy's percentile function. When we require numpy >= 1.8,
1879 # the implementation of this function can be replaced by np.percentile.
1880 a = np.asarray(a)
1881 if a.size == 0:
1882 # empty array, return nan(s) with shape matching `per`
1883 if np.isscalar(per):
1884 return np.nan
1885 else:
1886 return np.full(np.asarray(per).shape, np.nan, dtype=np.float64)
1888 if limit:
1889 a = a[(limit[0] <= a) & (a <= limit[1])]
1891 sorted_ = np.sort(a, axis=axis)
1892 if axis is None:
1893 axis = 0
1895 return _compute_qth_percentile(sorted_, per, interpolation_method, axis)
1898# handle sequence of per's without calling sort multiple times
1899def _compute_qth_percentile(sorted_, per, interpolation_method, axis):
1900 if not np.isscalar(per):
1901 score = [_compute_qth_percentile(sorted_, i,
1902 interpolation_method, axis)
1903 for i in per]
1904 return np.array(score)
1906 if not (0 <= per <= 100):
1907 raise ValueError("percentile must be in the range [0, 100]")
1909 indexer = [slice(None)] * sorted_.ndim
1910 idx = per / 100. * (sorted_.shape[axis] - 1)
1912 if int(idx) != idx:
1913 # round fractional indices according to interpolation method
1914 if interpolation_method == 'lower':
1915 idx = int(np.floor(idx))
1916 elif interpolation_method == 'higher':
1917 idx = int(np.ceil(idx))
1918 elif interpolation_method == 'fraction':
1919 pass # keep idx as fraction and interpolate
1920 else:
1921 raise ValueError("interpolation_method can only be 'fraction', "
1922 "'lower' or 'higher'")
1924 i = int(idx)
1925 if i == idx:
1926 indexer[axis] = slice(i, i + 1)
1927 weights = array(1)
1928 sumval = 1.0
1929 else:
1930 indexer[axis] = slice(i, i + 2)
1931 j = i + 1
1932 weights = array([(j - idx), (idx - i)], float)
1933 wshape = [1] * sorted_.ndim
1934 wshape[axis] = 2
1935 weights.shape = wshape
1936 sumval = weights.sum()
1938 # Use np.add.reduce (== np.sum but a little faster) to coerce data type
1939 return np.add.reduce(sorted_[tuple(indexer)] * weights, axis=axis) / sumval
1942def percentileofscore(a, score, kind='rank'):
1943 """
1944 Compute the percentile rank of a score relative to a list of scores.
1946 A `percentileofscore` of, for example, 80% means that 80% of the
1947 scores in `a` are below the given score. In the case of gaps or
1948 ties, the exact definition depends on the optional keyword, `kind`.
1950 Parameters
1951 ----------
1952 a : array_like
1953 Array of scores to which `score` is compared.
1954 score : int or float
1955 Score that is compared to the elements in `a`.
1956 kind : {'rank', 'weak', 'strict', 'mean'}, optional
1957 Specifies the interpretation of the resulting score.
1958 The following options are available (default is 'rank'):
1960 * 'rank': Average percentage ranking of score. In case of multiple
1961 matches, average the percentage rankings of all matching scores.
1962 * 'weak': This kind corresponds to the definition of a cumulative
1963 distribution function. A percentileofscore of 80% means that 80%
1964 of values are less than or equal to the provided score.
1965 * 'strict': Similar to "weak", except that only values that are
1966 strictly less than the given score are counted.
1967 * 'mean': The average of the "weak" and "strict" scores, often used
1968 in testing. See https://en.wikipedia.org/wiki/Percentile_rank
1970 Returns
1971 -------
1972 pcos : float
1973 Percentile-position of score (0-100) relative to `a`.
1975 See Also
1976 --------
1977 numpy.percentile
1979 Examples
1980 --------
1981 Three-quarters of the given values lie below a given score:
1983 >>> from scipy import stats
1984 >>> stats.percentileofscore([1, 2, 3, 4], 3)
1985 75.0
1987 With multiple matches, note how the scores of the two matches, 0.6
1988 and 0.8 respectively, are averaged:
1990 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3)
1991 70.0
1993 Only 2/5 values are strictly less than 3:
1995 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
1996 40.0
1998 But 4/5 values are less than or equal to 3:
2000 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
2001 80.0
2003 The average between the weak and the strict scores is:
2005 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
2006 60.0
2008 """
2009 if np.isnan(score):
2010 return np.nan
2011 a = np.asarray(a)
2012 n = len(a)
2013 if n == 0:
2014 return 100.0
2016 if kind == 'rank':
2017 left = np.count_nonzero(a < score)
2018 right = np.count_nonzero(a <= score)
2019 pct = (right + left + (1 if right > left else 0)) * 50.0/n
2020 return pct
2021 elif kind == 'strict':
2022 return np.count_nonzero(a < score) / n * 100
2023 elif kind == 'weak':
2024 return np.count_nonzero(a <= score) / n * 100
2025 elif kind == 'mean':
2026 pct = (np.count_nonzero(a < score) + np.count_nonzero(a <= score)) / n * 50
2027 return pct
2028 else:
2029 raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'")
2032HistogramResult = namedtuple('HistogramResult',
2033 ('count', 'lowerlimit', 'binsize', 'extrapoints'))
2036def _histogram(a, numbins=10, defaultlimits=None, weights=None, printextras=False):
2037 """
2038 Create a histogram.
2040 Separate the range into several bins and return the number of instances
2041 in each bin.
2043 Parameters
2044 ----------
2045 a : array_like
2046 Array of scores which will be put into bins.
2047 numbins : int, optional
2048 The number of bins to use for the histogram. Default is 10.
2049 defaultlimits : tuple (lower, upper), optional
2050 The lower and upper values for the range of the histogram.
2051 If no value is given, a range slightly larger than the range of the
2052 values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
2053 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
2054 weights : array_like, optional
2055 The weights for each value in `a`. Default is None, which gives each
2056 value a weight of 1.0
2057 printextras : bool, optional
2058 If True, if there are extra points (i.e. the points that fall outside
2059 the bin limits) a warning is raised saying how many of those points
2060 there are. Default is False.
2062 Returns
2063 -------
2064 count : ndarray
2065 Number of points (or sum of weights) in each bin.
2066 lowerlimit : float
2067 Lowest value of histogram, the lower limit of the first bin.
2068 binsize : float
2069 The size of the bins (all bins have the same size).
2070 extrapoints : int
2071 The number of points outside the range of the histogram.
2073 See Also
2074 --------
2075 numpy.histogram
2077 Notes
2078 -----
2079 This histogram is based on numpy's histogram but has a larger range by
2080 default if default limits is not set.
2082 """
2083 a = np.ravel(a)
2084 if defaultlimits is None:
2085 if a.size == 0:
2086 # handle empty arrays. Undetermined range, so use 0-1.
2087 defaultlimits = (0, 1)
2088 else:
2089 # no range given, so use values in `a`
2090 data_min = a.min()
2091 data_max = a.max()
2092 # Have bins extend past min and max values slightly
2093 s = (data_max - data_min) / (2. * (numbins - 1.))
2094 defaultlimits = (data_min - s, data_max + s)
2096 # use numpy's histogram method to compute bins
2097 hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits,
2098 weights=weights)
2099 # hist are not always floats, convert to keep with old output
2100 hist = np.array(hist, dtype=float)
2101 # fixed width for bins is assumed, as numpy's histogram gives
2102 # fixed width bins for int values for 'bins'
2103 binsize = bin_edges[1] - bin_edges[0]
2104 # calculate number of extra points
2105 extrapoints = len([v for v in a
2106 if defaultlimits[0] > v or v > defaultlimits[1]])
2107 if extrapoints > 0 and printextras:
2108 warnings.warn("Points outside given histogram range = %s"
2109 % extrapoints)
2111 return HistogramResult(hist, defaultlimits[0], binsize, extrapoints)
2114CumfreqResult = namedtuple('CumfreqResult',
2115 ('cumcount', 'lowerlimit', 'binsize',
2116 'extrapoints'))
2119def cumfreq(a, numbins=10, defaultreallimits=None, weights=None):
2120 """
2121 Return a cumulative frequency histogram, using the histogram function.
2123 A cumulative histogram is a mapping that counts the cumulative number of
2124 observations in all of the bins up to the specified bin.
2126 Parameters
2127 ----------
2128 a : array_like
2129 Input array.
2130 numbins : int, optional
2131 The number of bins to use for the histogram. Default is 10.
2132 defaultreallimits : tuple (lower, upper), optional
2133 The lower and upper values for the range of the histogram.
2134 If no value is given, a range slightly larger than the range of the
2135 values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``,
2136 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
2137 weights : array_like, optional
2138 The weights for each value in `a`. Default is None, which gives each
2139 value a weight of 1.0
2141 Returns
2142 -------
2143 cumcount : ndarray
2144 Binned values of cumulative frequency.
2145 lowerlimit : float
2146 Lower real limit
2147 binsize : float
2148 Width of each bin.
2149 extrapoints : int
2150 Extra points.
2152 Examples
2153 --------
2154 >>> import matplotlib.pyplot as plt
2155 >>> from scipy import stats
2156 >>> x = [1, 4, 2, 1, 3, 1]
2157 >>> res = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5))
2158 >>> res.cumcount
2159 array([ 1., 2., 3., 3.])
2160 >>> res.extrapoints
2161 3
2163 Create a normal distribution with 1000 random values
2165 >>> rng = np.random.RandomState(seed=12345)
2166 >>> samples = stats.norm.rvs(size=1000, random_state=rng)
2168 Calculate cumulative frequencies
2170 >>> res = stats.cumfreq(samples, numbins=25)
2172 Calculate space of values for x
2174 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.cumcount.size,
2175 ... res.cumcount.size)
2177 Plot histogram and cumulative histogram
2179 >>> fig = plt.figure(figsize=(10, 4))
2180 >>> ax1 = fig.add_subplot(1, 2, 1)
2181 >>> ax2 = fig.add_subplot(1, 2, 2)
2182 >>> ax1.hist(samples, bins=25)
2183 >>> ax1.set_title('Histogram')
2184 >>> ax2.bar(x, res.cumcount, width=res.binsize)
2185 >>> ax2.set_title('Cumulative histogram')
2186 >>> ax2.set_xlim([x.min(), x.max()])
2188 >>> plt.show()
2190 """
2191 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
2192 cumhist = np.cumsum(h * 1, axis=0)
2193 return CumfreqResult(cumhist, l, b, e)
2196RelfreqResult = namedtuple('RelfreqResult',
2197 ('frequency', 'lowerlimit', 'binsize',
2198 'extrapoints'))
2201def relfreq(a, numbins=10, defaultreallimits=None, weights=None):
2202 """
2203 Return a relative frequency histogram, using the histogram function.
2205 A relative frequency histogram is a mapping of the number of
2206 observations in each of the bins relative to the total of observations.
2208 Parameters
2209 ----------
2210 a : array_like
2211 Input array.
2212 numbins : int, optional
2213 The number of bins to use for the histogram. Default is 10.
2214 defaultreallimits : tuple (lower, upper), optional
2215 The lower and upper values for the range of the histogram.
2216 If no value is given, a range slightly larger than the range of the
2217 values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
2218 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
2219 weights : array_like, optional
2220 The weights for each value in `a`. Default is None, which gives each
2221 value a weight of 1.0
2223 Returns
2224 -------
2225 frequency : ndarray
2226 Binned values of relative frequency.
2227 lowerlimit : float
2228 Lower real limit.
2229 binsize : float
2230 Width of each bin.
2231 extrapoints : int
2232 Extra points.
2234 Examples
2235 --------
2236 >>> import matplotlib.pyplot as plt
2237 >>> from scipy import stats
2238 >>> a = np.array([2, 4, 1, 2, 3, 2])
2239 >>> res = stats.relfreq(a, numbins=4)
2240 >>> res.frequency
2241 array([ 0.16666667, 0.5 , 0.16666667, 0.16666667])
2242 >>> np.sum(res.frequency) # relative frequencies should add up to 1
2243 1.0
2245 Create a normal distribution with 1000 random values
2247 >>> rng = np.random.RandomState(seed=12345)
2248 >>> samples = stats.norm.rvs(size=1000, random_state=rng)
2250 Calculate relative frequencies
2252 >>> res = stats.relfreq(samples, numbins=25)
2254 Calculate space of values for x
2256 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size,
2257 ... res.frequency.size)
2259 Plot relative frequency histogram
2261 >>> fig = plt.figure(figsize=(5, 4))
2262 >>> ax = fig.add_subplot(1, 1, 1)
2263 >>> ax.bar(x, res.frequency, width=res.binsize)
2264 >>> ax.set_title('Relative frequency histogram')
2265 >>> ax.set_xlim([x.min(), x.max()])
2267 >>> plt.show()
2269 """
2270 a = np.asanyarray(a)
2271 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
2272 h = h / a.shape[0]
2274 return RelfreqResult(h, l, b, e)
2277#####################################
2278# VARIABILITY FUNCTIONS #
2279#####################################
2281def obrientransform(*args):
2282 """
2283 Compute the O'Brien transform on input data (any number of arrays).
2285 Used to test for homogeneity of variance prior to running one-way stats.
2286 Each array in ``*args`` is one level of a factor.
2287 If `f_oneway` is run on the transformed data and found significant,
2288 the variances are unequal. From Maxwell and Delaney [1]_, p.112.
2290 Parameters
2291 ----------
2292 args : tuple of array_like
2293 Any number of arrays.
2295 Returns
2296 -------
2297 obrientransform : ndarray
2298 Transformed data for use in an ANOVA. The first dimension
2299 of the result corresponds to the sequence of transformed
2300 arrays. If the arrays given are all 1-D of the same length,
2301 the return value is a 2-D array; otherwise it is a 1-D array
2302 of type object, with each element being an ndarray.
2304 References
2305 ----------
2306 .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and
2307 Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990.
2309 Examples
2310 --------
2311 We'll test the following data sets for differences in their variance.
2313 >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10]
2314 >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15]
2316 Apply the O'Brien transform to the data.
2318 >>> from scipy.stats import obrientransform
2319 >>> tx, ty = obrientransform(x, y)
2321 Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the
2322 transformed data.
2324 >>> from scipy.stats import f_oneway
2325 >>> F, p = f_oneway(tx, ty)
2326 >>> p
2327 0.1314139477040335
2329 If we require that ``p < 0.05`` for significance, we cannot conclude
2330 that the variances are different.
2332 """
2333 TINY = np.sqrt(np.finfo(float).eps)
2335 # `arrays` will hold the transformed arguments.
2336 arrays = []
2337 sLast = None
2339 for arg in args:
2340 a = np.asarray(arg)
2341 n = len(a)
2342 mu = np.mean(a)
2343 sq = (a - mu)**2
2344 sumsq = sq.sum()
2346 # The O'Brien transform.
2347 t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2))
2349 # Check that the mean of the transformed data is equal to the
2350 # original variance.
2351 var = sumsq / (n - 1)
2352 if abs(var - np.mean(t)) > TINY:
2353 raise ValueError('Lack of convergence in obrientransform.')
2355 arrays.append(t)
2356 sLast = a.shape
2358 if sLast:
2359 for arr in arrays[:-1]:
2360 if sLast != arr.shape:
2361 return np.array(arrays, dtype=object)
2362 return np.array(arrays)
2365def sem(a, axis=0, ddof=1, nan_policy='propagate'):
2366 """
2367 Compute standard error of the mean.
2369 Calculate the standard error of the mean (or standard error of
2370 measurement) of the values in the input array.
2372 Parameters
2373 ----------
2374 a : array_like
2375 An array containing the values for which the standard error is
2376 returned.
2377 axis : int or None, optional
2378 Axis along which to operate. Default is 0. If None, compute over
2379 the whole array `a`.
2380 ddof : int, optional
2381 Delta degrees-of-freedom. How many degrees of freedom to adjust
2382 for bias in limited samples relative to the population estimate
2383 of variance. Defaults to 1.
2384 nan_policy : {'propagate', 'raise', 'omit'}, optional
2385 Defines how to handle when input contains nan.
2386 The following options are available (default is 'propagate'):
2388 * 'propagate': returns nan
2389 * 'raise': throws an error
2390 * 'omit': performs the calculations ignoring nan values
2392 Returns
2393 -------
2394 s : ndarray or float
2395 The standard error of the mean in the sample(s), along the input axis.
2397 Notes
2398 -----
2399 The default value for `ddof` is different to the default (0) used by other
2400 ddof containing routines, such as np.std and np.nanstd.
2402 Examples
2403 --------
2404 Find standard error along the first axis:
2406 >>> from scipy import stats
2407 >>> a = np.arange(20).reshape(5,4)
2408 >>> stats.sem(a)
2409 array([ 2.8284, 2.8284, 2.8284, 2.8284])
2411 Find standard error across the whole array, using n degrees of freedom:
2413 >>> stats.sem(a, axis=None, ddof=0)
2414 1.2893796958227628
2416 """
2417 a, axis = _chk_asarray(a, axis)
2419 contains_nan, nan_policy = _contains_nan(a, nan_policy)
2421 if contains_nan and nan_policy == 'omit':
2422 a = ma.masked_invalid(a)
2423 return mstats_basic.sem(a, axis, ddof)
2425 n = a.shape[axis]
2426 s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n)
2427 return s
2430def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
2431 """
2432 Compute the z score.
2434 Compute the z score of each value in the sample, relative to the
2435 sample mean and standard deviation.
2437 Parameters
2438 ----------
2439 a : array_like
2440 An array like object containing the sample data.
2441 axis : int or None, optional
2442 Axis along which to operate. Default is 0. If None, compute over
2443 the whole array `a`.
2444 ddof : int, optional
2445 Degrees of freedom correction in the calculation of the
2446 standard deviation. Default is 0.
2447 nan_policy : {'propagate', 'raise', 'omit'}, optional
2448 Defines how to handle when input contains nan. 'propagate' returns nan,
2449 'raise' throws an error, 'omit' performs the calculations ignoring nan
2450 values. Default is 'propagate'.
2452 Returns
2453 -------
2454 zscore : array_like
2455 The z-scores, standardized by mean and standard deviation of
2456 input array `a`.
2458 Notes
2459 -----
2460 This function preserves ndarray subclasses, and works also with
2461 matrices and masked arrays (it uses `asanyarray` instead of
2462 `asarray` for parameters).
2464 Examples
2465 --------
2466 >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091,
2467 ... 0.1954, 0.6307, 0.6599, 0.1065, 0.0508])
2468 >>> from scipy import stats
2469 >>> stats.zscore(a)
2470 array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786,
2471 0.6748, -1.1488, -1.3324])
2473 Computing along a specified axis, using n-1 degrees of freedom
2474 (``ddof=1``) to calculate the standard deviation:
2476 >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608],
2477 ... [ 0.7149, 0.0775, 0.6072, 0.9656],
2478 ... [ 0.6341, 0.1403, 0.9759, 0.4064],
2479 ... [ 0.5918, 0.6948, 0.904 , 0.3721],
2480 ... [ 0.0921, 0.2481, 0.1188, 0.1366]])
2481 >>> stats.zscore(b, axis=1, ddof=1)
2482 array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358],
2483 [ 0.33048416, -1.37380874, 0.04251374, 1.00081084],
2484 [ 0.26796377, -1.12598418, 1.23283094, -0.37481053],
2485 [-0.22095197, 0.24468594, 1.19042819, -1.21416216],
2486 [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]])
2488 """
2489 a = np.asanyarray(a)
2491 contains_nan, nan_policy = _contains_nan(a, nan_policy)
2493 if contains_nan and nan_policy == 'omit':
2494 mns = np.nanmean(a=a, axis=axis, keepdims=True)
2495 sstd = np.nanstd(a=a, axis=axis, ddof=ddof, keepdims=True)
2496 else:
2497 mns = a.mean(axis=axis, keepdims=True)
2498 sstd = a.std(axis=axis, ddof=ddof, keepdims=True)
2500 return (a - mns) / sstd
2503def zmap(scores, compare, axis=0, ddof=0):
2504 """
2505 Calculate the relative z-scores.
2507 Return an array of z-scores, i.e., scores that are standardized to
2508 zero mean and unit variance, where mean and variance are calculated
2509 from the comparison array.
2511 Parameters
2512 ----------
2513 scores : array_like
2514 The input for which z-scores are calculated.
2515 compare : array_like
2516 The input from which the mean and standard deviation of the
2517 normalization are taken; assumed to have the same dimension as
2518 `scores`.
2519 axis : int or None, optional
2520 Axis over which mean and variance of `compare` are calculated.
2521 Default is 0. If None, compute over the whole array `scores`.
2522 ddof : int, optional
2523 Degrees of freedom correction in the calculation of the
2524 standard deviation. Default is 0.
2526 Returns
2527 -------
2528 zscore : array_like
2529 Z-scores, in the same shape as `scores`.
2531 Notes
2532 -----
2533 This function preserves ndarray subclasses, and works also with
2534 matrices and masked arrays (it uses `asanyarray` instead of
2535 `asarray` for parameters).
2537 Examples
2538 --------
2539 >>> from scipy.stats import zmap
2540 >>> a = [0.5, 2.0, 2.5, 3]
2541 >>> b = [0, 1, 2, 3, 4]
2542 >>> zmap(a, b)
2543 array([-1.06066017, 0. , 0.35355339, 0.70710678])
2545 """
2546 scores, compare = map(np.asanyarray, [scores, compare])
2547 mns = compare.mean(axis=axis, keepdims=True)
2548 sstd = compare.std(axis=axis, ddof=ddof, keepdims=True)
2549 return (scores - mns) / sstd
2552def gstd(a, axis=0, ddof=1):
2553 """
2554 Calculate the geometric standard deviation of an array.
2556 The geometric standard deviation describes the spread of a set of numbers
2557 where the geometric mean is preferred. It is a multiplicative factor, and
2558 so a dimensionless quantity.
2560 It is defined as the exponent of the standard deviation of ``log(a)``.
2561 Mathematically the population geometric standard deviation can be
2562 evaluated as::
2564 gstd = exp(std(log(a)))
2566 .. versionadded:: 1.3.0
2568 Parameters
2569 ----------
2570 a : array_like
2571 An array like object containing the sample data.
2572 axis : int, tuple or None, optional
2573 Axis along which to operate. Default is 0. If None, compute over
2574 the whole array `a`.
2575 ddof : int, optional
2576 Degree of freedom correction in the calculation of the
2577 geometric standard deviation. Default is 1.
2579 Returns
2580 -------
2581 ndarray or float
2582 An array of the geometric standard deviation. If `axis` is None or `a`
2583 is a 1d array a float is returned.
2585 Notes
2586 -----
2587 As the calculation requires the use of logarithms the geometric standard
2588 deviation only supports strictly positive values. Any non-positive or
2589 infinite values will raise a `ValueError`.
2590 The geometric standard deviation is sometimes confused with the exponent of
2591 the standard deviation, ``exp(std(a))``. Instead the geometric standard
2592 deviation is ``exp(std(log(a)))``.
2593 The default value for `ddof` is different to the default value (0) used
2594 by other ddof containing functions, such as ``np.std`` and ``np.nanstd``.
2596 Examples
2597 --------
2598 Find the geometric standard deviation of a log-normally distributed sample.
2599 Note that the standard deviation of the distribution is one, on a
2600 log scale this evaluates to approximately ``exp(1)``.
2602 >>> from scipy.stats import gstd
2603 >>> np.random.seed(123)
2604 >>> sample = np.random.lognormal(mean=0, sigma=1, size=1000)
2605 >>> gstd(sample)
2606 2.7217860664589946
2608 Compute the geometric standard deviation of a multidimensional array and
2609 of a given axis.
2611 >>> a = np.arange(1, 25).reshape(2, 3, 4)
2612 >>> gstd(a, axis=None)
2613 2.2944076136018947
2614 >>> gstd(a, axis=2)
2615 array([[1.82424757, 1.22436866, 1.13183117],
2616 [1.09348306, 1.07244798, 1.05914985]])
2617 >>> gstd(a, axis=(1,2))
2618 array([2.12939215, 1.22120169])
2620 The geometric standard deviation further handles masked arrays.
2622 >>> a = np.arange(1, 25).reshape(2, 3, 4)
2623 >>> ma = np.ma.masked_where(a > 16, a)
2624 >>> ma
2625 masked_array(
2626 data=[[[1, 2, 3, 4],
2627 [5, 6, 7, 8],
2628 [9, 10, 11, 12]],
2629 [[13, 14, 15, 16],
2630 [--, --, --, --],
2631 [--, --, --, --]]],
2632 mask=[[[False, False, False, False],
2633 [False, False, False, False],
2634 [False, False, False, False]],
2635 [[False, False, False, False],
2636 [ True, True, True, True],
2637 [ True, True, True, True]]],
2638 fill_value=999999)
2639 >>> gstd(ma, axis=2)
2640 masked_array(
2641 data=[[1.8242475707663655, 1.2243686572447428, 1.1318311657788478],
2642 [1.0934830582350938, --, --]],
2643 mask=[[False, False, False],
2644 [False, True, True]],
2645 fill_value=999999)
2647 """
2648 a = np.asanyarray(a)
2649 log = ma.log if isinstance(a, ma.MaskedArray) else np.log
2651 try:
2652 with warnings.catch_warnings():
2653 warnings.simplefilter("error", RuntimeWarning)
2654 return np.exp(np.std(log(a), axis=axis, ddof=ddof))
2655 except RuntimeWarning as w:
2656 if np.isinf(a).any():
2657 raise ValueError(
2658 'Infinite value encountered. The geometric standard deviation '
2659 'is defined for strictly positive values only.')
2660 a_nan = np.isnan(a)
2661 a_nan_any = a_nan.any()
2662 # exclude NaN's from negativity check, but
2663 # avoid expensive masking for arrays with no NaN
2664 if ((a_nan_any and np.less_equal(np.nanmin(a), 0)) or
2665 (not a_nan_any and np.less_equal(a, 0).any())):
2666 raise ValueError(
2667 'Non positive value encountered. The geometric standard '
2668 'deviation is defined for strictly positive values only.')
2669 elif 'Degrees of freedom <= 0 for slice' == str(w):
2670 raise ValueError(w)
2671 else:
2672 # Remaining warnings don't need to be exceptions.
2673 return np.exp(np.std(log(a, where=~a_nan), axis=axis, ddof=ddof))
2674 except TypeError:
2675 raise ValueError(
2676 'Invalid array input. The inputs could not be '
2677 'safely coerced to any supported types')
2680# Private dictionary initialized only once at module level
2681# See https://en.wikipedia.org/wiki/Robust_measures_of_scale
2682_scale_conversions = {'raw': 1.0,
2683 'normal': special.erfinv(0.5) * 2.0 * math.sqrt(2.0)}
2686def iqr(x, axis=None, rng=(25, 75), scale=1.0, nan_policy='propagate',
2687 interpolation='linear', keepdims=False):
2688 r"""
2689 Compute the interquartile range of the data along the specified axis.
2691 The interquartile range (IQR) is the difference between the 75th and
2692 25th percentile of the data. It is a measure of the dispersion
2693 similar to standard deviation or variance, but is much more robust
2694 against outliers [2]_.
2696 The ``rng`` parameter allows this function to compute other
2697 percentile ranges than the actual IQR. For example, setting
2698 ``rng=(0, 100)`` is equivalent to `numpy.ptp`.
2700 The IQR of an empty array is `np.nan`.
2702 .. versionadded:: 0.18.0
2704 Parameters
2705 ----------
2706 x : array_like
2707 Input array or object that can be converted to an array.
2708 axis : int or sequence of int, optional
2709 Axis along which the range is computed. The default is to
2710 compute the IQR for the entire array.
2711 rng : Two-element sequence containing floats in range of [0,100] optional
2712 Percentiles over which to compute the range. Each must be
2713 between 0 and 100, inclusive. The default is the true IQR:
2714 `(25, 75)`. The order of the elements is not important.
2715 scale : scalar or str, optional
2716 The numerical value of scale will be divided out of the final
2717 result. The following string values are recognized:
2719 * 'raw' : No scaling, just return the raw IQR.
2720 **Deprecated!** Use `scale=1` instead.
2721 * 'normal' : Scale by
2722 :math:`2 \sqrt{2} erf^{-1}(\frac{1}{2}) \approx 1.349`.
2724 The default is 1.0. The use of scale='raw' is deprecated.
2725 Array-like scale is also allowed, as long
2726 as it broadcasts correctly to the output such that
2727 ``out / scale`` is a valid operation. The output dimensions
2728 depend on the input array, `x`, the `axis` argument, and the
2729 `keepdims` flag.
2730 nan_policy : {'propagate', 'raise', 'omit'}, optional
2731 Defines how to handle when input contains nan.
2732 The following options are available (default is 'propagate'):
2734 * 'propagate': returns nan
2735 * 'raise': throws an error
2736 * 'omit': performs the calculations ignoring nan values
2737 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
2738 Specifies the interpolation method to use when the percentile
2739 boundaries lie between two data points `i` and `j`.
2740 The following options are available (default is 'linear'):
2742 * 'linear': `i + (j - i) * fraction`, where `fraction` is the
2743 fractional part of the index surrounded by `i` and `j`.
2744 * 'lower': `i`.
2745 * 'higher': `j`.
2746 * 'nearest': `i` or `j` whichever is nearest.
2747 * 'midpoint': `(i + j) / 2`.
2749 keepdims : bool, optional
2750 If this is set to `True`, the reduced axes are left in the
2751 result as dimensions with size one. With this option, the result
2752 will broadcast correctly against the original array `x`.
2754 Returns
2755 -------
2756 iqr : scalar or ndarray
2757 If ``axis=None``, a scalar is returned. If the input contains
2758 integers or floats of smaller precision than ``np.float64``, then the
2759 output data-type is ``np.float64``. Otherwise, the output data-type is
2760 the same as that of the input.
2762 See Also
2763 --------
2764 numpy.std, numpy.var
2766 Notes
2767 -----
2768 This function is heavily dependent on the version of `numpy` that is
2769 installed. Versions greater than 1.11.0b3 are highly recommended, as they
2770 include a number of enhancements and fixes to `numpy.percentile` and
2771 `numpy.nanpercentile` that affect the operation of this function. The
2772 following modifications apply:
2774 Below 1.10.0 : `nan_policy` is poorly defined.
2775 The default behavior of `numpy.percentile` is used for 'propagate'. This
2776 is a hybrid of 'omit' and 'propagate' that mostly yields a skewed
2777 version of 'omit' since NaNs are sorted to the end of the data. A
2778 warning is raised if there are NaNs in the data.
2779 Below 1.9.0: `numpy.nanpercentile` does not exist.
2780 This means that `numpy.percentile` is used regardless of `nan_policy`
2781 and a warning is issued. See previous item for a description of the
2782 behavior.
2783 Below 1.9.0: `keepdims` and `interpolation` are not supported.
2784 The keywords get ignored with a warning if supplied with non-default
2785 values. However, multiple axes are still supported.
2787 References
2788 ----------
2789 .. [1] "Interquartile range" https://en.wikipedia.org/wiki/Interquartile_range
2790 .. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale
2791 .. [3] "Quantile" https://en.wikipedia.org/wiki/Quantile
2793 Examples
2794 --------
2795 >>> from scipy.stats import iqr
2796 >>> x = np.array([[10, 7, 4], [3, 2, 1]])
2797 >>> x
2798 array([[10, 7, 4],
2799 [ 3, 2, 1]])
2800 >>> iqr(x)
2801 4.0
2802 >>> iqr(x, axis=0)
2803 array([ 3.5, 2.5, 1.5])
2804 >>> iqr(x, axis=1)
2805 array([ 3., 1.])
2806 >>> iqr(x, axis=1, keepdims=True)
2807 array([[ 3.],
2808 [ 1.]])
2810 """
2811 x = asarray(x)
2813 # This check prevents percentile from raising an error later. Also, it is
2814 # consistent with `np.var` and `np.std`.
2815 if not x.size:
2816 return np.nan
2818 # An error may be raised here, so fail-fast, before doing lengthy
2819 # computations, even though `scale` is not used until later
2820 if isinstance(scale, str):
2821 scale_key = scale.lower()
2822 if scale_key not in _scale_conversions:
2823 raise ValueError("{0} not a valid scale for `iqr`".format(scale))
2824 if scale_key == 'raw':
2825 warnings.warn(
2826 "use of scale='raw' is deprecated, use scale=1.0 instead",
2827 np.VisibleDeprecationWarning
2828 )
2829 scale = _scale_conversions[scale_key]
2831 # Select the percentile function to use based on nans and policy
2832 contains_nan, nan_policy = _contains_nan(x, nan_policy)
2834 if contains_nan and nan_policy == 'omit':
2835 percentile_func = np.nanpercentile
2836 else:
2837 percentile_func = np.percentile
2839 if len(rng) != 2:
2840 raise TypeError("quantile range must be two element sequence")
2842 if np.isnan(rng).any():
2843 raise ValueError("range must not contain NaNs")
2845 rng = sorted(rng)
2846 pct = percentile_func(x, rng, axis=axis, interpolation=interpolation,
2847 keepdims=keepdims)
2848 out = np.subtract(pct[1], pct[0])
2850 if scale != 1.0:
2851 out /= scale
2853 return out
2856def _mad_1d(x, center, nan_policy):
2857 # Median absolute deviation for 1-d array x.
2858 # This is a helper function for `median_abs_deviation`; it assumes its
2859 # arguments have been validated already. In particular, x must be a
2860 # 1-d numpy array, center must be callable, and if nan_policy is not
2861 # 'propagate', it is assumed to be 'omit', because 'raise' is handled
2862 # in `median_abs_deviation`.
2863 # No warning is generated if x is empty or all nan.
2864 isnan = np.isnan(x)
2865 if isnan.any():
2866 if nan_policy == 'propagate':
2867 return np.nan
2868 x = x[~isnan]
2869 if x.size == 0:
2870 # MAD of an empty array is nan.
2871 return np.nan
2872 # Edge cases have been handled, so do the basic MAD calculation.
2873 med = center(x)
2874 mad = np.median(np.abs(x - med))
2875 return mad
2878def median_abs_deviation(x, axis=0, center=np.median, scale=1.0,
2879 nan_policy='propagate'):
2880 r"""
2881 Compute the median absolute deviation of the data along the given axis.
2883 The median absolute deviation (MAD, [1]_) computes the median over the
2884 absolute deviations from the median. It is a measure of dispersion
2885 similar to the standard deviation but more robust to outliers [2]_.
2887 The MAD of an empty array is ``np.nan``.
2889 .. versionadded:: 1.5.0
2891 Parameters
2892 ----------
2893 x : array_like
2894 Input array or object that can be converted to an array.
2895 axis : int or None, optional
2896 Axis along which the range is computed. Default is 0. If None, compute
2897 the MAD over the entire array.
2898 center : callable, optional
2899 A function that will return the central value. The default is to use
2900 np.median. Any user defined function used will need to have the
2901 function signature ``func(arr, axis)``.
2902 scale : scalar or str, optional
2903 The numerical value of scale will be divided out of the final
2904 result. The default is 1.0. The string "normal" is also accepted,
2905 and results in `scale` being the inverse of the standard normal
2906 quantile function at 0.75, which is approximately 0.67449.
2907 Array-like scale is also allowed, as long as it broadcasts correctly
2908 to the output such that ``out / scale`` is a valid operation. The
2909 output dimensions depend on the input array, `x`, and the `axis`
2910 argument.
2911 nan_policy : {'propagate', 'raise', 'omit'}, optional
2912 Defines how to handle when input contains nan.
2913 The following options are available (default is 'propagate'):
2915 * 'propagate': returns nan
2916 * 'raise': throws an error
2917 * 'omit': performs the calculations ignoring nan values
2919 Returns
2920 -------
2921 mad : scalar or ndarray
2922 If ``axis=None``, a scalar is returned. If the input contains
2923 integers or floats of smaller precision than ``np.float64``, then the
2924 output data-type is ``np.float64``. Otherwise, the output data-type is
2925 the same as that of the input.
2927 See Also
2928 --------
2929 numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean,
2930 scipy.stats.tstd, scipy.stats.tvar
2932 Notes
2933 -----
2934 The `center` argument only affects the calculation of the central value
2935 around which the MAD is calculated. That is, passing in ``center=np.mean``
2936 will calculate the MAD around the mean - it will not calculate the *mean*
2937 absolute deviation.
2939 The input array may contain `inf`, but if `center` returns `inf`, the
2940 corresponding MAD for that data will be `nan`.
2942 References
2943 ----------
2944 .. [1] "Median absolute deviation",
2945 https://en.wikipedia.org/wiki/Median_absolute_deviation
2946 .. [2] "Robust measures of scale",
2947 https://en.wikipedia.org/wiki/Robust_measures_of_scale
2949 Examples
2950 --------
2951 When comparing the behavior of `median_abs_deviation` with ``np.std``,
2952 the latter is affected when we change a single value of an array to have an
2953 outlier value while the MAD hardly changes:
2955 >>> from scipy import stats
2956 >>> x = stats.norm.rvs(size=100, scale=1, random_state=123456)
2957 >>> x.std()
2958 0.9973906394005013
2959 >>> stats.median_abs_deviation(x)
2960 0.82832610097857
2961 >>> x[0] = 345.6
2962 >>> x.std()
2963 34.42304872314415
2964 >>> stats.median_abs_deviation(x)
2965 0.8323442311590675
2967 Axis handling example:
2969 >>> x = np.array([[10, 7, 4], [3, 2, 1]])
2970 >>> x
2971 array([[10, 7, 4],
2972 [ 3, 2, 1]])
2973 >>> stats.median_abs_deviation(x)
2974 array([3.5, 2.5, 1.5])
2975 >>> stats.median_abs_deviation(x, axis=None)
2976 2.0
2978 Scale normal example:
2980 >>> x = stats.norm.rvs(size=1000000, scale=2, random_state=123456)
2981 >>> stats.median_abs_deviation(x)
2982 1.3487398527041636
2983 >>> stats.median_abs_deviation(x, scale='normal')
2984 1.9996446978061115
2986 """
2987 if not callable(center):
2988 raise TypeError("The argument 'center' must be callable. The given "
2989 f"value {repr(center)} is not callable.")
2991 # An error may be raised here, so fail-fast, before doing lengthy
2992 # computations, even though `scale` is not used until later
2993 if isinstance(scale, str):
2994 if scale.lower() == 'normal':
2995 scale = 0.6744897501960817 # special.ndtri(0.75)
2996 else:
2997 raise ValueError(f"{scale} is not a valid scale value.")
2999 x = asarray(x)
3001 # Consistent with `np.var` and `np.std`.
3002 if not x.size:
3003 if axis is None:
3004 return np.nan
3005 nan_shape = tuple(item for i, item in enumerate(x.shape) if i != axis)
3006 if nan_shape == ():
3007 # Return nan, not array(nan)
3008 return np.nan
3009 return np.full(nan_shape, np.nan)
3011 contains_nan, nan_policy = _contains_nan(x, nan_policy)
3013 if contains_nan:
3014 if axis is None:
3015 mad = _mad_1d(x.ravel(), center, nan_policy)
3016 else:
3017 mad = np.apply_along_axis(_mad_1d, axis, x, center, nan_policy)
3018 else:
3019 if axis is None:
3020 med = center(x, axis=None)
3021 mad = np.median(np.abs(x - med))
3022 else:
3023 # Wrap the call to center() in expand_dims() so it acts like
3024 # keepdims=True was used.
3025 med = np.expand_dims(center(x, axis=axis), axis)
3026 mad = np.median(np.abs(x - med), axis=axis)
3028 return mad / scale
3031# Keep the top newline so that the message does not show up on the stats page
3032_median_absolute_deviation_deprec_msg = """
3033To preserve the existing default behavior, use
3034`scipy.stats.median_abs_deviation(..., scale=1/1.4826)`.
3035The value 1.4826 is not numerically precise for scaling
3036with a normal distribution. For a numerically precise value, use
3037`scipy.stats.median_abs_deviation(..., scale='normal')`.
3038"""
3041# Due to numpy/gh-16349 we need to unindent the entire docstring
3042@np.deprecate(old_name='median_absolute_deviation',
3043 new_name='median_abs_deviation',
3044 message=_median_absolute_deviation_deprec_msg)
3045def median_absolute_deviation(x, axis=0, center=np.median, scale=1.4826,
3046 nan_policy='propagate'):
3047 r"""
3048Compute the median absolute deviation of the data along the given axis.
3050The median absolute deviation (MAD, [1]_) computes the median over the
3051absolute deviations from the median. It is a measure of dispersion
3052similar to the standard deviation but more robust to outliers [2]_.
3054The MAD of an empty array is ``np.nan``.
3056.. versionadded:: 1.3.0
3058Parameters
3059----------
3060x : array_like
3061 Input array or object that can be converted to an array.
3062axis : int or None, optional
3063 Axis along which the range is computed. Default is 0. If None, compute
3064 the MAD over the entire array.
3065center : callable, optional
3066 A function that will return the central value. The default is to use
3067 np.median. Any user defined function used will need to have the function
3068 signature ``func(arr, axis)``.
3069scale : int, optional
3070 The scaling factor applied to the MAD. The default scale (1.4826)
3071 ensures consistency with the standard deviation for normally distributed
3072 data.
3073nan_policy : {'propagate', 'raise', 'omit'}, optional
3074 Defines how to handle when input contains nan.
3075 The following options are available (default is 'propagate'):
3077 * 'propagate': returns nan
3078 * 'raise': throws an error
3079 * 'omit': performs the calculations ignoring nan values
3081Returns
3082-------
3083mad : scalar or ndarray
3084 If ``axis=None``, a scalar is returned. If the input contains
3085 integers or floats of smaller precision than ``np.float64``, then the
3086 output data-type is ``np.float64``. Otherwise, the output data-type is
3087 the same as that of the input.
3089See Also
3090--------
3091numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean,
3092scipy.stats.tstd, scipy.stats.tvar
3094Notes
3095-----
3096The `center` argument only affects the calculation of the central value
3097around which the MAD is calculated. That is, passing in ``center=np.mean``
3098will calculate the MAD around the mean - it will not calculate the *mean*
3099absolute deviation.
3101References
3102----------
3103.. [1] "Median absolute deviation",
3104 https://en.wikipedia.org/wiki/Median_absolute_deviation
3105.. [2] "Robust measures of scale",
3106 https://en.wikipedia.org/wiki/Robust_measures_of_scale
3108Examples
3109--------
3110When comparing the behavior of `median_absolute_deviation` with ``np.std``,
3111the latter is affected when we change a single value of an array to have an
3112outlier value while the MAD hardly changes:
3114>>> from scipy import stats
3115>>> x = stats.norm.rvs(size=100, scale=1, random_state=123456)
3116>>> x.std()
31170.9973906394005013
3118>>> stats.median_absolute_deviation(x)
31191.2280762773108278
3120>>> x[0] = 345.6
3121>>> x.std()
312234.42304872314415
3123>>> stats.median_absolute_deviation(x)
31241.2340335571164334
3126Axis handling example:
3128>>> x = np.array([[10, 7, 4], [3, 2, 1]])
3129>>> x
3130array([[10, 7, 4],
3131 [ 3, 2, 1]])
3132>>> stats.median_absolute_deviation(x)
3133array([5.1891, 3.7065, 2.2239])
3134>>> stats.median_absolute_deviation(x, axis=None)
31352.9652
3136"""
3137 if isinstance(scale, str):
3138 if scale.lower() == 'raw':
3139 warnings.warn(
3140 "use of scale='raw' is deprecated, use scale=1.0 instead",
3141 np.VisibleDeprecationWarning
3142 )
3143 scale = 1.0
3145 if not isinstance(scale, str):
3146 scale = 1 / scale
3148 return median_abs_deviation(x, axis=axis, center=center, scale=scale,
3149 nan_policy=nan_policy)
3151#####################################
3152# TRIMMING FUNCTIONS #
3153#####################################
3156SigmaclipResult = namedtuple('SigmaclipResult', ('clipped', 'lower', 'upper'))
3159def sigmaclip(a, low=4., high=4.):
3160 """
3161 Perform iterative sigma-clipping of array elements.
3163 Starting from the full sample, all elements outside the critical range are
3164 removed, i.e. all elements of the input array `c` that satisfy either of
3165 the following conditions::
3167 c < mean(c) - std(c)*low
3168 c > mean(c) + std(c)*high
3170 The iteration continues with the updated sample until no
3171 elements are outside the (updated) range.
3173 Parameters
3174 ----------
3175 a : array_like
3176 Data array, will be raveled if not 1-D.
3177 low : float, optional
3178 Lower bound factor of sigma clipping. Default is 4.
3179 high : float, optional
3180 Upper bound factor of sigma clipping. Default is 4.
3182 Returns
3183 -------
3184 clipped : ndarray
3185 Input array with clipped elements removed.
3186 lower : float
3187 Lower threshold value use for clipping.
3188 upper : float
3189 Upper threshold value use for clipping.
3191 Examples
3192 --------
3193 >>> from scipy.stats import sigmaclip
3194 >>> a = np.concatenate((np.linspace(9.5, 10.5, 31),
3195 ... np.linspace(0, 20, 5)))
3196 >>> fact = 1.5
3197 >>> c, low, upp = sigmaclip(a, fact, fact)
3198 >>> c
3199 array([ 9.96666667, 10. , 10.03333333, 10. ])
3200 >>> c.var(), c.std()
3201 (0.00055555555555555165, 0.023570226039551501)
3202 >>> low, c.mean() - fact*c.std(), c.min()
3203 (9.9646446609406727, 9.9646446609406727, 9.9666666666666668)
3204 >>> upp, c.mean() + fact*c.std(), c.max()
3205 (10.035355339059327, 10.035355339059327, 10.033333333333333)
3207 >>> a = np.concatenate((np.linspace(9.5, 10.5, 11),
3208 ... np.linspace(-100, -50, 3)))
3209 >>> c, low, upp = sigmaclip(a, 1.8, 1.8)
3210 >>> (c == np.linspace(9.5, 10.5, 11)).all()
3211 True
3213 """
3214 c = np.asarray(a).ravel()
3215 delta = 1
3216 while delta:
3217 c_std = c.std()
3218 c_mean = c.mean()
3219 size = c.size
3220 critlower = c_mean - c_std * low
3221 critupper = c_mean + c_std * high
3222 c = c[(c >= critlower) & (c <= critupper)]
3223 delta = size - c.size
3225 return SigmaclipResult(c, critlower, critupper)
3228def trimboth(a, proportiontocut, axis=0):
3229 """
3230 Slice off a proportion of items from both ends of an array.
3232 Slice off the passed proportion of items from both ends of the passed
3233 array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**
3234 rightmost 10% of scores). The trimmed values are the lowest and
3235 highest ones.
3236 Slice off less if proportion results in a non-integer slice index (i.e.
3237 conservatively slices off `proportiontocut`).
3239 Parameters
3240 ----------
3241 a : array_like
3242 Data to trim.
3243 proportiontocut : float
3244 Proportion (in range 0-1) of total data set to trim of each end.
3245 axis : int or None, optional
3246 Axis along which to trim data. Default is 0. If None, compute over
3247 the whole array `a`.
3249 Returns
3250 -------
3251 out : ndarray
3252 Trimmed version of array `a`. The order of the trimmed content
3253 is undefined.
3255 See Also
3256 --------
3257 trim_mean
3259 Examples
3260 --------
3261 >>> from scipy import stats
3262 >>> a = np.arange(20)
3263 >>> b = stats.trimboth(a, 0.1)
3264 >>> b.shape
3265 (16,)
3267 """
3268 a = np.asarray(a)
3270 if a.size == 0:
3271 return a
3273 if axis is None:
3274 a = a.ravel()
3275 axis = 0
3277 nobs = a.shape[axis]
3278 lowercut = int(proportiontocut * nobs)
3279 uppercut = nobs - lowercut
3280 if (lowercut >= uppercut):
3281 raise ValueError("Proportion too big.")
3283 atmp = np.partition(a, (lowercut, uppercut - 1), axis)
3285 sl = [slice(None)] * atmp.ndim
3286 sl[axis] = slice(lowercut, uppercut)
3287 return atmp[tuple(sl)]
3290def trim1(a, proportiontocut, tail='right', axis=0):
3291 """
3292 Slice off a proportion from ONE end of the passed array distribution.
3294 If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost'
3295 10% of scores. The lowest or highest values are trimmed (depending on
3296 the tail).
3297 Slice off less if proportion results in a non-integer slice index
3298 (i.e. conservatively slices off `proportiontocut` ).
3300 Parameters
3301 ----------
3302 a : array_like
3303 Input array.
3304 proportiontocut : float
3305 Fraction to cut off of 'left' or 'right' of distribution.
3306 tail : {'left', 'right'}, optional
3307 Defaults to 'right'.
3308 axis : int or None, optional
3309 Axis along which to trim data. Default is 0. If None, compute over
3310 the whole array `a`.
3312 Returns
3313 -------
3314 trim1 : ndarray
3315 Trimmed version of array `a`. The order of the trimmed content is
3316 undefined.
3318 """
3319 a = np.asarray(a)
3320 if axis is None:
3321 a = a.ravel()
3322 axis = 0
3324 nobs = a.shape[axis]
3326 # avoid possible corner case
3327 if proportiontocut >= 1:
3328 return []
3330 if tail.lower() == 'right':
3331 lowercut = 0
3332 uppercut = nobs - int(proportiontocut * nobs)
3334 elif tail.lower() == 'left':
3335 lowercut = int(proportiontocut * nobs)
3336 uppercut = nobs
3338 atmp = np.partition(a, (lowercut, uppercut - 1), axis)
3340 return atmp[lowercut:uppercut]
3343def trim_mean(a, proportiontocut, axis=0):
3344 """
3345 Return mean of array after trimming distribution from both tails.
3347 If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of
3348 scores. The input is sorted before slicing. Slices off less if proportion
3349 results in a non-integer slice index (i.e., conservatively slices off
3350 `proportiontocut` ).
3352 Parameters
3353 ----------
3354 a : array_like
3355 Input array.
3356 proportiontocut : float
3357 Fraction to cut off of both tails of the distribution.
3358 axis : int or None, optional
3359 Axis along which the trimmed means are computed. Default is 0.
3360 If None, compute over the whole array `a`.
3362 Returns
3363 -------
3364 trim_mean : ndarray
3365 Mean of trimmed array.
3367 See Also
3368 --------
3369 trimboth
3370 tmean : Compute the trimmed mean ignoring values outside given `limits`.
3372 Examples
3373 --------
3374 >>> from scipy import stats
3375 >>> x = np.arange(20)
3376 >>> stats.trim_mean(x, 0.1)
3377 9.5
3378 >>> x2 = x.reshape(5, 4)
3379 >>> x2
3380 array([[ 0, 1, 2, 3],
3381 [ 4, 5, 6, 7],
3382 [ 8, 9, 10, 11],
3383 [12, 13, 14, 15],
3384 [16, 17, 18, 19]])
3385 >>> stats.trim_mean(x2, 0.25)
3386 array([ 8., 9., 10., 11.])
3387 >>> stats.trim_mean(x2, 0.25, axis=1)
3388 array([ 1.5, 5.5, 9.5, 13.5, 17.5])
3390 """
3391 a = np.asarray(a)
3393 if a.size == 0:
3394 return np.nan
3396 if axis is None:
3397 a = a.ravel()
3398 axis = 0
3400 nobs = a.shape[axis]
3401 lowercut = int(proportiontocut * nobs)
3402 uppercut = nobs - lowercut
3403 if (lowercut > uppercut):
3404 raise ValueError("Proportion too big.")
3406 atmp = np.partition(a, (lowercut, uppercut - 1), axis)
3408 sl = [slice(None)] * atmp.ndim
3409 sl[axis] = slice(lowercut, uppercut)
3410 return np.mean(atmp[tuple(sl)], axis=axis)
3413F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
3416class F_onewayConstantInputWarning(RuntimeWarning):
3417 """
3418 Warning generated by `f_oneway` when an input is constant, e.g.
3419 each of the samples provided is a constant array.
3420 """
3422 def __init__(self, msg=None):
3423 if msg is None:
3424 msg = ("Each of the input arrays is constant;"
3425 "the F statistic is not defined or infinite")
3426 self.args = (msg,)
3429class F_onewayBadInputSizesWarning(RuntimeWarning):
3430 """
3431 Warning generated by `f_oneway` when an input has length 0,
3432 or if all the inputs have length 1.
3433 """
3434 pass
3437def _create_f_oneway_nan_result(shape, axis):
3438 """
3439 This is a helper function for f_oneway for creating the return values
3440 in certain degenerate conditions. It creates return values that are
3441 all nan with the appropriate shape for the given `shape` and `axis`.
3442 """
3443 axis = np.core.multiarray.normalize_axis_index(axis, len(shape))
3444 shp = shape[:axis] + shape[axis+1:]
3445 if shp == ():
3446 f = np.nan
3447 prob = np.nan
3448 else:
3449 f = np.full(shp, fill_value=np.nan)
3450 prob = f.copy()
3451 return F_onewayResult(f, prob)
3454def _first(arr, axis):
3455 """
3456 Return arr[..., 0:1, ...] where 0:1 is in the `axis` position.
3457 """
3458 # When the oldest version of numpy supported by scipy is at
3459 # least 1.15.0, this function can be replaced by np.take_along_axis
3460 # (with appropriately configured arguments).
3461 axis = np.core.multiarray.normalize_axis_index(axis, arr.ndim)
3462 return arr[tuple(slice(None) if k != axis else slice(0, 1)
3463 for k in range(arr.ndim))]
3466def f_oneway(*args, axis=0):
3467 """
3468 Perform one-way ANOVA.
3470 The one-way ANOVA tests the null hypothesis that two or more groups have
3471 the same population mean. The test is applied to samples from two or
3472 more groups, possibly with differing sizes.
3474 Parameters
3475 ----------
3476 sample1, sample2, ... : array_like
3477 The sample measurements for each group. There must be at least
3478 two arguments. If the arrays are multidimensional, then all the
3479 dimensions of the array must be the same except for `axis`.
3480 axis : int, optional
3481 Axis of the input arrays along which the test is applied.
3482 Default is 0.
3484 Returns
3485 -------
3486 statistic : float
3487 The computed F statistic of the test.
3488 pvalue : float
3489 The associated p-value from the F distribution.
3491 Warns
3492 -----
3493 F_onewayConstantInputWarning
3494 Raised if each of the input arrays is constant array.
3495 In this case the F statistic is either infinite or isn't defined,
3496 so ``np.inf`` or ``np.nan`` is returned.
3498 F_onewayBadInputSizesWarning
3499 Raised if the length of any input array is 0, or if all the input
3500 arrays have length 1. ``np.nan`` is returned for the F statistic
3501 and the p-value in these cases.
3503 Notes
3504 -----
3505 The ANOVA test has important assumptions that must be satisfied in order
3506 for the associated p-value to be valid.
3508 1. The samples are independent.
3509 2. Each sample is from a normally distributed population.
3510 3. The population standard deviations of the groups are all equal. This
3511 property is known as homoscedasticity.
3513 If these assumptions are not true for a given set of data, it may still
3514 be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`)
3515 although with some loss of power.
3517 The length of each group must be at least one, and there must be at
3518 least one group with length greater than one. If these conditions
3519 are not satisfied, a warning is generated and (``np.nan``, ``np.nan``)
3520 is returned.
3522 If each group contains constant values, and there exist at least two
3523 groups with different values, the function generates a warning and
3524 returns (``np.inf``, 0).
3526 If all values in all groups are the same, function generates a warning
3527 and returns (``np.nan``, ``np.nan``).
3529 The algorithm is from Heiman [2]_, pp.394-7.
3531 References
3532 ----------
3533 .. [1] R. Lowry, "Concepts and Applications of Inferential Statistics",
3534 Chapter 14, 2014, http://vassarstats.net/textbook/
3536 .. [2] G.W. Heiman, "Understanding research methods and statistics: An
3537 integrated introduction for psychology", Houghton, Mifflin and
3538 Company, 2001.
3540 .. [3] G.H. McDonald, "Handbook of Biological Statistics", One-way ANOVA.
3541 http://www.biostathandbook.com/onewayanova.html
3543 Examples
3544 --------
3545 >>> from scipy.stats import f_oneway
3547 Here are some data [3]_ on a shell measurement (the length of the anterior
3548 adductor muscle scar, standardized by dividing by length) in the mussel
3549 Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon;
3550 Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, taken from a
3551 much larger data set used in McDonald et al. (1991).
3553 >>> tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735,
3554 ... 0.0659, 0.0923, 0.0836]
3555 >>> newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835,
3556 ... 0.0725]
3557 >>> petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
3558 >>> magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764,
3559 ... 0.0689]
3560 >>> tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]
3561 >>> f_oneway(tillamook, newport, petersburg, magadan, tvarminne)
3562 F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)
3564 `f_oneway` accepts multidimensional input arrays. When the inputs
3565 are multidimensional and `axis` is not given, the test is performed
3566 along the first axis of the input arrays. For the following data, the
3567 test is performed three times, once for each column.
3569 >>> a = np.array([[9.87, 9.03, 6.81],
3570 ... [7.18, 8.35, 7.00],
3571 ... [8.39, 7.58, 7.68],
3572 ... [7.45, 6.33, 9.35],
3573 ... [6.41, 7.10, 9.33],
3574 ... [8.00, 8.24, 8.44]])
3575 >>> b = np.array([[6.35, 7.30, 7.16],
3576 ... [6.65, 6.68, 7.63],
3577 ... [5.72, 7.73, 6.72],
3578 ... [7.01, 9.19, 7.41],
3579 ... [7.75, 7.87, 8.30],
3580 ... [6.90, 7.97, 6.97]])
3581 >>> c = np.array([[3.31, 8.77, 1.01],
3582 ... [8.25, 3.24, 3.62],
3583 ... [6.32, 8.81, 5.19],
3584 ... [7.48, 8.83, 8.91],
3585 ... [8.59, 6.01, 6.07],
3586 ... [3.07, 9.72, 7.48]])
3587 >>> F, p = f_oneway(a, b, c)
3588 >>> F
3589 array([1.75676344, 0.03701228, 3.76439349])
3590 >>> p
3591 array([0.20630784, 0.96375203, 0.04733157])
3593 """
3594 if len(args) < 2:
3595 raise TypeError(f'at least two inputs are required; got {len(args)}.')
3597 args = [np.asarray(arg, dtype=float) for arg in args]
3599 # ANOVA on N groups, each in its own array
3600 num_groups = len(args)
3602 # We haven't explicitly validated axis, but if it is bad, this call of
3603 # np.concatenate will raise np.AxisError. The call will raise ValueError
3604 # if the dimensions of all the arrays, except the axis dimension, are not
3605 # the same.
3606 alldata = np.concatenate(args, axis=axis)
3607 bign = alldata.shape[axis]
3609 # Check this after forming alldata, so shape errors are detected
3610 # and reported before checking for 0 length inputs.
3611 if any(arg.shape[axis] == 0 for arg in args):
3612 warnings.warn(F_onewayBadInputSizesWarning('at least one input '
3613 'has length 0'))
3614 return _create_f_oneway_nan_result(alldata.shape, axis)
3616 # Must have at least one group with length greater than 1.
3617 if all(arg.shape[axis] == 1 for arg in args):
3618 msg = ('all input arrays have length 1. f_oneway requires that at '
3619 'least one input has length greater than 1.')
3620 warnings.warn(F_onewayBadInputSizesWarning(msg))
3621 return _create_f_oneway_nan_result(alldata.shape, axis)
3623 # Check if the values within each group are constant, and if the common
3624 # value in at least one group is different from that in another group.
3625 # Based on https://github.com/scipy/scipy/issues/11669
3627 # If axis=0, say, and the groups have shape (n0, ...), (n1, ...), ...,
3628 # then is_const is a boolean array with shape (num_groups, ...).
3629 # It is True if the groups along the axis slice are each consant.
3630 # In the typical case where each input array is 1-d, is_const is a
3631 # 1-d array with length num_groups.
3632 is_const = np.concatenate([(_first(a, axis) == a).all(axis=axis,
3633 keepdims=True)
3634 for a in args], axis=axis)
3636 # all_const is a boolean array with shape (...) (see previous comment).
3637 # It is True if the values within each group along the axis slice are
3638 # the same (e.g. [[3, 3, 3], [5, 5, 5, 5], [4, 4, 4]]).
3639 all_const = is_const.all(axis=axis)
3640 if all_const.any():
3641 warnings.warn(F_onewayConstantInputWarning())
3643 # all_same_const is True if all the values in the groups along the axis=0
3644 # slice are the same (e.g. [[3, 3, 3], [3, 3, 3, 3], [3, 3, 3]]).
3645 all_same_const = (_first(alldata, axis) == alldata).all(axis=axis)
3647 # Determine the mean of the data, and subtract that from all inputs to a
3648 # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariant
3649 # to a shift in location, and centering all data around zero vastly
3650 # improves numerical stability.
3651 offset = alldata.mean(axis=axis, keepdims=True)
3652 alldata -= offset
3654 normalized_ss = _square_of_sums(alldata, axis=axis) / bign
3656 sstot = _sum_of_squares(alldata, axis=axis) - normalized_ss
3658 ssbn = 0
3659 for a in args:
3660 ssbn += _square_of_sums(a - offset, axis=axis) / a.shape[axis]
3662 # Naming: variables ending in bn/b are for "between treatments", wn/w are
3663 # for "within treatments"
3664 ssbn -= normalized_ss
3665 sswn = sstot - ssbn
3666 dfbn = num_groups - 1
3667 dfwn = bign - num_groups
3668 msb = ssbn / dfbn
3669 msw = sswn / dfwn
3670 with np.errstate(divide='ignore', invalid='ignore'):
3671 f = msb / msw
3673 prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf
3675 # Fix any f values that should be inf or nan because the corresponding
3676 # inputs were constant.
3677 if np.isscalar(f):
3678 if all_same_const:
3679 f = np.nan
3680 prob = np.nan
3681 elif all_const:
3682 f = np.inf
3683 prob = 0.0
3684 else:
3685 f[all_const] = np.inf
3686 prob[all_const] = 0.0
3687 f[all_same_const] = np.nan
3688 prob[all_same_const] = np.nan
3690 return F_onewayResult(f, prob)
3693class PearsonRConstantInputWarning(RuntimeWarning):
3694 """Warning generated by `pearsonr` when an input is constant."""
3696 def __init__(self, msg=None):
3697 if msg is None:
3698 msg = ("An input array is constant; the correlation coefficent "
3699 "is not defined.")
3700 self.args = (msg,)
3703class PearsonRNearConstantInputWarning(RuntimeWarning):
3704 """Warning generated by `pearsonr` when an input is nearly constant."""
3706 def __init__(self, msg=None):
3707 if msg is None:
3708 msg = ("An input array is nearly constant; the computed "
3709 "correlation coefficent may be inaccurate.")
3710 self.args = (msg,)
3713def pearsonr(x, y):
3714 r"""
3715 Pearson correlation coefficient and p-value for testing non-correlation.
3717 The Pearson correlation coefficient [1]_ measures the linear relationship
3718 between two datasets. The calculation of the p-value relies on the
3719 assumption that each dataset is normally distributed. (See Kowalski [3]_
3720 for a discussion of the effects of non-normality of the input on the
3721 distribution of the correlation coefficient.) Like other correlation
3722 coefficients, this one varies between -1 and +1 with 0 implying no
3723 correlation. Correlations of -1 or +1 imply an exact linear relationship.
3724 Positive correlations imply that as x increases, so does y. Negative
3725 correlations imply that as x increases, y decreases.
3727 The p-value roughly indicates the probability of an uncorrelated system
3728 producing datasets that have a Pearson correlation at least as extreme
3729 as the one computed from these datasets.
3731 Parameters
3732 ----------
3733 x : (N,) array_like
3734 Input array.
3735 y : (N,) array_like
3736 Input array.
3738 Returns
3739 -------
3740 r : float
3741 Pearson's correlation coefficient.
3742 p-value : float
3743 Two-tailed p-value.
3745 Warns
3746 -----
3747 PearsonRConstantInputWarning
3748 Raised if an input is a constant array. The correlation coefficient
3749 is not defined in this case, so ``np.nan`` is returned.
3751 PearsonRNearConstantInputWarning
3752 Raised if an input is "nearly" constant. The array ``x`` is considered
3753 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
3754 Numerical errors in the calculation ``x - mean(x)`` in this case might
3755 result in an inaccurate calculation of r.
3757 See Also
3758 --------
3759 spearmanr : Spearman rank-order correlation coefficient.
3760 kendalltau : Kendall's tau, a correlation measure for ordinal data.
3762 Notes
3763 -----
3764 The correlation coefficient is calculated as follows:
3766 .. math::
3768 r = \frac{\sum (x - m_x) (y - m_y)}
3769 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
3771 where :math:`m_x` is the mean of the vector :math:`x` and :math:`m_y` is
3772 the mean of the vector :math:`y`.
3774 Under the assumption that x and y are drawn from independent normal
3775 distributions (so the population correlation coefficient is 0), the
3776 probability density function of the sample correlation coefficient r
3777 is ([1]_, [2]_)::
3779 (1 - r**2)**(n/2 - 2)
3780 f(r) = ---------------------
3781 B(1/2, n/2 - 1)
3783 where n is the number of samples, and B is the beta function. This
3784 is sometimes referred to as the exact distribution of r. This is
3785 the distribution that is used in `pearsonr` to compute the p-value.
3786 The distribution is a beta distribution on the interval [-1, 1],
3787 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's
3788 implementation of the beta distribution, the distribution of r is::
3790 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
3792 The p-value returned by `pearsonr` is a two-sided p-value. For a
3793 given sample with correlation coefficient r, the p-value is
3794 the probability that abs(r') of a random sample x' and y' drawn from
3795 the population with zero correlation would be greater than or equal
3796 to abs(r). In terms of the object ``dist`` shown above, the p-value
3797 for a given r and length n can be computed as::
3799 p = 2*dist.cdf(-abs(r))
3801 When n is 2, the above continuous distribution is not well-defined.
3802 One can interpret the limit of the beta distribution as the shape
3803 parameters a and b approach a = b = 0 as a discrete distribution with
3804 equal probability masses at r = 1 and r = -1. More directly, one
3805 can observe that, given the data x = [x1, x2] and y = [y1, y2], and
3806 assuming x1 != x2 and y1 != y2, the only possible values for r are 1
3807 and -1. Because abs(r') for any sample x' and y' with length 2 will
3808 be 1, the two-sided p-value for a sample of length 2 is always 1.
3810 References
3811 ----------
3812 .. [1] "Pearson correlation coefficient", Wikipedia,
3813 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
3814 .. [2] Student, "Probable error of a correlation coefficient",
3815 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
3816 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
3817 of the Sample Product-Moment Correlation Coefficient"
3818 Journal of the Royal Statistical Society. Series C (Applied
3819 Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
3821 Examples
3822 --------
3823 >>> from scipy import stats
3824 >>> a = np.array([0, 0, 0, 1, 1, 1, 1])
3825 >>> b = np.arange(7)
3826 >>> stats.pearsonr(a, b)
3827 (0.8660254037844386, 0.011724811003954649)
3829 >>> stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
3830 (-0.7426106572325057, 0.1505558088534455)
3832 """
3833 n = len(x)
3834 if n != len(y):
3835 raise ValueError('x and y must have the same length.')
3837 if n < 2:
3838 raise ValueError('x and y must have length at least 2.')
3840 x = np.asarray(x)
3841 y = np.asarray(y)
3843 # If an input is constant, the correlation coefficient is not defined.
3844 if (x == x[0]).all() or (y == y[0]).all():
3845 warnings.warn(PearsonRConstantInputWarning())
3846 return np.nan, np.nan
3848 # dtype is the data type for the calculations. This expression ensures
3849 # that the data type is at least 64 bit floating point. It might have
3850 # more precision if the input is, for example, np.longdouble.
3851 dtype = type(1.0 + x[0] + y[0])
3853 if n == 2:
3854 return dtype(np.sign(x[1] - x[0])*np.sign(y[1] - y[0])), 1.0
3856 xmean = x.mean(dtype=dtype)
3857 ymean = y.mean(dtype=dtype)
3859 # By using `astype(dtype)`, we ensure that the intermediate calculations
3860 # use at least 64 bit floating point.
3861 xm = x.astype(dtype) - xmean
3862 ym = y.astype(dtype) - ymean
3864 # Unlike np.linalg.norm or the expression sqrt((xm*xm).sum()),
3865 # scipy.linalg.norm(xm) does not overflow if xm is, for example,
3866 # [-5e210, 5e210, 3e200, -3e200]
3867 normxm = linalg.norm(xm)
3868 normym = linalg.norm(ym)
3870 threshold = 1e-13
3871 if normxm < threshold*abs(xmean) or normym < threshold*abs(ymean):
3872 # If all the values in x (likewise y) are very close to the mean,
3873 # the loss of precision that occurs in the subtraction xm = x - xmean
3874 # might result in large errors in r.
3875 warnings.warn(PearsonRNearConstantInputWarning())
3877 r = np.dot(xm/normxm, ym/normym)
3879 # Presumably, if abs(r) > 1, then it is only some small artifact of
3880 # floating point arithmetic.
3881 r = max(min(r, 1.0), -1.0)
3883 # As explained in the docstring, the p-value can be computed as
3884 # p = 2*dist.cdf(-abs(r))
3885 # where dist is the beta distribution on [-1, 1] with shape parameters
3886 # a = b = n/2 - 1. `special.btdtr` is the CDF for the beta distribution
3887 # on [0, 1]. To use it, we make the transformation x = (r + 1)/2; the
3888 # shape parameters do not change. Then -abs(r) used in `cdf(-abs(r))`
3889 # becomes x = (-abs(r) + 1)/2 = 0.5*(1 - abs(r)). (r is cast to float64
3890 # to avoid a TypeError raised by btdtr when r is higher precision.)
3891 ab = n/2 - 1
3892 prob = 2*special.btdtr(ab, ab, 0.5*(1 - abs(np.float64(r))))
3894 return r, prob
3897def fisher_exact(table, alternative='two-sided'):
3898 """
3899 Perform a Fisher exact test on a 2x2 contingency table.
3901 Parameters
3902 ----------
3903 table : array_like of ints
3904 A 2x2 contingency table. Elements should be non-negative integers.
3905 alternative : {'two-sided', 'less', 'greater'}, optional
3906 Defines the alternative hypothesis.
3907 The following options are available (default is 'two-sided'):
3909 * 'two-sided'
3910 * 'less': one-sided
3911 * 'greater': one-sided
3913 Returns
3914 -------
3915 oddsratio : float
3916 This is prior odds ratio and not a posterior estimate.
3917 p_value : float
3918 P-value, the probability of obtaining a distribution at least as
3919 extreme as the one that was actually observed, assuming that the
3920 null hypothesis is true.
3922 See Also
3923 --------
3924 chi2_contingency : Chi-square test of independence of variables in a
3925 contingency table.
3927 Notes
3928 -----
3929 The calculated odds ratio is different from the one R uses. This scipy
3930 implementation returns the (more common) "unconditional Maximum
3931 Likelihood Estimate", while R uses the "conditional Maximum Likelihood
3932 Estimate".
3934 For tables with large numbers, the (inexact) chi-square test implemented
3935 in the function `chi2_contingency` can also be used.
3937 Examples
3938 --------
3939 Say we spend a few days counting whales and sharks in the Atlantic and
3940 Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the
3941 Indian ocean 2 whales and 5 sharks. Then our contingency table is::
3943 Atlantic Indian
3944 whales 8 2
3945 sharks 1 5
3947 We use this table to find the p-value:
3949 >>> import scipy.stats as stats
3950 >>> oddsratio, pvalue = stats.fisher_exact([[8, 2], [1, 5]])
3951 >>> pvalue
3952 0.0349...
3954 The probability that we would observe this or an even more imbalanced ratio
3955 by chance is about 3.5%. A commonly used significance level is 5%--if we
3956 adopt that, we can therefore conclude that our observed imbalance is
3957 statistically significant; whales prefer the Atlantic while sharks prefer
3958 the Indian ocean.
3960 """
3961 hypergeom = distributions.hypergeom
3962 c = np.asarray(table, dtype=np.int64) # int32 is not enough for the algorithm
3963 if not c.shape == (2, 2):
3964 raise ValueError("The input `table` must be of shape (2, 2).")
3966 if np.any(c < 0):
3967 raise ValueError("All values in `table` must be nonnegative.")
3969 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
3970 # If both values in a row or column are zero, the p-value is 1 and
3971 # the odds ratio is NaN.
3972 return np.nan, 1.0
3974 if c[1, 0] > 0 and c[0, 1] > 0:
3975 oddsratio = c[0, 0] * c[1, 1] / (c[1, 0] * c[0, 1])
3976 else:
3977 oddsratio = np.inf
3979 n1 = c[0, 0] + c[0, 1]
3980 n2 = c[1, 0] + c[1, 1]
3981 n = c[0, 0] + c[1, 0]
3983 def binary_search(n, n1, n2, side):
3984 """Binary search for where to begin halves in two-sided test."""
3985 if side == "upper":
3986 minval = mode
3987 maxval = n
3988 else:
3989 minval = 0
3990 maxval = mode
3991 guess = -1
3992 while maxval - minval > 1:
3993 if maxval == minval + 1 and guess == minval:
3994 guess = maxval
3995 else:
3996 guess = (maxval + minval) // 2
3997 pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
3998 if side == "upper":
3999 ng = guess - 1
4000 else:
4001 ng = guess + 1
4002 if pguess <= pexact < hypergeom.pmf(ng, n1 + n2, n1, n):
4003 break
4004 elif pguess < pexact:
4005 maxval = guess
4006 else:
4007 minval = guess
4008 if guess == -1:
4009 guess = minval
4010 if side == "upper":
4011 while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
4012 guess -= 1
4013 while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
4014 guess += 1
4015 else:
4016 while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
4017 guess += 1
4018 while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
4019 guess -= 1
4020 return guess
4022 if alternative == 'less':
4023 pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
4024 elif alternative == 'greater':
4025 # Same formula as the 'less' case, but with the second column.
4026 pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1])
4027 elif alternative == 'two-sided':
4028 mode = int((n + 1) * (n1 + 1) / (n1 + n2 + 2))
4029 pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n)
4030 pmode = hypergeom.pmf(mode, n1 + n2, n1, n)
4032 epsilon = 1 - 1e-4
4033 if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= 1 - epsilon:
4034 return oddsratio, 1.
4036 elif c[0, 0] < mode:
4037 plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
4038 if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon:
4039 return oddsratio, plower
4041 guess = binary_search(n, n1, n2, "upper")
4042 pvalue = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n)
4043 else:
4044 pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n)
4045 if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon:
4046 return oddsratio, pupper
4048 guess = binary_search(n, n1, n2, "lower")
4049 pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
4050 else:
4051 msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}"
4052 raise ValueError(msg)
4054 pvalue = min(pvalue, 1.0)
4056 return oddsratio, pvalue
4059class SpearmanRConstantInputWarning(RuntimeWarning):
4060 """Warning generated by `spearmanr` when an input is constant."""
4062 def __init__(self, msg=None):
4063 if msg is None:
4064 msg = ("An input array is constant; the correlation coefficent "
4065 "is not defined.")
4066 self.args = (msg,)
4069SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))
4072def spearmanr(a, b=None, axis=0, nan_policy='propagate'):
4073 """
4074 Calculate a Spearman correlation coefficient with associated p-value.
4076 The Spearman rank-order correlation coefficient is a nonparametric measure
4077 of the monotonicity of the relationship between two datasets. Unlike the
4078 Pearson correlation, the Spearman correlation does not assume that both
4079 datasets are normally distributed. Like other correlation coefficients,
4080 this one varies between -1 and +1 with 0 implying no correlation.
4081 Correlations of -1 or +1 imply an exact monotonic relationship. Positive
4082 correlations imply that as x increases, so does y. Negative correlations
4083 imply that as x increases, y decreases.
4085 The p-value roughly indicates the probability of an uncorrelated system
4086 producing datasets that have a Spearman correlation at least as extreme
4087 as the one computed from these datasets. The p-values are not entirely
4088 reliable but are probably reasonable for datasets larger than 500 or so.
4090 Parameters
4091 ----------
4092 a, b : 1D or 2D array_like, b is optional
4093 One or two 1-D or 2-D arrays containing multiple variables and
4094 observations. When these are 1-D, each represents a vector of
4095 observations of a single variable. For the behavior in the 2-D case,
4096 see under ``axis``, below.
4097 Both arrays need to have the same length in the ``axis`` dimension.
4098 axis : int or None, optional
4099 If axis=0 (default), then each column represents a variable, with
4100 observations in the rows. If axis=1, the relationship is transposed:
4101 each row represents a variable, while the columns contain observations.
4102 If axis=None, then both arrays will be raveled.
4103 nan_policy : {'propagate', 'raise', 'omit'}, optional
4104 Defines how to handle when input contains nan.
4105 The following options are available (default is 'propagate'):
4107 * 'propagate': returns nan
4108 * 'raise': throws an error
4109 * 'omit': performs the calculations ignoring nan values
4111 Returns
4112 -------
4113 correlation : float or ndarray (2-D square)
4114 Spearman correlation matrix or correlation coefficient (if only 2
4115 variables are given as parameters. Correlation matrix is square with
4116 length equal to total number of variables (columns or rows) in ``a``
4117 and ``b`` combined.
4118 pvalue : float
4119 The two-sided p-value for a hypothesis test whose null hypothesis is
4120 that two sets of data are uncorrelated, has same dimension as rho.
4122 References
4123 ----------
4124 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
4125 Probability and Statistics Tables and Formulae. Chapman & Hall: New
4126 York. 2000.
4127 Section 14.7
4129 Examples
4130 --------
4131 >>> from scipy import stats
4132 >>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
4133 (0.82078268166812329, 0.088587005313543798)
4134 >>> np.random.seed(1234321)
4135 >>> x2n = np.random.randn(100, 2)
4136 >>> y2n = np.random.randn(100, 2)
4137 >>> stats.spearmanr(x2n)
4138 (0.059969996999699973, 0.55338590803773591)
4139 >>> stats.spearmanr(x2n[:,0], x2n[:,1])
4140 (0.059969996999699973, 0.55338590803773591)
4141 >>> rho, pval = stats.spearmanr(x2n, y2n)
4142 >>> rho
4143 array([[ 1. , 0.05997 , 0.18569457, 0.06258626],
4144 [ 0.05997 , 1. , 0.110003 , 0.02534653],
4145 [ 0.18569457, 0.110003 , 1. , 0.03488749],
4146 [ 0.06258626, 0.02534653, 0.03488749, 1. ]])
4147 >>> pval
4148 array([[ 0. , 0.55338591, 0.06435364, 0.53617935],
4149 [ 0.55338591, 0. , 0.27592895, 0.80234077],
4150 [ 0.06435364, 0.27592895, 0. , 0.73039992],
4151 [ 0.53617935, 0.80234077, 0.73039992, 0. ]])
4152 >>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
4153 >>> rho
4154 array([[ 1. , 0.05997 , 0.18569457, 0.06258626],
4155 [ 0.05997 , 1. , 0.110003 , 0.02534653],
4156 [ 0.18569457, 0.110003 , 1. , 0.03488749],
4157 [ 0.06258626, 0.02534653, 0.03488749, 1. ]])
4158 >>> stats.spearmanr(x2n, y2n, axis=None)
4159 (0.10816770419260482, 0.1273562188027364)
4160 >>> stats.spearmanr(x2n.ravel(), y2n.ravel())
4161 (0.10816770419260482, 0.1273562188027364)
4163 >>> xint = np.random.randint(10, size=(100, 2))
4164 >>> stats.spearmanr(xint)
4165 (0.052760927029710199, 0.60213045837062351)
4167 """
4168 if axis is not None and axis > 1:
4169 raise ValueError("spearmanr only handles 1-D or 2-D arrays, supplied axis argument {}, please use only values 0, 1 or None for axis".format(axis))
4171 a, axisout = _chk_asarray(a, axis)
4172 if a.ndim > 2:
4173 raise ValueError("spearmanr only handles 1-D or 2-D arrays")
4175 if b is None:
4176 if a.ndim < 2:
4177 raise ValueError("`spearmanr` needs at least 2 variables to compare")
4178 else:
4179 # Concatenate a and b, so that we now only have to handle the case
4180 # of a 2-D `a`.
4181 b, _ = _chk_asarray(b, axis)
4182 if axisout == 0:
4183 a = np.column_stack((a, b))
4184 else:
4185 a = np.row_stack((a, b))
4187 n_vars = a.shape[1 - axisout]
4188 n_obs = a.shape[axisout]
4189 if n_obs <= 1:
4190 # Handle empty arrays or single observations.
4191 return SpearmanrResult(np.nan, np.nan)
4193 if axisout == 0:
4194 if (a[:, 0][0] == a[:, 0]).all() or (a[:, 1][0] == a[:, 1]).all():
4195 # If an input is constant, the correlation coefficient is not defined.
4196 warnings.warn(SpearmanRConstantInputWarning())
4197 return SpearmanrResult(np.nan, np.nan)
4198 else: # case when axisout == 1 b/c a is 2 dim only
4199 if (a[0, :][0] == a[0, :]).all() or (a[1, :][0] == a[1, :]).all():
4200 # If an input is constant, the correlation coefficient is not defined.
4201 warnings.warn(SpearmanRConstantInputWarning())
4202 return SpearmanrResult(np.nan, np.nan)
4204 a_contains_nan, nan_policy = _contains_nan(a, nan_policy)
4205 variable_has_nan = np.zeros(n_vars, dtype=bool)
4206 if a_contains_nan:
4207 if nan_policy == 'omit':
4208 return mstats_basic.spearmanr(a, axis=axis, nan_policy=nan_policy)
4209 elif nan_policy == 'propagate':
4210 if a.ndim == 1 or n_vars <= 2:
4211 return SpearmanrResult(np.nan, np.nan)
4212 else:
4213 # Keep track of variables with NaNs, set the outputs to NaN
4214 # only for those variables
4215 variable_has_nan = np.isnan(a).sum(axis=axisout)
4217 a_ranked = np.apply_along_axis(rankdata, axisout, a)
4218 rs = np.corrcoef(a_ranked, rowvar=axisout)
4219 dof = n_obs - 2 # degrees of freedom
4221 # rs can have elements equal to 1, so avoid zero division warnings
4222 with np.errstate(divide='ignore'):
4223 # clip the small negative values possibly caused by rounding
4224 # errors before taking the square root
4225 t = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0))
4227 prob = 2 * distributions.t.sf(np.abs(t), dof)
4229 # For backwards compatibility, return scalars when comparing 2 columns
4230 if rs.shape == (2, 2):
4231 return SpearmanrResult(rs[1, 0], prob[1, 0])
4232 else:
4233 rs[variable_has_nan, :] = np.nan
4234 rs[:, variable_has_nan] = np.nan
4235 return SpearmanrResult(rs, prob)
4238PointbiserialrResult = namedtuple('PointbiserialrResult',
4239 ('correlation', 'pvalue'))
4242def pointbiserialr(x, y):
4243 r"""
4244 Calculate a point biserial correlation coefficient and its p-value.
4246 The point biserial correlation is used to measure the relationship
4247 between a binary variable, x, and a continuous variable, y. Like other
4248 correlation coefficients, this one varies between -1 and +1 with 0
4249 implying no correlation. Correlations of -1 or +1 imply a determinative
4250 relationship.
4252 This function uses a shortcut formula but produces the same result as
4253 `pearsonr`.
4255 Parameters
4256 ----------
4257 x : array_like of bools
4258 Input array.
4259 y : array_like
4260 Input array.
4262 Returns
4263 -------
4264 correlation : float
4265 R value.
4266 pvalue : float
4267 Two-sided p-value.
4269 Notes
4270 -----
4271 `pointbiserialr` uses a t-test with ``n-1`` degrees of freedom.
4272 It is equivalent to `pearsonr.`
4274 The value of the point-biserial correlation can be calculated from:
4276 .. math::
4278 r_{pb} = \frac{\overline{Y_{1}} -
4279 \overline{Y_{0}}}{s_{y}}\sqrt{\frac{N_{1} N_{2}}{N (N - 1))}}
4281 Where :math:`Y_{0}` and :math:`Y_{1}` are means of the metric
4282 observations coded 0 and 1 respectively; :math:`N_{0}` and :math:`N_{1}`
4283 are number of observations coded 0 and 1 respectively; :math:`N` is the
4284 total number of observations and :math:`s_{y}` is the standard
4285 deviation of all the metric observations.
4287 A value of :math:`r_{pb}` that is significantly different from zero is
4288 completely equivalent to a significant difference in means between the two
4289 groups. Thus, an independent groups t Test with :math:`N-2` degrees of
4290 freedom may be used to test whether :math:`r_{pb}` is nonzero. The
4291 relation between the t-statistic for comparing two independent groups and
4292 :math:`r_{pb}` is given by:
4294 .. math::
4296 t = \sqrt{N - 2}\frac{r_{pb}}{\sqrt{1 - r^{2}_{pb}}}
4298 References
4299 ----------
4300 .. [1] J. Lev, "The Point Biserial Coefficient of Correlation", Ann. Math.
4301 Statist., Vol. 20, no.1, pp. 125-126, 1949.
4303 .. [2] R.F. Tate, "Correlation Between a Discrete and a Continuous
4304 Variable. Point-Biserial Correlation.", Ann. Math. Statist., Vol. 25,
4305 np. 3, pp. 603-607, 1954.
4307 .. [3] D. Kornbrot "Point Biserial Correlation", In Wiley StatsRef:
4308 Statistics Reference Online (eds N. Balakrishnan, et al.), 2014.
4309 https://doi.org/10.1002/9781118445112.stat06227
4311 Examples
4312 --------
4313 >>> from scipy import stats
4314 >>> a = np.array([0, 0, 0, 1, 1, 1, 1])
4315 >>> b = np.arange(7)
4316 >>> stats.pointbiserialr(a, b)
4317 (0.8660254037844386, 0.011724811003954652)
4318 >>> stats.pearsonr(a, b)
4319 (0.86602540378443871, 0.011724811003954626)
4320 >>> np.corrcoef(a, b)
4321 array([[ 1. , 0.8660254],
4322 [ 0.8660254, 1. ]])
4324 """
4325 rpb, prob = pearsonr(x, y)
4326 return PointbiserialrResult(rpb, prob)
4329KendalltauResult = namedtuple('KendalltauResult', ('correlation', 'pvalue'))
4332def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', method='auto'):
4333 """
4334 Calculate Kendall's tau, a correlation measure for ordinal data.
4336 Kendall's tau is a measure of the correspondence between two rankings.
4337 Values close to 1 indicate strong agreement, values close to -1 indicate
4338 strong disagreement. This is the 1945 "tau-b" version of Kendall's
4339 tau [2]_, which can account for ties and which reduces to the 1938 "tau-a"
4340 version [1]_ in absence of ties.
4342 Parameters
4343 ----------
4344 x, y : array_like
4345 Arrays of rankings, of the same shape. If arrays are not 1-D, they will
4346 be flattened to 1-D.
4347 initial_lexsort : bool, optional
4348 Unused (deprecated).
4349 nan_policy : {'propagate', 'raise', 'omit'}, optional
4350 Defines how to handle when input contains nan.
4351 The following options are available (default is 'propagate'):
4353 * 'propagate': returns nan
4354 * 'raise': throws an error
4355 * 'omit': performs the calculations ignoring nan values
4356 method : {'auto', 'asymptotic', 'exact'}, optional
4357 Defines which method is used to calculate the p-value [5]_.
4358 The following options are available (default is 'auto'):
4360 * 'auto': selects the appropriate method based on a trade-off between
4361 speed and accuracy
4362 * 'asymptotic': uses a normal approximation valid for large samples
4363 * 'exact': computes the exact p-value, but can only be used if no ties
4364 are present
4366 Returns
4367 -------
4368 correlation : float
4369 The tau statistic.
4370 pvalue : float
4371 The two-sided p-value for a hypothesis test whose null hypothesis is
4372 an absence of association, tau = 0.
4374 See Also
4375 --------
4376 spearmanr : Calculates a Spearman rank-order correlation coefficient.
4377 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
4378 weightedtau : Computes a weighted version of Kendall's tau.
4380 Notes
4381 -----
4382 The definition of Kendall's tau that is used is [2]_::
4384 tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U))
4386 where P is the number of concordant pairs, Q the number of discordant
4387 pairs, T the number of ties only in `x`, and U the number of ties only in
4388 `y`. If a tie occurs for the same pair in both `x` and `y`, it is not
4389 added to either T or U.
4391 References
4392 ----------
4393 .. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika
4394 Vol. 30, No. 1/2, pp. 81-93, 1938.
4395 .. [2] Maurice G. Kendall, "The treatment of ties in ranking problems",
4396 Biometrika Vol. 33, No. 3, pp. 239-251. 1945.
4397 .. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John
4398 Wiley & Sons, 1967.
4399 .. [4] Peter M. Fenwick, "A new data structure for cumulative frequency
4400 tables", Software: Practice and Experience, Vol. 24, No. 3,
4401 pp. 327-336, 1994.
4402 .. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
4403 Charles Griffin & Co., 1970.
4405 Examples
4406 --------
4407 >>> from scipy import stats
4408 >>> x1 = [12, 2, 1, 12, 2]
4409 >>> x2 = [1, 4, 7, 1, 0]
4410 >>> tau, p_value = stats.kendalltau(x1, x2)
4411 >>> tau
4412 -0.47140452079103173
4413 >>> p_value
4414 0.2827454599327748
4416 """
4417 x = np.asarray(x).ravel()
4418 y = np.asarray(y).ravel()
4420 if x.size != y.size:
4421 raise ValueError("All inputs to `kendalltau` must be of the same size, "
4422 "found x-size %s and y-size %s" % (x.size, y.size))
4423 elif not x.size or not y.size:
4424 return KendalltauResult(np.nan, np.nan) # Return NaN if arrays are empty
4426 # check both x and y
4427 cnx, npx = _contains_nan(x, nan_policy)
4428 cny, npy = _contains_nan(y, nan_policy)
4429 contains_nan = cnx or cny
4430 if npx == 'omit' or npy == 'omit':
4431 nan_policy = 'omit'
4433 if contains_nan and nan_policy == 'propagate':
4434 return KendalltauResult(np.nan, np.nan)
4436 elif contains_nan and nan_policy == 'omit':
4437 x = ma.masked_invalid(x)
4438 y = ma.masked_invalid(y)
4439 return mstats_basic.kendalltau(x, y, method=method)
4441 if initial_lexsort is not None: # deprecate to drop!
4442 warnings.warn('"initial_lexsort" is gone!')
4444 def count_rank_tie(ranks):
4445 cnt = np.bincount(ranks).astype('int64', copy=False)
4446 cnt = cnt[cnt > 1]
4447 return ((cnt * (cnt - 1) // 2).sum(),
4448 (cnt * (cnt - 1.) * (cnt - 2)).sum(),
4449 (cnt * (cnt - 1.) * (2*cnt + 5)).sum())
4451 size = x.size
4452 perm = np.argsort(y) # sort on y and convert y to dense ranks
4453 x, y = x[perm], y[perm]
4454 y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp)
4456 # stable sort on x and convert x to dense ranks
4457 perm = np.argsort(x, kind='mergesort')
4458 x, y = x[perm], y[perm]
4459 x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp)
4461 dis = _kendall_dis(x, y) # discordant pairs
4463 obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]
4464 cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)
4466 ntie = (cnt * (cnt - 1) // 2).sum() # joint ties
4467 xtie, x0, x1 = count_rank_tie(x) # ties in x, stats
4468 ytie, y0, y1 = count_rank_tie(y) # ties in y, stats
4470 tot = (size * (size - 1)) // 2
4472 if xtie == tot or ytie == tot:
4473 return KendalltauResult(np.nan, np.nan)
4475 # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
4476 # = con + dis + xtie + ytie - ntie
4477 con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
4478 tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie)
4479 # Limit range to fix computational errors
4480 tau = min(1., max(-1., tau))
4482 if method == 'exact' and (xtie != 0 or ytie != 0):
4483 raise ValueError("Ties found, exact method cannot be used.")
4485 if method == 'auto':
4486 if (xtie == 0 and ytie == 0) and (size <= 33 or min(dis, tot-dis) <= 1):
4487 method = 'exact'
4488 else:
4489 method = 'asymptotic'
4491 if xtie == 0 and ytie == 0 and method == 'exact':
4492 # Exact p-value, see p. 68 of Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), Charles Griffin & Co., 1970.
4493 c = min(dis, tot-dis)
4494 if size <= 0:
4495 raise ValueError
4496 elif c < 0 or 2*c > size*(size-1):
4497 raise ValueError
4498 elif size == 1:
4499 pvalue = 1.0
4500 elif size == 2:
4501 pvalue = 1.0
4502 elif c == 0:
4503 pvalue = 2.0/math.factorial(size) if size < 171 else 0.0
4504 elif c == 1:
4505 pvalue = 2.0/math.factorial(size-1) if (size-1) < 171 else 0.0
4506 elif 2*c == tot:
4507 pvalue = 1.0
4508 else:
4509 new = [0.0]*(c+1)
4510 new[0] = 1.0
4511 new[1] = 1.0
4512 for j in range(3,size+1):
4513 old = new[:]
4514 for k in range(1,min(j,c+1)):
4515 new[k] += new[k-1]
4516 for k in range(j,c+1):
4517 new[k] += new[k-1] - old[k-j]
4519 pvalue = 2.0*sum(new)/math.factorial(size) if size < 171 else 0.0
4521 elif method == 'asymptotic':
4522 # con_minus_dis is approx normally distributed with this variance [3]_
4523 var = (size * (size - 1) * (2.*size + 5) - x1 - y1) / 18. + (
4524 2. * xtie * ytie) / (size * (size - 1)) + x0 * y0 / (9. *
4525 size * (size - 1) * (size - 2))
4526 pvalue = special.erfc(np.abs(con_minus_dis) / np.sqrt(var) / np.sqrt(2))
4527 else:
4528 raise ValueError("Unknown method "+str(method)+" specified, please use auto, exact or asymptotic.")
4530 return KendalltauResult(tau, pvalue)
4533WeightedTauResult = namedtuple('WeightedTauResult', ('correlation', 'pvalue'))
4536def weightedtau(x, y, rank=True, weigher=None, additive=True):
4537 r"""
4538 Compute a weighted version of Kendall's :math:`\tau`.
4540 The weighted :math:`\tau` is a weighted version of Kendall's
4541 :math:`\tau` in which exchanges of high weight are more influential than
4542 exchanges of low weight. The default parameters compute the additive
4543 hyperbolic version of the index, :math:`\tau_\mathrm h`, which has
4544 been shown to provide the best balance between important and
4545 unimportant elements [1]_.
4547 The weighting is defined by means of a rank array, which assigns a
4548 nonnegative rank to each element, and a weigher function, which
4549 assigns a weight based from the rank to each element. The weight of an
4550 exchange is then the sum or the product of the weights of the ranks of
4551 the exchanged elements. The default parameters compute
4552 :math:`\tau_\mathrm h`: an exchange between elements with rank
4553 :math:`r` and :math:`s` (starting from zero) has weight
4554 :math:`1/(r+1) + 1/(s+1)`.
4556 Specifying a rank array is meaningful only if you have in mind an
4557 external criterion of importance. If, as it usually happens, you do
4558 not have in mind a specific rank, the weighted :math:`\tau` is
4559 defined by averaging the values obtained using the decreasing
4560 lexicographical rank by (`x`, `y`) and by (`y`, `x`). This is the
4561 behavior with default parameters.
4563 Note that if you are computing the weighted :math:`\tau` on arrays of
4564 ranks, rather than of scores (i.e., a larger value implies a lower
4565 rank) you must negate the ranks, so that elements of higher rank are
4566 associated with a larger value.
4568 Parameters
4569 ----------
4570 x, y : array_like
4571 Arrays of scores, of the same shape. If arrays are not 1-D, they will
4572 be flattened to 1-D.
4573 rank : array_like of ints or bool, optional
4574 A nonnegative rank assigned to each element. If it is None, the
4575 decreasing lexicographical rank by (`x`, `y`) will be used: elements of
4576 higher rank will be those with larger `x`-values, using `y`-values to
4577 break ties (in particular, swapping `x` and `y` will give a different
4578 result). If it is False, the element indices will be used
4579 directly as ranks. The default is True, in which case this
4580 function returns the average of the values obtained using the
4581 decreasing lexicographical rank by (`x`, `y`) and by (`y`, `x`).
4582 weigher : callable, optional
4583 The weigher function. Must map nonnegative integers (zero
4584 representing the most important element) to a nonnegative weight.
4585 The default, None, provides hyperbolic weighing, that is,
4586 rank :math:`r` is mapped to weight :math:`1/(r+1)`.
4587 additive : bool, optional
4588 If True, the weight of an exchange is computed by adding the
4589 weights of the ranks of the exchanged elements; otherwise, the weights
4590 are multiplied. The default is True.
4592 Returns
4593 -------
4594 correlation : float
4595 The weighted :math:`\tau` correlation index.
4596 pvalue : float
4597 Presently ``np.nan``, as the null statistics is unknown (even in the
4598 additive hyperbolic case).
4600 See Also
4601 --------
4602 kendalltau : Calculates Kendall's tau.
4603 spearmanr : Calculates a Spearman rank-order correlation coefficient.
4604 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
4606 Notes
4607 -----
4608 This function uses an :math:`O(n \log n)`, mergesort-based algorithm
4609 [1]_ that is a weighted extension of Knight's algorithm for Kendall's
4610 :math:`\tau` [2]_. It can compute Shieh's weighted :math:`\tau` [3]_
4611 between rankings without ties (i.e., permutations) by setting
4612 `additive` and `rank` to False, as the definition given in [1]_ is a
4613 generalization of Shieh's.
4615 NaNs are considered the smallest possible score.
4617 .. versionadded:: 0.19.0
4619 References
4620 ----------
4621 .. [1] Sebastiano Vigna, "A weighted correlation index for rankings with
4622 ties", Proceedings of the 24th international conference on World
4623 Wide Web, pp. 1166-1176, ACM, 2015.
4624 .. [2] W.R. Knight, "A Computer Method for Calculating Kendall's Tau with
4625 Ungrouped Data", Journal of the American Statistical Association,
4626 Vol. 61, No. 314, Part 1, pp. 436-439, 1966.
4627 .. [3] Grace S. Shieh. "A weighted Kendall's tau statistic", Statistics &
4628 Probability Letters, Vol. 39, No. 1, pp. 17-24, 1998.
4630 Examples
4631 --------
4632 >>> from scipy import stats
4633 >>> x = [12, 2, 1, 12, 2]
4634 >>> y = [1, 4, 7, 1, 0]
4635 >>> tau, p_value = stats.weightedtau(x, y)
4636 >>> tau
4637 -0.56694968153682723
4638 >>> p_value
4639 nan
4640 >>> tau, p_value = stats.weightedtau(x, y, additive=False)
4641 >>> tau
4642 -0.62205716951801038
4644 NaNs are considered the smallest possible score:
4646 >>> x = [12, 2, 1, 12, 2]
4647 >>> y = [1, 4, 7, 1, np.nan]
4648 >>> tau, _ = stats.weightedtau(x, y)
4649 >>> tau
4650 -0.56694968153682723
4652 This is exactly Kendall's tau:
4654 >>> x = [12, 2, 1, 12, 2]
4655 >>> y = [1, 4, 7, 1, 0]
4656 >>> tau, _ = stats.weightedtau(x, y, weigher=lambda x: 1)
4657 >>> tau
4658 -0.47140452079103173
4660 >>> x = [12, 2, 1, 12, 2]
4661 >>> y = [1, 4, 7, 1, 0]
4662 >>> stats.weightedtau(x, y, rank=None)
4663 WeightedTauResult(correlation=-0.4157652301037516, pvalue=nan)
4664 >>> stats.weightedtau(y, x, rank=None)
4665 WeightedTauResult(correlation=-0.7181341329699028, pvalue=nan)
4667 """
4668 x = np.asarray(x).ravel()
4669 y = np.asarray(y).ravel()
4671 if x.size != y.size:
4672 raise ValueError("All inputs to `weightedtau` must be of the same size, "
4673 "found x-size %s and y-size %s" % (x.size, y.size))
4674 if not x.size:
4675 return WeightedTauResult(np.nan, np.nan) # Return NaN if arrays are empty
4677 # If there are NaNs we apply _toint64()
4678 if np.isnan(np.sum(x)):
4679 x = _toint64(x)
4680 if np.isnan(np.sum(x)):
4681 y = _toint64(y)
4683 # Reduce to ranks unsupported types
4684 if x.dtype != y.dtype:
4685 if x.dtype != np.int64:
4686 x = _toint64(x)
4687 if y.dtype != np.int64:
4688 y = _toint64(y)
4689 else:
4690 if x.dtype not in (np.int32, np.int64, np.float32, np.float64):
4691 x = _toint64(x)
4692 y = _toint64(y)
4694 if rank is True:
4695 return WeightedTauResult((
4696 _weightedrankedtau(x, y, None, weigher, additive) +
4697 _weightedrankedtau(y, x, None, weigher, additive)
4698 ) / 2, np.nan)
4700 if rank is False:
4701 rank = np.arange(x.size, dtype=np.intp)
4702 elif rank is not None:
4703 rank = np.asarray(rank).ravel()
4704 if rank.size != x.size:
4705 raise ValueError("All inputs to `weightedtau` must be of the same size, "
4706 "found x-size %s and rank-size %s" % (x.size, rank.size))
4708 return WeightedTauResult(_weightedrankedtau(x, y, rank, weigher, additive), np.nan)
4711# FROM MGCPY: https://github.com/neurodata/mgcpy
4713class _ParallelP(object):
4714 """
4715 Helper function to calculate parallel p-value.
4716 """
4717 def __init__(self, x, y, compute_distance, random_states):
4718 self.x = x
4719 self.y = y
4720 self.compute_distance = compute_distance
4721 self.random_states = random_states
4723 def __call__(self, index):
4724 permx = self.random_states[index].permutation(self.x)
4725 permy = self.random_states[index].permutation(self.y)
4727 # calculate permuted stats, store in null distribution
4728 perm_stat = _mgc_stat(permx, permy, self.compute_distance)[0]
4730 return perm_stat
4733def _perm_test(x, y, stat, compute_distance, reps=1000, workers=-1,
4734 random_state=None):
4735 r"""
4736 Helper function that calculates the p-value. See below for uses.
4738 Parameters
4739 ----------
4740 x, y : ndarray
4741 `x` and `y` have shapes `(n, p)` and `(n, q)`.
4742 stat : float
4743 The sample test statistic.
4744 compute_distance : callable
4745 A function that computes the distance or similarity among the samples
4746 within each data matrix. Set to `None` if `x` and `y` are already
4747 distance.
4748 reps : int, optional
4749 The number of replications used to estimate the null when using the
4750 permutation test. The default is 1000 replications.
4751 workers : int or map-like callable, optional
4752 If `workers` is an int the population is subdivided into `workers`
4753 sections and evaluated in parallel (uses
4754 `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
4755 available to the Process. Alternatively supply a map-like callable,
4756 such as `multiprocessing.Pool.map` for evaluating the population in
4757 parallel. This evaluation is carried out as `workers(func, iterable)`.
4758 Requires that `func` be pickleable.
4759 random_state : int or np.random.RandomState instance, optional
4760 If already a RandomState instance, use it.
4761 If seed is an int, return a new RandomState instance seeded with seed.
4762 If None, use np.random.RandomState. Default is None.
4764 Returns
4765 -------
4766 pvalue : float
4767 The sample test p-value.
4768 null_dist : list
4769 The approximated null distribution.
4770 """
4771 # generate seeds for each rep (change to new parallel random number
4772 # capabilities in numpy >= 1.17+)
4773 random_state = check_random_state(random_state)
4774 random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
4775 size=4, dtype=np.uint32)) for _ in range(reps)]
4777 # parallelizes with specified workers over number of reps and set seeds
4778 mapwrapper = MapWrapper(workers)
4779 parallelp = _ParallelP(x=x, y=y, compute_distance=compute_distance,
4780 random_states=random_states)
4781 null_dist = np.array(list(mapwrapper(parallelp, range(reps))))
4783 # calculate p-value and significant permutation map through list
4784 pvalue = (null_dist >= stat).sum() / reps
4786 # correct for a p-value of 0. This is because, with bootstrapping
4787 # permutations, a p-value of 0 is incorrect
4788 if pvalue == 0:
4789 pvalue = 1 / reps
4791 return pvalue, null_dist
4794def _euclidean_dist(x):
4795 return cdist(x, x)
4798MGCResult = namedtuple('MGCResult', ('stat', 'pvalue', 'mgc_dict'))
4801def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
4802 workers=1, is_twosamp=False, random_state=None):
4803 r"""
4804 Computes the Multiscale Graph Correlation (MGC) test statistic.
4806 Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
4807 one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
4808 the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
4809 called the "scale". A priori, however, it is not know which scales will be
4810 most informative. So, MGC computes all distance pairs, and then efficiently
4811 computes the distance correlations for all scales. The local correlations
4812 illustrate which scales are relatively informative about the relationship.
4813 The key, therefore, to successfully discover and decipher relationships
4814 between disparate data modalities is to adaptively determine which scales
4815 are the most informative, and the geometric implication for the most
4816 informative scales. Doing so not only provides an estimate of whether the
4817 modalities are related, but also provides insight into how the
4818 determination was made. This is especially important in high-dimensional
4819 data, where simple visualizations do not reveal relationships to the
4820 unaided human eye. Characterizations of this implementation in particular
4821 have been derived from and benchmarked within in [2]_.
4823 Parameters
4824 ----------
4825 x, y : ndarray
4826 If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
4827 the number of samples and `p` and `q` are the number of dimensions,
4828 then the MGC independence test will be run. Alternatively, ``x`` and
4829 ``y`` can have shapes ``(n, n)`` if they are distance or similarity
4830 matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
4831 and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
4832 two-sample MGC test will be run.
4833 compute_distance : callable, optional
4834 A function that computes the distance or similarity among the samples
4835 within each data matrix. Set to ``None`` if ``x`` and ``y`` are
4836 already distance matrices. The default uses the euclidean norm metric.
4837 If you are calling a custom function, either create the distance
4838 matrix before-hand or create a function of the form
4839 ``compute_distance(x)`` where `x` is the data matrix for which
4840 pairwise distances are calculated.
4841 reps : int, optional
4842 The number of replications used to estimate the null when using the
4843 permutation test. The default is ``1000``.
4844 workers : int or map-like callable, optional
4845 If ``workers`` is an int the population is subdivided into ``workers``
4846 sections and evaluated in parallel (uses ``multiprocessing.Pool
4847 <multiprocessing>``). Supply ``-1`` to use all cores available to the
4848 Process. Alternatively supply a map-like callable, such as
4849 ``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
4850 This evaluation is carried out as ``workers(func, iterable)``.
4851 Requires that `func` be pickleable. The default is ``1``.
4852 is_twosamp : bool, optional
4853 If `True`, a two sample test will be run. If ``x`` and ``y`` have
4854 shapes ``(n, p)`` and ``(m, p)``, this optional will be overriden and
4855 set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
4856 ``(n, p)`` and a two sample test is desired. The default is ``False``.
4857 random_state : int or np.random.RandomState instance, optional
4858 If already a RandomState instance, use it.
4859 If seed is an int, return a new RandomState instance seeded with seed.
4860 If None, use np.random.RandomState. Default is None.
4862 Returns
4863 -------
4864 stat : float
4865 The sample MGC test statistic within `[-1, 1]`.
4866 pvalue : float
4867 The p-value obtained via permutation.
4868 mgc_dict : dict
4869 Contains additional useful additional returns containing the following
4870 keys:
4872 - mgc_map : ndarray
4873 A 2D representation of the latent geometry of the relationship.
4874 of the relationship.
4875 - opt_scale : (int, int)
4876 The estimated optimal scale as a `(x, y)` pair.
4877 - null_dist : list
4878 The null distribution derived from the permuted matrices
4880 See Also
4881 --------
4882 pearsonr : Pearson correlation coefficient and p-value for testing
4883 non-correlation.
4884 kendalltau : Calculates Kendall's tau.
4885 spearmanr : Calculates a Spearman rank-order correlation coefficient.
4887 Notes
4888 -----
4889 A description of the process of MGC and applications on neuroscience data
4890 can be found in [1]_. It is performed using the following steps:
4892 #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
4893 modified to be mean zero columnwise. This results in two
4894 :math:`n \times n` distance matrices :math:`A` and :math:`B` (the
4895 centering and unbiased modification) [3]_.
4897 #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
4899 * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
4900 are calculated for each property. Here, :math:`G_k (i, j)` indicates
4901 the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
4902 and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
4903 the :math:`i`-th row of :math:`B`
4905 * Let :math:`\circ` denotes the entry-wise matrix product, then local
4906 correlations are summed and normalized using the following statistic:
4908 .. math::
4910 c^{kl} = \frac{\sum_{ij} A G_k B H_l}
4911 {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
4913 #. The MGC test statistic is the smoothed optimal local correlation of
4914 :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
4915 (which essentially set all isolated large correlations) as 0 and
4916 connected large correlations the same as before, see [3]_.) MGC is,
4918 .. math::
4920 MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
4921 \right)
4923 The test statistic returns a value between :math:`(-1, 1)` since it is
4924 normalized.
4926 The p-value returned is calculated using a permutation test. This process
4927 is completed by first randomly permuting :math:`y` to estimate the null
4928 distribution and then calculating the probability of observing a test
4929 statistic, under the null, at least as extreme as the observed test
4930 statistic.
4932 MGC requires at least 5 samples to run with reliable results. It can also
4933 handle high-dimensional data sets.
4935 In addition, by manipulating the input data matrices, the two-sample
4936 testing problem can be reduced to the independence testing problem [4]_.
4937 Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
4938 :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
4939 follows:
4941 .. math::
4943 X = [U | V] \in \mathcal{R}^{p \times (n + m)}
4945 Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}
4947 Then, the MGC statistic can be calculated as normal. This methodology can
4948 be extended to similar tests such as distance correlation [4]_.
4950 .. versionadded:: 1.4.0
4952 References
4953 ----------
4954 .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
4955 Maggioni, M., & Shen, C. (2019). Discovering and deciphering
4956 relationships across disparate data modalities. ELife.
4957 .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
4958 Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
4959 mgcpy: A Comprehensive High Dimensional Independence Testing Python
4960 Package. ArXiv:1907.02088 [Cs, Stat].
4961 .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
4962 correlation to multiscale graph correlation. Journal of the American
4963 Statistical Association.
4964 .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
4965 Distance and Kernel Methods for Hypothesis Testing. ArXiv:1806.05514
4966 [Cs, Stat].
4968 Examples
4969 --------
4970 >>> from scipy.stats import multiscale_graphcorr
4971 >>> x = np.arange(100)
4972 >>> y = x
4973 >>> stat, pvalue, _ = multiscale_graphcorr(x, y, workers=-1)
4974 >>> '%.1f, %.3f' % (stat, pvalue)
4975 '1.0, 0.001'
4977 Alternatively,
4979 >>> x = np.arange(100)
4980 >>> y = x
4981 >>> mgc = multiscale_graphcorr(x, y)
4982 >>> '%.1f, %.3f' % (mgc.stat, mgc.pvalue)
4983 '1.0, 0.001'
4985 To run an unpaired two-sample test,
4987 >>> x = np.arange(100)
4988 >>> y = np.arange(79)
4989 >>> mgc = multiscale_graphcorr(x, y, random_state=1)
4990 >>> '%.3f, %.2f' % (mgc.stat, mgc.pvalue)
4991 '0.033, 0.02'
4993 or, if shape of the inputs are the same,
4995 >>> x = np.arange(100)
4996 >>> y = x
4997 >>> mgc = multiscale_graphcorr(x, y, is_twosamp=True)
4998 >>> '%.3f, %.1f' % (mgc.stat, mgc.pvalue)
4999 '-0.008, 1.0'
5000 """
5001 if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
5002 raise ValueError("x and y must be ndarrays")
5004 # convert arrays of type (n,) to (n, 1)
5005 if x.ndim == 1:
5006 x = x[:, np.newaxis]
5007 elif x.ndim != 2:
5008 raise ValueError("Expected a 2-D array `x`, found shape "
5009 "{}".format(x.shape))
5010 if y.ndim == 1:
5011 y = y[:, np.newaxis]
5012 elif y.ndim != 2:
5013 raise ValueError("Expected a 2-D array `y`, found shape "
5014 "{}".format(y.shape))
5016 nx, px = x.shape
5017 ny, py = y.shape
5019 # check for NaNs
5020 _contains_nan(x, nan_policy='raise')
5021 _contains_nan(y, nan_policy='raise')
5023 # check for positive or negative infinity and raise error
5024 if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
5025 raise ValueError("Inputs contain infinities")
5027 if nx != ny:
5028 if px == py:
5029 # reshape x and y for two sample testing
5030 is_twosamp = True
5031 else:
5032 raise ValueError("Shape mismatch, x and y must have shape [n, p] "
5033 "and [n, q] or have shape [n, p] and [m, p].")
5035 if nx < 5 or ny < 5:
5036 raise ValueError("MGC requires at least 5 samples to give reasonable "
5037 "results.")
5039 # convert x and y to float
5040 x = x.astype(np.float64)
5041 y = y.astype(np.float64)
5043 # check if compute_distance_matrix if a callable()
5044 if not callable(compute_distance) and compute_distance is not None:
5045 raise ValueError("Compute_distance must be a function.")
5047 # check if number of reps exists, integer, or > 0 (if under 1000 raises
5048 # warning)
5049 if not isinstance(reps, int) or reps < 0:
5050 raise ValueError("Number of reps must be an integer greater than 0.")
5051 elif reps < 1000:
5052 msg = ("The number of replications is low (under 1000), and p-value "
5053 "calculations may be unreliable. Use the p-value result, with "
5054 "caution!")
5055 warnings.warn(msg, RuntimeWarning)
5057 if is_twosamp:
5058 x, y = _two_sample_transform(x, y)
5060 # calculate MGC stat
5061 stat, stat_dict = _mgc_stat(x, y, compute_distance)
5062 stat_mgc_map = stat_dict["stat_mgc_map"]
5063 opt_scale = stat_dict["opt_scale"]
5065 # calculate permutation MGC p-value
5066 pvalue, null_dist = _perm_test(x, y, stat, compute_distance, reps=reps,
5067 workers=workers, random_state=random_state)
5069 # save all stats (other than stat/p-value) in dictionary
5070 mgc_dict = {"mgc_map": stat_mgc_map,
5071 "opt_scale": opt_scale,
5072 "null_dist": null_dist}
5074 return MGCResult(stat, pvalue, mgc_dict)
5077def _mgc_stat(x, y, compute_distance):
5078 r"""
5079 Helper function that calculates the MGC stat. See above for use.
5081 Parameters
5082 ----------
5083 x, y : ndarray
5084 `x` and `y` have shapes `(n, p)` and `(n, q)` or `(n, n)` and `(n, n)`
5085 if distance matrices.
5086 compute_distance : callable
5087 A function that computes the distance or similarity among the samples
5088 within each data matrix. Set to `None` if `x` and `y` are already
5089 distance.
5091 Returns
5092 -------
5093 stat : float
5094 The sample MGC test statistic within `[-1, 1]`.
5095 stat_dict : dict
5096 Contains additional useful additional returns containing the following
5097 keys:
5098 - stat_mgc_map : ndarray
5099 MGC-map of the statistics.
5100 - opt_scale : (float, float)
5101 The estimated optimal scale as a `(x, y)` pair.
5102 """
5103 # set distx and disty to x and y when compute_distance = None
5104 distx = x
5105 disty = y
5107 if compute_distance is not None:
5108 # compute distance matrices for x and y
5109 distx = compute_distance(x)
5110 disty = compute_distance(y)
5112 # calculate MGC map and optimal scale
5113 stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')
5115 n, m = stat_mgc_map.shape
5116 if m == 1 or n == 1:
5117 # the global scale at is the statistic calculated at maximial nearest
5118 # neighbors. There is not enough local scale to search over, so
5119 # default to global scale
5120 stat = stat_mgc_map[m - 1][n - 1]
5121 opt_scale = m * n
5122 else:
5123 samp_size = len(distx) - 1
5125 # threshold to find connected region of significant local correlations
5126 sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)
5128 # maximum within the significant region
5129 stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)
5131 stat_dict = {"stat_mgc_map": stat_mgc_map,
5132 "opt_scale": opt_scale}
5134 return stat, stat_dict
5137def _threshold_mgc_map(stat_mgc_map, samp_size):
5138 r"""
5139 Finds a connected region of significance in the MGC-map by thresholding.
5141 Parameters
5142 ----------
5143 stat_mgc_map : ndarray
5144 All local correlations within `[-1,1]`.
5145 samp_size : int
5146 The sample size of original data.
5148 Returns
5149 -------
5150 sig_connect : ndarray
5151 A binary matrix with 1's indicating the significant region.
5152 """
5153 m, n = stat_mgc_map.shape
5155 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
5156 # with varying levels of performance. Threshold is based on a beta
5157 # approximation.
5158 per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant
5159 threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation
5160 threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1
5162 # the global scale at is the statistic calculated at maximial nearest
5163 # neighbors. Threshold is the maximium on the global and local scales
5164 threshold = max(threshold, stat_mgc_map[m - 1][n - 1])
5166 # find the largest connected component of significant correlations
5167 sig_connect = stat_mgc_map > threshold
5168 if np.sum(sig_connect) > 0:
5169 sig_connect, _ = measurements.label(sig_connect)
5170 _, label_counts = np.unique(sig_connect, return_counts=True)
5172 # skip the first element in label_counts, as it is count(zeros)
5173 max_label = np.argmax(label_counts[1:]) + 1
5174 sig_connect = sig_connect == max_label
5175 else:
5176 sig_connect = np.array([[False]])
5178 return sig_connect
5181def _smooth_mgc_map(sig_connect, stat_mgc_map):
5182 """
5183 Finds the smoothed maximal within the significant region R.
5185 If area of R is too small it returns the last local correlation. Otherwise,
5186 returns the maximum within significant_connected_region.
5188 Parameters
5189 ----------
5190 sig_connect: ndarray
5191 A binary matrix with 1's indicating the significant region.
5192 stat_mgc_map: ndarray
5193 All local correlations within `[-1, 1]`.
5195 Returns
5196 -------
5197 stat : float
5198 The sample MGC statistic within `[-1, 1]`.
5199 opt_scale: (float, float)
5200 The estimated optimal scale as an `(x, y)` pair.
5201 """
5203 m, n = stat_mgc_map.shape
5205 # the global scale at is the statistic calculated at maximial nearest
5206 # neighbors. By default, statistic and optimal scale are global.
5207 stat = stat_mgc_map[m - 1][n - 1]
5208 opt_scale = [m, n]
5210 if np.linalg.norm(sig_connect) != 0:
5211 # proceed only when the connected region's area is sufficiently large
5212 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
5213 # with varying levels of performance
5214 if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
5215 max_corr = max(stat_mgc_map[sig_connect])
5217 # find all scales within significant_connected_region that maximize
5218 # the local correlation
5219 max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)
5221 if max_corr >= stat:
5222 stat = max_corr
5224 k, l = max_corr_index
5225 one_d_indices = k * n + l # 2D to 1D indexing
5226 k = np.max(one_d_indices) // n
5227 l = np.max(one_d_indices) % n
5228 opt_scale = [k+1, l+1] # adding 1s to match R indexing
5230 return stat, opt_scale
5233def _two_sample_transform(u, v):
5234 """
5235 Helper function that concatenates x and y for two sample MGC stat. See
5236 above for use.
5238 Parameters
5239 ----------
5240 u, v : ndarray
5241 `u` and `v` have shapes `(n, p)` and `(m, p)`,
5243 Returns
5244 -------
5245 x : ndarray
5246 Concatenate `u` and `v` along the `axis = 0`. `x` thus has shape
5247 `(2n, p)`.
5248 y : ndarray
5249 Label matrix for `x` where 0 refers to samples that comes from `u` and
5250 1 refers to samples that come from `v`. `y` thus has shape `(2n, 1)`.
5251 """
5252 nx = u.shape[0]
5253 ny = v.shape[0]
5254 x = np.concatenate([u, v], axis=0)
5255 y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
5256 return x, y
5259#####################################
5260# INFERENTIAL STATISTICS #
5261#####################################
5263Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
5266def ttest_1samp(a, popmean, axis=0, nan_policy='propagate'):
5267 """
5268 Calculate the T-test for the mean of ONE group of scores.
5270 This is a two-sided test for the null hypothesis that the expected value
5271 (mean) of a sample of independent observations `a` is equal to the given
5272 population mean, `popmean`.
5274 Parameters
5275 ----------
5276 a : array_like
5277 Sample observation.
5278 popmean : float or array_like
5279 Expected value in null hypothesis. If array_like, then it must have the
5280 same shape as `a` excluding the axis dimension.
5281 axis : int or None, optional
5282 Axis along which to compute test. If None, compute over the whole
5283 array `a`.
5284 nan_policy : {'propagate', 'raise', 'omit'}, optional
5285 Defines how to handle when input contains nan.
5286 The following options are available (default is 'propagate'):
5288 * 'propagate': returns nan
5289 * 'raise': throws an error
5290 * 'omit': performs the calculations ignoring nan values
5292 Returns
5293 -------
5294 statistic : float or array
5295 t-statistic.
5296 pvalue : float or array
5297 Two-sided p-value.
5299 Examples
5300 --------
5301 >>> from scipy import stats
5303 >>> np.random.seed(7654567) # fix seed to get the same result
5304 >>> rvs = stats.norm.rvs(loc=5, scale=10, size=(50,2))
5306 Test if mean of random sample is equal to true mean, and different mean.
5307 We reject the null hypothesis in the second case and don't reject it in
5308 the first case.
5310 >>> stats.ttest_1samp(rvs,5.0)
5311 (array([-0.68014479, -0.04323899]), array([ 0.49961383, 0.96568674]))
5312 >>> stats.ttest_1samp(rvs,0.0)
5313 (array([ 2.77025808, 4.11038784]), array([ 0.00789095, 0.00014999]))
5315 Examples using axis and non-scalar dimension for population mean.
5317 >>> stats.ttest_1samp(rvs,[5.0,0.0])
5318 (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04]))
5319 >>> stats.ttest_1samp(rvs.T,[5.0,0.0],axis=1)
5320 (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04]))
5321 >>> stats.ttest_1samp(rvs,[[5.0],[0.0]])
5322 (array([[-0.68014479, -0.04323899],
5323 [ 2.77025808, 4.11038784]]), array([[ 4.99613833e-01, 9.65686743e-01],
5324 [ 7.89094663e-03, 1.49986458e-04]]))
5326 """
5327 a, axis = _chk_asarray(a, axis)
5329 contains_nan, nan_policy = _contains_nan(a, nan_policy)
5331 if contains_nan and nan_policy == 'omit':
5332 a = ma.masked_invalid(a)
5333 return mstats_basic.ttest_1samp(a, popmean, axis)
5335 n = a.shape[axis]
5336 df = n - 1
5338 d = np.mean(a, axis) - popmean
5339 v = np.var(a, axis, ddof=1)
5340 denom = np.sqrt(v / n)
5342 with np.errstate(divide='ignore', invalid='ignore'):
5343 t = np.divide(d, denom)
5344 t, prob = _ttest_finish(df, t)
5346 return Ttest_1sampResult(t, prob)
5349def _ttest_finish(df, t):
5350 """Common code between all 3 t-test functions."""
5351 prob = distributions.t.sf(np.abs(t), df) * 2 # use np.abs to get upper tail
5352 if t.ndim == 0:
5353 t = t[()]
5355 return t, prob
5358def _ttest_ind_from_stats(mean1, mean2, denom, df):
5360 d = mean1 - mean2
5361 with np.errstate(divide='ignore', invalid='ignore'):
5362 t = np.divide(d, denom)
5363 t, prob = _ttest_finish(df, t)
5365 return (t, prob)
5368def _unequal_var_ttest_denom(v1, n1, v2, n2):
5369 vn1 = v1 / n1
5370 vn2 = v2 / n2
5371 with np.errstate(divide='ignore', invalid='ignore'):
5372 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
5374 # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
5375 # Hence it doesn't matter what df is as long as it's not NaN.
5376 df = np.where(np.isnan(df), 1, df)
5377 denom = np.sqrt(vn1 + vn2)
5378 return df, denom
5381def _equal_var_ttest_denom(v1, n1, v2, n2):
5382 df = n1 + n2 - 2.0
5383 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
5384 denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))
5385 return df, denom
5388Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
5391def ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2,
5392 equal_var=True):
5393 r"""
5394 T-test for means of two independent samples from descriptive statistics.
5396 This is a two-sided test for the null hypothesis that two independent
5397 samples have identical average (expected) values.
5399 Parameters
5400 ----------
5401 mean1 : array_like
5402 The mean(s) of sample 1.
5403 std1 : array_like
5404 The standard deviation(s) of sample 1.
5405 nobs1 : array_like
5406 The number(s) of observations of sample 1.
5407 mean2 : array_like
5408 The mean(s) of sample 2.
5409 std2 : array_like
5410 The standard deviations(s) of sample 2.
5411 nobs2 : array_like
5412 The number(s) of observations of sample 2.
5413 equal_var : bool, optional
5414 If True (default), perform a standard independent 2 sample test
5415 that assumes equal population variances [1]_.
5416 If False, perform Welch's t-test, which does not assume equal
5417 population variance [2]_.
5419 Returns
5420 -------
5421 statistic : float or array
5422 The calculated t-statistics.
5423 pvalue : float or array
5424 The two-tailed p-value.
5426 See Also
5427 --------
5428 scipy.stats.ttest_ind
5430 Notes
5431 -----
5432 .. versionadded:: 0.16.0
5434 References
5435 ----------
5436 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
5438 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
5440 Examples
5441 --------
5442 Suppose we have the summary data for two samples, as follows::
5444 Sample Sample
5445 Size Mean Variance
5446 Sample 1 13 15.0 87.5
5447 Sample 2 11 12.0 39.0
5449 Apply the t-test to this data (with the assumption that the population
5450 variances are equal):
5452 >>> from scipy.stats import ttest_ind_from_stats
5453 >>> ttest_ind_from_stats(mean1=15.0, std1=np.sqrt(87.5), nobs1=13,
5454 ... mean2=12.0, std2=np.sqrt(39.0), nobs2=11)
5455 Ttest_indResult(statistic=0.9051358093310269, pvalue=0.3751996797581487)
5457 For comparison, here is the data from which those summary statistics
5458 were taken. With this data, we can compute the same result using
5459 `scipy.stats.ttest_ind`:
5461 >>> a = np.array([1, 3, 4, 6, 11, 13, 15, 19, 22, 24, 25, 26, 26])
5462 >>> b = np.array([2, 4, 6, 9, 11, 13, 14, 15, 18, 19, 21])
5463 >>> from scipy.stats import ttest_ind
5464 >>> ttest_ind(a, b)
5465 Ttest_indResult(statistic=0.905135809331027, pvalue=0.3751996797581486)
5467 Suppose we instead have binary data and would like to apply a t-test to
5468 compare the proportion of 1s in two independent groups::
5470 Number of Sample Sample
5471 Size ones Mean Variance
5472 Sample 1 150 30 0.2 0.16
5473 Sample 2 200 45 0.225 0.174375
5475 The sample mean :math:`\hat{p}` is the proportion of ones in the sample
5476 and the variance for a binary observation is estimated by
5477 :math:`\hat{p}(1-\hat{p})`.
5479 >>> ttest_ind_from_stats(mean1=0.2, std1=np.sqrt(0.16), nobs1=150,
5480 ... mean2=0.225, std2=np.sqrt(0.17437), nobs2=200)
5481 Ttest_indResult(statistic=-0.564327545549774, pvalue=0.5728947691244874)
5483 For comparison, we could compute the t statistic and p-value using
5484 arrays of 0s and 1s and `scipy.stat.ttest_ind`, as above.
5486 >>> group1 = np.array([1]*30 + [0]*(150-30))
5487 >>> group2 = np.array([1]*45 + [0]*(200-45))
5488 >>> ttest_ind(group1, group2)
5489 Ttest_indResult(statistic=-0.5627179589855622, pvalue=0.573989277115258)
5491 """
5492 if equal_var:
5493 df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)
5494 else:
5495 df, denom = _unequal_var_ttest_denom(std1**2, nobs1,
5496 std2**2, nobs2)
5498 res = _ttest_ind_from_stats(mean1, mean2, denom, df)
5499 return Ttest_indResult(*res)
5502def _ttest_nans(a, b, axis, namedtuple_type):
5503 """
5504 Generate an array of `nan`, with shape determined by `a`, `b` and `axis`.
5506 This function is used by ttest_ind and ttest_rel to create the return
5507 value when one of the inputs has size 0.
5509 The shapes of the arrays are determined by dropping `axis` from the
5510 shapes of `a` and `b` and broadcasting what is left.
5512 The return value is a named tuple of the type given in `namedtuple_type`.
5514 Examples
5515 --------
5516 >>> a = np.zeros((9, 2))
5517 >>> b = np.zeros((5, 1))
5518 >>> _ttest_nans(a, b, 0, Ttest_indResult)
5519 Ttest_indResult(statistic=array([nan, nan]), pvalue=array([nan, nan]))
5521 >>> a = np.zeros((3, 0, 9))
5522 >>> b = np.zeros((1, 10))
5523 >>> stat, p = _ttest_nans(a, b, -1, Ttest_indResult)
5524 >>> stat
5525 array([], shape=(3, 0), dtype=float64)
5526 >>> p
5527 array([], shape=(3, 0), dtype=float64)
5529 >>> a = np.zeros(10)
5530 >>> b = np.zeros(7)
5531 >>> _ttest_nans(a, b, 0, Ttest_indResult)
5532 Ttest_indResult(statistic=nan, pvalue=nan)
5533 """
5534 shp = _broadcast_shapes_with_dropped_axis(a, b, axis)
5535 if len(shp) == 0:
5536 t = np.nan
5537 p = np.nan
5538 else:
5539 t = np.full(shp, fill_value=np.nan)
5540 p = t.copy()
5541 return namedtuple_type(t, p)
5544def ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate'):
5545 """
5546 Calculate the T-test for the means of *two independent* samples of scores.
5548 This is a two-sided test for the null hypothesis that 2 independent samples
5549 have identical average (expected) values. This test assumes that the
5550 populations have identical variances by default.
5552 Parameters
5553 ----------
5554 a, b : array_like
5555 The arrays must have the same shape, except in the dimension
5556 corresponding to `axis` (the first, by default).
5557 axis : int or None, optional
5558 Axis along which to compute test. If None, compute over the whole
5559 arrays, `a`, and `b`.
5560 equal_var : bool, optional
5561 If True (default), perform a standard independent 2 sample test
5562 that assumes equal population variances [1]_.
5563 If False, perform Welch's t-test, which does not assume equal
5564 population variance [2]_.
5566 .. versionadded:: 0.11.0
5567 nan_policy : {'propagate', 'raise', 'omit'}, optional
5568 Defines how to handle when input contains nan.
5569 The following options are available (default is 'propagate'):
5571 * 'propagate': returns nan
5572 * 'raise': throws an error
5573 * 'omit': performs the calculations ignoring nan values
5575 Returns
5576 -------
5577 statistic : float or array
5578 The calculated t-statistic.
5579 pvalue : float or array
5580 The two-tailed p-value.
5582 Notes
5583 -----
5584 We can use this test, if we observe two independent samples from
5585 the same or different population, e.g. exam scores of boys and
5586 girls or of two ethnic groups. The test measures whether the
5587 average (expected) value differs significantly across samples. If
5588 we observe a large p-value, for example larger than 0.05 or 0.1,
5589 then we cannot reject the null hypothesis of identical average scores.
5590 If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%,
5591 then we reject the null hypothesis of equal averages.
5593 References
5594 ----------
5595 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
5597 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
5599 Examples
5600 --------
5601 >>> from scipy import stats
5602 >>> np.random.seed(12345678)
5604 Test with sample with identical means:
5606 >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)
5607 >>> rvs2 = stats.norm.rvs(loc=5,scale=10,size=500)
5608 >>> stats.ttest_ind(rvs1,rvs2)
5609 (0.26833823296239279, 0.78849443369564776)
5610 >>> stats.ttest_ind(rvs1,rvs2, equal_var = False)
5611 (0.26833823296239279, 0.78849452749500748)
5613 `ttest_ind` underestimates p for unequal variances:
5615 >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500)
5616 >>> stats.ttest_ind(rvs1, rvs3)
5617 (-0.46580283298287162, 0.64145827413436174)
5618 >>> stats.ttest_ind(rvs1, rvs3, equal_var = False)
5619 (-0.46580283298287162, 0.64149646246569292)
5621 When n1 != n2, the equal variance t-statistic is no longer equal to the
5622 unequal variance t-statistic:
5624 >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100)
5625 >>> stats.ttest_ind(rvs1, rvs4)
5626 (-0.99882539442782481, 0.3182832709103896)
5627 >>> stats.ttest_ind(rvs1, rvs4, equal_var = False)
5628 (-0.69712570584654099, 0.48716927725402048)
5630 T-test with different means, variance, and n:
5632 >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100)
5633 >>> stats.ttest_ind(rvs1, rvs5)
5634 (-1.4679669854490653, 0.14263895620529152)
5635 >>> stats.ttest_ind(rvs1, rvs5, equal_var = False)
5636 (-0.94365973617132992, 0.34744170334794122)
5638 """
5639 a, b, axis = _chk2_asarray(a, b, axis)
5641 # check both a and b
5642 cna, npa = _contains_nan(a, nan_policy)
5643 cnb, npb = _contains_nan(b, nan_policy)
5644 contains_nan = cna or cnb
5645 if npa == 'omit' or npb == 'omit':
5646 nan_policy = 'omit'
5648 if contains_nan and nan_policy == 'omit':
5649 a = ma.masked_invalid(a)
5650 b = ma.masked_invalid(b)
5651 return mstats_basic.ttest_ind(a, b, axis, equal_var)
5653 if a.size == 0 or b.size == 0:
5654 return _ttest_nans(a, b, axis, Ttest_indResult)
5656 v1 = np.var(a, axis, ddof=1)
5657 v2 = np.var(b, axis, ddof=1)
5658 n1 = a.shape[axis]
5659 n2 = b.shape[axis]
5661 if equal_var:
5662 df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)
5663 else:
5664 df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)
5666 res = _ttest_ind_from_stats(np.mean(a, axis), np.mean(b, axis), denom, df)
5668 return Ttest_indResult(*res)
5671def _get_len(a, axis, msg):
5672 try:
5673 n = a.shape[axis]
5674 except IndexError:
5675 raise np.AxisError(axis, a.ndim, msg) from None
5676 return n
5679Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
5682def ttest_rel(a, b, axis=0, nan_policy='propagate'):
5683 """
5684 Calculate the t-test on TWO RELATED samples of scores, a and b.
5686 This is a two-sided test for the null hypothesis that 2 related or
5687 repeated samples have identical average (expected) values.
5689 Parameters
5690 ----------
5691 a, b : array_like
5692 The arrays must have the same shape.
5693 axis : int or None, optional
5694 Axis along which to compute test. If None, compute over the whole
5695 arrays, `a`, and `b`.
5696 nan_policy : {'propagate', 'raise', 'omit'}, optional
5697 Defines how to handle when input contains nan.
5698 The following options are available (default is 'propagate'):
5700 * 'propagate': returns nan
5701 * 'raise': throws an error
5702 * 'omit': performs the calculations ignoring nan values
5704 Returns
5705 -------
5706 statistic : float or array
5707 t-statistic.
5708 pvalue : float or array
5709 Two-sided p-value.
5711 Notes
5712 -----
5713 Examples for use are scores of the same set of student in
5714 different exams, or repeated sampling from the same units. The
5715 test measures whether the average score differs significantly
5716 across samples (e.g. exams). If we observe a large p-value, for
5717 example greater than 0.05 or 0.1 then we cannot reject the null
5718 hypothesis of identical average scores. If the p-value is smaller
5719 than the threshold, e.g. 1%, 5% or 10%, then we reject the null
5720 hypothesis of equal averages. Small p-values are associated with
5721 large t-statistics.
5723 References
5724 ----------
5725 https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples
5727 Examples
5728 --------
5729 >>> from scipy import stats
5730 >>> np.random.seed(12345678) # fix random seed to get same numbers
5732 >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)
5733 >>> rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) +
5734 ... stats.norm.rvs(scale=0.2,size=500))
5735 >>> stats.ttest_rel(rvs1,rvs2)
5736 (0.24101764965300962, 0.80964043445811562)
5737 >>> rvs3 = (stats.norm.rvs(loc=8,scale=10,size=500) +
5738 ... stats.norm.rvs(scale=0.2,size=500))
5739 >>> stats.ttest_rel(rvs1,rvs3)
5740 (-3.9995108708727933, 7.3082402191726459e-005)
5742 """
5743 a, b, axis = _chk2_asarray(a, b, axis)
5745 cna, npa = _contains_nan(a, nan_policy)
5746 cnb, npb = _contains_nan(b, nan_policy)
5747 contains_nan = cna or cnb
5748 if npa == 'omit' or npb == 'omit':
5749 nan_policy = 'omit'
5751 if contains_nan and nan_policy == 'omit':
5752 a = ma.masked_invalid(a)
5753 b = ma.masked_invalid(b)
5754 m = ma.mask_or(ma.getmask(a), ma.getmask(b))
5755 aa = ma.array(a, mask=m, copy=True)
5756 bb = ma.array(b, mask=m, copy=True)
5757 return mstats_basic.ttest_rel(aa, bb, axis)
5759 na = _get_len(a, axis, "first argument")
5760 nb = _get_len(b, axis, "second argument")
5761 if na != nb:
5762 raise ValueError('unequal length arrays')
5764 if na == 0:
5765 return _ttest_nans(a, b, axis, Ttest_relResult)
5767 n = a.shape[axis]
5768 df = n - 1
5770 d = (a - b).astype(np.float64)
5771 v = np.var(d, axis, ddof=1)
5772 dm = np.mean(d, axis)
5773 denom = np.sqrt(v / n)
5775 with np.errstate(divide='ignore', invalid='ignore'):
5776 t = np.divide(dm, denom)
5777 t, prob = _ttest_finish(df, t)
5779 return Ttest_relResult(t, prob)
5782# Map from names to lambda_ values used in power_divergence().
5783_power_div_lambda_names = {
5784 "pearson": 1,
5785 "log-likelihood": 0,
5786 "freeman-tukey": -0.5,
5787 "mod-log-likelihood": -1,
5788 "neyman": -2,
5789 "cressie-read": 2/3,
5790}
5793def _count(a, axis=None):
5794 """
5795 Count the number of non-masked elements of an array.
5797 This function behaves like np.ma.count(), but is much faster
5798 for ndarrays.
5799 """
5800 if hasattr(a, 'count'):
5801 num = a.count(axis=axis)
5802 if isinstance(num, np.ndarray) and num.ndim == 0:
5803 # In some cases, the `count` method returns a scalar array (e.g.
5804 # np.array(3)), but we want a plain integer.
5805 num = int(num)
5806 else:
5807 if axis is None:
5808 num = a.size
5809 else:
5810 num = a.shape[axis]
5811 return num
5814Power_divergenceResult = namedtuple('Power_divergenceResult',
5815 ('statistic', 'pvalue'))
5818def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):
5819 """
5820 Cressie-Read power divergence statistic and goodness of fit test.
5822 This function tests the null hypothesis that the categorical data
5823 has the given frequencies, using the Cressie-Read power divergence
5824 statistic.
5826 Parameters
5827 ----------
5828 f_obs : array_like
5829 Observed frequencies in each category.
5830 f_exp : array_like, optional
5831 Expected frequencies in each category. By default the categories are
5832 assumed to be equally likely.
5833 ddof : int, optional
5834 "Delta degrees of freedom": adjustment to the degrees of freedom
5835 for the p-value. The p-value is computed using a chi-squared
5836 distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
5837 is the number of observed frequencies. The default value of `ddof`
5838 is 0.
5839 axis : int or None, optional
5840 The axis of the broadcast result of `f_obs` and `f_exp` along which to
5841 apply the test. If axis is None, all values in `f_obs` are treated
5842 as a single data set. Default is 0.
5843 lambda_ : float or str, optional
5844 The power in the Cressie-Read power divergence statistic. The default
5845 is 1. For convenience, `lambda_` may be assigned one of the following
5846 strings, in which case the corresponding numerical value is used::
5848 String Value Description
5849 "pearson" 1 Pearson's chi-squared statistic.
5850 In this case, the function is
5851 equivalent to `stats.chisquare`.
5852 "log-likelihood" 0 Log-likelihood ratio. Also known as
5853 the G-test [3]_.
5854 "freeman-tukey" -1/2 Freeman-Tukey statistic.
5855 "mod-log-likelihood" -1 Modified log-likelihood ratio.
5856 "neyman" -2 Neyman's statistic.
5857 "cressie-read" 2/3 The power recommended in [5]_.
5859 Returns
5860 -------
5861 statistic : float or ndarray
5862 The Cressie-Read power divergence test statistic. The value is
5863 a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D.
5864 pvalue : float or ndarray
5865 The p-value of the test. The value is a float if `ddof` and the
5866 return value `stat` are scalars.
5868 See Also
5869 --------
5870 chisquare
5872 Notes
5873 -----
5874 This test is invalid when the observed or expected frequencies in each
5875 category are too small. A typical rule is that all of the observed
5876 and expected frequencies should be at least 5.
5878 When `lambda_` is less than zero, the formula for the statistic involves
5879 dividing by `f_obs`, so a warning or error may be generated if any value
5880 in `f_obs` is 0.
5882 Similarly, a warning or error may be generated if any value in `f_exp` is
5883 zero when `lambda_` >= 0.
5885 The default degrees of freedom, k-1, are for the case when no parameters
5886 of the distribution are estimated. If p parameters are estimated by
5887 efficient maximum likelihood then the correct degrees of freedom are
5888 k-1-p. If the parameters are estimated in a different way, then the
5889 dof can be between k-1-p and k-1. However, it is also possible that
5890 the asymptotic distribution is not a chisquare, in which case this
5891 test is not appropriate.
5893 This function handles masked arrays. If an element of `f_obs` or `f_exp`
5894 is masked, then data at that position is ignored, and does not count
5895 towards the size of the data set.
5897 .. versionadded:: 0.13.0
5899 References
5900 ----------
5901 .. [1] Lowry, Richard. "Concepts and Applications of Inferential
5902 Statistics". Chapter 8.
5903 https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html
5904 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
5905 .. [3] "G-test", https://en.wikipedia.org/wiki/G-test
5906 .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and
5907 practice of statistics in biological research", New York: Freeman
5908 (1981)
5909 .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
5910 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
5911 pp. 440-464.
5913 Examples
5914 --------
5915 (See `chisquare` for more examples.)
5917 When just `f_obs` is given, it is assumed that the expected frequencies
5918 are uniform and given by the mean of the observed frequencies. Here we
5919 perform a G-test (i.e. use the log-likelihood ratio statistic):
5921 >>> from scipy.stats import power_divergence
5922 >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood')
5923 (2.006573162632538, 0.84823476779463769)
5925 The expected frequencies can be given with the `f_exp` argument:
5927 >>> power_divergence([16, 18, 16, 14, 12, 12],
5928 ... f_exp=[16, 16, 16, 16, 16, 8],
5929 ... lambda_='log-likelihood')
5930 (3.3281031458963746, 0.6495419288047497)
5932 When `f_obs` is 2-D, by default the test is applied to each column.
5934 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
5935 >>> obs.shape
5936 (6, 2)
5937 >>> power_divergence(obs, lambda_="log-likelihood")
5938 (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225]))
5940 By setting ``axis=None``, the test is applied to all data in the array,
5941 which is equivalent to applying the test to the flattened array.
5943 >>> power_divergence(obs, axis=None)
5944 (23.31034482758621, 0.015975692534127565)
5945 >>> power_divergence(obs.ravel())
5946 (23.31034482758621, 0.015975692534127565)
5948 `ddof` is the change to make to the default degrees of freedom.
5950 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1)
5951 (2.0, 0.73575888234288467)
5953 The calculation of the p-values is done by broadcasting the
5954 test statistic with `ddof`.
5956 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2])
5957 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))
5959 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has
5960 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
5961 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared
5962 statistics, we must use ``axis=1``:
5964 >>> power_divergence([16, 18, 16, 14, 12, 12],
5965 ... f_exp=[[16, 16, 16, 16, 16, 8],
5966 ... [8, 20, 20, 16, 12, 12]],
5967 ... axis=1)
5968 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))
5970 """
5971 # Convert the input argument `lambda_` to a numerical value.
5972 if isinstance(lambda_, str):
5973 if lambda_ not in _power_div_lambda_names:
5974 names = repr(list(_power_div_lambda_names.keys()))[1:-1]
5975 raise ValueError("invalid string for lambda_: {0!r}. Valid strings "
5976 "are {1}".format(lambda_, names))
5977 lambda_ = _power_div_lambda_names[lambda_]
5978 elif lambda_ is None:
5979 lambda_ = 1
5981 f_obs = np.asanyarray(f_obs)
5983 if f_exp is not None:
5984 f_exp = np.asanyarray(f_exp)
5985 else:
5986 # Ignore 'invalid' errors so the edge case of a data set with length 0
5987 # is handled without spurious warnings.
5988 with np.errstate(invalid='ignore'):
5989 f_exp = f_obs.mean(axis=axis, keepdims=True)
5991 # `terms` is the array of terms that are summed along `axis` to create
5992 # the test statistic. We use some specialized code for a few special
5993 # cases of lambda_.
5994 if lambda_ == 1:
5995 # Pearson's chi-squared statistic
5996 terms = (f_obs.astype(np.float64) - f_exp)**2 / f_exp
5997 elif lambda_ == 0:
5998 # Log-likelihood ratio (i.e. G-test)
5999 terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
6000 elif lambda_ == -1:
6001 # Modified log-likelihood ratio
6002 terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs)
6003 else:
6004 # General Cressie-Read power divergence.
6005 terms = f_obs * ((f_obs / f_exp)**lambda_ - 1)
6006 terms /= 0.5 * lambda_ * (lambda_ + 1)
6008 stat = terms.sum(axis=axis)
6010 num_obs = _count(terms, axis=axis)
6011 ddof = asarray(ddof)
6012 p = distributions.chi2.sf(stat, num_obs - 1 - ddof)
6014 return Power_divergenceResult(stat, p)
6017def chisquare(f_obs, f_exp=None, ddof=0, axis=0):
6018 """
6019 Calculate a one-way chi-square test.
6021 The chi-square test tests the null hypothesis that the categorical data
6022 has the given frequencies.
6024 Parameters
6025 ----------
6026 f_obs : array_like
6027 Observed frequencies in each category.
6028 f_exp : array_like, optional
6029 Expected frequencies in each category. By default the categories are
6030 assumed to be equally likely.
6031 ddof : int, optional
6032 "Delta degrees of freedom": adjustment to the degrees of freedom
6033 for the p-value. The p-value is computed using a chi-squared
6034 distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
6035 is the number of observed frequencies. The default value of `ddof`
6036 is 0.
6037 axis : int or None, optional
6038 The axis of the broadcast result of `f_obs` and `f_exp` along which to
6039 apply the test. If axis is None, all values in `f_obs` are treated
6040 as a single data set. Default is 0.
6042 Returns
6043 -------
6044 chisq : float or ndarray
6045 The chi-squared test statistic. The value is a float if `axis` is
6046 None or `f_obs` and `f_exp` are 1-D.
6047 p : float or ndarray
6048 The p-value of the test. The value is a float if `ddof` and the
6049 return value `chisq` are scalars.
6051 See Also
6052 --------
6053 scipy.stats.power_divergence
6055 Notes
6056 -----
6057 This test is invalid when the observed or expected frequencies in each
6058 category are too small. A typical rule is that all of the observed
6059 and expected frequencies should be at least 5.
6061 The default degrees of freedom, k-1, are for the case when no parameters
6062 of the distribution are estimated. If p parameters are estimated by
6063 efficient maximum likelihood then the correct degrees of freedom are
6064 k-1-p. If the parameters are estimated in a different way, then the
6065 dof can be between k-1-p and k-1. However, it is also possible that
6066 the asymptotic distribution is not chi-square, in which case this test
6067 is not appropriate.
6069 References
6070 ----------
6071 .. [1] Lowry, Richard. "Concepts and Applications of Inferential
6072 Statistics". Chapter 8.
6073 https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html
6074 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
6076 Examples
6077 --------
6078 When just `f_obs` is given, it is assumed that the expected frequencies
6079 are uniform and given by the mean of the observed frequencies.
6081 >>> from scipy.stats import chisquare
6082 >>> chisquare([16, 18, 16, 14, 12, 12])
6083 (2.0, 0.84914503608460956)
6085 With `f_exp` the expected frequencies can be given.
6087 >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])
6088 (3.5, 0.62338762774958223)
6090 When `f_obs` is 2-D, by default the test is applied to each column.
6092 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
6093 >>> obs.shape
6094 (6, 2)
6095 >>> chisquare(obs)
6096 (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415]))
6098 By setting ``axis=None``, the test is applied to all data in the array,
6099 which is equivalent to applying the test to the flattened array.
6101 >>> chisquare(obs, axis=None)
6102 (23.31034482758621, 0.015975692534127565)
6103 >>> chisquare(obs.ravel())
6104 (23.31034482758621, 0.015975692534127565)
6106 `ddof` is the change to make to the default degrees of freedom.
6108 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1)
6109 (2.0, 0.73575888234288467)
6111 The calculation of the p-values is done by broadcasting the
6112 chi-squared statistic with `ddof`.
6114 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2])
6115 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))
6117 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has
6118 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
6119 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared
6120 statistics, we use ``axis=1``:
6122 >>> chisquare([16, 18, 16, 14, 12, 12],
6123 ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]],
6124 ... axis=1)
6125 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))
6127 """
6128 return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis,
6129 lambda_="pearson")
6132KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue'))
6135def _compute_dplus(cdfvals):
6136 """Computes D+ as used in the Kolmogorov-Smirnov test.
6138 Parameters
6139 ----------
6140 cdfvals: array_like
6141 Sorted array of CDF values between 0 and 1
6143 Returns
6144 -------
6145 Maximum distance of the CDF values below Uniform(0, 1)
6146"""
6147 n = len(cdfvals)
6148 return (np.arange(1.0, n + 1) / n - cdfvals).max()
6151def _compute_dminus(cdfvals):
6152 """Computes D- as used in the Kolmogorov-Smirnov test.
6154 Parameters
6155 ----------
6156 cdfvals: array_like
6157 Sorted array of CDF values between 0 and 1
6159 Returns
6160 -------
6161 Maximum distance of the CDF values above Uniform(0, 1)
6162 """
6163 n = len(cdfvals)
6164 return (cdfvals - np.arange(0.0, n)/n).max()
6167def ks_1samp(x, cdf, args=(), alternative='two-sided', mode='auto'):
6168 """
6169 Performs the Kolmogorov-Smirnov test for goodness of fit.
6171 This performs a test of the distribution F(x) of an observed
6172 random variable against a given distribution G(x). Under the null
6173 hypothesis, the two distributions are identical, F(x)=G(x). The
6174 alternative hypothesis can be either 'two-sided' (default), 'less'
6175 or 'greater'. The KS test is only valid for continuous distributions.
6177 Parameters
6178 ----------
6179 x : array_like
6180 a 1-D array of observations of iid random variables.
6181 cdf : callable
6182 callable used to calculate the cdf.
6183 args : tuple, sequence, optional
6184 Distribution parameters, used with `cdf`.
6185 alternative : {'two-sided', 'less', 'greater'}, optional
6186 Defines the alternative hypothesis.
6187 The following options are available (default is 'two-sided'):
6189 * 'two-sided'
6190 * 'less': one-sided, see explanation in Notes
6191 * 'greater': one-sided, see explanation in Notes
6192 mode : {'auto', 'exact', 'approx', 'asymp'}, optional
6193 Defines the distribution used for calculating the p-value.
6194 The following options are available (default is 'auto'):
6196 * 'auto' : selects one of the other options.
6197 * 'exact' : uses the exact distribution of test statistic.
6198 * 'approx' : approximates the two-sided probability with twice the one-sided probability
6199 * 'asymp': uses asymptotic distribution of test statistic
6201 Returns
6202 -------
6203 statistic : float
6204 KS test statistic, either D, D+ or D- (depending on the value of 'alternative')
6205 pvalue : float
6206 One-tailed or two-tailed p-value.
6208 See Also
6209 --------
6210 ks_2samp, kstest
6212 Notes
6213 -----
6214 In the one-sided test, the alternative is that the empirical
6215 cumulative distribution function of the random variable is "less"
6216 or "greater" than the cumulative distribution function G(x) of the
6217 hypothesis, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``.
6219 Examples
6220 --------
6221 >>> from scipy import stats
6223 >>> x = np.linspace(-15, 15, 9)
6224 >>> stats.ks_1samp(x, stats.norm.cdf)
6225 (0.44435602715924361, 0.038850142705171065)
6227 >>> np.random.seed(987654321) # set random seed to get the same result
6228 >>> stats.ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf)
6229 (0.058352892479417884, 0.8653960860778898)
6231 *Test against one-sided alternative hypothesis*
6233 Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``:
6235 >>> np.random.seed(987654321)
6236 >>> x = stats.norm.rvs(loc=0.2, size=100)
6237 >>> stats.ks_1samp(x, stats.norm.cdf, alternative='less')
6238 (0.12464329735846891, 0.040989164077641749)
6240 Reject equal distribution against alternative hypothesis: less
6242 >>> stats.ks_1samp(x, stats.norm.cdf, alternative='greater')
6243 (0.0072115233216311081, 0.98531158590396395)
6245 Don't reject equal distribution against alternative hypothesis: greater
6247 >>> stats.ks_1samp(x, stats.norm.cdf)
6248 (0.12464329735846891, 0.08197335233541582)
6250 Don't reject equal distribution against alternative hypothesis: two-sided
6252 *Testing t distributed random variables against normal distribution*
6254 With 100 degrees of freedom the t distribution looks close to the normal
6255 distribution, and the K-S test does not reject the hypothesis that the
6256 sample came from the normal distribution:
6258 >>> np.random.seed(987654321)
6259 >>> stats.ks_1samp(stats.t.rvs(100,size=100), stats.norm.cdf)
6260 (0.072018929165471257, 0.6505883498379312)
6262 With 3 degrees of freedom the t distribution looks sufficiently different
6263 from the normal distribution, that we can reject the hypothesis that the
6264 sample came from the normal distribution at the 10% level:
6266 >>> np.random.seed(987654321)
6267 >>> stats.ks_1samp(stats.t.rvs(3,size=100), stats.norm.cdf)
6268 (0.131016895759829, 0.058826222555312224)
6270 """
6271 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
6272 alternative.lower()[0], alternative)
6273 if alternative not in ['two-sided', 'greater', 'less']:
6274 raise ValueError("Unexpected alternative %s" % alternative)
6275 if np.ma.is_masked(x):
6276 x = x.compressed()
6278 N = len(x)
6279 x = np.sort(x)
6280 cdfvals = cdf(x, *args)
6282 if alternative == 'greater':
6283 Dplus = _compute_dplus(cdfvals)
6284 return KstestResult(Dplus, distributions.ksone.sf(Dplus, N))
6286 if alternative == 'less':
6287 Dminus = _compute_dminus(cdfvals)
6288 return KstestResult(Dminus, distributions.ksone.sf(Dminus, N))
6290 # alternative == 'two-sided':
6291 Dplus = _compute_dplus(cdfvals)
6292 Dminus = _compute_dminus(cdfvals)
6293 D = np.max([Dplus, Dminus])
6294 if mode == 'auto': # Always select exact
6295 mode = 'exact'
6296 if mode == 'exact':
6297 prob = distributions.kstwo.sf(D, N)
6298 elif mode == 'asymp':
6299 prob = distributions.kstwobign.sf(D * np.sqrt(N))
6300 else:
6301 # mode == 'approx'
6302 prob = 2 * distributions.ksone.sf(D, N)
6303 prob = np.clip(prob, 0, 1)
6304 return KstestResult(D, prob)
6307Ks_2sampResult = KstestResult
6310def _compute_prob_inside_method(m, n, g, h):
6311 """
6312 Count the proportion of paths that stay strictly inside two diagonal lines.
6314 Parameters
6315 ----------
6316 m : integer
6317 m > 0
6318 n : integer
6319 n > 0
6320 g : integer
6321 g is greatest common divisor of m and n
6322 h : integer
6323 0 <= h <= lcm(m,n)
6325 Returns
6326 -------
6327 p : float
6328 The proportion of paths that stay inside the two lines.
6331 Count the integer lattice paths from (0, 0) to (m, n) which satisfy
6332 |x/m - y/n| < h / lcm(m, n).
6333 The paths make steps of size +1 in either positive x or positive y directions.
6335 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
6336 Hodges, J.L. Jr.,
6337 "The Significance Probability of the Smirnov Two-Sample Test,"
6338 Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
6340 """
6341 # Probability is symmetrical in m, n. Computation below uses m >= n.
6342 if m < n:
6343 m, n = n, m
6344 mg = m // g
6345 ng = n // g
6347 # Count the integer lattice paths from (0, 0) to (m, n) which satisfy
6348 # |nx/g - my/g| < h.
6349 # Compute matrix A such that:
6350 # A(x, 0) = A(0, y) = 1
6351 # A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that
6352 # A(x, y) = 0 if |x/m - y/n|>= h
6353 # Probability is A(m, n)/binom(m+n, n)
6354 # Optimizations exist for m==n, m==n*p.
6355 # Only need to preserve a single column of A, and only a sliding window of it.
6356 # minj keeps track of the slide.
6357 minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1)
6358 curlen = maxj - minj
6359 # Make a vector long enough to hold maximum window needed.
6360 lenA = min(2 * maxj + 2, n + 1)
6361 # This is an integer calculation, but the entries are essentially
6362 # binomial coefficients, hence grow quickly.
6363 # Scaling after each column is computed avoids dividing by a
6364 # large binomial coefficent at the end, but is not sufficient to avoid
6365 # the large dyanamic range which appears during the calculation.
6366 # Instead we rescale based on the magnitude of the right most term in
6367 # the column and keep track of an exponent separately and apply
6368 # it at the end of the calculation. Similarly when multiplying by
6369 # the binomial coefficint
6370 dtype = np.float64
6371 A = np.zeros(lenA, dtype=dtype)
6372 # Initialize the first column
6373 A[minj:maxj] = 1
6374 expnt = 0
6375 for i in range(1, m + 1):
6376 # Generate the next column.
6377 # First calculate the sliding window
6378 lastminj, lastlen = minj, curlen
6379 minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0)
6380 minj = min(minj, n)
6381 maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1)
6382 if maxj <= minj:
6383 return 0
6384 # Now fill in the values
6385 A[0:maxj - minj] = np.cumsum(A[minj - lastminj:maxj - lastminj])
6386 curlen = maxj - minj
6387 if lastlen > curlen:
6388 # Set some carried-over elements to 0
6389 A[maxj - minj:maxj - minj + (lastlen - curlen)] = 0
6390 # Rescale if the right most value is over 2**900
6391 val = A[maxj - minj - 1]
6392 _, valexpt = math.frexp(val)
6393 if valexpt > 900:
6394 # Scaling to bring down to about 2**800 appears
6395 # sufficient for sizes under 10000.
6396 valexpt -= 800
6397 A = np.ldexp(A, -valexpt)
6398 expnt += valexpt
6400 val = A[maxj - minj - 1]
6401 # Now divide by the binomial (m+n)!/m!/n!
6402 for i in range(1, n + 1):
6403 val = (val * i) / (m + i)
6404 _, valexpt = math.frexp(val)
6405 if valexpt < -128:
6406 val = np.ldexp(val, -valexpt)
6407 expnt += valexpt
6408 # Finally scale if needed.
6409 return np.ldexp(val, expnt)
6412def _compute_prob_outside_square(n, h):
6413 """
6414 Compute the proportion of paths that pass outside the two diagonal lines.
6416 Parameters
6417 ----------
6418 n : integer
6419 n > 0
6420 h : integer
6421 0 <= h <= n
6423 Returns
6424 -------
6425 p : float
6426 The proportion of paths that pass outside the lines x-y = +/-h.
6428 """
6429 # Compute Pr(D_{n,n} >= h/n)
6430 # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... ) / binom(2n, n)
6431 # This formulation exhibits subtractive cancellation.
6432 # Instead divide each term by binom(2n, n), then factor common terms
6433 # and use a Horner-like algorithm
6434 # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...)))))
6436 P = 0.0
6437 k = int(np.floor(n / h))
6438 while k >= 0:
6439 p1 = 1.0
6440 # Each of the Ai terms has numerator and denominator with h simple terms.
6441 for j in range(h):
6442 p1 = (n - k * h - j) * p1 / (n + k * h + j + 1)
6443 P = p1 * (1.0 - P)
6444 k -= 1
6445 return 2 * P
6448def _count_paths_outside_method(m, n, g, h):
6449 """
6450 Count the number of paths that pass outside the specified diagonal.
6452 Parameters
6453 ----------
6454 m : integer
6455 m > 0
6456 n : integer
6457 n > 0
6458 g : integer
6459 g is greatest common divisor of m and n
6460 h : integer
6461 0 <= h <= lcm(m,n)
6463 Returns
6464 -------
6465 p : float
6466 The number of paths that go low.
6467 The calculation may overflow - check for a finite answer.
6469 Exceptions
6470 ----------
6471 FloatingPointError: Raised if the intermediate computation goes outside
6472 the range of a float.
6474 Notes
6475 -----
6476 Count the integer lattice paths from (0, 0) to (m, n), which at some
6477 point (x, y) along the path, satisfy:
6478 m*y <= n*x - h*g
6479 The paths make steps of size +1 in either positive x or positive y directions.
6481 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
6482 Hodges, J.L. Jr.,
6483 "The Significance Probability of the Smirnov Two-Sample Test,"
6484 Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
6486 """
6487 # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n)
6488 # B(x, y) = #{paths from (0,0) to (x,y) without previously crossing the boundary}
6489 # = binom(x, y) - #{paths which already reached the boundary}
6490 # Multiply by the number of path extensions going from (x, y) to (m, n)
6491 # Sum.
6493 # Probability is symmetrical in m, n. Computation below assumes m >= n.
6494 if m < n:
6495 m, n = n, m
6496 mg = m // g
6497 ng = n // g
6499 # Not every x needs to be considered.
6500 # xj holds the list of x values to be checked.
6501 # Wherever n*x/m + ng*h crosses an integer
6502 lxj = n + (mg-h)//mg
6503 xj = [(h + mg * j + ng-1)//ng for j in range(lxj)]
6504 # B is an array just holding a few values of B(x,y), the ones needed.
6505 # B[j] == B(x_j, j)
6506 if lxj == 0:
6507 return np.round(special.binom(m + n, n))
6508 B = np.zeros(lxj)
6509 B[0] = 1
6510 # Compute the B(x, y) terms
6511 # The binomial coefficient is an integer, but special.binom() may return a float.
6512 # Round it to the nearest integer.
6513 for j in range(1, lxj):
6514 Bj = np.round(special.binom(xj[j] + j, j))
6515 if not np.isfinite(Bj):
6516 raise FloatingPointError()
6517 for i in range(j):
6518 bin = np.round(special.binom(xj[j] - xj[i] + j - i, j-i))
6519 Bj -= bin * B[i]
6520 B[j] = Bj
6521 if not np.isfinite(Bj):
6522 raise FloatingPointError()
6523 # Compute the number of path extensions...
6524 num_paths = 0
6525 for j in range(lxj):
6526 bin = np.round(special.binom((m-xj[j]) + (n - j), n-j))
6527 term = B[j] * bin
6528 if not np.isfinite(term):
6529 raise FloatingPointError()
6530 num_paths += term
6531 return np.round(num_paths)
6534def _attempt_exact_2kssamp(n1, n2, g, d, alternative):
6535 """Attempts to compute the exact 2sample probability.
6537 n1, n2 are the sample sizes
6538 g is the gcd(n1, n2)
6539 d is the computed max difference in ECDFs
6541 Returns (success, d, probability)
6542 """
6543 lcm = (n1 // g) * n2
6544 h = int(np.round(d * lcm))
6545 d = h * 1.0 / lcm
6546 if h == 0:
6547 return True, d, 1.0
6548 saw_fp_error, prob = False, np.nan
6549 try:
6550 if alternative == 'two-sided':
6551 if n1 == n2:
6552 prob = _compute_prob_outside_square(n1, h)
6553 else:
6554 prob = 1 - _compute_prob_inside_method(n1, n2, g, h)
6555 else:
6556 if n1 == n2:
6557 # prob = binom(2n, n-h) / binom(2n, n)
6558 # Evaluating in that form incurs roundoff errors
6559 # from special.binom. Instead calculate directly
6560 jrange = np.arange(h)
6561 prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0))
6562 else:
6563 num_paths = _count_paths_outside_method(n1, n2, g, h)
6564 bin = special.binom(n1 + n2, n1)
6565 if not np.isfinite(bin) or not np.isfinite(num_paths) or num_paths > bin:
6566 saw_fp_error = True
6567 else:
6568 prob = num_paths / bin
6570 except FloatingPointError:
6571 saw_fp_error = True
6573 if saw_fp_error:
6574 return False, d, np.nan
6575 if not (0 <= prob <= 1):
6576 return False, d, prob
6577 return True, d, prob
6580def ks_2samp(data1, data2, alternative='two-sided', mode='auto'):
6581 """
6582 Compute the Kolmogorov-Smirnov statistic on 2 samples.
6584 This is a two-sided test for the null hypothesis that 2 independent samples
6585 are drawn from the same continuous distribution. The alternative hypothesis
6586 can be either 'two-sided' (default), 'less' or 'greater'.
6588 Parameters
6589 ----------
6590 data1, data2 : array_like, 1-Dimensional
6591 Two arrays of sample observations assumed to be drawn from a continuous
6592 distribution, sample sizes can be different.
6593 alternative : {'two-sided', 'less', 'greater'}, optional
6594 Defines the alternative hypothesis.
6595 The following options are available (default is 'two-sided'):
6597 * 'two-sided'
6598 * 'less': one-sided, see explanation in Notes
6599 * 'greater': one-sided, see explanation in Notes
6600 mode : {'auto', 'exact', 'asymp'}, optional
6601 Defines the method used for calculating the p-value.
6602 The following options are available (default is 'auto'):
6604 * 'auto' : use 'exact' for small size arrays, 'asymp' for large
6605 * 'exact' : use exact distribution of test statistic
6606 * 'asymp' : use asymptotic distribution of test statistic
6608 Returns
6609 -------
6610 statistic : float
6611 KS statistic.
6612 pvalue : float
6613 Two-tailed p-value.
6615 See Also
6616 --------
6617 kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp
6619 Notes
6620 -----
6621 This tests whether 2 samples are drawn from the same distribution. Note
6622 that, like in the case of the one-sample KS test, the distribution is
6623 assumed to be continuous.
6625 In the one-sided test, the alternative is that the empirical
6626 cumulative distribution function F(x) of the data1 variable is "less"
6627 or "greater" than the empirical cumulative distribution function G(x)
6628 of the data2 variable, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``.
6630 If the KS statistic is small or the p-value is high, then we cannot
6631 reject the hypothesis that the distributions of the two samples
6632 are the same.
6634 If the mode is 'auto', the computation is exact if the sample sizes are
6635 less than 10000. For larger sizes, the computation uses the
6636 Kolmogorov-Smirnov distributions to compute an approximate value.
6638 The 'two-sided' 'exact' computation computes the complementary probability
6639 and then subtracts from 1. As such, the minimum probability it can return
6640 is about 1e-16. While the algorithm itself is exact, numerical
6641 errors may accumulate for large sample sizes. It is most suited to
6642 situations in which one of the sample sizes is only a few thousand.
6644 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_.
6646 References
6647 ----------
6648 .. [1] Hodges, J.L. Jr., "The Significance Probability of the Smirnov
6649 Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
6652 Examples
6653 --------
6654 >>> from scipy import stats
6655 >>> np.random.seed(12345678) #fix random seed to get the same result
6656 >>> n1 = 200 # size of first sample
6657 >>> n2 = 300 # size of second sample
6659 For a different distribution, we can reject the null hypothesis since the
6660 pvalue is below 1%:
6662 >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
6663 >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
6664 >>> stats.ks_2samp(rvs1, rvs2)
6665 (0.20833333333333334, 5.129279597781977e-05)
6667 For a slightly different distribution, we cannot reject the null hypothesis
6668 at a 10% or lower alpha since the p-value at 0.144 is higher than 10%
6670 >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
6671 >>> stats.ks_2samp(rvs1, rvs3)
6672 (0.10333333333333333, 0.14691437867433876)
6674 For an identical distribution, we cannot reject the null hypothesis since
6675 the p-value is high, 41%:
6677 >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
6678 >>> stats.ks_2samp(rvs1, rvs4)
6679 (0.07999999999999996, 0.41126949729859719)
6681 """
6682 if mode not in ['auto', 'exact', 'asymp']:
6683 raise ValueError(f'Invalid value for mode: {mode}')
6684 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
6685 alternative.lower()[0], alternative)
6686 if alternative not in ['two-sided', 'less', 'greater']:
6687 raise ValueError(f'Invalid value for alternative: {alternative}')
6688 MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N
6689 if np.ma.is_masked(data1):
6690 data1 = data1.compressed()
6691 if np.ma.is_masked(data2):
6692 data2 = data2.compressed()
6693 data1 = np.sort(data1)
6694 data2 = np.sort(data2)
6695 n1 = data1.shape[0]
6696 n2 = data2.shape[0]
6697 if min(n1, n2) == 0:
6698 raise ValueError('Data passed to ks_2samp must not be empty')
6700 data_all = np.concatenate([data1, data2])
6701 # using searchsorted solves equal data problem
6702 cdf1 = np.searchsorted(data1, data_all, side='right') / n1
6703 cdf2 = np.searchsorted(data2, data_all, side='right') / n2
6704 cddiffs = cdf1 - cdf2
6705 minS = -np.min(cddiffs)
6706 maxS = np.max(cddiffs)
6707 alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': max(minS, maxS)}
6708 d = alt2Dvalue[alternative]
6709 g = gcd(n1, n2)
6710 n1g = n1 // g
6711 n2g = n2 // g
6712 prob = -np.inf
6713 original_mode = mode
6714 if mode == 'auto':
6715 mode = 'exact' if max(n1, n2) <= MAX_AUTO_N else 'asymp'
6716 elif mode == 'exact':
6717 # If lcm(n1, n2) is too big, switch from exact to asymp
6718 if n1g >= np.iinfo(np.int).max / n2g:
6719 mode = 'asymp'
6720 warnings.warn(
6721 "Exact ks_2samp calculation not possible with samples sizes "
6722 "%d and %d. Switching to 'asymp' " % (n1, n2), RuntimeWarning)
6724 if mode == 'exact':
6725 success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
6726 if not success:
6727 mode = 'asymp'
6728 if original_mode == 'exact':
6729 warnings.warn(f"ks_2samp: Exact calculation unsuccessful. "
6730 f"Switching to mode={mode}.", RuntimeWarning)
6732 if mode == 'asymp':
6733 # The product n1*n2 is large. Use Smirnov's asymptoptic formula.
6734 if alternative == 'two-sided':
6735 en = n1 * n2 / (n1 + n2)
6736 prob = distributions.kstwo.sf(d, np.round(en))
6737 else:
6738 m, n = max(n1, n2), min(n1, n2)
6739 z = np.sqrt(m*n/(m+n)) * d
6740 # Use Hodges' suggested approximation Eqn 5.3
6741 expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0
6742 prob = np.exp(expt)
6744 prob = np.clip(prob, 0, 1)
6745 return KstestResult(d, prob)
6748def _parse_kstest_args(data1, data2, args, N):
6749 # kstest allows many different variations of arguments.
6750 # Pull out the parsing into a separate function
6751 # (xvals, yvals, ) # 2sample
6752 # (xvals, cdf function,..)
6753 # (xvals, name of distribution, ...)
6754 # (name of distribution, name of distribution, ...)
6756 # Returns xvals, yvals, cdf
6757 # where cdf is a cdf function, or None
6758 # and yvals is either an array_like of values, or None
6759 # and xvals is array_like.
6760 rvsfunc, cdf = None, None
6761 if isinstance(data1, str):
6762 rvsfunc = getattr(distributions, data1).rvs
6763 elif callable(data1):
6764 rvsfunc = data1
6766 if isinstance(data2, str):
6767 cdf = getattr(distributions, data2).cdf
6768 data2 = None
6769 elif callable(data2):
6770 cdf = data2
6771 data2 = None
6773 data1 = np.sort(rvsfunc(*args, size=N) if rvsfunc else data1)
6774 return data1, data2, cdf
6777def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='auto'):
6778 """
6779 Performs the (one sample or two samples) Kolmogorov-Smirnov test for goodness of fit.
6781 The one-sample test performs a test of the distribution F(x) of an observed
6782 random variable against a given distribution G(x). Under the null
6783 hypothesis, the two distributions are identical, F(x)=G(x). The
6784 alternative hypothesis can be either 'two-sided' (default), 'less'
6785 or 'greater'. The KS test is only valid for continuous distributions.
6786 The two-sample test tests whether the two independent samples are drawn
6787 from the same continuous distribution.
6789 Parameters
6790 ----------
6791 rvs : str, array_like, or callable
6792 If an array, it should be a 1-D array of observations of random
6793 variables.
6794 If a callable, it should be a function to generate random variables;
6795 it is required to have a keyword argument `size`.
6796 If a string, it should be the name of a distribution in `scipy.stats`,
6797 which will be used to generate random variables.
6798 cdf : str, array_like or callable
6799 If array_like, it should be a 1-D array of observations of random
6800 variables, and the two-sample test is performed (and rvs must be array_like)
6801 If a callable, that callable is used to calculate the cdf.
6802 If a string, it should be the name of a distribution in `scipy.stats`,
6803 which will be used as the cdf function.
6804 args : tuple, sequence, optional
6805 Distribution parameters, used if `rvs` or `cdf` are strings or callables.
6806 N : int, optional
6807 Sample size if `rvs` is string or callable. Default is 20.
6808 alternative : {'two-sided', 'less', 'greater'}, optional
6809 Defines the alternative hypothesis.
6810 The following options are available (default is 'two-sided'):
6812 * 'two-sided'
6813 * 'less': one-sided, see explanation in Notes
6814 * 'greater': one-sided, see explanation in Notes
6815 mode : {'auto', 'exact', 'approx', 'asymp'}, optional
6816 Defines the distribution used for calculating the p-value.
6817 The following options are available (default is 'auto'):
6819 * 'auto' : selects one of the other options.
6820 * 'exact' : uses the exact distribution of test statistic.
6821 * 'approx' : approximates the two-sided probability with twice the one-sided probability
6822 * 'asymp': uses asymptotic distribution of test statistic
6824 Returns
6825 -------
6826 statistic : float
6827 KS test statistic, either D, D+ or D-.
6828 pvalue : float
6829 One-tailed or two-tailed p-value.
6831 See Also
6832 --------
6833 ks_2samp
6835 Notes
6836 -----
6837 In the one-sided test, the alternative is that the empirical
6838 cumulative distribution function of the random variable is "less"
6839 or "greater" than the cumulative distribution function G(x) of the
6840 hypothesis, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``.
6842 Examples
6843 --------
6844 >>> from scipy import stats
6846 >>> x = np.linspace(-15, 15, 9)
6847 >>> stats.kstest(x, 'norm')
6848 (0.44435602715924361, 0.038850142705171065)
6850 >>> np.random.seed(987654321) # set random seed to get the same result
6851 >>> stats.kstest(stats.norm.rvs(size=100), stats.norm.cdf)
6852 (0.058352892479417884, 0.8653960860778898)
6854 The above lines are equivalent to:
6856 >>> np.random.seed(987654321)
6857 >>> stats.kstest(stats.norm.rvs, 'norm', N=100)
6858 (0.058352892479417884, 0.8653960860778898)
6860 *Test against one-sided alternative hypothesis*
6862 Shift distribution to larger values, so that ``CDF(x) < norm.cdf(x)``:
6864 >>> np.random.seed(987654321)
6865 >>> x = stats.norm.rvs(loc=0.2, size=100)
6866 >>> stats.kstest(x, 'norm', alternative='less')
6867 (0.12464329735846891, 0.040989164077641749)
6869 Reject equal distribution against alternative hypothesis: less
6871 >>> stats.kstest(x, 'norm', alternative='greater')
6872 (0.0072115233216311081, 0.98531158590396395)
6874 Don't reject equal distribution against alternative hypothesis: greater
6876 >>> stats.kstest(x, 'norm')
6877 (0.12464329735846891, 0.08197335233541582)
6879 *Testing t distributed random variables against normal distribution*
6881 With 100 degrees of freedom the t distribution looks close to the normal
6882 distribution, and the K-S test does not reject the hypothesis that the
6883 sample came from the normal distribution:
6885 >>> np.random.seed(987654321)
6886 >>> stats.kstest(stats.t.rvs(100, size=100), 'norm')
6887 (0.072018929165471257, 0.6505883498379312)
6889 With 3 degrees of freedom the t distribution looks sufficiently different
6890 from the normal distribution, that we can reject the hypothesis that the
6891 sample came from the normal distribution at the 10% level:
6893 >>> np.random.seed(987654321)
6894 >>> stats.kstest(stats.t.rvs(3, size=100), 'norm')
6895 (0.131016895759829, 0.058826222555312224)
6897 """
6898 # to not break compatibility with existing code
6899 if alternative == 'two_sided':
6900 alternative = 'two-sided'
6901 if alternative not in ['two-sided', 'greater', 'less']:
6902 raise ValueError("Unexpected alternative %s" % alternative)
6903 xvals, yvals, cdf = _parse_kstest_args(rvs, cdf, args, N)
6904 if cdf:
6905 return ks_1samp(xvals, cdf, args=args, alternative=alternative, mode=mode)
6906 return ks_2samp(xvals, yvals, alternative=alternative, mode=mode)
6909def tiecorrect(rankvals):
6910 """
6911 Tie correction factor for Mann-Whitney U and Kruskal-Wallis H tests.
6913 Parameters
6914 ----------
6915 rankvals : array_like
6916 A 1-D sequence of ranks. Typically this will be the array
6917 returned by `~scipy.stats.rankdata`.
6919 Returns
6920 -------
6921 factor : float
6922 Correction factor for U or H.
6924 See Also
6925 --------
6926 rankdata : Assign ranks to the data
6927 mannwhitneyu : Mann-Whitney rank test
6928 kruskal : Kruskal-Wallis H test
6930 References
6931 ----------
6932 .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral
6933 Sciences. New York: McGraw-Hill.
6935 Examples
6936 --------
6937 >>> from scipy.stats import tiecorrect, rankdata
6938 >>> tiecorrect([1, 2.5, 2.5, 4])
6939 0.9
6940 >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4])
6941 >>> ranks
6942 array([ 1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5])
6943 >>> tiecorrect(ranks)
6944 0.9833333333333333
6946 """
6947 arr = np.sort(rankvals)
6948 idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0]
6949 cnt = np.diff(idx).astype(np.float64)
6951 size = np.float64(arr.size)
6952 return 1.0 if size < 2 else 1.0 - (cnt**3 - cnt).sum() / (size**3 - size)
6955MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
6958def mannwhitneyu(x, y, use_continuity=True, alternative=None):
6959 """
6960 Compute the Mann-Whitney rank test on samples x and y.
6962 Parameters
6963 ----------
6964 x, y : array_like
6965 Array of samples, should be one-dimensional.
6966 use_continuity : bool, optional
6967 Whether a continuity correction (1/2.) should be taken into
6968 account. Default is True.
6969 alternative : {None, 'two-sided', 'less', 'greater'}, optional
6970 Defines the alternative hypothesis.
6971 The following options are available (default is None):
6973 * None: computes p-value half the size of the 'two-sided' p-value and
6974 a different U statistic. The default behavior is not the same as
6975 using 'less' or 'greater'; it only exists for backward compatibility
6976 and is deprecated.
6977 * 'two-sided'
6978 * 'less': one-sided
6979 * 'greater': one-sided
6981 Use of the None option is deprecated.
6983 Returns
6984 -------
6985 statistic : float
6986 The Mann-Whitney U statistic, equal to min(U for x, U for y) if
6987 `alternative` is equal to None (deprecated; exists for backward
6988 compatibility), and U for y otherwise.
6989 pvalue : float
6990 p-value assuming an asymptotic normal distribution. One-sided or
6991 two-sided, depending on the choice of `alternative`.
6993 Notes
6994 -----
6995 Use only when the number of observation in each sample is > 20 and
6996 you have 2 independent samples of ranks. Mann-Whitney U is
6997 significant if the u-obtained is LESS THAN or equal to the critical
6998 value of U.
7000 This test corrects for ties and by default uses a continuity correction.
7002 References
7003 ----------
7004 .. [1] https://en.wikipedia.org/wiki/Mann-Whitney_U_test
7006 .. [2] H.B. Mann and D.R. Whitney, "On a Test of Whether one of Two Random
7007 Variables is Stochastically Larger than the Other," The Annals of
7008 Mathematical Statistics, vol. 18, no. 1, pp. 50-60, 1947.
7010 """
7011 if alternative is None:
7012 warnings.warn("Calling `mannwhitneyu` without specifying "
7013 "`alternative` is deprecated.", DeprecationWarning)
7015 x = np.asarray(x)
7016 y = np.asarray(y)
7017 n1 = len(x)
7018 n2 = len(y)
7019 ranked = rankdata(np.concatenate((x, y)))
7020 rankx = ranked[0:n1] # get the x-ranks
7021 u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx, axis=0) # calc U for x
7022 u2 = n1*n2 - u1 # remainder is U for y
7023 T = tiecorrect(ranked)
7024 if T == 0:
7025 raise ValueError('All numbers are identical in mannwhitneyu')
7026 sd = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0)
7028 meanrank = n1*n2/2.0 + 0.5 * use_continuity
7029 if alternative is None or alternative == 'two-sided':
7030 bigu = max(u1, u2)
7031 elif alternative == 'less':
7032 bigu = u1
7033 elif alternative == 'greater':
7034 bigu = u2
7035 else:
7036 raise ValueError("alternative should be None, 'less', 'greater' "
7037 "or 'two-sided'")
7039 z = (bigu - meanrank) / sd
7040 if alternative is None:
7041 # This behavior, equal to half the size of the two-sided
7042 # p-value, is deprecated.
7043 p = distributions.norm.sf(abs(z))
7044 elif alternative == 'two-sided':
7045 p = 2 * distributions.norm.sf(abs(z))
7046 else:
7047 p = distributions.norm.sf(z)
7049 u = u2
7050 # This behavior is deprecated.
7051 if alternative is None:
7052 u = min(u1, u2)
7053 return MannwhitneyuResult(u, p)
7056RanksumsResult = namedtuple('RanksumsResult', ('statistic', 'pvalue'))
7059def ranksums(x, y):
7060 """
7061 Compute the Wilcoxon rank-sum statistic for two samples.
7063 The Wilcoxon rank-sum test tests the null hypothesis that two sets
7064 of measurements are drawn from the same distribution. The alternative
7065 hypothesis is that values in one sample are more likely to be
7066 larger than the values in the other sample.
7068 This test should be used to compare two samples from continuous
7069 distributions. It does not handle ties between measurements
7070 in x and y. For tie-handling and an optional continuity correction
7071 see `scipy.stats.mannwhitneyu`.
7073 Parameters
7074 ----------
7075 x,y : array_like
7076 The data from the two samples.
7078 Returns
7079 -------
7080 statistic : float
7081 The test statistic under the large-sample approximation that the
7082 rank sum statistic is normally distributed.
7083 pvalue : float
7084 The two-sided p-value of the test.
7086 References
7087 ----------
7088 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test
7090 Examples
7091 --------
7092 We can test the hypothesis that two independent unequal-sized samples are
7093 drawn from the same distribution with computing the Wilcoxon rank-sum
7094 statistic.
7096 >>> from scipy.stats import ranksums
7097 >>> sample1 = np.random.uniform(-1, 1, 200)
7098 >>> sample2 = np.random.uniform(-0.5, 1.5, 300) # a shifted distribution
7099 >>> ranksums(sample1, sample2)
7100 RanksumsResult(statistic=-7.887059, pvalue=3.09390448e-15) # may vary
7102 The p-value of less than ``0.05`` indicates that this test rejects the
7103 hypothesis at the 5% significance level.
7105 """
7106 x, y = map(np.asarray, (x, y))
7107 n1 = len(x)
7108 n2 = len(y)
7109 alldata = np.concatenate((x, y))
7110 ranked = rankdata(alldata)
7111 x = ranked[:n1]
7112 s = np.sum(x, axis=0)
7113 expected = n1 * (n1+n2+1) / 2.0
7114 z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
7115 prob = 2 * distributions.norm.sf(abs(z))
7117 return RanksumsResult(z, prob)
7120KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
7123def kruskal(*args, **kwargs):
7124 """
7125 Compute the Kruskal-Wallis H-test for independent samples.
7127 The Kruskal-Wallis H-test tests the null hypothesis that the population
7128 median of all of the groups are equal. It is a non-parametric version of
7129 ANOVA. The test works on 2 or more independent samples, which may have
7130 different sizes. Note that rejecting the null hypothesis does not
7131 indicate which of the groups differs. Post hoc comparisons between
7132 groups are required to determine which groups are different.
7134 Parameters
7135 ----------
7136 sample1, sample2, ... : array_like
7137 Two or more arrays with the sample measurements can be given as
7138 arguments.
7139 nan_policy : {'propagate', 'raise', 'omit'}, optional
7140 Defines how to handle when input contains nan.
7141 The following options are available (default is 'propagate'):
7143 * 'propagate': returns nan
7144 * 'raise': throws an error
7145 * 'omit': performs the calculations ignoring nan values
7147 Returns
7148 -------
7149 statistic : float
7150 The Kruskal-Wallis H statistic, corrected for ties.
7151 pvalue : float
7152 The p-value for the test using the assumption that H has a chi
7153 square distribution.
7155 See Also
7156 --------
7157 f_oneway : 1-way ANOVA.
7158 mannwhitneyu : Mann-Whitney rank test on two samples.
7159 friedmanchisquare : Friedman test for repeated measurements.
7161 Notes
7162 -----
7163 Due to the assumption that H has a chi square distribution, the number
7164 of samples in each group must not be too small. A typical rule is
7165 that each sample must have at least 5 measurements.
7167 References
7168 ----------
7169 .. [1] W. H. Kruskal & W. W. Wallis, "Use of Ranks in
7170 One-Criterion Variance Analysis", Journal of the American Statistical
7171 Association, Vol. 47, Issue 260, pp. 583-621, 1952.
7172 .. [2] https://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance
7174 Examples
7175 --------
7176 >>> from scipy import stats
7177 >>> x = [1, 3, 5, 7, 9]
7178 >>> y = [2, 4, 6, 8, 10]
7179 >>> stats.kruskal(x, y)
7180 KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895)
7182 >>> x = [1, 1, 1]
7183 >>> y = [2, 2, 2]
7184 >>> z = [2, 2]
7185 >>> stats.kruskal(x, y, z)
7186 KruskalResult(statistic=7.0, pvalue=0.0301973834223185)
7188 """
7189 args = list(map(np.asarray, args))
7190 num_groups = len(args)
7191 if num_groups < 2:
7192 raise ValueError("Need at least two groups in stats.kruskal()")
7194 for arg in args:
7195 if arg.size == 0:
7196 return KruskalResult(np.nan, np.nan)
7197 n = np.asarray(list(map(len, args)))
7199 if 'nan_policy' in kwargs.keys():
7200 if kwargs['nan_policy'] not in ('propagate', 'raise', 'omit'):
7201 raise ValueError("nan_policy must be 'propagate', "
7202 "'raise' or'omit'")
7203 else:
7204 nan_policy = kwargs['nan_policy']
7205 else:
7206 nan_policy = 'propagate'
7208 contains_nan = False
7209 for arg in args:
7210 cn = _contains_nan(arg, nan_policy)
7211 if cn[0]:
7212 contains_nan = True
7213 break
7215 if contains_nan and nan_policy == 'omit':
7216 for a in args:
7217 a = ma.masked_invalid(a)
7218 return mstats_basic.kruskal(*args)
7220 if contains_nan and nan_policy == 'propagate':
7221 return KruskalResult(np.nan, np.nan)
7223 alldata = np.concatenate(args)
7224 ranked = rankdata(alldata)
7225 ties = tiecorrect(ranked)
7226 if ties == 0:
7227 raise ValueError('All numbers are identical in kruskal')
7229 # Compute sum^2/n for each group and sum
7230 j = np.insert(np.cumsum(n), 0, 0)
7231 ssbn = 0
7232 for i in range(num_groups):
7233 ssbn += _square_of_sums(ranked[j[i]:j[i+1]]) / n[i]
7235 totaln = np.sum(n, dtype=float)
7236 h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1)
7237 df = num_groups - 1
7238 h /= ties
7240 return KruskalResult(h, distributions.chi2.sf(h, df))
7243FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
7244 ('statistic', 'pvalue'))
7247def friedmanchisquare(*args):
7248 """
7249 Compute the Friedman test for repeated measurements.
7251 The Friedman test tests the null hypothesis that repeated measurements of
7252 the same individuals have the same distribution. It is often used
7253 to test for consistency among measurements obtained in different ways.
7254 For example, if two measurement techniques are used on the same set of
7255 individuals, the Friedman test can be used to determine if the two
7256 measurement techniques are consistent.
7258 Parameters
7259 ----------
7260 measurements1, measurements2, measurements3... : array_like
7261 Arrays of measurements. All of the arrays must have the same number
7262 of elements. At least 3 sets of measurements must be given.
7264 Returns
7265 -------
7266 statistic : float
7267 The test statistic, correcting for ties.
7268 pvalue : float
7269 The associated p-value assuming that the test statistic has a chi
7270 squared distribution.
7272 Notes
7273 -----
7274 Due to the assumption that the test statistic has a chi squared
7275 distribution, the p-value is only reliable for n > 10 and more than
7276 6 repeated measurements.
7278 References
7279 ----------
7280 .. [1] https://en.wikipedia.org/wiki/Friedman_test
7282 """
7283 k = len(args)
7284 if k < 3:
7285 raise ValueError('Less than 3 levels. Friedman test not appropriate.')
7287 n = len(args[0])
7288 for i in range(1, k):
7289 if len(args[i]) != n:
7290 raise ValueError('Unequal N in friedmanchisquare. Aborting.')
7292 # Rank data
7293 data = np.vstack(args).T
7294 data = data.astype(float)
7295 for i in range(len(data)):
7296 data[i] = rankdata(data[i])
7298 # Handle ties
7299 ties = 0
7300 for i in range(len(data)):
7301 replist, repnum = find_repeats(array(data[i]))
7302 for t in repnum:
7303 ties += t * (t*t - 1)
7304 c = 1 - ties / (k*(k*k - 1)*n)
7306 ssbn = np.sum(data.sum(axis=0)**2)
7307 chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c
7309 return FriedmanchisquareResult(chisq, distributions.chi2.sf(chisq, k - 1))
7312BrunnerMunzelResult = namedtuple('BrunnerMunzelResult',
7313 ('statistic', 'pvalue'))
7316def brunnermunzel(x, y, alternative="two-sided", distribution="t",
7317 nan_policy='propagate'):
7318 """
7319 Compute the Brunner-Munzel test on samples x and y.
7321 The Brunner-Munzel test is a nonparametric test of the null hypothesis that
7322 when values are taken one by one from each group, the probabilities of
7323 getting large values in both groups are equal.
7324 Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
7325 assumption of equivariance of two groups. Note that this does not assume
7326 the distributions are same. This test works on two independent samples,
7327 which may have different sizes.
7329 Parameters
7330 ----------
7331 x, y : array_like
7332 Array of samples, should be one-dimensional.
7333 alternative : {'two-sided', 'less', 'greater'}, optional
7334 Defines the alternative hypothesis.
7335 The following options are available (default is 'two-sided'):
7337 * 'two-sided'
7338 * 'less': one-sided
7339 * 'greater': one-sided
7340 distribution : {'t', 'normal'}, optional
7341 Defines how to get the p-value.
7342 The following options are available (default is 't'):
7344 * 't': get the p-value by t-distribution
7345 * 'normal': get the p-value by standard normal distribution.
7346 nan_policy : {'propagate', 'raise', 'omit'}, optional
7347 Defines how to handle when input contains nan.
7348 The following options are available (default is 'propagate'):
7350 * 'propagate': returns nan
7351 * 'raise': throws an error
7352 * 'omit': performs the calculations ignoring nan values
7354 Returns
7355 -------
7356 statistic : float
7357 The Brunner-Munzer W statistic.
7358 pvalue : float
7359 p-value assuming an t distribution. One-sided or
7360 two-sided, depending on the choice of `alternative` and `distribution`.
7362 See Also
7363 --------
7364 mannwhitneyu : Mann-Whitney rank test on two samples.
7366 Notes
7367 -----
7368 Brunner and Munzel recommended to estimate the p-value by t-distribution
7369 when the size of data is 50 or less. If the size is lower than 10, it would
7370 be better to use permuted Brunner Munzel test (see [2]_).
7372 References
7373 ----------
7374 .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher
7375 problem: Asymptotic theory and a small-sample approximation".
7376 Biometrical Journal. Vol. 42(2000): 17-25.
7377 .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the
7378 non-parametric Behrens-Fisher problem". Computational Statistics and
7379 Data Analysis. Vol. 51(2007): 5192-5204.
7381 Examples
7382 --------
7383 >>> from scipy import stats
7384 >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1]
7385 >>> x2 = [3,3,4,3,1,2,3,1,1,5,4]
7386 >>> w, p_value = stats.brunnermunzel(x1, x2)
7387 >>> w
7388 3.1374674823029505
7389 >>> p_value
7390 0.0057862086661515377
7392 """
7393 x = np.asarray(x)
7394 y = np.asarray(y)
7396 # check both x and y
7397 cnx, npx = _contains_nan(x, nan_policy)
7398 cny, npy = _contains_nan(y, nan_policy)
7399 contains_nan = cnx or cny
7400 if npx == "omit" or npy == "omit":
7401 nan_policy = "omit"
7403 if contains_nan and nan_policy == "propagate":
7404 return BrunnerMunzelResult(np.nan, np.nan)
7405 elif contains_nan and nan_policy == "omit":
7406 x = ma.masked_invalid(x)
7407 y = ma.masked_invalid(y)
7408 return mstats_basic.brunnermunzel(x, y, alternative, distribution)
7410 nx = len(x)
7411 ny = len(y)
7412 if nx == 0 or ny == 0:
7413 return BrunnerMunzelResult(np.nan, np.nan)
7414 rankc = rankdata(np.concatenate((x, y)))
7415 rankcx = rankc[0:nx]
7416 rankcy = rankc[nx:nx+ny]
7417 rankcx_mean = np.mean(rankcx)
7418 rankcy_mean = np.mean(rankcy)
7419 rankx = rankdata(x)
7420 ranky = rankdata(y)
7421 rankx_mean = np.mean(rankx)
7422 ranky_mean = np.mean(ranky)
7424 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
7425 Sx /= nx - 1
7426 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
7427 Sy /= ny - 1
7429 wbfn = nx * ny * (rankcy_mean - rankcx_mean)
7430 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
7432 if distribution == "t":
7433 df_numer = np.power(nx * Sx + ny * Sy, 2.0)
7434 df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
7435 df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
7436 df = df_numer / df_denom
7437 p = distributions.t.cdf(wbfn, df)
7438 elif distribution == "normal":
7439 p = distributions.norm.cdf(wbfn)
7440 else:
7441 raise ValueError(
7442 "distribution should be 't' or 'normal'")
7444 if alternative == "greater":
7445 pass
7446 elif alternative == "less":
7447 p = 1 - p
7448 elif alternative == "two-sided":
7449 p = 2 * np.min([p, 1-p])
7450 else:
7451 raise ValueError(
7452 "alternative should be 'less', 'greater' or 'two-sided'")
7454 return BrunnerMunzelResult(wbfn, p)
7457def combine_pvalues(pvalues, method='fisher', weights=None):
7458 """
7459 Combine p-values from independent tests bearing upon the same hypothesis.
7461 Parameters
7462 ----------
7463 pvalues : array_like, 1-D
7464 Array of p-values assumed to come from independent tests.
7465 method : {'fisher', 'pearson', 'tippett', 'stouffer', 'mudholkar_george'}, optional
7466 Name of method to use to combine p-values.
7467 The following methods are available (default is 'fisher'):
7469 * 'fisher': Fisher's method (Fisher's combined probability test), the
7470 sum of the logarithm of the p-values
7471 * 'pearson': Pearson's method (similar to Fisher's but uses sum of the
7472 complement of the p-values inside the logarithms)
7473 * 'tippett': Tippett's method (minimum of p-values)
7474 * 'stouffer': Stouffer's Z-score method
7475 * 'mudholkar_george': the difference of Fisher's and Pearson's methods
7476 divided by 2
7477 weights : array_like, 1-D, optional
7478 Optional array of weights used only for Stouffer's Z-score method.
7480 Returns
7481 -------
7482 statistic: float
7483 The statistic calculated by the specified method.
7484 pval: float
7485 The combined p-value.
7487 Notes
7488 -----
7489 Fisher's method (also known as Fisher's combined probability test) [1]_ uses
7490 a chi-squared statistic to compute a combined p-value. The closely related
7491 Stouffer's Z-score method [2]_ uses Z-scores rather than p-values. The
7492 advantage of Stouffer's method is that it is straightforward to introduce
7493 weights, which can make Stouffer's method more powerful than Fisher's
7494 method when the p-values are from studies of different size [6]_ [7]_.
7495 The Pearson's method uses :math:`log(1-p_i)` inside the sum whereas Fisher's
7496 method uses :math:`log(p_i)` [4]_. For Fisher's and Pearson's method, the
7497 sum of the logarithms is multiplied by -2 in the implementation. This
7498 quantity has a chi-square distribution that determines the p-value. The
7499 `mudholkar_george` method is the difference of the Fisher's and Pearson's
7500 test statistics, each of which include the -2 factor [4]_. However, the
7501 `mudholkar_george` method does not include these -2 factors. The test
7502 statistic of `mudholkar_george` is the sum of logisitic random variables and
7503 equation 3.6 in [3]_ is used to approximate the p-value based on Student's
7504 t-distribution.
7506 Fisher's method may be extended to combine p-values from dependent tests
7507 [5]_. Extensions such as Brown's method and Kost's method are not currently
7508 implemented.
7510 .. versionadded:: 0.15.0
7512 References
7513 ----------
7514 .. [1] https://en.wikipedia.org/wiki/Fisher%27s_method
7515 .. [2] https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer.27s_Z-score_method
7516 .. [3] George, E. O., and G. S. Mudholkar. "On the convolution of logistic
7517 random variables." Metrika 30.1 (1983): 1-13.
7518 .. [4] Heard, N. and Rubin-Delanchey, P. "Choosing between methods of
7519 combining p-values." Biometrika 105.1 (2018): 239-246.
7520 .. [5] Whitlock, M. C. "Combining probability from independent tests: the
7521 weighted Z-method is superior to Fisher's approach." Journal of
7522 Evolutionary Biology 18, no. 5 (2005): 1368-1373.
7523 .. [6] Zaykin, Dmitri V. "Optimally weighted Z-test is a powerful method
7524 for combining probabilities in meta-analysis." Journal of
7525 Evolutionary Biology 24, no. 8 (2011): 1836-1841.
7526 .. [7] https://en.wikipedia.org/wiki/Extensions_of_Fisher%27s_method
7528 """
7529 pvalues = np.asarray(pvalues)
7530 if pvalues.ndim != 1:
7531 raise ValueError("pvalues is not 1-D")
7533 if method == 'fisher':
7534 statistic = -2 * np.sum(np.log(pvalues))
7535 pval = distributions.chi2.sf(statistic, 2 * len(pvalues))
7536 elif method == 'pearson':
7537 statistic = -2 * np.sum(np.log1p(-pvalues))
7538 pval = distributions.chi2.sf(statistic, 2 * len(pvalues))
7539 elif method == 'mudholkar_george':
7540 statistic = -np.sum(np.log(pvalues)) + np.sum(np.log1p(-pvalues))
7541 nu = 5 * len(pvalues) + 4
7542 approx_factor = np.sqrt(nu / (nu - 2))
7543 pval = distributions.t.sf(statistic * approx_factor, nu)
7544 elif method == 'tippett':
7545 statistic = np.min(pvalues)
7546 pval = distributions.beta.sf(statistic, 1, len(pvalues))
7547 elif method == 'stouffer':
7548 if weights is None:
7549 weights = np.ones_like(pvalues)
7550 elif len(weights) != len(pvalues):
7551 raise ValueError("pvalues and weights must be of the same size.")
7553 weights = np.asarray(weights)
7554 if weights.ndim != 1:
7555 raise ValueError("weights is not 1-D")
7557 Zi = distributions.norm.isf(pvalues)
7558 statistic = np.dot(weights, Zi) / np.linalg.norm(weights)
7559 pval = distributions.norm.sf(statistic)
7561 else:
7562 raise ValueError(
7563 "Invalid method '%s'. Options are 'fisher', 'pearson', \
7564 'mudholkar_george', 'tippett', 'or 'stouffer'", method)
7566 return (statistic, pval)
7569#####################################
7570# STATISTICAL DISTANCES #
7571#####################################
7573def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):
7574 r"""
7575 Compute the first Wasserstein distance between two 1D distributions.
7577 This distance is also known as the earth mover's distance, since it can be
7578 seen as the minimum amount of "work" required to transform :math:`u` into
7579 :math:`v`, where "work" is measured as the amount of distribution weight
7580 that must be moved, multiplied by the distance it has to be moved.
7582 .. versionadded:: 1.0.0
7584 Parameters
7585 ----------
7586 u_values, v_values : array_like
7587 Values observed in the (empirical) distribution.
7588 u_weights, v_weights : array_like, optional
7589 Weight for each value. If unspecified, each value is assigned the same
7590 weight.
7591 `u_weights` (resp. `v_weights`) must have the same length as
7592 `u_values` (resp. `v_values`). If the weight sum differs from 1, it
7593 must still be positive and finite so that the weights can be normalized
7594 to sum to 1.
7596 Returns
7597 -------
7598 distance : float
7599 The computed distance between the distributions.
7601 Notes
7602 -----
7603 The first Wasserstein distance between the distributions :math:`u` and
7604 :math:`v` is:
7606 .. math::
7608 l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int_{\mathbb{R} \times
7609 \mathbb{R}} |x-y| \mathrm{d} \pi (x, y)
7611 where :math:`\Gamma (u, v)` is the set of (probability) distributions on
7612 :math:`\mathbb{R} \times \mathbb{R}` whose marginals are :math:`u` and
7613 :math:`v` on the first and second factors respectively.
7615 If :math:`U` and :math:`V` are the respective CDFs of :math:`u` and
7616 :math:`v`, this distance also equals to:
7618 .. math::
7620 l_1(u, v) = \int_{-\infty}^{+\infty} |U-V|
7622 See [2]_ for a proof of the equivalence of both definitions.
7624 The input distributions can be empirical, therefore coming from samples
7625 whose values are effectively inputs of the function, or they can be seen as
7626 generalized functions, in which case they are weighted sums of Dirac delta
7627 functions located at the specified values.
7629 References
7630 ----------
7631 .. [1] "Wasserstein metric", https://en.wikipedia.org/wiki/Wasserstein_metric
7632 .. [2] Ramdas, Garcia, Cuturi "On Wasserstein Two Sample Testing and Related
7633 Families of Nonparametric Tests" (2015). :arXiv:`1509.02237`.
7635 Examples
7636 --------
7637 >>> from scipy.stats import wasserstein_distance
7638 >>> wasserstein_distance([0, 1, 3], [5, 6, 8])
7639 5.0
7640 >>> wasserstein_distance([0, 1], [0, 1], [3, 1], [2, 2])
7641 0.25
7642 >>> wasserstein_distance([3.4, 3.9, 7.5, 7.8], [4.5, 1.4],
7643 ... [1.4, 0.9, 3.1, 7.2], [3.2, 3.5])
7644 4.0781331438047861
7646 """
7647 return _cdf_distance(1, u_values, v_values, u_weights, v_weights)
7650def energy_distance(u_values, v_values, u_weights=None, v_weights=None):
7651 r"""
7652 Compute the energy distance between two 1D distributions.
7654 .. versionadded:: 1.0.0
7656 Parameters
7657 ----------
7658 u_values, v_values : array_like
7659 Values observed in the (empirical) distribution.
7660 u_weights, v_weights : array_like, optional
7661 Weight for each value. If unspecified, each value is assigned the same
7662 weight.
7663 `u_weights` (resp. `v_weights`) must have the same length as
7664 `u_values` (resp. `v_values`). If the weight sum differs from 1, it
7665 must still be positive and finite so that the weights can be normalized
7666 to sum to 1.
7668 Returns
7669 -------
7670 distance : float
7671 The computed distance between the distributions.
7673 Notes
7674 -----
7675 The energy distance between two distributions :math:`u` and :math:`v`, whose
7676 respective CDFs are :math:`U` and :math:`V`, equals to:
7678 .. math::
7680 D(u, v) = \left( 2\mathbb E|X - Y| - \mathbb E|X - X'| -
7681 \mathbb E|Y - Y'| \right)^{1/2}
7683 where :math:`X` and :math:`X'` (resp. :math:`Y` and :math:`Y'`) are
7684 independent random variables whose probability distribution is :math:`u`
7685 (resp. :math:`v`).
7687 As shown in [2]_, for one-dimensional real-valued variables, the energy
7688 distance is linked to the non-distribution-free version of the Cramer-von
7689 Mises distance:
7691 .. math::
7693 D(u, v) = \sqrt{2} l_2(u, v) = \left( 2 \int_{-\infty}^{+\infty} (U-V)^2
7694 \right)^{1/2}
7696 Note that the common Cramer-von Mises criterion uses the distribution-free
7697 version of the distance. See [2]_ (section 2), for more details about both
7698 versions of the distance.
7700 The input distributions can be empirical, therefore coming from samples
7701 whose values are effectively inputs of the function, or they can be seen as
7702 generalized functions, in which case they are weighted sums of Dirac delta
7703 functions located at the specified values.
7705 References
7706 ----------
7707 .. [1] "Energy distance", https://en.wikipedia.org/wiki/Energy_distance
7708 .. [2] Szekely "E-statistics: The energy of statistical samples." Bowling
7709 Green State University, Department of Mathematics and Statistics,
7710 Technical Report 02-16 (2002).
7711 .. [3] Rizzo, Szekely "Energy distance." Wiley Interdisciplinary Reviews:
7712 Computational Statistics, 8(1):27-38 (2015).
7713 .. [4] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
7714 Munos "The Cramer Distance as a Solution to Biased Wasserstein
7715 Gradients" (2017). :arXiv:`1705.10743`.
7717 Examples
7718 --------
7719 >>> from scipy.stats import energy_distance
7720 >>> energy_distance([0], [2])
7721 2.0000000000000004
7722 >>> energy_distance([0, 8], [0, 8], [3, 1], [2, 2])
7723 1.0000000000000002
7724 >>> energy_distance([0.7, 7.4, 2.4, 6.8], [1.4, 8. ],
7725 ... [2.1, 4.2, 7.4, 8. ], [7.6, 8.8])
7726 0.88003340976158217
7728 """
7729 return np.sqrt(2) * _cdf_distance(2, u_values, v_values,
7730 u_weights, v_weights)
7733def _cdf_distance(p, u_values, v_values, u_weights=None, v_weights=None):
7734 r"""
7735 Compute, between two one-dimensional distributions :math:`u` and
7736 :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the
7737 statistical distance that is defined as:
7739 .. math::
7741 l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
7743 p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2
7744 gives the energy distance.
7746 Parameters
7747 ----------
7748 u_values, v_values : array_like
7749 Values observed in the (empirical) distribution.
7750 u_weights, v_weights : array_like, optional
7751 Weight for each value. If unspecified, each value is assigned the same
7752 weight.
7753 `u_weights` (resp. `v_weights`) must have the same length as
7754 `u_values` (resp. `v_values`). If the weight sum differs from 1, it
7755 must still be positive and finite so that the weights can be normalized
7756 to sum to 1.
7758 Returns
7759 -------
7760 distance : float
7761 The computed distance between the distributions.
7763 Notes
7764 -----
7765 The input distributions can be empirical, therefore coming from samples
7766 whose values are effectively inputs of the function, or they can be seen as
7767 generalized functions, in which case they are weighted sums of Dirac delta
7768 functions located at the specified values.
7770 References
7771 ----------
7772 .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
7773 Munos "The Cramer Distance as a Solution to Biased Wasserstein
7774 Gradients" (2017). :arXiv:`1705.10743`.
7776 """
7777 u_values, u_weights = _validate_distribution(u_values, u_weights)
7778 v_values, v_weights = _validate_distribution(v_values, v_weights)
7780 u_sorter = np.argsort(u_values)
7781 v_sorter = np.argsort(v_values)
7783 all_values = np.concatenate((u_values, v_values))
7784 all_values.sort(kind='mergesort')
7786 # Compute the differences between pairs of successive values of u and v.
7787 deltas = np.diff(all_values)
7789 # Get the respective positions of the values of u and v among the values of
7790 # both distributions.
7791 u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], 'right')
7792 v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], 'right')
7794 # Calculate the CDFs of u and v using their weights, if specified.
7795 if u_weights is None:
7796 u_cdf = u_cdf_indices / u_values.size
7797 else:
7798 u_sorted_cumweights = np.concatenate(([0],
7799 np.cumsum(u_weights[u_sorter])))
7800 u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1]
7802 if v_weights is None:
7803 v_cdf = v_cdf_indices / v_values.size
7804 else:
7805 v_sorted_cumweights = np.concatenate(([0],
7806 np.cumsum(v_weights[v_sorter])))
7807 v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1]
7809 # Compute the value of the integral based on the CDFs.
7810 # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead
7811 # of about 15%.
7812 if p == 1:
7813 return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas))
7814 if p == 2:
7815 return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas)))
7816 return np.power(np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p),
7817 deltas)), 1/p)
7820def _validate_distribution(values, weights):
7821 """
7822 Validate the values and weights from a distribution input of `cdf_distance`
7823 and return them as ndarray objects.
7825 Parameters
7826 ----------
7827 values : array_like
7828 Values observed in the (empirical) distribution.
7829 weights : array_like
7830 Weight for each value.
7832 Returns
7833 -------
7834 values : ndarray
7835 Values as ndarray.
7836 weights : ndarray
7837 Weights as ndarray.
7839 """
7840 # Validate the value array.
7841 values = np.asarray(values, dtype=float)
7842 if len(values) == 0:
7843 raise ValueError("Distribution can't be empty.")
7845 # Validate the weight array, if specified.
7846 if weights is not None:
7847 weights = np.asarray(weights, dtype=float)
7848 if len(weights) != len(values):
7849 raise ValueError('Value and weight array-likes for the same '
7850 'empirical distribution must be of the same size.')
7851 if np.any(weights < 0):
7852 raise ValueError('All weights must be non-negative.')
7853 if not 0 < np.sum(weights) < np.inf:
7854 raise ValueError('Weight array-like sum must be positive and '
7855 'finite. Set as None for an equal distribution of '
7856 'weight.')
7858 return values, weights
7860 return values, None
7863#####################################
7864# SUPPORT FUNCTIONS #
7865#####################################
7867RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
7870def find_repeats(arr):
7871 """
7872 Find repeats and repeat counts.
7874 Parameters
7875 ----------
7876 arr : array_like
7877 Input array. This is cast to float64.
7879 Returns
7880 -------
7881 values : ndarray
7882 The unique values from the (flattened) input that are repeated.
7884 counts : ndarray
7885 Number of times the corresponding 'value' is repeated.
7887 Notes
7888 -----
7889 In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
7890 difference is that `find_repeats` only returns repeated values.
7892 Examples
7893 --------
7894 >>> from scipy import stats
7895 >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
7896 RepeatedResults(values=array([2.]), counts=array([4]))
7898 >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
7899 RepeatedResults(values=array([4., 5.]), counts=array([2, 2]))
7901 """
7902 # Note: always copies.
7903 return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64)))
7906def _sum_of_squares(a, axis=0):
7907 """
7908 Square each element of the input array, and return the sum(s) of that.
7910 Parameters
7911 ----------
7912 a : array_like
7913 Input array.
7914 axis : int or None, optional
7915 Axis along which to calculate. Default is 0. If None, compute over
7916 the whole array `a`.
7918 Returns
7919 -------
7920 sum_of_squares : ndarray
7921 The sum along the given axis for (a**2).
7923 See Also
7924 --------
7925 _square_of_sums : The square(s) of the sum(s) (the opposite of
7926 `_sum_of_squares`).
7928 """
7929 a, axis = _chk_asarray(a, axis)
7930 return np.sum(a*a, axis)
7933def _square_of_sums(a, axis=0):
7934 """
7935 Sum elements of the input array, and return the square(s) of that sum.
7937 Parameters
7938 ----------
7939 a : array_like
7940 Input array.
7941 axis : int or None, optional
7942 Axis along which to calculate. Default is 0. If None, compute over
7943 the whole array `a`.
7945 Returns
7946 -------
7947 square_of_sums : float or ndarray
7948 The square of the sum over `axis`.
7950 See Also
7951 --------
7952 _sum_of_squares : The sum of squares (the opposite of `square_of_sums`).
7954 """
7955 a, axis = _chk_asarray(a, axis)
7956 s = np.sum(a, axis)
7957 if not np.isscalar(s):
7958 return s.astype(float) * s
7959 else:
7960 return float(s) * s
7963def rankdata(a, method='average', *, axis=None):
7964 """
7965 Assign ranks to data, dealing with ties appropriately.
7967 By default (``axis=None``), the data array is first flattened, and a flat
7968 array of ranks is returned. Separately reshape the rank array to the
7969 shape of the data array if desired (see Examples).
7971 Ranks begin at 1. The `method` argument controls how ranks are assigned
7972 to equal values. See [1]_ for further discussion of ranking methods.
7974 Parameters
7975 ----------
7976 a : array_like
7977 The array of values to be ranked.
7978 method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional
7979 The method used to assign ranks to tied elements.
7980 The following methods are available (default is 'average'):
7982 * 'average': The average of the ranks that would have been assigned to
7983 all the tied values is assigned to each value.
7984 * 'min': The minimum of the ranks that would have been assigned to all
7985 the tied values is assigned to each value. (This is also
7986 referred to as "competition" ranking.)
7987 * 'max': The maximum of the ranks that would have been assigned to all
7988 the tied values is assigned to each value.
7989 * 'dense': Like 'min', but the rank of the next highest element is
7990 assigned the rank immediately after those assigned to the tied
7991 elements.
7992 * 'ordinal': All values are given a distinct rank, corresponding to
7993 the order that the values occur in `a`.
7994 axis : {None, int}, optional
7995 Axis along which to perform the ranking. If ``None``, the data array
7996 is first flattened.
7998 Returns
7999 -------
8000 ranks : ndarray
8001 An array of size equal to the size of `a`, containing rank
8002 scores.
8004 References
8005 ----------
8006 .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking
8008 Examples
8009 --------
8010 >>> from scipy.stats import rankdata
8011 >>> rankdata([0, 2, 3, 2])
8012 array([ 1. , 2.5, 4. , 2.5])
8013 >>> rankdata([0, 2, 3, 2], method='min')
8014 array([ 1, 2, 4, 2])
8015 >>> rankdata([0, 2, 3, 2], method='max')
8016 array([ 1, 3, 4, 3])
8017 >>> rankdata([0, 2, 3, 2], method='dense')
8018 array([ 1, 2, 3, 2])
8019 >>> rankdata([0, 2, 3, 2], method='ordinal')
8020 array([ 1, 2, 4, 3])
8021 >>> rankdata([[0, 2], [3, 2]]).reshape(2,2)
8022 array([[1. , 2.5],
8023 [4. , 2.5]])
8024 >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1)
8025 array([[1. , 2.5, 2.5],
8026 [2. , 1. , 3. ]])
8027 """
8028 if method not in ('average', 'min', 'max', 'dense', 'ordinal'):
8029 raise ValueError('unknown method "{0}"'.format(method))
8031 if axis is not None:
8032 a = np.asarray(a)
8033 if a.size == 0:
8034 # The return values of `normalize_axis_index` are ignored. The
8035 # call validates `axis`, even though we won't use it.
8036 # use scipy._lib._util._normalize_axis_index when available
8037 np.core.multiarray.normalize_axis_index(axis, a.ndim)
8038 dt = np.float64 if method == 'average' else np.int_
8039 return np.empty(a.shape, dtype=dt)
8040 return np.apply_along_axis(rankdata, axis, a, method)
8042 arr = np.ravel(np.asarray(a))
8043 algo = 'mergesort' if method == 'ordinal' else 'quicksort'
8044 sorter = np.argsort(arr, kind=algo)
8046 inv = np.empty(sorter.size, dtype=np.intp)
8047 inv[sorter] = np.arange(sorter.size, dtype=np.intp)
8049 if method == 'ordinal':
8050 return inv + 1
8052 arr = arr[sorter]
8053 obs = np.r_[True, arr[1:] != arr[:-1]]
8054 dense = obs.cumsum()[inv]
8056 if method == 'dense':
8057 return dense
8059 # cumulative counts of each unique value
8060 count = np.r_[np.nonzero(obs)[0], len(obs)]
8062 if method == 'max':
8063 return count[dense]
8065 if method == 'min':
8066 return count[dense - 1] + 1
8068 # average method
8069 return .5 * (count[dense] + count[dense - 1] + 1)