Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# Copyright 2002 Gary Strangman. All rights reserved 

2# Copyright 2002-2016 The SciPy Developers 

3# 

4# The original code from Gary Strangman was heavily adapted for 

5# use in SciPy by Travis Oliphant. The original code came with the 

6# following disclaimer: 

7# 

8# This software is provided "as-is". There are no expressed or implied 

9# warranties of any kind, including, but not limited to, the warranties 

10# of merchantability and fitness for a given application. In no event 

11# shall Gary Strangman be liable for any direct, indirect, incidental, 

12# special, exemplary or consequential damages (including, but not limited 

13# to, loss of use, data or profits, or business interruption) however 

14# caused and on any theory of liability, whether in contract, strict 

15# liability or tort (including negligence or otherwise) arising in any way 

16# out of the use of this software, even if advised of the possibility of 

17# such damage. 

18 

19""" 

20A collection of basic statistical functions for Python. The function 

21names appear below. 

22 

23 Some scalar functions defined here are also available in the scipy.special 

24 package where they work on arbitrary sized arrays. 

25 

26Disclaimers: The function list is obviously incomplete and, worse, the 

27functions are not optimized. All functions have been tested (some more 

28so than others), but they are far from bulletproof. Thus, as with any 

29free software, no warranty or guarantee is expressed or implied. :-) A 

30few extra functions that don't appear in the list below can be found by 

31interested treasure-hunters. These functions don't necessarily have 

32both list and array versions but were deemed useful. 

33 

34Central Tendency 

35---------------- 

36.. autosummary:: 

37 :toctree: generated/ 

38 

39 gmean 

40 hmean 

41 mode 

42 

43Moments 

44------- 

45.. autosummary:: 

46 :toctree: generated/ 

47 

48 moment 

49 variation 

50 skew 

51 kurtosis 

52 normaltest 

53 

54Altered Versions 

55---------------- 

56.. autosummary:: 

57 :toctree: generated/ 

58 

59 tmean 

60 tvar 

61 tstd 

62 tsem 

63 describe 

64 

65Frequency Stats 

66--------------- 

67.. autosummary:: 

68 :toctree: generated/ 

69 

70 itemfreq 

71 scoreatpercentile 

72 percentileofscore 

73 cumfreq 

74 relfreq 

75 

76Variability 

77----------- 

78.. autosummary:: 

79 :toctree: generated/ 

80 

81 obrientransform 

82 sem 

83 zmap 

84 zscore 

85 gstd 

86 iqr 

87 median_abs_deviation 

88 

89Trimming Functions 

90------------------ 

91.. autosummary:: 

92 :toctree: generated/ 

93 

94 trimboth 

95 trim1 

96 

97Correlation Functions 

98--------------------- 

99.. autosummary:: 

100 :toctree: generated/ 

101 

102 pearsonr 

103 fisher_exact 

104 spearmanr 

105 pointbiserialr 

106 kendalltau 

107 weightedtau 

108 linregress 

109 theilslopes 

110 multiscale_graphcorr 

111 

112Inferential Stats 

113----------------- 

114.. autosummary:: 

115 :toctree: generated/ 

116 

117 ttest_1samp 

118 ttest_ind 

119 ttest_ind_from_stats 

120 ttest_rel 

121 chisquare 

122 power_divergence 

123 kstest 

124 ks_1samp 

125 ks_2samp 

126 epps_singleton_2samp 

127 mannwhitneyu 

128 ranksums 

129 wilcoxon 

130 kruskal 

131 friedmanchisquare 

132 brunnermunzel 

133 combine_pvalues 

134 

135Statistical Distances 

136--------------------- 

137.. autosummary:: 

138 :toctree: generated/ 

139 

140 wasserstein_distance 

141 energy_distance 

142 

143ANOVA Functions 

144--------------- 

145.. autosummary:: 

146 :toctree: generated/ 

147 

148 f_oneway 

149 

150Support Functions 

151----------------- 

152.. autosummary:: 

153 :toctree: generated/ 

154 

155 rankdata 

156 rvs_ratio_uniforms 

157 

158References 

159---------- 

160.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

161 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

162 York. 2000. 

163 

164""" 

165 

166import warnings 

167import math 

168from math import gcd 

169from collections import namedtuple 

170 

171import numpy as np 

172from numpy import array, asarray, ma 

173 

174from scipy.spatial.distance import cdist 

175from scipy.ndimage import measurements 

176from scipy._lib._util import (_lazywhere, check_random_state, MapWrapper, 

177 rng_integers) 

178import scipy.special as special 

179from scipy import linalg 

180from . import distributions 

181from . import mstats_basic 

182from ._stats_mstats_common import (_find_repeats, linregress, theilslopes, 

183 siegelslopes) 

184from ._stats import (_kendall_dis, _toint64, _weightedrankedtau, 

185 _local_correlations) 

186from ._rvs_sampling import rvs_ratio_uniforms 

187from ._hypotests import epps_singleton_2samp 

188 

189 

190__all__ = ['find_repeats', 'gmean', 'hmean', 'mode', 'tmean', 'tvar', 

191 'tmin', 'tmax', 'tstd', 'tsem', 'moment', 'variation', 

192 'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest', 

193 'normaltest', 'jarque_bera', 'itemfreq', 

194 'scoreatpercentile', 'percentileofscore', 

195 'cumfreq', 'relfreq', 'obrientransform', 

196 'sem', 'zmap', 'zscore', 'iqr', 'gstd', 'median_absolute_deviation', 

197 'median_abs_deviation', 

198 'sigmaclip', 'trimboth', 'trim1', 'trim_mean', 

199 'f_oneway', 'F_onewayConstantInputWarning', 

200 'F_onewayBadInputSizesWarning', 

201 'PearsonRConstantInputWarning', 'PearsonRNearConstantInputWarning', 

202 'pearsonr', 'fisher_exact', 'SpearmanRConstantInputWarning', 

203 'spearmanr', 'pointbiserialr', 

204 'kendalltau', 'weightedtau', 'multiscale_graphcorr', 

205 'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp', 

206 'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel', 

207 'kstest', 'ks_1samp', 'ks_2samp', 

208 'chisquare', 'power_divergence', 'mannwhitneyu', 

209 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare', 

210 'rankdata', 'rvs_ratio_uniforms', 

211 'combine_pvalues', 'wasserstein_distance', 'energy_distance', 

212 'brunnermunzel', 'epps_singleton_2samp'] 

213 

214 

215def _contains_nan(a, nan_policy='propagate'): 

216 policies = ['propagate', 'raise', 'omit'] 

217 if nan_policy not in policies: 

218 raise ValueError("nan_policy must be one of {%s}" % 

219 ', '.join("'%s'" % s for s in policies)) 

220 try: 

221 # Calling np.sum to avoid creating a huge array into memory 

222 # e.g. np.isnan(a).any() 

223 with np.errstate(invalid='ignore'): 

224 contains_nan = np.isnan(np.sum(a)) 

225 except TypeError: 

226 # This can happen when attempting to sum things which are not 

227 # numbers (e.g. as in the function `mode`). Try an alternative method: 

228 try: 

229 contains_nan = np.nan in set(a.ravel()) 

230 except TypeError: 

231 # Don't know what to do. Fall back to omitting nan values and 

232 # issue a warning. 

233 contains_nan = False 

234 nan_policy = 'omit' 

235 warnings.warn("The input array could not be properly checked for nan " 

236 "values. nan values will be ignored.", RuntimeWarning) 

237 

238 if contains_nan and nan_policy == 'raise': 

239 raise ValueError("The input contains nan values") 

240 

241 return contains_nan, nan_policy 

242 

243 

244def _chk_asarray(a, axis): 

245 if axis is None: 

246 a = np.ravel(a) 

247 outaxis = 0 

248 else: 

249 a = np.asarray(a) 

250 outaxis = axis 

251 

252 if a.ndim == 0: 

253 a = np.atleast_1d(a) 

254 

255 return a, outaxis 

256 

257 

258def _chk2_asarray(a, b, axis): 

259 if axis is None: 

260 a = np.ravel(a) 

261 b = np.ravel(b) 

262 outaxis = 0 

263 else: 

264 a = np.asarray(a) 

265 b = np.asarray(b) 

266 outaxis = axis 

267 

268 if a.ndim == 0: 

269 a = np.atleast_1d(a) 

270 if b.ndim == 0: 

271 b = np.atleast_1d(b) 

272 

273 return a, b, outaxis 

274 

275 

276def _shape_with_dropped_axis(a, axis): 

277 """ 

278 Given an array `a` and an integer `axis`, return the shape 

279 of `a` with the `axis` dimension removed. 

280 

281 Examples 

282 -------- 

283 >>> a = np.zeros((3, 5, 2)) 

284 >>> _shape_with_dropped_axis(a, 1) 

285 (3, 2) 

286 """ 

287 shp = list(a.shape) 

288 try: 

289 del shp[axis] 

290 except IndexError: 

291 raise np.AxisError(axis, a.ndim) from None 

292 return tuple(shp) 

293 

294 

295def _broadcast_shapes(shape1, shape2): 

296 """ 

297 Given two shapes (i.e. tuples of integers), return the shape 

298 that would result from broadcasting two arrays with the given 

299 shapes. 

300 

301 Examples 

302 -------- 

303 >>> _broadcast_shapes((2, 1), (4, 1, 3)) 

304 (4, 2, 3) 

305 """ 

306 d = len(shape1) - len(shape2) 

307 if d <= 0: 

308 shp1 = (1,)*(-d) + shape1 

309 shp2 = shape2 

310 elif d > 0: 

311 shp1 = shape1 

312 shp2 = (1,)*d + shape2 

313 shape = [] 

314 for n1, n2 in zip(shp1, shp2): 

315 if n1 == 1: 

316 n = n2 

317 elif n2 == 1 or n1 == n2: 

318 n = n1 

319 else: 

320 raise ValueError(f'shapes {shape1} and {shape2} could not be ' 

321 'broadcast together') 

322 shape.append(n) 

323 return tuple(shape) 

324 

325 

326def _broadcast_shapes_with_dropped_axis(a, b, axis): 

327 """ 

328 Given two arrays `a` and `b` and an integer `axis`, find the 

329 shape of the broadcast result after dropping `axis` from the 

330 shapes of `a` and `b`. 

331 

332 Examples 

333 -------- 

334 >>> a = np.zeros((5, 2, 1)) 

335 >>> b = np.zeros((1, 9, 3)) 

336 >>> _broadcast_shapes_with_dropped_axis(a, b, 1) 

337 (5, 3) 

338 """ 

339 shp1 = _shape_with_dropped_axis(a, axis) 

340 shp2 = _shape_with_dropped_axis(b, axis) 

341 try: 

342 shp = _broadcast_shapes(shp1, shp2) 

343 except ValueError: 

344 raise ValueError(f'non-axis shapes {shp1} and {shp2} could not be ' 

345 'broadcast together') from None 

346 return shp 

347 

348 

349def gmean(a, axis=0, dtype=None): 

350 """ 

351 Compute the geometric mean along the specified axis. 

352 

353 Return the geometric average of the array elements. 

354 That is: n-th root of (x1 * x2 * ... * xn) 

355 

356 Parameters 

357 ---------- 

358 a : array_like 

359 Input array or object that can be converted to an array. 

360 axis : int or None, optional 

361 Axis along which the geometric mean is computed. Default is 0. 

362 If None, compute over the whole array `a`. 

363 dtype : dtype, optional 

364 Type of the returned array and of the accumulator in which the 

365 elements are summed. If dtype is not specified, it defaults to the 

366 dtype of a, unless a has an integer dtype with a precision less than 

367 that of the default platform integer. In that case, the default 

368 platform integer is used. 

369 

370 Returns 

371 ------- 

372 gmean : ndarray 

373 See `dtype` parameter above. 

374 

375 See Also 

376 -------- 

377 numpy.mean : Arithmetic average 

378 numpy.average : Weighted average 

379 hmean : Harmonic mean 

380 

381 Notes 

382 ----- 

383 The geometric average is computed over a single dimension of the input 

384 array, axis=0 by default, or all values in the array if axis=None. 

385 float64 intermediate and return values are used for integer inputs. 

386 

387 Use masked arrays to ignore any non-finite values in the input or that 

388 arise in the calculations such as Not a Number and infinity because masked 

389 arrays automatically mask any non-finite values. 

390 

391 Examples 

392 -------- 

393 >>> from scipy.stats import gmean 

394 >>> gmean([1, 4]) 

395 2.0 

396 >>> gmean([1, 2, 3, 4, 5, 6, 7]) 

397 3.3800151591412964 

398 

399 """ 

400 if not isinstance(a, np.ndarray): 

401 # if not an ndarray object attempt to convert it 

402 log_a = np.log(np.array(a, dtype=dtype)) 

403 elif dtype: 

404 # Must change the default dtype allowing array type 

405 if isinstance(a, np.ma.MaskedArray): 

406 log_a = np.log(np.ma.asarray(a, dtype=dtype)) 

407 else: 

408 log_a = np.log(np.asarray(a, dtype=dtype)) 

409 else: 

410 log_a = np.log(a) 

411 return np.exp(log_a.mean(axis=axis)) 

412 

413 

414def hmean(a, axis=0, dtype=None): 

415 """ 

416 Calculate the harmonic mean along the specified axis. 

417 

418 That is: n / (1/x1 + 1/x2 + ... + 1/xn) 

419 

420 Parameters 

421 ---------- 

422 a : array_like 

423 Input array, masked array or object that can be converted to an array. 

424 axis : int or None, optional 

425 Axis along which the harmonic mean is computed. Default is 0. 

426 If None, compute over the whole array `a`. 

427 dtype : dtype, optional 

428 Type of the returned array and of the accumulator in which the 

429 elements are summed. If `dtype` is not specified, it defaults to the 

430 dtype of `a`, unless `a` has an integer `dtype` with a precision less 

431 than that of the default platform integer. In that case, the default 

432 platform integer is used. 

433 

434 Returns 

435 ------- 

436 hmean : ndarray 

437 See `dtype` parameter above. 

438 

439 See Also 

440 -------- 

441 numpy.mean : Arithmetic average 

442 numpy.average : Weighted average 

443 gmean : Geometric mean 

444 

445 Notes 

446 ----- 

447 The harmonic mean is computed over a single dimension of the input 

448 array, axis=0 by default, or all values in the array if axis=None. 

449 float64 intermediate and return values are used for integer inputs. 

450 

451 Use masked arrays to ignore any non-finite values in the input or that 

452 arise in the calculations such as Not a Number and infinity. 

453 

454 Examples 

455 -------- 

456 >>> from scipy.stats import hmean 

457 >>> hmean([1, 4]) 

458 1.6000000000000001 

459 >>> hmean([1, 2, 3, 4, 5, 6, 7]) 

460 2.6997245179063363 

461 

462 """ 

463 if not isinstance(a, np.ndarray): 

464 a = np.array(a, dtype=dtype) 

465 if np.all(a >= 0): 

466 # Harmonic mean only defined if greater than or equal to to zero. 

467 if isinstance(a, np.ma.MaskedArray): 

468 size = a.count(axis) 

469 else: 

470 if axis is None: 

471 a = a.ravel() 

472 size = a.shape[0] 

473 else: 

474 size = a.shape[axis] 

475 with np.errstate(divide='ignore'): 

476 return size / np.sum(1.0 / a, axis=axis, dtype=dtype) 

477 else: 

478 raise ValueError("Harmonic mean only defined if all elements greater " 

479 "than or equal to zero") 

480 

481 

482ModeResult = namedtuple('ModeResult', ('mode', 'count')) 

483 

484 

485def mode(a, axis=0, nan_policy='propagate'): 

486 """ 

487 Return an array of the modal (most common) value in the passed array. 

488 

489 If there is more than one such value, only the smallest is returned. 

490 The bin-count for the modal bins is also returned. 

491 

492 Parameters 

493 ---------- 

494 a : array_like 

495 n-dimensional array of which to find mode(s). 

496 axis : int or None, optional 

497 Axis along which to operate. Default is 0. If None, compute over 

498 the whole array `a`. 

499 nan_policy : {'propagate', 'raise', 'omit'}, optional 

500 Defines how to handle when input contains nan. 

501 The following options are available (default is 'propagate'): 

502 

503 * 'propagate': returns nan 

504 * 'raise': throws an error 

505 * 'omit': performs the calculations ignoring nan values 

506 

507 Returns 

508 ------- 

509 mode : ndarray 

510 Array of modal values. 

511 count : ndarray 

512 Array of counts for each mode. 

513 

514 Examples 

515 -------- 

516 >>> a = np.array([[6, 8, 3, 0], 

517 ... [3, 2, 1, 7], 

518 ... [8, 1, 8, 4], 

519 ... [5, 3, 0, 5], 

520 ... [4, 7, 5, 9]]) 

521 >>> from scipy import stats 

522 >>> stats.mode(a) 

523 ModeResult(mode=array([[3, 1, 0, 0]]), count=array([[1, 1, 1, 1]])) 

524 

525 To get mode of whole array, specify ``axis=None``: 

526 

527 >>> stats.mode(a, axis=None) 

528 ModeResult(mode=array([3]), count=array([3])) 

529 

530 """ 

531 a, axis = _chk_asarray(a, axis) 

532 if a.size == 0: 

533 return ModeResult(np.array([]), np.array([])) 

534 

535 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

536 

537 if contains_nan and nan_policy == 'omit': 

538 a = ma.masked_invalid(a) 

539 return mstats_basic.mode(a, axis) 

540 

541 if a.dtype == object and np.nan in set(a.ravel()): 

542 # Fall back to a slower method since np.unique does not work with NaN 

543 scores = set(np.ravel(a)) # get ALL unique values 

544 testshape = list(a.shape) 

545 testshape[axis] = 1 

546 oldmostfreq = np.zeros(testshape, dtype=a.dtype) 

547 oldcounts = np.zeros(testshape, dtype=int) 

548 

549 for score in scores: 

550 template = (a == score) 

551 counts = np.expand_dims(np.sum(template, axis), axis) 

552 mostfrequent = np.where(counts > oldcounts, score, oldmostfreq) 

553 oldcounts = np.maximum(counts, oldcounts) 

554 oldmostfreq = mostfrequent 

555 

556 return ModeResult(mostfrequent, oldcounts) 

557 

558 def _mode1D(a): 

559 vals, cnts = np.unique(a, return_counts=True) 

560 return vals[cnts.argmax()], cnts.max() 

561 

562 # np.apply_along_axis will convert the _mode1D tuples to a numpy array, casting types in the process 

563 # This recreates the results without that issue 

564 # View of a, rotated so the requested axis is last 

565 in_dims = list(range(a.ndim)) 

566 a_view = np.transpose(a, in_dims[:axis] + in_dims[axis+1:] + [axis]) 

567 

568 inds = np.ndindex(a_view.shape[:-1]) 

569 modes = np.empty(a_view.shape[:-1], dtype=a.dtype) 

570 counts = np.zeros(a_view.shape[:-1], dtype=np.int) 

571 for ind in inds: 

572 modes[ind], counts[ind] = _mode1D(a_view[ind]) 

573 newshape = list(a.shape) 

574 newshape[axis] = 1 

575 return ModeResult(modes.reshape(newshape), counts.reshape(newshape)) 

576 

577 

578def _mask_to_limits(a, limits, inclusive): 

579 """Mask an array for values outside of given limits. 

580 

581 This is primarily a utility function. 

582 

583 Parameters 

584 ---------- 

585 a : array 

586 limits : (float or None, float or None) 

587 A tuple consisting of the (lower limit, upper limit). Values in the 

588 input array less than the lower limit or greater than the upper limit 

589 will be masked out. None implies no limit. 

590 inclusive : (bool, bool) 

591 A tuple consisting of the (lower flag, upper flag). These flags 

592 determine whether values exactly equal to lower or upper are allowed. 

593 

594 Returns 

595 ------- 

596 A MaskedArray. 

597 

598 Raises 

599 ------ 

600 A ValueError if there are no values within the given limits. 

601 

602 """ 

603 lower_limit, upper_limit = limits 

604 lower_include, upper_include = inclusive 

605 am = ma.MaskedArray(a) 

606 if lower_limit is not None: 

607 if lower_include: 

608 am = ma.masked_less(am, lower_limit) 

609 else: 

610 am = ma.masked_less_equal(am, lower_limit) 

611 

612 if upper_limit is not None: 

613 if upper_include: 

614 am = ma.masked_greater(am, upper_limit) 

615 else: 

616 am = ma.masked_greater_equal(am, upper_limit) 

617 

618 if am.count() == 0: 

619 raise ValueError("No array values within given limits") 

620 

621 return am 

622 

623 

624def tmean(a, limits=None, inclusive=(True, True), axis=None): 

625 """ 

626 Compute the trimmed mean. 

627 

628 This function finds the arithmetic mean of given values, ignoring values 

629 outside the given `limits`. 

630 

631 Parameters 

632 ---------- 

633 a : array_like 

634 Array of values. 

635 limits : None or (lower limit, upper limit), optional 

636 Values in the input array less than the lower limit or greater than the 

637 upper limit will be ignored. When limits is None (default), then all 

638 values are used. Either of the limit values in the tuple can also be 

639 None representing a half-open interval. 

640 inclusive : (bool, bool), optional 

641 A tuple consisting of the (lower flag, upper flag). These flags 

642 determine whether values exactly equal to the lower or upper limits 

643 are included. The default value is (True, True). 

644 axis : int or None, optional 

645 Axis along which to compute test. Default is None. 

646 

647 Returns 

648 ------- 

649 tmean : float 

650 Trimmed mean. 

651 

652 See Also 

653 -------- 

654 trim_mean : Returns mean after trimming a proportion from both tails. 

655 

656 Examples 

657 -------- 

658 >>> from scipy import stats 

659 >>> x = np.arange(20) 

660 >>> stats.tmean(x) 

661 9.5 

662 >>> stats.tmean(x, (3,17)) 

663 10.0 

664 

665 """ 

666 a = asarray(a) 

667 if limits is None: 

668 return np.mean(a, None) 

669 

670 am = _mask_to_limits(a.ravel(), limits, inclusive) 

671 return am.mean(axis=axis) 

672 

673 

674def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

675 """ 

676 Compute the trimmed variance. 

677 

678 This function computes the sample variance of an array of values, 

679 while ignoring values which are outside of given `limits`. 

680 

681 Parameters 

682 ---------- 

683 a : array_like 

684 Array of values. 

685 limits : None or (lower limit, upper limit), optional 

686 Values in the input array less than the lower limit or greater than the 

687 upper limit will be ignored. When limits is None, then all values are 

688 used. Either of the limit values in the tuple can also be None 

689 representing a half-open interval. The default value is None. 

690 inclusive : (bool, bool), optional 

691 A tuple consisting of the (lower flag, upper flag). These flags 

692 determine whether values exactly equal to the lower or upper limits 

693 are included. The default value is (True, True). 

694 axis : int or None, optional 

695 Axis along which to operate. Default is 0. If None, compute over the 

696 whole array `a`. 

697 ddof : int, optional 

698 Delta degrees of freedom. Default is 1. 

699 

700 Returns 

701 ------- 

702 tvar : float 

703 Trimmed variance. 

704 

705 Notes 

706 ----- 

707 `tvar` computes the unbiased sample variance, i.e. it uses a correction 

708 factor ``n / (n - 1)``. 

709 

710 Examples 

711 -------- 

712 >>> from scipy import stats 

713 >>> x = np.arange(20) 

714 >>> stats.tvar(x) 

715 35.0 

716 >>> stats.tvar(x, (3,17)) 

717 20.0 

718 

719 """ 

720 a = asarray(a) 

721 a = a.astype(float) 

722 if limits is None: 

723 return a.var(ddof=ddof, axis=axis) 

724 am = _mask_to_limits(a, limits, inclusive) 

725 amnan = am.filled(fill_value=np.nan) 

726 return np.nanvar(amnan, ddof=ddof, axis=axis) 

727 

728 

729def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'): 

730 """ 

731 Compute the trimmed minimum. 

732 

733 This function finds the miminum value of an array `a` along the 

734 specified axis, but only considering values greater than a specified 

735 lower limit. 

736 

737 Parameters 

738 ---------- 

739 a : array_like 

740 Array of values. 

741 lowerlimit : None or float, optional 

742 Values in the input array less than the given limit will be ignored. 

743 When lowerlimit is None, then all values are used. The default value 

744 is None. 

745 axis : int or None, optional 

746 Axis along which to operate. Default is 0. If None, compute over the 

747 whole array `a`. 

748 inclusive : {True, False}, optional 

749 This flag determines whether values exactly equal to the lower limit 

750 are included. The default value is True. 

751 nan_policy : {'propagate', 'raise', 'omit'}, optional 

752 Defines how to handle when input contains nan. 

753 The following options are available (default is 'propagate'): 

754 

755 * 'propagate': returns nan 

756 * 'raise': throws an error 

757 * 'omit': performs the calculations ignoring nan values 

758 

759 Returns 

760 ------- 

761 tmin : float, int or ndarray 

762 Trimmed minimum. 

763 

764 Examples 

765 -------- 

766 >>> from scipy import stats 

767 >>> x = np.arange(20) 

768 >>> stats.tmin(x) 

769 0 

770 

771 >>> stats.tmin(x, 13) 

772 13 

773 

774 >>> stats.tmin(x, 13, inclusive=False) 

775 14 

776 

777 """ 

778 a, axis = _chk_asarray(a, axis) 

779 am = _mask_to_limits(a, (lowerlimit, None), (inclusive, False)) 

780 

781 contains_nan, nan_policy = _contains_nan(am, nan_policy) 

782 

783 if contains_nan and nan_policy == 'omit': 

784 am = ma.masked_invalid(am) 

785 

786 res = ma.minimum.reduce(am, axis).data 

787 if res.ndim == 0: 

788 return res[()] 

789 return res 

790 

791 

792def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'): 

793 """ 

794 Compute the trimmed maximum. 

795 

796 This function computes the maximum value of an array along a given axis, 

797 while ignoring values larger than a specified upper limit. 

798 

799 Parameters 

800 ---------- 

801 a : array_like 

802 Array of values. 

803 upperlimit : None or float, optional 

804 Values in the input array greater than the given limit will be ignored. 

805 When upperlimit is None, then all values are used. The default value 

806 is None. 

807 axis : int or None, optional 

808 Axis along which to operate. Default is 0. If None, compute over the 

809 whole array `a`. 

810 inclusive : {True, False}, optional 

811 This flag determines whether values exactly equal to the upper limit 

812 are included. The default value is True. 

813 nan_policy : {'propagate', 'raise', 'omit'}, optional 

814 Defines how to handle when input contains nan. 

815 The following options are available (default is 'propagate'): 

816 

817 * 'propagate': returns nan 

818 * 'raise': throws an error 

819 * 'omit': performs the calculations ignoring nan values 

820 

821 Returns 

822 ------- 

823 tmax : float, int or ndarray 

824 Trimmed maximum. 

825 

826 Examples 

827 -------- 

828 >>> from scipy import stats 

829 >>> x = np.arange(20) 

830 >>> stats.tmax(x) 

831 19 

832 

833 >>> stats.tmax(x, 13) 

834 13 

835 

836 >>> stats.tmax(x, 13, inclusive=False) 

837 12 

838 

839 """ 

840 a, axis = _chk_asarray(a, axis) 

841 am = _mask_to_limits(a, (None, upperlimit), (False, inclusive)) 

842 

843 contains_nan, nan_policy = _contains_nan(am, nan_policy) 

844 

845 if contains_nan and nan_policy == 'omit': 

846 am = ma.masked_invalid(am) 

847 

848 res = ma.maximum.reduce(am, axis).data 

849 if res.ndim == 0: 

850 return res[()] 

851 return res 

852 

853 

854def tstd(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

855 """ 

856 Compute the trimmed sample standard deviation. 

857 

858 This function finds the sample standard deviation of given values, 

859 ignoring values outside the given `limits`. 

860 

861 Parameters 

862 ---------- 

863 a : array_like 

864 Array of values. 

865 limits : None or (lower limit, upper limit), optional 

866 Values in the input array less than the lower limit or greater than the 

867 upper limit will be ignored. When limits is None, then all values are 

868 used. Either of the limit values in the tuple can also be None 

869 representing a half-open interval. The default value is None. 

870 inclusive : (bool, bool), optional 

871 A tuple consisting of the (lower flag, upper flag). These flags 

872 determine whether values exactly equal to the lower or upper limits 

873 are included. The default value is (True, True). 

874 axis : int or None, optional 

875 Axis along which to operate. Default is 0. If None, compute over the 

876 whole array `a`. 

877 ddof : int, optional 

878 Delta degrees of freedom. Default is 1. 

879 

880 Returns 

881 ------- 

882 tstd : float 

883 Trimmed sample standard deviation. 

884 

885 Notes 

886 ----- 

887 `tstd` computes the unbiased sample standard deviation, i.e. it uses a 

888 correction factor ``n / (n - 1)``. 

889 

890 Examples 

891 -------- 

892 >>> from scipy import stats 

893 >>> x = np.arange(20) 

894 >>> stats.tstd(x) 

895 5.9160797830996161 

896 >>> stats.tstd(x, (3,17)) 

897 4.4721359549995796 

898 

899 """ 

900 return np.sqrt(tvar(a, limits, inclusive, axis, ddof)) 

901 

902 

903def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

904 """ 

905 Compute the trimmed standard error of the mean. 

906 

907 This function finds the standard error of the mean for given 

908 values, ignoring values outside the given `limits`. 

909 

910 Parameters 

911 ---------- 

912 a : array_like 

913 Array of values. 

914 limits : None or (lower limit, upper limit), optional 

915 Values in the input array less than the lower limit or greater than the 

916 upper limit will be ignored. When limits is None, then all values are 

917 used. Either of the limit values in the tuple can also be None 

918 representing a half-open interval. The default value is None. 

919 inclusive : (bool, bool), optional 

920 A tuple consisting of the (lower flag, upper flag). These flags 

921 determine whether values exactly equal to the lower or upper limits 

922 are included. The default value is (True, True). 

923 axis : int or None, optional 

924 Axis along which to operate. Default is 0. If None, compute over the 

925 whole array `a`. 

926 ddof : int, optional 

927 Delta degrees of freedom. Default is 1. 

928 

929 Returns 

930 ------- 

931 tsem : float 

932 Trimmed standard error of the mean. 

933 

934 Notes 

935 ----- 

936 `tsem` uses unbiased sample standard deviation, i.e. it uses a 

937 correction factor ``n / (n - 1)``. 

938 

939 Examples 

940 -------- 

941 >>> from scipy import stats 

942 >>> x = np.arange(20) 

943 >>> stats.tsem(x) 

944 1.3228756555322954 

945 >>> stats.tsem(x, (3,17)) 

946 1.1547005383792515 

947 

948 """ 

949 a = np.asarray(a).ravel() 

950 if limits is None: 

951 return a.std(ddof=ddof) / np.sqrt(a.size) 

952 

953 am = _mask_to_limits(a, limits, inclusive) 

954 sd = np.sqrt(np.ma.var(am, ddof=ddof, axis=axis)) 

955 return sd / np.sqrt(am.count()) 

956 

957 

958##################################### 

959# MOMENTS # 

960##################################### 

961 

962def moment(a, moment=1, axis=0, nan_policy='propagate'): 

963 r""" 

964 Calculate the nth moment about the mean for a sample. 

965 

966 A moment is a specific quantitative measure of the shape of a set of 

967 points. It is often used to calculate coefficients of skewness and kurtosis 

968 due to its close relationship with them. 

969 

970 Parameters 

971 ---------- 

972 a : array_like 

973 Input array. 

974 moment : int or array_like of ints, optional 

975 Order of central moment that is returned. Default is 1. 

976 axis : int or None, optional 

977 Axis along which the central moment is computed. Default is 0. 

978 If None, compute over the whole array `a`. 

979 nan_policy : {'propagate', 'raise', 'omit'}, optional 

980 Defines how to handle when input contains nan. 

981 The following options are available (default is 'propagate'): 

982 

983 * 'propagate': returns nan 

984 * 'raise': throws an error 

985 * 'omit': performs the calculations ignoring nan values 

986 

987 Returns 

988 ------- 

989 n-th central moment : ndarray or float 

990 The appropriate moment along the given axis or over all values if axis 

991 is None. The denominator for the moment calculation is the number of 

992 observations, no degrees of freedom correction is done. 

993 

994 See Also 

995 -------- 

996 kurtosis, skew, describe 

997 

998 Notes 

999 ----- 

1000 The k-th central moment of a data sample is: 

1001 

1002 .. math:: 

1003 

1004 m_k = \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x})^k 

1005 

1006 Where n is the number of samples and x-bar is the mean. This function uses 

1007 exponentiation by squares [1]_ for efficiency. 

1008 

1009 References 

1010 ---------- 

1011 .. [1] https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms 

1012 

1013 Examples 

1014 -------- 

1015 >>> from scipy.stats import moment 

1016 >>> moment([1, 2, 3, 4, 5], moment=1) 

1017 0.0 

1018 >>> moment([1, 2, 3, 4, 5], moment=2) 

1019 2.0 

1020 

1021 """ 

1022 a, axis = _chk_asarray(a, axis) 

1023 

1024 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1025 

1026 if contains_nan and nan_policy == 'omit': 

1027 a = ma.masked_invalid(a) 

1028 return mstats_basic.moment(a, moment, axis) 

1029 

1030 if a.size == 0: 

1031 # empty array, return nan(s) with shape matching `moment` 

1032 if np.isscalar(moment): 

1033 return np.nan 

1034 else: 

1035 return np.full(np.asarray(moment).shape, np.nan, dtype=np.float64) 

1036 

1037 # for array_like moment input, return a value for each. 

1038 if not np.isscalar(moment): 

1039 mmnt = [_moment(a, i, axis) for i in moment] 

1040 return np.array(mmnt) 

1041 else: 

1042 return _moment(a, moment, axis) 

1043 

1044 

1045def _moment(a, moment, axis): 

1046 if np.abs(moment - np.round(moment)) > 0: 

1047 raise ValueError("All moment parameters must be integers") 

1048 

1049 if moment == 0: 

1050 # When moment equals 0, the result is 1, by definition. 

1051 shape = list(a.shape) 

1052 del shape[axis] 

1053 if shape: 

1054 # return an actual array of the appropriate shape 

1055 return np.ones(shape, dtype=float) 

1056 else: 

1057 # the input was 1D, so return a scalar instead of a rank-0 array 

1058 return 1.0 

1059 

1060 elif moment == 1: 

1061 # By definition the first moment about the mean is 0. 

1062 shape = list(a.shape) 

1063 del shape[axis] 

1064 if shape: 

1065 # return an actual array of the appropriate shape 

1066 return np.zeros(shape, dtype=float) 

1067 else: 

1068 # the input was 1D, so return a scalar instead of a rank-0 array 

1069 return np.float64(0.0) 

1070 else: 

1071 # Exponentiation by squares: form exponent sequence 

1072 n_list = [moment] 

1073 current_n = moment 

1074 while current_n > 2: 

1075 if current_n % 2: 

1076 current_n = (current_n - 1) / 2 

1077 else: 

1078 current_n /= 2 

1079 n_list.append(current_n) 

1080 

1081 # Starting point for exponentiation by squares 

1082 a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis) 

1083 if n_list[-1] == 1: 

1084 s = a_zero_mean.copy() 

1085 else: 

1086 s = a_zero_mean**2 

1087 

1088 # Perform multiplications 

1089 for n in n_list[-2::-1]: 

1090 s = s**2 

1091 if n % 2: 

1092 s *= a_zero_mean 

1093 return np.mean(s, axis) 

1094 

1095 

1096def variation(a, axis=0, nan_policy='propagate'): 

1097 """ 

1098 Compute the coefficient of variation. 

1099 

1100 The coefficient of variation is the ratio of the biased standard 

1101 deviation to the mean. 

1102 

1103 Parameters 

1104 ---------- 

1105 a : array_like 

1106 Input array. 

1107 axis : int or None, optional 

1108 Axis along which to calculate the coefficient of variation. Default 

1109 is 0. If None, compute over the whole array `a`. 

1110 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1111 Defines how to handle when input contains nan. 

1112 The following options are available (default is 'propagate'): 

1113 

1114 * 'propagate': returns nan 

1115 * 'raise': throws an error 

1116 * 'omit': performs the calculations ignoring nan values 

1117 

1118 Returns 

1119 ------- 

1120 variation : ndarray 

1121 The calculated variation along the requested axis. 

1122 

1123 References 

1124 ---------- 

1125 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

1126 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

1127 York. 2000. 

1128 

1129 Examples 

1130 -------- 

1131 >>> from scipy.stats import variation 

1132 >>> variation([1, 2, 3, 4, 5]) 

1133 0.47140452079103173 

1134 

1135 """ 

1136 a, axis = _chk_asarray(a, axis) 

1137 

1138 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1139 

1140 if contains_nan and nan_policy == 'omit': 

1141 a = ma.masked_invalid(a) 

1142 return mstats_basic.variation(a, axis) 

1143 

1144 return a.std(axis) / a.mean(axis) 

1145 

1146 

1147def skew(a, axis=0, bias=True, nan_policy='propagate'): 

1148 r""" 

1149 Compute the sample skewness of a data set. 

1150 

1151 For normally distributed data, the skewness should be about zero. For 

1152 unimodal continuous distributions, a skewness value greater than zero means 

1153 that there is more weight in the right tail of the distribution. The 

1154 function `skewtest` can be used to determine if the skewness value 

1155 is close enough to zero, statistically speaking. 

1156 

1157 Parameters 

1158 ---------- 

1159 a : ndarray 

1160 Input array. 

1161 axis : int or None, optional 

1162 Axis along which skewness is calculated. Default is 0. 

1163 If None, compute over the whole array `a`. 

1164 bias : bool, optional 

1165 If False, then the calculations are corrected for statistical bias. 

1166 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1167 Defines how to handle when input contains nan. 

1168 The following options are available (default is 'propagate'): 

1169 

1170 * 'propagate': returns nan 

1171 * 'raise': throws an error 

1172 * 'omit': performs the calculations ignoring nan values 

1173 

1174 Returns 

1175 ------- 

1176 skewness : ndarray 

1177 The skewness of values along an axis, returning 0 where all values are 

1178 equal. 

1179 

1180 Notes 

1181 ----- 

1182 The sample skewness is computed as the Fisher-Pearson coefficient 

1183 of skewness, i.e. 

1184 

1185 .. math:: 

1186 

1187 g_1=\frac{m_3}{m_2^{3/2}} 

1188 

1189 where 

1190 

1191 .. math:: 

1192 

1193 m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i 

1194 

1195 is the biased sample :math:`i\texttt{th}` central moment, and :math:`\bar{x}` is 

1196 the sample mean. If ``bias`` is False, the calculations are 

1197 corrected for bias and the value computed is the adjusted 

1198 Fisher-Pearson standardized moment coefficient, i.e. 

1199 

1200 .. math:: 

1201 

1202 G_1=\frac{k_3}{k_2^{3/2}}= 

1203 \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}. 

1204 

1205 References 

1206 ---------- 

1207 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

1208 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

1209 York. 2000. 

1210 Section 2.2.24.1 

1211 

1212 Examples 

1213 -------- 

1214 >>> from scipy.stats import skew 

1215 >>> skew([1, 2, 3, 4, 5]) 

1216 0.0 

1217 >>> skew([2, 8, 0, 4, 1, 9, 9, 0]) 

1218 0.2650554122698573 

1219 

1220 """ 

1221 a, axis = _chk_asarray(a, axis) 

1222 n = a.shape[axis] 

1223 

1224 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1225 

1226 if contains_nan and nan_policy == 'omit': 

1227 a = ma.masked_invalid(a) 

1228 return mstats_basic.skew(a, axis, bias) 

1229 

1230 m2 = moment(a, 2, axis) 

1231 m3 = moment(a, 3, axis) 

1232 zero = (m2 == 0) 

1233 vals = _lazywhere(~zero, (m2, m3), 

1234 lambda m2, m3: m3 / m2**1.5, 

1235 0.) 

1236 if not bias: 

1237 can_correct = (n > 2) & (m2 > 0) 

1238 if can_correct.any(): 

1239 m2 = np.extract(can_correct, m2) 

1240 m3 = np.extract(can_correct, m3) 

1241 nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2**1.5 

1242 np.place(vals, can_correct, nval) 

1243 

1244 if vals.ndim == 0: 

1245 return vals.item() 

1246 

1247 return vals 

1248 

1249 

1250def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'): 

1251 """ 

1252 Compute the kurtosis (Fisher or Pearson) of a dataset. 

1253 

1254 Kurtosis is the fourth central moment divided by the square of the 

1255 variance. If Fisher's definition is used, then 3.0 is subtracted from 

1256 the result to give 0.0 for a normal distribution. 

1257 

1258 If bias is False then the kurtosis is calculated using k statistics to 

1259 eliminate bias coming from biased moment estimators 

1260 

1261 Use `kurtosistest` to see if result is close enough to normal. 

1262 

1263 Parameters 

1264 ---------- 

1265 a : array 

1266 Data for which the kurtosis is calculated. 

1267 axis : int or None, optional 

1268 Axis along which the kurtosis is calculated. Default is 0. 

1269 If None, compute over the whole array `a`. 

1270 fisher : bool, optional 

1271 If True, Fisher's definition is used (normal ==> 0.0). If False, 

1272 Pearson's definition is used (normal ==> 3.0). 

1273 bias : bool, optional 

1274 If False, then the calculations are corrected for statistical bias. 

1275 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1276 Defines how to handle when input contains nan. 'propagate' returns nan, 

1277 'raise' throws an error, 'omit' performs the calculations ignoring nan 

1278 values. Default is 'propagate'. 

1279 

1280 Returns 

1281 ------- 

1282 kurtosis : array 

1283 The kurtosis of values along an axis. If all values are equal, 

1284 return -3 for Fisher's definition and 0 for Pearson's definition. 

1285 

1286 References 

1287 ---------- 

1288 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

1289 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

1290 York. 2000. 

1291 

1292 Examples 

1293 -------- 

1294 In Fisher's definiton, the kurtosis of the normal distribution is zero. 

1295 In the following example, the kurtosis is close to zero, because it was 

1296 calculated from the dataset, not from the continuous distribution. 

1297 

1298 >>> from scipy.stats import norm, kurtosis 

1299 >>> data = norm.rvs(size=1000, random_state=3) 

1300 >>> kurtosis(data) 

1301 -0.06928694200380558 

1302 

1303 The distribution with a higher kurtosis has a heavier tail. 

1304 The zero valued kurtosis of the normal distribution in Fisher's definition 

1305 can serve as a reference point. 

1306 

1307 >>> import matplotlib.pyplot as plt 

1308 >>> import scipy.stats as stats 

1309 >>> from scipy.stats import kurtosis 

1310 

1311 >>> x = np.linspace(-5, 5, 100) 

1312 >>> ax = plt.subplot() 

1313 >>> distnames = ['laplace', 'norm', 'uniform'] 

1314 

1315 >>> for distname in distnames: 

1316 ... if distname == 'uniform': 

1317 ... dist = getattr(stats, distname)(loc=-2, scale=4) 

1318 ... else: 

1319 ... dist = getattr(stats, distname) 

1320 ... data = dist.rvs(size=1000) 

1321 ... kur = kurtosis(data, fisher=True) 

1322 ... y = dist.pdf(x) 

1323 ... ax.plot(x, y, label="{}, {}".format(distname, round(kur, 3))) 

1324 ... ax.legend() 

1325 

1326 The Laplace distribution has a heavier tail than the normal distribution. 

1327 The uniform distribution (which has negative kurtosis) has the thinnest 

1328 tail. 

1329 

1330 """ 

1331 a, axis = _chk_asarray(a, axis) 

1332 

1333 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1334 

1335 if contains_nan and nan_policy == 'omit': 

1336 a = ma.masked_invalid(a) 

1337 return mstats_basic.kurtosis(a, axis, fisher, bias) 

1338 

1339 n = a.shape[axis] 

1340 m2 = moment(a, 2, axis) 

1341 m4 = moment(a, 4, axis) 

1342 zero = (m2 == 0) 

1343 with np.errstate(all='ignore'): 

1344 vals = np.where(zero, 0, m4 / m2**2.0) 

1345 

1346 if not bias: 

1347 can_correct = (n > 3) & (m2 > 0) 

1348 if can_correct.any(): 

1349 m2 = np.extract(can_correct, m2) 

1350 m4 = np.extract(can_correct, m4) 

1351 nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0) 

1352 np.place(vals, can_correct, nval + 3.0) 

1353 

1354 if vals.ndim == 0: 

1355 vals = vals.item() # array scalar 

1356 

1357 return vals - 3 if fisher else vals 

1358 

1359 

1360DescribeResult = namedtuple('DescribeResult', 

1361 ('nobs', 'minmax', 'mean', 'variance', 'skewness', 

1362 'kurtosis')) 

1363 

1364 

1365def describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate'): 

1366 """ 

1367 Compute several descriptive statistics of the passed array. 

1368 

1369 Parameters 

1370 ---------- 

1371 a : array_like 

1372 Input data. 

1373 axis : int or None, optional 

1374 Axis along which statistics are calculated. Default is 0. 

1375 If None, compute over the whole array `a`. 

1376 ddof : int, optional 

1377 Delta degrees of freedom (only for variance). Default is 1. 

1378 bias : bool, optional 

1379 If False, then the skewness and kurtosis calculations are corrected for 

1380 statistical bias. 

1381 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1382 Defines how to handle when input contains nan. 

1383 The following options are available (default is 'propagate'): 

1384 

1385 * 'propagate': returns nan 

1386 * 'raise': throws an error 

1387 * 'omit': performs the calculations ignoring nan values 

1388 

1389 Returns 

1390 ------- 

1391 nobs : int or ndarray of ints 

1392 Number of observations (length of data along `axis`). 

1393 When 'omit' is chosen as nan_policy, each column is counted separately. 

1394 minmax: tuple of ndarrays or floats 

1395 Minimum and maximum value of data array. 

1396 mean : ndarray or float 

1397 Arithmetic mean of data along axis. 

1398 variance : ndarray or float 

1399 Unbiased variance of the data along axis, denominator is number of 

1400 observations minus one. 

1401 skewness : ndarray or float 

1402 Skewness, based on moment calculations with denominator equal to 

1403 the number of observations, i.e. no degrees of freedom correction. 

1404 kurtosis : ndarray or float 

1405 Kurtosis (Fisher). The kurtosis is normalized so that it is 

1406 zero for the normal distribution. No degrees of freedom are used. 

1407 

1408 See Also 

1409 -------- 

1410 skew, kurtosis 

1411 

1412 Examples 

1413 -------- 

1414 >>> from scipy import stats 

1415 >>> a = np.arange(10) 

1416 >>> stats.describe(a) 

1417 DescribeResult(nobs=10, minmax=(0, 9), mean=4.5, variance=9.166666666666666, 

1418 skewness=0.0, kurtosis=-1.2242424242424244) 

1419 >>> b = [[1, 2], [3, 4]] 

1420 >>> stats.describe(b) 

1421 DescribeResult(nobs=2, minmax=(array([1, 2]), array([3, 4])), 

1422 mean=array([2., 3.]), variance=array([2., 2.]), 

1423 skewness=array([0., 0.]), kurtosis=array([-2., -2.])) 

1424 

1425 """ 

1426 a, axis = _chk_asarray(a, axis) 

1427 

1428 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1429 

1430 if contains_nan and nan_policy == 'omit': 

1431 a = ma.masked_invalid(a) 

1432 return mstats_basic.describe(a, axis, ddof, bias) 

1433 

1434 if a.size == 0: 

1435 raise ValueError("The input must not be empty.") 

1436 n = a.shape[axis] 

1437 mm = (np.min(a, axis=axis), np.max(a, axis=axis)) 

1438 m = np.mean(a, axis=axis) 

1439 v = np.var(a, axis=axis, ddof=ddof) 

1440 sk = skew(a, axis, bias=bias) 

1441 kurt = kurtosis(a, axis, bias=bias) 

1442 

1443 return DescribeResult(n, mm, m, v, sk, kurt) 

1444 

1445##################################### 

1446# NORMALITY TESTS # 

1447##################################### 

1448 

1449 

1450SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue')) 

1451 

1452 

1453def skewtest(a, axis=0, nan_policy='propagate'): 

1454 """ 

1455 Test whether the skew is different from the normal distribution. 

1456 

1457 This function tests the null hypothesis that the skewness of 

1458 the population that the sample was drawn from is the same 

1459 as that of a corresponding normal distribution. 

1460 

1461 Parameters 

1462 ---------- 

1463 a : array 

1464 The data to be tested. 

1465 axis : int or None, optional 

1466 Axis along which statistics are calculated. Default is 0. 

1467 If None, compute over the whole array `a`. 

1468 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1469 Defines how to handle when input contains nan. 

1470 The following options are available (default is 'propagate'): 

1471 

1472 * 'propagate': returns nan 

1473 * 'raise': throws an error 

1474 * 'omit': performs the calculations ignoring nan values 

1475 

1476 Returns 

1477 ------- 

1478 statistic : float 

1479 The computed z-score for this test. 

1480 pvalue : float 

1481 Two-sided p-value for the hypothesis test. 

1482 

1483 Notes 

1484 ----- 

1485 The sample size must be at least 8. 

1486 

1487 References 

1488 ---------- 

1489 .. [1] R. B. D'Agostino, A. J. Belanger and R. B. D'Agostino Jr., 

1490 "A suggestion for using powerful and informative tests of 

1491 normality", American Statistician 44, pp. 316-321, 1990. 

1492 

1493 Examples 

1494 -------- 

1495 >>> from scipy.stats import skewtest 

1496 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8]) 

1497 SkewtestResult(statistic=1.0108048609177787, pvalue=0.3121098361421897) 

1498 >>> skewtest([2, 8, 0, 4, 1, 9, 9, 0]) 

1499 SkewtestResult(statistic=0.44626385374196975, pvalue=0.6554066631275459) 

1500 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8000]) 

1501 SkewtestResult(statistic=3.571773510360407, pvalue=0.0003545719905823133) 

1502 >>> skewtest([100, 100, 100, 100, 100, 100, 100, 101]) 

1503 SkewtestResult(statistic=3.5717766638478072, pvalue=0.000354567720281634) 

1504 

1505 """ 

1506 a, axis = _chk_asarray(a, axis) 

1507 

1508 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1509 

1510 if contains_nan and nan_policy == 'omit': 

1511 a = ma.masked_invalid(a) 

1512 return mstats_basic.skewtest(a, axis) 

1513 

1514 if axis is None: 

1515 a = np.ravel(a) 

1516 axis = 0 

1517 b2 = skew(a, axis) 

1518 n = a.shape[axis] 

1519 if n < 8: 

1520 raise ValueError( 

1521 "skewtest is not valid with less than 8 samples; %i samples" 

1522 " were given." % int(n)) 

1523 y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) 

1524 beta2 = (3.0 * (n**2 + 27*n - 70) * (n+1) * (n+3) / 

1525 ((n-2.0) * (n+5) * (n+7) * (n+9))) 

1526 W2 = -1 + math.sqrt(2 * (beta2 - 1)) 

1527 delta = 1 / math.sqrt(0.5 * math.log(W2)) 

1528 alpha = math.sqrt(2.0 / (W2 - 1)) 

1529 y = np.where(y == 0, 1, y) 

1530 Z = delta * np.log(y / alpha + np.sqrt((y / alpha)**2 + 1)) 

1531 

1532 return SkewtestResult(Z, 2 * distributions.norm.sf(np.abs(Z))) 

1533 

1534 

1535KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue')) 

1536 

1537 

1538def kurtosistest(a, axis=0, nan_policy='propagate'): 

1539 """ 

1540 Test whether a dataset has normal kurtosis. 

1541 

1542 This function tests the null hypothesis that the kurtosis 

1543 of the population from which the sample was drawn is that 

1544 of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``. 

1545 

1546 Parameters 

1547 ---------- 

1548 a : array 

1549 Array of the sample data. 

1550 axis : int or None, optional 

1551 Axis along which to compute test. Default is 0. If None, 

1552 compute over the whole array `a`. 

1553 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1554 Defines how to handle when input contains nan. 

1555 The following options are available (default is 'propagate'): 

1556 

1557 * 'propagate': returns nan 

1558 * 'raise': throws an error 

1559 * 'omit': performs the calculations ignoring nan values 

1560 

1561 Returns 

1562 ------- 

1563 statistic : float 

1564 The computed z-score for this test. 

1565 pvalue : float 

1566 The two-sided p-value for the hypothesis test. 

1567 

1568 Notes 

1569 ----- 

1570 Valid only for n>20. This function uses the method described in [1]_. 

1571 

1572 References 

1573 ---------- 

1574 .. [1] see e.g. F. J. Anscombe, W. J. Glynn, "Distribution of the kurtosis 

1575 statistic b2 for normal samples", Biometrika, vol. 70, pp. 227-234, 1983. 

1576 

1577 Examples 

1578 -------- 

1579 >>> from scipy.stats import kurtosistest 

1580 >>> kurtosistest(list(range(20))) 

1581 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.08804338332528348) 

1582 

1583 >>> np.random.seed(28041990) 

1584 >>> s = np.random.normal(0, 1, 1000) 

1585 >>> kurtosistest(s) 

1586 KurtosistestResult(statistic=1.2317590987707365, pvalue=0.21803908613450895) 

1587 

1588 """ 

1589 a, axis = _chk_asarray(a, axis) 

1590 

1591 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1592 

1593 if contains_nan and nan_policy == 'omit': 

1594 a = ma.masked_invalid(a) 

1595 return mstats_basic.kurtosistest(a, axis) 

1596 

1597 n = a.shape[axis] 

1598 if n < 5: 

1599 raise ValueError( 

1600 "kurtosistest requires at least 5 observations; %i observations" 

1601 " were given." % int(n)) 

1602 if n < 20: 

1603 warnings.warn("kurtosistest only valid for n>=20 ... continuing " 

1604 "anyway, n=%i" % int(n)) 

1605 b2 = kurtosis(a, axis, fisher=False) 

1606 

1607 E = 3.0*(n-1) / (n+1) 

1608 varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) # [1]_ Eq. 1 

1609 x = (b2-E) / np.sqrt(varb2) # [1]_ Eq. 4 

1610 # [1]_ Eq. 2: 

1611 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / 

1612 (n*(n-2)*(n-3))) 

1613 # [1]_ Eq. 3: 

1614 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) 

1615 term1 = 1 - 2/(9.0*A) 

1616 denom = 1 + x*np.sqrt(2/(A-4.0)) 

1617 term2 = np.sign(denom) * np.where(denom == 0.0, np.nan, 

1618 np.power((1-2.0/A)/np.abs(denom), 1/3.0)) 

1619 if np.any(denom == 0): 

1620 msg = "Test statistic not defined in some cases due to division by " \ 

1621 "zero. Return nan in that case..." 

1622 warnings.warn(msg, RuntimeWarning) 

1623 

1624 Z = (term1 - term2) / np.sqrt(2/(9.0*A)) # [1]_ Eq. 5 

1625 if Z.ndim == 0: 

1626 Z = Z[()] 

1627 

1628 # zprob uses upper tail, so Z needs to be positive 

1629 return KurtosistestResult(Z, 2 * distributions.norm.sf(np.abs(Z))) 

1630 

1631 

1632NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue')) 

1633 

1634 

1635def normaltest(a, axis=0, nan_policy='propagate'): 

1636 """ 

1637 Test whether a sample differs from a normal distribution. 

1638 

1639 This function tests the null hypothesis that a sample comes 

1640 from a normal distribution. It is based on D'Agostino and 

1641 Pearson's [1]_, [2]_ test that combines skew and kurtosis to 

1642 produce an omnibus test of normality. 

1643 

1644 Parameters 

1645 ---------- 

1646 a : array_like 

1647 The array containing the sample to be tested. 

1648 axis : int or None, optional 

1649 Axis along which to compute test. Default is 0. If None, 

1650 compute over the whole array `a`. 

1651 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1652 Defines how to handle when input contains nan. 

1653 The following options are available (default is 'propagate'): 

1654 

1655 * 'propagate': returns nan 

1656 * 'raise': throws an error 

1657 * 'omit': performs the calculations ignoring nan values 

1658 

1659 Returns 

1660 ------- 

1661 statistic : float or array 

1662 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and 

1663 ``k`` is the z-score returned by `kurtosistest`. 

1664 pvalue : float or array 

1665 A 2-sided chi squared probability for the hypothesis test. 

1666 

1667 References 

1668 ---------- 

1669 .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for 

1670 moderate and large sample size", Biometrika, 58, 341-348 

1671 

1672 .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Tests for departure from 

1673 normality", Biometrika, 60, 613-622 

1674 

1675 Examples 

1676 -------- 

1677 >>> from scipy import stats 

1678 >>> pts = 1000 

1679 >>> np.random.seed(28041990) 

1680 >>> a = np.random.normal(0, 1, size=pts) 

1681 >>> b = np.random.normal(2, 1, size=pts) 

1682 >>> x = np.concatenate((a, b)) 

1683 >>> k2, p = stats.normaltest(x) 

1684 >>> alpha = 1e-3 

1685 >>> print("p = {:g}".format(p)) 

1686 p = 3.27207e-11 

1687 >>> if p < alpha: # null hypothesis: x comes from a normal distribution 

1688 ... print("The null hypothesis can be rejected") 

1689 ... else: 

1690 ... print("The null hypothesis cannot be rejected") 

1691 The null hypothesis can be rejected 

1692 

1693 """ 

1694 a, axis = _chk_asarray(a, axis) 

1695 

1696 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1697 

1698 if contains_nan and nan_policy == 'omit': 

1699 a = ma.masked_invalid(a) 

1700 return mstats_basic.normaltest(a, axis) 

1701 

1702 s, _ = skewtest(a, axis) 

1703 k, _ = kurtosistest(a, axis) 

1704 k2 = s*s + k*k 

1705 

1706 return NormaltestResult(k2, distributions.chi2.sf(k2, 2)) 

1707 

1708 

1709Jarque_beraResult = namedtuple('Jarque_beraResult', ('statistic', 'pvalue')) 

1710 

1711 

1712def jarque_bera(x): 

1713 """ 

1714 Perform the Jarque-Bera goodness of fit test on sample data. 

1715 

1716 The Jarque-Bera test tests whether the sample data has the skewness and 

1717 kurtosis matching a normal distribution. 

1718 

1719 Note that this test only works for a large enough number of data samples 

1720 (>2000) as the test statistic asymptotically has a Chi-squared distribution 

1721 with 2 degrees of freedom. 

1722 

1723 Parameters 

1724 ---------- 

1725 x : array_like 

1726 Observations of a random variable. 

1727 

1728 Returns 

1729 ------- 

1730 jb_value : float 

1731 The test statistic. 

1732 p : float 

1733 The p-value for the hypothesis test. 

1734 

1735 References 

1736 ---------- 

1737 .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality, 

1738 homoscedasticity and serial independence of regression residuals", 

1739 6 Econometric Letters 255-259. 

1740 

1741 Examples 

1742 -------- 

1743 >>> from scipy import stats 

1744 >>> np.random.seed(987654321) 

1745 >>> x = np.random.normal(0, 1, 100000) 

1746 >>> jarque_bera_test = stats.jarque_bera(x) 

1747 >>> jarque_bera_test 

1748 Jarque_beraResult(statistic=4.716570798957913, pvalue=0.0945822550304295) 

1749 >>> jarque_bera_test.statistic 

1750 4.716570798957913 

1751 >>> jarque_bera_test.pvalue 

1752 0.0945822550304295 

1753 

1754 """ 

1755 x = np.asarray(x) 

1756 n = x.size 

1757 if n == 0: 

1758 raise ValueError('At least one observation is required.') 

1759 

1760 mu = x.mean() 

1761 diffx = x - mu 

1762 skewness = (1 / n * np.sum(diffx**3)) / (1 / n * np.sum(diffx**2))**(3 / 2.) 

1763 kurtosis = (1 / n * np.sum(diffx**4)) / (1 / n * np.sum(diffx**2))**2 

1764 jb_value = n / 6 * (skewness**2 + (kurtosis - 3)**2 / 4) 

1765 p = 1 - distributions.chi2.cdf(jb_value, 2) 

1766 

1767 return Jarque_beraResult(jb_value, p) 

1768 

1769 

1770##################################### 

1771# FREQUENCY FUNCTIONS # 

1772##################################### 

1773 

1774# deindent to work around numpy/gh-16202 

1775@np.deprecate( 

1776 message="`itemfreq` is deprecated and will be removed in a " 

1777 "future version. Use instead `np.unique(..., return_counts=True)`") 

1778def itemfreq(a): 

1779 """ 

1780Return a 2-D array of item frequencies. 

1781 

1782Parameters 

1783---------- 

1784a : (N,) array_like 

1785 Input array. 

1786 

1787Returns 

1788------- 

1789itemfreq : (K, 2) ndarray 

1790 A 2-D frequency table. Column 1 contains sorted, unique values from 

1791 `a`, column 2 contains their respective counts. 

1792 

1793Examples 

1794-------- 

1795>>> from scipy import stats 

1796>>> a = np.array([1, 1, 5, 0, 1, 2, 2, 0, 1, 4]) 

1797>>> stats.itemfreq(a) 

1798array([[ 0., 2.], 

1799 [ 1., 4.], 

1800 [ 2., 2.], 

1801 [ 4., 1.], 

1802 [ 5., 1.]]) 

1803>>> np.bincount(a) 

1804array([2, 4, 2, 0, 1, 1]) 

1805 

1806>>> stats.itemfreq(a/10.) 

1807array([[ 0. , 2. ], 

1808 [ 0.1, 4. ], 

1809 [ 0.2, 2. ], 

1810 [ 0.4, 1. ], 

1811 [ 0.5, 1. ]]) 

1812""" 

1813 items, inv = np.unique(a, return_inverse=True) 

1814 freq = np.bincount(inv) 

1815 return np.array([items, freq]).T 

1816 

1817 

1818def scoreatpercentile(a, per, limit=(), interpolation_method='fraction', 

1819 axis=None): 

1820 """ 

1821 Calculate the score at a given percentile of the input sequence. 

1822 

1823 For example, the score at `per=50` is the median. If the desired quantile 

1824 lies between two data points, we interpolate between them, according to 

1825 the value of `interpolation`. If the parameter `limit` is provided, it 

1826 should be a tuple (lower, upper) of two values. 

1827 

1828 Parameters 

1829 ---------- 

1830 a : array_like 

1831 A 1-D array of values from which to extract score. 

1832 per : array_like 

1833 Percentile(s) at which to extract score. Values should be in range 

1834 [0,100]. 

1835 limit : tuple, optional 

1836 Tuple of two scalars, the lower and upper limits within which to 

1837 compute the percentile. Values of `a` outside 

1838 this (closed) interval will be ignored. 

1839 interpolation_method : {'fraction', 'lower', 'higher'}, optional 

1840 Specifies the interpolation method to use, 

1841 when the desired quantile lies between two data points `i` and `j` 

1842 The following options are available (default is 'fraction'): 

1843 

1844 * 'fraction': ``i + (j - i) * fraction`` where ``fraction`` is the 

1845 fractional part of the index surrounded by ``i`` and ``j`` 

1846 * 'lower': ``i`` 

1847 * 'higher': ``j`` 

1848 

1849 axis : int, optional 

1850 Axis along which the percentiles are computed. Default is None. If 

1851 None, compute over the whole array `a`. 

1852 

1853 Returns 

1854 ------- 

1855 score : float or ndarray 

1856 Score at percentile(s). 

1857 

1858 See Also 

1859 -------- 

1860 percentileofscore, numpy.percentile 

1861 

1862 Notes 

1863 ----- 

1864 This function will become obsolete in the future. 

1865 For NumPy 1.9 and higher, `numpy.percentile` provides all the functionality 

1866 that `scoreatpercentile` provides. And it's significantly faster. 

1867 Therefore it's recommended to use `numpy.percentile` for users that have 

1868 numpy >= 1.9. 

1869 

1870 Examples 

1871 -------- 

1872 >>> from scipy import stats 

1873 >>> a = np.arange(100) 

1874 >>> stats.scoreatpercentile(a, 50) 

1875 49.5 

1876 

1877 """ 

1878 # adapted from NumPy's percentile function. When we require numpy >= 1.8, 

1879 # the implementation of this function can be replaced by np.percentile. 

1880 a = np.asarray(a) 

1881 if a.size == 0: 

1882 # empty array, return nan(s) with shape matching `per` 

1883 if np.isscalar(per): 

1884 return np.nan 

1885 else: 

1886 return np.full(np.asarray(per).shape, np.nan, dtype=np.float64) 

1887 

1888 if limit: 

1889 a = a[(limit[0] <= a) & (a <= limit[1])] 

1890 

1891 sorted_ = np.sort(a, axis=axis) 

1892 if axis is None: 

1893 axis = 0 

1894 

1895 return _compute_qth_percentile(sorted_, per, interpolation_method, axis) 

1896 

1897 

1898# handle sequence of per's without calling sort multiple times 

1899def _compute_qth_percentile(sorted_, per, interpolation_method, axis): 

1900 if not np.isscalar(per): 

1901 score = [_compute_qth_percentile(sorted_, i, 

1902 interpolation_method, axis) 

1903 for i in per] 

1904 return np.array(score) 

1905 

1906 if not (0 <= per <= 100): 

1907 raise ValueError("percentile must be in the range [0, 100]") 

1908 

1909 indexer = [slice(None)] * sorted_.ndim 

1910 idx = per / 100. * (sorted_.shape[axis] - 1) 

1911 

1912 if int(idx) != idx: 

1913 # round fractional indices according to interpolation method 

1914 if interpolation_method == 'lower': 

1915 idx = int(np.floor(idx)) 

1916 elif interpolation_method == 'higher': 

1917 idx = int(np.ceil(idx)) 

1918 elif interpolation_method == 'fraction': 

1919 pass # keep idx as fraction and interpolate 

1920 else: 

1921 raise ValueError("interpolation_method can only be 'fraction', " 

1922 "'lower' or 'higher'") 

1923 

1924 i = int(idx) 

1925 if i == idx: 

1926 indexer[axis] = slice(i, i + 1) 

1927 weights = array(1) 

1928 sumval = 1.0 

1929 else: 

1930 indexer[axis] = slice(i, i + 2) 

1931 j = i + 1 

1932 weights = array([(j - idx), (idx - i)], float) 

1933 wshape = [1] * sorted_.ndim 

1934 wshape[axis] = 2 

1935 weights.shape = wshape 

1936 sumval = weights.sum() 

1937 

1938 # Use np.add.reduce (== np.sum but a little faster) to coerce data type 

1939 return np.add.reduce(sorted_[tuple(indexer)] * weights, axis=axis) / sumval 

1940 

1941 

1942def percentileofscore(a, score, kind='rank'): 

1943 """ 

1944 Compute the percentile rank of a score relative to a list of scores. 

1945 

1946 A `percentileofscore` of, for example, 80% means that 80% of the 

1947 scores in `a` are below the given score. In the case of gaps or 

1948 ties, the exact definition depends on the optional keyword, `kind`. 

1949 

1950 Parameters 

1951 ---------- 

1952 a : array_like 

1953 Array of scores to which `score` is compared. 

1954 score : int or float 

1955 Score that is compared to the elements in `a`. 

1956 kind : {'rank', 'weak', 'strict', 'mean'}, optional 

1957 Specifies the interpretation of the resulting score. 

1958 The following options are available (default is 'rank'): 

1959 

1960 * 'rank': Average percentage ranking of score. In case of multiple 

1961 matches, average the percentage rankings of all matching scores. 

1962 * 'weak': This kind corresponds to the definition of a cumulative 

1963 distribution function. A percentileofscore of 80% means that 80% 

1964 of values are less than or equal to the provided score. 

1965 * 'strict': Similar to "weak", except that only values that are 

1966 strictly less than the given score are counted. 

1967 * 'mean': The average of the "weak" and "strict" scores, often used 

1968 in testing. See https://en.wikipedia.org/wiki/Percentile_rank 

1969 

1970 Returns 

1971 ------- 

1972 pcos : float 

1973 Percentile-position of score (0-100) relative to `a`. 

1974 

1975 See Also 

1976 -------- 

1977 numpy.percentile 

1978 

1979 Examples 

1980 -------- 

1981 Three-quarters of the given values lie below a given score: 

1982 

1983 >>> from scipy import stats 

1984 >>> stats.percentileofscore([1, 2, 3, 4], 3) 

1985 75.0 

1986 

1987 With multiple matches, note how the scores of the two matches, 0.6 

1988 and 0.8 respectively, are averaged: 

1989 

1990 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3) 

1991 70.0 

1992 

1993 Only 2/5 values are strictly less than 3: 

1994 

1995 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') 

1996 40.0 

1997 

1998 But 4/5 values are less than or equal to 3: 

1999 

2000 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') 

2001 80.0 

2002 

2003 The average between the weak and the strict scores is: 

2004 

2005 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') 

2006 60.0 

2007 

2008 """ 

2009 if np.isnan(score): 

2010 return np.nan 

2011 a = np.asarray(a) 

2012 n = len(a) 

2013 if n == 0: 

2014 return 100.0 

2015 

2016 if kind == 'rank': 

2017 left = np.count_nonzero(a < score) 

2018 right = np.count_nonzero(a <= score) 

2019 pct = (right + left + (1 if right > left else 0)) * 50.0/n 

2020 return pct 

2021 elif kind == 'strict': 

2022 return np.count_nonzero(a < score) / n * 100 

2023 elif kind == 'weak': 

2024 return np.count_nonzero(a <= score) / n * 100 

2025 elif kind == 'mean': 

2026 pct = (np.count_nonzero(a < score) + np.count_nonzero(a <= score)) / n * 50 

2027 return pct 

2028 else: 

2029 raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") 

2030 

2031 

2032HistogramResult = namedtuple('HistogramResult', 

2033 ('count', 'lowerlimit', 'binsize', 'extrapoints')) 

2034 

2035 

2036def _histogram(a, numbins=10, defaultlimits=None, weights=None, printextras=False): 

2037 """ 

2038 Create a histogram. 

2039 

2040 Separate the range into several bins and return the number of instances 

2041 in each bin. 

2042 

2043 Parameters 

2044 ---------- 

2045 a : array_like 

2046 Array of scores which will be put into bins. 

2047 numbins : int, optional 

2048 The number of bins to use for the histogram. Default is 10. 

2049 defaultlimits : tuple (lower, upper), optional 

2050 The lower and upper values for the range of the histogram. 

2051 If no value is given, a range slightly larger than the range of the 

2052 values in a is used. Specifically ``(a.min() - s, a.max() + s)``, 

2053 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. 

2054 weights : array_like, optional 

2055 The weights for each value in `a`. Default is None, which gives each 

2056 value a weight of 1.0 

2057 printextras : bool, optional 

2058 If True, if there are extra points (i.e. the points that fall outside 

2059 the bin limits) a warning is raised saying how many of those points 

2060 there are. Default is False. 

2061 

2062 Returns 

2063 ------- 

2064 count : ndarray 

2065 Number of points (or sum of weights) in each bin. 

2066 lowerlimit : float 

2067 Lowest value of histogram, the lower limit of the first bin. 

2068 binsize : float 

2069 The size of the bins (all bins have the same size). 

2070 extrapoints : int 

2071 The number of points outside the range of the histogram. 

2072 

2073 See Also 

2074 -------- 

2075 numpy.histogram 

2076 

2077 Notes 

2078 ----- 

2079 This histogram is based on numpy's histogram but has a larger range by 

2080 default if default limits is not set. 

2081 

2082 """ 

2083 a = np.ravel(a) 

2084 if defaultlimits is None: 

2085 if a.size == 0: 

2086 # handle empty arrays. Undetermined range, so use 0-1. 

2087 defaultlimits = (0, 1) 

2088 else: 

2089 # no range given, so use values in `a` 

2090 data_min = a.min() 

2091 data_max = a.max() 

2092 # Have bins extend past min and max values slightly 

2093 s = (data_max - data_min) / (2. * (numbins - 1.)) 

2094 defaultlimits = (data_min - s, data_max + s) 

2095 

2096 # use numpy's histogram method to compute bins 

2097 hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits, 

2098 weights=weights) 

2099 # hist are not always floats, convert to keep with old output 

2100 hist = np.array(hist, dtype=float) 

2101 # fixed width for bins is assumed, as numpy's histogram gives 

2102 # fixed width bins for int values for 'bins' 

2103 binsize = bin_edges[1] - bin_edges[0] 

2104 # calculate number of extra points 

2105 extrapoints = len([v for v in a 

2106 if defaultlimits[0] > v or v > defaultlimits[1]]) 

2107 if extrapoints > 0 and printextras: 

2108 warnings.warn("Points outside given histogram range = %s" 

2109 % extrapoints) 

2110 

2111 return HistogramResult(hist, defaultlimits[0], binsize, extrapoints) 

2112 

2113 

2114CumfreqResult = namedtuple('CumfreqResult', 

2115 ('cumcount', 'lowerlimit', 'binsize', 

2116 'extrapoints')) 

2117 

2118 

2119def cumfreq(a, numbins=10, defaultreallimits=None, weights=None): 

2120 """ 

2121 Return a cumulative frequency histogram, using the histogram function. 

2122 

2123 A cumulative histogram is a mapping that counts the cumulative number of 

2124 observations in all of the bins up to the specified bin. 

2125 

2126 Parameters 

2127 ---------- 

2128 a : array_like 

2129 Input array. 

2130 numbins : int, optional 

2131 The number of bins to use for the histogram. Default is 10. 

2132 defaultreallimits : tuple (lower, upper), optional 

2133 The lower and upper values for the range of the histogram. 

2134 If no value is given, a range slightly larger than the range of the 

2135 values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``, 

2136 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. 

2137 weights : array_like, optional 

2138 The weights for each value in `a`. Default is None, which gives each 

2139 value a weight of 1.0 

2140 

2141 Returns 

2142 ------- 

2143 cumcount : ndarray 

2144 Binned values of cumulative frequency. 

2145 lowerlimit : float 

2146 Lower real limit 

2147 binsize : float 

2148 Width of each bin. 

2149 extrapoints : int 

2150 Extra points. 

2151 

2152 Examples 

2153 -------- 

2154 >>> import matplotlib.pyplot as plt 

2155 >>> from scipy import stats 

2156 >>> x = [1, 4, 2, 1, 3, 1] 

2157 >>> res = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5)) 

2158 >>> res.cumcount 

2159 array([ 1., 2., 3., 3.]) 

2160 >>> res.extrapoints 

2161 3 

2162 

2163 Create a normal distribution with 1000 random values 

2164 

2165 >>> rng = np.random.RandomState(seed=12345) 

2166 >>> samples = stats.norm.rvs(size=1000, random_state=rng) 

2167 

2168 Calculate cumulative frequencies 

2169 

2170 >>> res = stats.cumfreq(samples, numbins=25) 

2171 

2172 Calculate space of values for x 

2173 

2174 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.cumcount.size, 

2175 ... res.cumcount.size) 

2176 

2177 Plot histogram and cumulative histogram 

2178 

2179 >>> fig = plt.figure(figsize=(10, 4)) 

2180 >>> ax1 = fig.add_subplot(1, 2, 1) 

2181 >>> ax2 = fig.add_subplot(1, 2, 2) 

2182 >>> ax1.hist(samples, bins=25) 

2183 >>> ax1.set_title('Histogram') 

2184 >>> ax2.bar(x, res.cumcount, width=res.binsize) 

2185 >>> ax2.set_title('Cumulative histogram') 

2186 >>> ax2.set_xlim([x.min(), x.max()]) 

2187 

2188 >>> plt.show() 

2189 

2190 """ 

2191 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights) 

2192 cumhist = np.cumsum(h * 1, axis=0) 

2193 return CumfreqResult(cumhist, l, b, e) 

2194 

2195 

2196RelfreqResult = namedtuple('RelfreqResult', 

2197 ('frequency', 'lowerlimit', 'binsize', 

2198 'extrapoints')) 

2199 

2200 

2201def relfreq(a, numbins=10, defaultreallimits=None, weights=None): 

2202 """ 

2203 Return a relative frequency histogram, using the histogram function. 

2204 

2205 A relative frequency histogram is a mapping of the number of 

2206 observations in each of the bins relative to the total of observations. 

2207 

2208 Parameters 

2209 ---------- 

2210 a : array_like 

2211 Input array. 

2212 numbins : int, optional 

2213 The number of bins to use for the histogram. Default is 10. 

2214 defaultreallimits : tuple (lower, upper), optional 

2215 The lower and upper values for the range of the histogram. 

2216 If no value is given, a range slightly larger than the range of the 

2217 values in a is used. Specifically ``(a.min() - s, a.max() + s)``, 

2218 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. 

2219 weights : array_like, optional 

2220 The weights for each value in `a`. Default is None, which gives each 

2221 value a weight of 1.0 

2222 

2223 Returns 

2224 ------- 

2225 frequency : ndarray 

2226 Binned values of relative frequency. 

2227 lowerlimit : float 

2228 Lower real limit. 

2229 binsize : float 

2230 Width of each bin. 

2231 extrapoints : int 

2232 Extra points. 

2233 

2234 Examples 

2235 -------- 

2236 >>> import matplotlib.pyplot as plt 

2237 >>> from scipy import stats 

2238 >>> a = np.array([2, 4, 1, 2, 3, 2]) 

2239 >>> res = stats.relfreq(a, numbins=4) 

2240 >>> res.frequency 

2241 array([ 0.16666667, 0.5 , 0.16666667, 0.16666667]) 

2242 >>> np.sum(res.frequency) # relative frequencies should add up to 1 

2243 1.0 

2244 

2245 Create a normal distribution with 1000 random values 

2246 

2247 >>> rng = np.random.RandomState(seed=12345) 

2248 >>> samples = stats.norm.rvs(size=1000, random_state=rng) 

2249 

2250 Calculate relative frequencies 

2251 

2252 >>> res = stats.relfreq(samples, numbins=25) 

2253 

2254 Calculate space of values for x 

2255 

2256 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size, 

2257 ... res.frequency.size) 

2258 

2259 Plot relative frequency histogram 

2260 

2261 >>> fig = plt.figure(figsize=(5, 4)) 

2262 >>> ax = fig.add_subplot(1, 1, 1) 

2263 >>> ax.bar(x, res.frequency, width=res.binsize) 

2264 >>> ax.set_title('Relative frequency histogram') 

2265 >>> ax.set_xlim([x.min(), x.max()]) 

2266 

2267 >>> plt.show() 

2268 

2269 """ 

2270 a = np.asanyarray(a) 

2271 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights) 

2272 h = h / a.shape[0] 

2273 

2274 return RelfreqResult(h, l, b, e) 

2275 

2276 

2277##################################### 

2278# VARIABILITY FUNCTIONS # 

2279##################################### 

2280 

2281def obrientransform(*args): 

2282 """ 

2283 Compute the O'Brien transform on input data (any number of arrays). 

2284 

2285 Used to test for homogeneity of variance prior to running one-way stats. 

2286 Each array in ``*args`` is one level of a factor. 

2287 If `f_oneway` is run on the transformed data and found significant, 

2288 the variances are unequal. From Maxwell and Delaney [1]_, p.112. 

2289 

2290 Parameters 

2291 ---------- 

2292 args : tuple of array_like 

2293 Any number of arrays. 

2294 

2295 Returns 

2296 ------- 

2297 obrientransform : ndarray 

2298 Transformed data for use in an ANOVA. The first dimension 

2299 of the result corresponds to the sequence of transformed 

2300 arrays. If the arrays given are all 1-D of the same length, 

2301 the return value is a 2-D array; otherwise it is a 1-D array 

2302 of type object, with each element being an ndarray. 

2303 

2304 References 

2305 ---------- 

2306 .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and 

2307 Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990. 

2308 

2309 Examples 

2310 -------- 

2311 We'll test the following data sets for differences in their variance. 

2312 

2313 >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10] 

2314 >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15] 

2315 

2316 Apply the O'Brien transform to the data. 

2317 

2318 >>> from scipy.stats import obrientransform 

2319 >>> tx, ty = obrientransform(x, y) 

2320 

2321 Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the 

2322 transformed data. 

2323 

2324 >>> from scipy.stats import f_oneway 

2325 >>> F, p = f_oneway(tx, ty) 

2326 >>> p 

2327 0.1314139477040335 

2328 

2329 If we require that ``p < 0.05`` for significance, we cannot conclude 

2330 that the variances are different. 

2331 

2332 """ 

2333 TINY = np.sqrt(np.finfo(float).eps) 

2334 

2335 # `arrays` will hold the transformed arguments. 

2336 arrays = [] 

2337 sLast = None 

2338 

2339 for arg in args: 

2340 a = np.asarray(arg) 

2341 n = len(a) 

2342 mu = np.mean(a) 

2343 sq = (a - mu)**2 

2344 sumsq = sq.sum() 

2345 

2346 # The O'Brien transform. 

2347 t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2)) 

2348 

2349 # Check that the mean of the transformed data is equal to the 

2350 # original variance. 

2351 var = sumsq / (n - 1) 

2352 if abs(var - np.mean(t)) > TINY: 

2353 raise ValueError('Lack of convergence in obrientransform.') 

2354 

2355 arrays.append(t) 

2356 sLast = a.shape 

2357 

2358 if sLast: 

2359 for arr in arrays[:-1]: 

2360 if sLast != arr.shape: 

2361 return np.array(arrays, dtype=object) 

2362 return np.array(arrays) 

2363 

2364 

2365def sem(a, axis=0, ddof=1, nan_policy='propagate'): 

2366 """ 

2367 Compute standard error of the mean. 

2368 

2369 Calculate the standard error of the mean (or standard error of 

2370 measurement) of the values in the input array. 

2371 

2372 Parameters 

2373 ---------- 

2374 a : array_like 

2375 An array containing the values for which the standard error is 

2376 returned. 

2377 axis : int or None, optional 

2378 Axis along which to operate. Default is 0. If None, compute over 

2379 the whole array `a`. 

2380 ddof : int, optional 

2381 Delta degrees-of-freedom. How many degrees of freedom to adjust 

2382 for bias in limited samples relative to the population estimate 

2383 of variance. Defaults to 1. 

2384 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2385 Defines how to handle when input contains nan. 

2386 The following options are available (default is 'propagate'): 

2387 

2388 * 'propagate': returns nan 

2389 * 'raise': throws an error 

2390 * 'omit': performs the calculations ignoring nan values 

2391 

2392 Returns 

2393 ------- 

2394 s : ndarray or float 

2395 The standard error of the mean in the sample(s), along the input axis. 

2396 

2397 Notes 

2398 ----- 

2399 The default value for `ddof` is different to the default (0) used by other 

2400 ddof containing routines, such as np.std and np.nanstd. 

2401 

2402 Examples 

2403 -------- 

2404 Find standard error along the first axis: 

2405 

2406 >>> from scipy import stats 

2407 >>> a = np.arange(20).reshape(5,4) 

2408 >>> stats.sem(a) 

2409 array([ 2.8284, 2.8284, 2.8284, 2.8284]) 

2410 

2411 Find standard error across the whole array, using n degrees of freedom: 

2412 

2413 >>> stats.sem(a, axis=None, ddof=0) 

2414 1.2893796958227628 

2415 

2416 """ 

2417 a, axis = _chk_asarray(a, axis) 

2418 

2419 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

2420 

2421 if contains_nan and nan_policy == 'omit': 

2422 a = ma.masked_invalid(a) 

2423 return mstats_basic.sem(a, axis, ddof) 

2424 

2425 n = a.shape[axis] 

2426 s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n) 

2427 return s 

2428 

2429 

2430def zscore(a, axis=0, ddof=0, nan_policy='propagate'): 

2431 """ 

2432 Compute the z score. 

2433 

2434 Compute the z score of each value in the sample, relative to the 

2435 sample mean and standard deviation. 

2436 

2437 Parameters 

2438 ---------- 

2439 a : array_like 

2440 An array like object containing the sample data. 

2441 axis : int or None, optional 

2442 Axis along which to operate. Default is 0. If None, compute over 

2443 the whole array `a`. 

2444 ddof : int, optional 

2445 Degrees of freedom correction in the calculation of the 

2446 standard deviation. Default is 0. 

2447 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2448 Defines how to handle when input contains nan. 'propagate' returns nan, 

2449 'raise' throws an error, 'omit' performs the calculations ignoring nan 

2450 values. Default is 'propagate'. 

2451 

2452 Returns 

2453 ------- 

2454 zscore : array_like 

2455 The z-scores, standardized by mean and standard deviation of 

2456 input array `a`. 

2457 

2458 Notes 

2459 ----- 

2460 This function preserves ndarray subclasses, and works also with 

2461 matrices and masked arrays (it uses `asanyarray` instead of 

2462 `asarray` for parameters). 

2463 

2464 Examples 

2465 -------- 

2466 >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091, 

2467 ... 0.1954, 0.6307, 0.6599, 0.1065, 0.0508]) 

2468 >>> from scipy import stats 

2469 >>> stats.zscore(a) 

2470 array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786, 

2471 0.6748, -1.1488, -1.3324]) 

2472 

2473 Computing along a specified axis, using n-1 degrees of freedom 

2474 (``ddof=1``) to calculate the standard deviation: 

2475 

2476 >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608], 

2477 ... [ 0.7149, 0.0775, 0.6072, 0.9656], 

2478 ... [ 0.6341, 0.1403, 0.9759, 0.4064], 

2479 ... [ 0.5918, 0.6948, 0.904 , 0.3721], 

2480 ... [ 0.0921, 0.2481, 0.1188, 0.1366]]) 

2481 >>> stats.zscore(b, axis=1, ddof=1) 

2482 array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358], 

2483 [ 0.33048416, -1.37380874, 0.04251374, 1.00081084], 

2484 [ 0.26796377, -1.12598418, 1.23283094, -0.37481053], 

2485 [-0.22095197, 0.24468594, 1.19042819, -1.21416216], 

2486 [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]]) 

2487 

2488 """ 

2489 a = np.asanyarray(a) 

2490 

2491 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

2492 

2493 if contains_nan and nan_policy == 'omit': 

2494 mns = np.nanmean(a=a, axis=axis, keepdims=True) 

2495 sstd = np.nanstd(a=a, axis=axis, ddof=ddof, keepdims=True) 

2496 else: 

2497 mns = a.mean(axis=axis, keepdims=True) 

2498 sstd = a.std(axis=axis, ddof=ddof, keepdims=True) 

2499 

2500 return (a - mns) / sstd 

2501 

2502 

2503def zmap(scores, compare, axis=0, ddof=0): 

2504 """ 

2505 Calculate the relative z-scores. 

2506 

2507 Return an array of z-scores, i.e., scores that are standardized to 

2508 zero mean and unit variance, where mean and variance are calculated 

2509 from the comparison array. 

2510 

2511 Parameters 

2512 ---------- 

2513 scores : array_like 

2514 The input for which z-scores are calculated. 

2515 compare : array_like 

2516 The input from which the mean and standard deviation of the 

2517 normalization are taken; assumed to have the same dimension as 

2518 `scores`. 

2519 axis : int or None, optional 

2520 Axis over which mean and variance of `compare` are calculated. 

2521 Default is 0. If None, compute over the whole array `scores`. 

2522 ddof : int, optional 

2523 Degrees of freedom correction in the calculation of the 

2524 standard deviation. Default is 0. 

2525 

2526 Returns 

2527 ------- 

2528 zscore : array_like 

2529 Z-scores, in the same shape as `scores`. 

2530 

2531 Notes 

2532 ----- 

2533 This function preserves ndarray subclasses, and works also with 

2534 matrices and masked arrays (it uses `asanyarray` instead of 

2535 `asarray` for parameters). 

2536 

2537 Examples 

2538 -------- 

2539 >>> from scipy.stats import zmap 

2540 >>> a = [0.5, 2.0, 2.5, 3] 

2541 >>> b = [0, 1, 2, 3, 4] 

2542 >>> zmap(a, b) 

2543 array([-1.06066017, 0. , 0.35355339, 0.70710678]) 

2544 

2545 """ 

2546 scores, compare = map(np.asanyarray, [scores, compare]) 

2547 mns = compare.mean(axis=axis, keepdims=True) 

2548 sstd = compare.std(axis=axis, ddof=ddof, keepdims=True) 

2549 return (scores - mns) / sstd 

2550 

2551 

2552def gstd(a, axis=0, ddof=1): 

2553 """ 

2554 Calculate the geometric standard deviation of an array. 

2555 

2556 The geometric standard deviation describes the spread of a set of numbers 

2557 where the geometric mean is preferred. It is a multiplicative factor, and 

2558 so a dimensionless quantity. 

2559 

2560 It is defined as the exponent of the standard deviation of ``log(a)``. 

2561 Mathematically the population geometric standard deviation can be 

2562 evaluated as:: 

2563 

2564 gstd = exp(std(log(a))) 

2565 

2566 .. versionadded:: 1.3.0 

2567 

2568 Parameters 

2569 ---------- 

2570 a : array_like 

2571 An array like object containing the sample data. 

2572 axis : int, tuple or None, optional 

2573 Axis along which to operate. Default is 0. If None, compute over 

2574 the whole array `a`. 

2575 ddof : int, optional 

2576 Degree of freedom correction in the calculation of the 

2577 geometric standard deviation. Default is 1. 

2578 

2579 Returns 

2580 ------- 

2581 ndarray or float 

2582 An array of the geometric standard deviation. If `axis` is None or `a` 

2583 is a 1d array a float is returned. 

2584 

2585 Notes 

2586 ----- 

2587 As the calculation requires the use of logarithms the geometric standard 

2588 deviation only supports strictly positive values. Any non-positive or 

2589 infinite values will raise a `ValueError`. 

2590 The geometric standard deviation is sometimes confused with the exponent of 

2591 the standard deviation, ``exp(std(a))``. Instead the geometric standard 

2592 deviation is ``exp(std(log(a)))``. 

2593 The default value for `ddof` is different to the default value (0) used 

2594 by other ddof containing functions, such as ``np.std`` and ``np.nanstd``. 

2595 

2596 Examples 

2597 -------- 

2598 Find the geometric standard deviation of a log-normally distributed sample. 

2599 Note that the standard deviation of the distribution is one, on a 

2600 log scale this evaluates to approximately ``exp(1)``. 

2601 

2602 >>> from scipy.stats import gstd 

2603 >>> np.random.seed(123) 

2604 >>> sample = np.random.lognormal(mean=0, sigma=1, size=1000) 

2605 >>> gstd(sample) 

2606 2.7217860664589946 

2607 

2608 Compute the geometric standard deviation of a multidimensional array and 

2609 of a given axis. 

2610 

2611 >>> a = np.arange(1, 25).reshape(2, 3, 4) 

2612 >>> gstd(a, axis=None) 

2613 2.2944076136018947 

2614 >>> gstd(a, axis=2) 

2615 array([[1.82424757, 1.22436866, 1.13183117], 

2616 [1.09348306, 1.07244798, 1.05914985]]) 

2617 >>> gstd(a, axis=(1,2)) 

2618 array([2.12939215, 1.22120169]) 

2619 

2620 The geometric standard deviation further handles masked arrays. 

2621 

2622 >>> a = np.arange(1, 25).reshape(2, 3, 4) 

2623 >>> ma = np.ma.masked_where(a > 16, a) 

2624 >>> ma 

2625 masked_array( 

2626 data=[[[1, 2, 3, 4], 

2627 [5, 6, 7, 8], 

2628 [9, 10, 11, 12]], 

2629 [[13, 14, 15, 16], 

2630 [--, --, --, --], 

2631 [--, --, --, --]]], 

2632 mask=[[[False, False, False, False], 

2633 [False, False, False, False], 

2634 [False, False, False, False]], 

2635 [[False, False, False, False], 

2636 [ True, True, True, True], 

2637 [ True, True, True, True]]], 

2638 fill_value=999999) 

2639 >>> gstd(ma, axis=2) 

2640 masked_array( 

2641 data=[[1.8242475707663655, 1.2243686572447428, 1.1318311657788478], 

2642 [1.0934830582350938, --, --]], 

2643 mask=[[False, False, False], 

2644 [False, True, True]], 

2645 fill_value=999999) 

2646 

2647 """ 

2648 a = np.asanyarray(a) 

2649 log = ma.log if isinstance(a, ma.MaskedArray) else np.log 

2650 

2651 try: 

2652 with warnings.catch_warnings(): 

2653 warnings.simplefilter("error", RuntimeWarning) 

2654 return np.exp(np.std(log(a), axis=axis, ddof=ddof)) 

2655 except RuntimeWarning as w: 

2656 if np.isinf(a).any(): 

2657 raise ValueError( 

2658 'Infinite value encountered. The geometric standard deviation ' 

2659 'is defined for strictly positive values only.') 

2660 a_nan = np.isnan(a) 

2661 a_nan_any = a_nan.any() 

2662 # exclude NaN's from negativity check, but 

2663 # avoid expensive masking for arrays with no NaN 

2664 if ((a_nan_any and np.less_equal(np.nanmin(a), 0)) or 

2665 (not a_nan_any and np.less_equal(a, 0).any())): 

2666 raise ValueError( 

2667 'Non positive value encountered. The geometric standard ' 

2668 'deviation is defined for strictly positive values only.') 

2669 elif 'Degrees of freedom <= 0 for slice' == str(w): 

2670 raise ValueError(w) 

2671 else: 

2672 # Remaining warnings don't need to be exceptions. 

2673 return np.exp(np.std(log(a, where=~a_nan), axis=axis, ddof=ddof)) 

2674 except TypeError: 

2675 raise ValueError( 

2676 'Invalid array input. The inputs could not be ' 

2677 'safely coerced to any supported types') 

2678 

2679 

2680# Private dictionary initialized only once at module level 

2681# See https://en.wikipedia.org/wiki/Robust_measures_of_scale 

2682_scale_conversions = {'raw': 1.0, 

2683 'normal': special.erfinv(0.5) * 2.0 * math.sqrt(2.0)} 

2684 

2685 

2686def iqr(x, axis=None, rng=(25, 75), scale=1.0, nan_policy='propagate', 

2687 interpolation='linear', keepdims=False): 

2688 r""" 

2689 Compute the interquartile range of the data along the specified axis. 

2690 

2691 The interquartile range (IQR) is the difference between the 75th and 

2692 25th percentile of the data. It is a measure of the dispersion 

2693 similar to standard deviation or variance, but is much more robust 

2694 against outliers [2]_. 

2695 

2696 The ``rng`` parameter allows this function to compute other 

2697 percentile ranges than the actual IQR. For example, setting 

2698 ``rng=(0, 100)`` is equivalent to `numpy.ptp`. 

2699 

2700 The IQR of an empty array is `np.nan`. 

2701 

2702 .. versionadded:: 0.18.0 

2703 

2704 Parameters 

2705 ---------- 

2706 x : array_like 

2707 Input array or object that can be converted to an array. 

2708 axis : int or sequence of int, optional 

2709 Axis along which the range is computed. The default is to 

2710 compute the IQR for the entire array. 

2711 rng : Two-element sequence containing floats in range of [0,100] optional 

2712 Percentiles over which to compute the range. Each must be 

2713 between 0 and 100, inclusive. The default is the true IQR: 

2714 `(25, 75)`. The order of the elements is not important. 

2715 scale : scalar or str, optional 

2716 The numerical value of scale will be divided out of the final 

2717 result. The following string values are recognized: 

2718 

2719 * 'raw' : No scaling, just return the raw IQR. 

2720 **Deprecated!** Use `scale=1` instead. 

2721 * 'normal' : Scale by 

2722 :math:`2 \sqrt{2} erf^{-1}(\frac{1}{2}) \approx 1.349`. 

2723 

2724 The default is 1.0. The use of scale='raw' is deprecated. 

2725 Array-like scale is also allowed, as long 

2726 as it broadcasts correctly to the output such that 

2727 ``out / scale`` is a valid operation. The output dimensions 

2728 depend on the input array, `x`, the `axis` argument, and the 

2729 `keepdims` flag. 

2730 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2731 Defines how to handle when input contains nan. 

2732 The following options are available (default is 'propagate'): 

2733 

2734 * 'propagate': returns nan 

2735 * 'raise': throws an error 

2736 * 'omit': performs the calculations ignoring nan values 

2737 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional 

2738 Specifies the interpolation method to use when the percentile 

2739 boundaries lie between two data points `i` and `j`. 

2740 The following options are available (default is 'linear'): 

2741 

2742 * 'linear': `i + (j - i) * fraction`, where `fraction` is the 

2743 fractional part of the index surrounded by `i` and `j`. 

2744 * 'lower': `i`. 

2745 * 'higher': `j`. 

2746 * 'nearest': `i` or `j` whichever is nearest. 

2747 * 'midpoint': `(i + j) / 2`. 

2748 

2749 keepdims : bool, optional 

2750 If this is set to `True`, the reduced axes are left in the 

2751 result as dimensions with size one. With this option, the result 

2752 will broadcast correctly against the original array `x`. 

2753 

2754 Returns 

2755 ------- 

2756 iqr : scalar or ndarray 

2757 If ``axis=None``, a scalar is returned. If the input contains 

2758 integers or floats of smaller precision than ``np.float64``, then the 

2759 output data-type is ``np.float64``. Otherwise, the output data-type is 

2760 the same as that of the input. 

2761 

2762 See Also 

2763 -------- 

2764 numpy.std, numpy.var 

2765 

2766 Notes 

2767 ----- 

2768 This function is heavily dependent on the version of `numpy` that is 

2769 installed. Versions greater than 1.11.0b3 are highly recommended, as they 

2770 include a number of enhancements and fixes to `numpy.percentile` and 

2771 `numpy.nanpercentile` that affect the operation of this function. The 

2772 following modifications apply: 

2773 

2774 Below 1.10.0 : `nan_policy` is poorly defined. 

2775 The default behavior of `numpy.percentile` is used for 'propagate'. This 

2776 is a hybrid of 'omit' and 'propagate' that mostly yields a skewed 

2777 version of 'omit' since NaNs are sorted to the end of the data. A 

2778 warning is raised if there are NaNs in the data. 

2779 Below 1.9.0: `numpy.nanpercentile` does not exist. 

2780 This means that `numpy.percentile` is used regardless of `nan_policy` 

2781 and a warning is issued. See previous item for a description of the 

2782 behavior. 

2783 Below 1.9.0: `keepdims` and `interpolation` are not supported. 

2784 The keywords get ignored with a warning if supplied with non-default 

2785 values. However, multiple axes are still supported. 

2786 

2787 References 

2788 ---------- 

2789 .. [1] "Interquartile range" https://en.wikipedia.org/wiki/Interquartile_range 

2790 .. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale 

2791 .. [3] "Quantile" https://en.wikipedia.org/wiki/Quantile 

2792 

2793 Examples 

2794 -------- 

2795 >>> from scipy.stats import iqr 

2796 >>> x = np.array([[10, 7, 4], [3, 2, 1]]) 

2797 >>> x 

2798 array([[10, 7, 4], 

2799 [ 3, 2, 1]]) 

2800 >>> iqr(x) 

2801 4.0 

2802 >>> iqr(x, axis=0) 

2803 array([ 3.5, 2.5, 1.5]) 

2804 >>> iqr(x, axis=1) 

2805 array([ 3., 1.]) 

2806 >>> iqr(x, axis=1, keepdims=True) 

2807 array([[ 3.], 

2808 [ 1.]]) 

2809 

2810 """ 

2811 x = asarray(x) 

2812 

2813 # This check prevents percentile from raising an error later. Also, it is 

2814 # consistent with `np.var` and `np.std`. 

2815 if not x.size: 

2816 return np.nan 

2817 

2818 # An error may be raised here, so fail-fast, before doing lengthy 

2819 # computations, even though `scale` is not used until later 

2820 if isinstance(scale, str): 

2821 scale_key = scale.lower() 

2822 if scale_key not in _scale_conversions: 

2823 raise ValueError("{0} not a valid scale for `iqr`".format(scale)) 

2824 if scale_key == 'raw': 

2825 warnings.warn( 

2826 "use of scale='raw' is deprecated, use scale=1.0 instead", 

2827 np.VisibleDeprecationWarning 

2828 ) 

2829 scale = _scale_conversions[scale_key] 

2830 

2831 # Select the percentile function to use based on nans and policy 

2832 contains_nan, nan_policy = _contains_nan(x, nan_policy) 

2833 

2834 if contains_nan and nan_policy == 'omit': 

2835 percentile_func = np.nanpercentile 

2836 else: 

2837 percentile_func = np.percentile 

2838 

2839 if len(rng) != 2: 

2840 raise TypeError("quantile range must be two element sequence") 

2841 

2842 if np.isnan(rng).any(): 

2843 raise ValueError("range must not contain NaNs") 

2844 

2845 rng = sorted(rng) 

2846 pct = percentile_func(x, rng, axis=axis, interpolation=interpolation, 

2847 keepdims=keepdims) 

2848 out = np.subtract(pct[1], pct[0]) 

2849 

2850 if scale != 1.0: 

2851 out /= scale 

2852 

2853 return out 

2854 

2855 

2856def _mad_1d(x, center, nan_policy): 

2857 # Median absolute deviation for 1-d array x. 

2858 # This is a helper function for `median_abs_deviation`; it assumes its 

2859 # arguments have been validated already. In particular, x must be a 

2860 # 1-d numpy array, center must be callable, and if nan_policy is not 

2861 # 'propagate', it is assumed to be 'omit', because 'raise' is handled 

2862 # in `median_abs_deviation`. 

2863 # No warning is generated if x is empty or all nan. 

2864 isnan = np.isnan(x) 

2865 if isnan.any(): 

2866 if nan_policy == 'propagate': 

2867 return np.nan 

2868 x = x[~isnan] 

2869 if x.size == 0: 

2870 # MAD of an empty array is nan. 

2871 return np.nan 

2872 # Edge cases have been handled, so do the basic MAD calculation. 

2873 med = center(x) 

2874 mad = np.median(np.abs(x - med)) 

2875 return mad 

2876 

2877 

2878def median_abs_deviation(x, axis=0, center=np.median, scale=1.0, 

2879 nan_policy='propagate'): 

2880 r""" 

2881 Compute the median absolute deviation of the data along the given axis. 

2882 

2883 The median absolute deviation (MAD, [1]_) computes the median over the 

2884 absolute deviations from the median. It is a measure of dispersion 

2885 similar to the standard deviation but more robust to outliers [2]_. 

2886 

2887 The MAD of an empty array is ``np.nan``. 

2888 

2889 .. versionadded:: 1.5.0 

2890 

2891 Parameters 

2892 ---------- 

2893 x : array_like 

2894 Input array or object that can be converted to an array. 

2895 axis : int or None, optional 

2896 Axis along which the range is computed. Default is 0. If None, compute 

2897 the MAD over the entire array. 

2898 center : callable, optional 

2899 A function that will return the central value. The default is to use 

2900 np.median. Any user defined function used will need to have the 

2901 function signature ``func(arr, axis)``. 

2902 scale : scalar or str, optional 

2903 The numerical value of scale will be divided out of the final 

2904 result. The default is 1.0. The string "normal" is also accepted, 

2905 and results in `scale` being the inverse of the standard normal 

2906 quantile function at 0.75, which is approximately 0.67449. 

2907 Array-like scale is also allowed, as long as it broadcasts correctly 

2908 to the output such that ``out / scale`` is a valid operation. The 

2909 output dimensions depend on the input array, `x`, and the `axis` 

2910 argument. 

2911 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2912 Defines how to handle when input contains nan. 

2913 The following options are available (default is 'propagate'): 

2914 

2915 * 'propagate': returns nan 

2916 * 'raise': throws an error 

2917 * 'omit': performs the calculations ignoring nan values 

2918 

2919 Returns 

2920 ------- 

2921 mad : scalar or ndarray 

2922 If ``axis=None``, a scalar is returned. If the input contains 

2923 integers or floats of smaller precision than ``np.float64``, then the 

2924 output data-type is ``np.float64``. Otherwise, the output data-type is 

2925 the same as that of the input. 

2926 

2927 See Also 

2928 -------- 

2929 numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean, 

2930 scipy.stats.tstd, scipy.stats.tvar 

2931 

2932 Notes 

2933 ----- 

2934 The `center` argument only affects the calculation of the central value 

2935 around which the MAD is calculated. That is, passing in ``center=np.mean`` 

2936 will calculate the MAD around the mean - it will not calculate the *mean* 

2937 absolute deviation. 

2938 

2939 The input array may contain `inf`, but if `center` returns `inf`, the 

2940 corresponding MAD for that data will be `nan`. 

2941 

2942 References 

2943 ---------- 

2944 .. [1] "Median absolute deviation", 

2945 https://en.wikipedia.org/wiki/Median_absolute_deviation 

2946 .. [2] "Robust measures of scale", 

2947 https://en.wikipedia.org/wiki/Robust_measures_of_scale 

2948 

2949 Examples 

2950 -------- 

2951 When comparing the behavior of `median_abs_deviation` with ``np.std``, 

2952 the latter is affected when we change a single value of an array to have an 

2953 outlier value while the MAD hardly changes: 

2954 

2955 >>> from scipy import stats 

2956 >>> x = stats.norm.rvs(size=100, scale=1, random_state=123456) 

2957 >>> x.std() 

2958 0.9973906394005013 

2959 >>> stats.median_abs_deviation(x) 

2960 0.82832610097857 

2961 >>> x[0] = 345.6 

2962 >>> x.std() 

2963 34.42304872314415 

2964 >>> stats.median_abs_deviation(x) 

2965 0.8323442311590675 

2966 

2967 Axis handling example: 

2968 

2969 >>> x = np.array([[10, 7, 4], [3, 2, 1]]) 

2970 >>> x 

2971 array([[10, 7, 4], 

2972 [ 3, 2, 1]]) 

2973 >>> stats.median_abs_deviation(x) 

2974 array([3.5, 2.5, 1.5]) 

2975 >>> stats.median_abs_deviation(x, axis=None) 

2976 2.0 

2977 

2978 Scale normal example: 

2979 

2980 >>> x = stats.norm.rvs(size=1000000, scale=2, random_state=123456) 

2981 >>> stats.median_abs_deviation(x) 

2982 1.3487398527041636 

2983 >>> stats.median_abs_deviation(x, scale='normal') 

2984 1.9996446978061115 

2985 

2986 """ 

2987 if not callable(center): 

2988 raise TypeError("The argument 'center' must be callable. The given " 

2989 f"value {repr(center)} is not callable.") 

2990 

2991 # An error may be raised here, so fail-fast, before doing lengthy 

2992 # computations, even though `scale` is not used until later 

2993 if isinstance(scale, str): 

2994 if scale.lower() == 'normal': 

2995 scale = 0.6744897501960817 # special.ndtri(0.75) 

2996 else: 

2997 raise ValueError(f"{scale} is not a valid scale value.") 

2998 

2999 x = asarray(x) 

3000 

3001 # Consistent with `np.var` and `np.std`. 

3002 if not x.size: 

3003 if axis is None: 

3004 return np.nan 

3005 nan_shape = tuple(item for i, item in enumerate(x.shape) if i != axis) 

3006 if nan_shape == (): 

3007 # Return nan, not array(nan) 

3008 return np.nan 

3009 return np.full(nan_shape, np.nan) 

3010 

3011 contains_nan, nan_policy = _contains_nan(x, nan_policy) 

3012 

3013 if contains_nan: 

3014 if axis is None: 

3015 mad = _mad_1d(x.ravel(), center, nan_policy) 

3016 else: 

3017 mad = np.apply_along_axis(_mad_1d, axis, x, center, nan_policy) 

3018 else: 

3019 if axis is None: 

3020 med = center(x, axis=None) 

3021 mad = np.median(np.abs(x - med)) 

3022 else: 

3023 # Wrap the call to center() in expand_dims() so it acts like 

3024 # keepdims=True was used. 

3025 med = np.expand_dims(center(x, axis=axis), axis) 

3026 mad = np.median(np.abs(x - med), axis=axis) 

3027 

3028 return mad / scale 

3029 

3030 

3031# Keep the top newline so that the message does not show up on the stats page 

3032_median_absolute_deviation_deprec_msg = """ 

3033To preserve the existing default behavior, use 

3034`scipy.stats.median_abs_deviation(..., scale=1/1.4826)`. 

3035The value 1.4826 is not numerically precise for scaling 

3036with a normal distribution. For a numerically precise value, use 

3037`scipy.stats.median_abs_deviation(..., scale='normal')`. 

3038""" 

3039 

3040 

3041# Due to numpy/gh-16349 we need to unindent the entire docstring 

3042@np.deprecate(old_name='median_absolute_deviation', 

3043 new_name='median_abs_deviation', 

3044 message=_median_absolute_deviation_deprec_msg) 

3045def median_absolute_deviation(x, axis=0, center=np.median, scale=1.4826, 

3046 nan_policy='propagate'): 

3047 r""" 

3048Compute the median absolute deviation of the data along the given axis. 

3049 

3050The median absolute deviation (MAD, [1]_) computes the median over the 

3051absolute deviations from the median. It is a measure of dispersion 

3052similar to the standard deviation but more robust to outliers [2]_. 

3053 

3054The MAD of an empty array is ``np.nan``. 

3055 

3056.. versionadded:: 1.3.0 

3057 

3058Parameters 

3059---------- 

3060x : array_like 

3061 Input array or object that can be converted to an array. 

3062axis : int or None, optional 

3063 Axis along which the range is computed. Default is 0. If None, compute 

3064 the MAD over the entire array. 

3065center : callable, optional 

3066 A function that will return the central value. The default is to use 

3067 np.median. Any user defined function used will need to have the function 

3068 signature ``func(arr, axis)``. 

3069scale : int, optional 

3070 The scaling factor applied to the MAD. The default scale (1.4826) 

3071 ensures consistency with the standard deviation for normally distributed 

3072 data. 

3073nan_policy : {'propagate', 'raise', 'omit'}, optional 

3074 Defines how to handle when input contains nan. 

3075 The following options are available (default is 'propagate'): 

3076 

3077 * 'propagate': returns nan 

3078 * 'raise': throws an error 

3079 * 'omit': performs the calculations ignoring nan values 

3080 

3081Returns 

3082------- 

3083mad : scalar or ndarray 

3084 If ``axis=None``, a scalar is returned. If the input contains 

3085 integers or floats of smaller precision than ``np.float64``, then the 

3086 output data-type is ``np.float64``. Otherwise, the output data-type is 

3087 the same as that of the input. 

3088 

3089See Also 

3090-------- 

3091numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean, 

3092scipy.stats.tstd, scipy.stats.tvar 

3093 

3094Notes 

3095----- 

3096The `center` argument only affects the calculation of the central value 

3097around which the MAD is calculated. That is, passing in ``center=np.mean`` 

3098will calculate the MAD around the mean - it will not calculate the *mean* 

3099absolute deviation. 

3100 

3101References 

3102---------- 

3103.. [1] "Median absolute deviation", 

3104 https://en.wikipedia.org/wiki/Median_absolute_deviation 

3105.. [2] "Robust measures of scale", 

3106 https://en.wikipedia.org/wiki/Robust_measures_of_scale 

3107 

3108Examples 

3109-------- 

3110When comparing the behavior of `median_absolute_deviation` with ``np.std``, 

3111the latter is affected when we change a single value of an array to have an 

3112outlier value while the MAD hardly changes: 

3113 

3114>>> from scipy import stats 

3115>>> x = stats.norm.rvs(size=100, scale=1, random_state=123456) 

3116>>> x.std() 

31170.9973906394005013 

3118>>> stats.median_absolute_deviation(x) 

31191.2280762773108278 

3120>>> x[0] = 345.6 

3121>>> x.std() 

312234.42304872314415 

3123>>> stats.median_absolute_deviation(x) 

31241.2340335571164334 

3125 

3126Axis handling example: 

3127 

3128>>> x = np.array([[10, 7, 4], [3, 2, 1]]) 

3129>>> x 

3130array([[10, 7, 4], 

3131 [ 3, 2, 1]]) 

3132>>> stats.median_absolute_deviation(x) 

3133array([5.1891, 3.7065, 2.2239]) 

3134>>> stats.median_absolute_deviation(x, axis=None) 

31352.9652 

3136""" 

3137 if isinstance(scale, str): 

3138 if scale.lower() == 'raw': 

3139 warnings.warn( 

3140 "use of scale='raw' is deprecated, use scale=1.0 instead", 

3141 np.VisibleDeprecationWarning 

3142 ) 

3143 scale = 1.0 

3144 

3145 if not isinstance(scale, str): 

3146 scale = 1 / scale 

3147 

3148 return median_abs_deviation(x, axis=axis, center=center, scale=scale, 

3149 nan_policy=nan_policy) 

3150 

3151##################################### 

3152# TRIMMING FUNCTIONS # 

3153##################################### 

3154 

3155 

3156SigmaclipResult = namedtuple('SigmaclipResult', ('clipped', 'lower', 'upper')) 

3157 

3158 

3159def sigmaclip(a, low=4., high=4.): 

3160 """ 

3161 Perform iterative sigma-clipping of array elements. 

3162 

3163 Starting from the full sample, all elements outside the critical range are 

3164 removed, i.e. all elements of the input array `c` that satisfy either of 

3165 the following conditions:: 

3166 

3167 c < mean(c) - std(c)*low 

3168 c > mean(c) + std(c)*high 

3169 

3170 The iteration continues with the updated sample until no 

3171 elements are outside the (updated) range. 

3172 

3173 Parameters 

3174 ---------- 

3175 a : array_like 

3176 Data array, will be raveled if not 1-D. 

3177 low : float, optional 

3178 Lower bound factor of sigma clipping. Default is 4. 

3179 high : float, optional 

3180 Upper bound factor of sigma clipping. Default is 4. 

3181 

3182 Returns 

3183 ------- 

3184 clipped : ndarray 

3185 Input array with clipped elements removed. 

3186 lower : float 

3187 Lower threshold value use for clipping. 

3188 upper : float 

3189 Upper threshold value use for clipping. 

3190 

3191 Examples 

3192 -------- 

3193 >>> from scipy.stats import sigmaclip 

3194 >>> a = np.concatenate((np.linspace(9.5, 10.5, 31), 

3195 ... np.linspace(0, 20, 5))) 

3196 >>> fact = 1.5 

3197 >>> c, low, upp = sigmaclip(a, fact, fact) 

3198 >>> c 

3199 array([ 9.96666667, 10. , 10.03333333, 10. ]) 

3200 >>> c.var(), c.std() 

3201 (0.00055555555555555165, 0.023570226039551501) 

3202 >>> low, c.mean() - fact*c.std(), c.min() 

3203 (9.9646446609406727, 9.9646446609406727, 9.9666666666666668) 

3204 >>> upp, c.mean() + fact*c.std(), c.max() 

3205 (10.035355339059327, 10.035355339059327, 10.033333333333333) 

3206 

3207 >>> a = np.concatenate((np.linspace(9.5, 10.5, 11), 

3208 ... np.linspace(-100, -50, 3))) 

3209 >>> c, low, upp = sigmaclip(a, 1.8, 1.8) 

3210 >>> (c == np.linspace(9.5, 10.5, 11)).all() 

3211 True 

3212 

3213 """ 

3214 c = np.asarray(a).ravel() 

3215 delta = 1 

3216 while delta: 

3217 c_std = c.std() 

3218 c_mean = c.mean() 

3219 size = c.size 

3220 critlower = c_mean - c_std * low 

3221 critupper = c_mean + c_std * high 

3222 c = c[(c >= critlower) & (c <= critupper)] 

3223 delta = size - c.size 

3224 

3225 return SigmaclipResult(c, critlower, critupper) 

3226 

3227 

3228def trimboth(a, proportiontocut, axis=0): 

3229 """ 

3230 Slice off a proportion of items from both ends of an array. 

3231 

3232 Slice off the passed proportion of items from both ends of the passed 

3233 array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and** 

3234 rightmost 10% of scores). The trimmed values are the lowest and 

3235 highest ones. 

3236 Slice off less if proportion results in a non-integer slice index (i.e. 

3237 conservatively slices off `proportiontocut`). 

3238 

3239 Parameters 

3240 ---------- 

3241 a : array_like 

3242 Data to trim. 

3243 proportiontocut : float 

3244 Proportion (in range 0-1) of total data set to trim of each end. 

3245 axis : int or None, optional 

3246 Axis along which to trim data. Default is 0. If None, compute over 

3247 the whole array `a`. 

3248 

3249 Returns 

3250 ------- 

3251 out : ndarray 

3252 Trimmed version of array `a`. The order of the trimmed content 

3253 is undefined. 

3254 

3255 See Also 

3256 -------- 

3257 trim_mean 

3258 

3259 Examples 

3260 -------- 

3261 >>> from scipy import stats 

3262 >>> a = np.arange(20) 

3263 >>> b = stats.trimboth(a, 0.1) 

3264 >>> b.shape 

3265 (16,) 

3266 

3267 """ 

3268 a = np.asarray(a) 

3269 

3270 if a.size == 0: 

3271 return a 

3272 

3273 if axis is None: 

3274 a = a.ravel() 

3275 axis = 0 

3276 

3277 nobs = a.shape[axis] 

3278 lowercut = int(proportiontocut * nobs) 

3279 uppercut = nobs - lowercut 

3280 if (lowercut >= uppercut): 

3281 raise ValueError("Proportion too big.") 

3282 

3283 atmp = np.partition(a, (lowercut, uppercut - 1), axis) 

3284 

3285 sl = [slice(None)] * atmp.ndim 

3286 sl[axis] = slice(lowercut, uppercut) 

3287 return atmp[tuple(sl)] 

3288 

3289 

3290def trim1(a, proportiontocut, tail='right', axis=0): 

3291 """ 

3292 Slice off a proportion from ONE end of the passed array distribution. 

3293 

3294 If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost' 

3295 10% of scores. The lowest or highest values are trimmed (depending on 

3296 the tail). 

3297 Slice off less if proportion results in a non-integer slice index 

3298 (i.e. conservatively slices off `proportiontocut` ). 

3299 

3300 Parameters 

3301 ---------- 

3302 a : array_like 

3303 Input array. 

3304 proportiontocut : float 

3305 Fraction to cut off of 'left' or 'right' of distribution. 

3306 tail : {'left', 'right'}, optional 

3307 Defaults to 'right'. 

3308 axis : int or None, optional 

3309 Axis along which to trim data. Default is 0. If None, compute over 

3310 the whole array `a`. 

3311 

3312 Returns 

3313 ------- 

3314 trim1 : ndarray 

3315 Trimmed version of array `a`. The order of the trimmed content is 

3316 undefined. 

3317 

3318 """ 

3319 a = np.asarray(a) 

3320 if axis is None: 

3321 a = a.ravel() 

3322 axis = 0 

3323 

3324 nobs = a.shape[axis] 

3325 

3326 # avoid possible corner case 

3327 if proportiontocut >= 1: 

3328 return [] 

3329 

3330 if tail.lower() == 'right': 

3331 lowercut = 0 

3332 uppercut = nobs - int(proportiontocut * nobs) 

3333 

3334 elif tail.lower() == 'left': 

3335 lowercut = int(proportiontocut * nobs) 

3336 uppercut = nobs 

3337 

3338 atmp = np.partition(a, (lowercut, uppercut - 1), axis) 

3339 

3340 return atmp[lowercut:uppercut] 

3341 

3342 

3343def trim_mean(a, proportiontocut, axis=0): 

3344 """ 

3345 Return mean of array after trimming distribution from both tails. 

3346 

3347 If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of 

3348 scores. The input is sorted before slicing. Slices off less if proportion 

3349 results in a non-integer slice index (i.e., conservatively slices off 

3350 `proportiontocut` ). 

3351 

3352 Parameters 

3353 ---------- 

3354 a : array_like 

3355 Input array. 

3356 proportiontocut : float 

3357 Fraction to cut off of both tails of the distribution. 

3358 axis : int or None, optional 

3359 Axis along which the trimmed means are computed. Default is 0. 

3360 If None, compute over the whole array `a`. 

3361 

3362 Returns 

3363 ------- 

3364 trim_mean : ndarray 

3365 Mean of trimmed array. 

3366 

3367 See Also 

3368 -------- 

3369 trimboth 

3370 tmean : Compute the trimmed mean ignoring values outside given `limits`. 

3371 

3372 Examples 

3373 -------- 

3374 >>> from scipy import stats 

3375 >>> x = np.arange(20) 

3376 >>> stats.trim_mean(x, 0.1) 

3377 9.5 

3378 >>> x2 = x.reshape(5, 4) 

3379 >>> x2 

3380 array([[ 0, 1, 2, 3], 

3381 [ 4, 5, 6, 7], 

3382 [ 8, 9, 10, 11], 

3383 [12, 13, 14, 15], 

3384 [16, 17, 18, 19]]) 

3385 >>> stats.trim_mean(x2, 0.25) 

3386 array([ 8., 9., 10., 11.]) 

3387 >>> stats.trim_mean(x2, 0.25, axis=1) 

3388 array([ 1.5, 5.5, 9.5, 13.5, 17.5]) 

3389 

3390 """ 

3391 a = np.asarray(a) 

3392 

3393 if a.size == 0: 

3394 return np.nan 

3395 

3396 if axis is None: 

3397 a = a.ravel() 

3398 axis = 0 

3399 

3400 nobs = a.shape[axis] 

3401 lowercut = int(proportiontocut * nobs) 

3402 uppercut = nobs - lowercut 

3403 if (lowercut > uppercut): 

3404 raise ValueError("Proportion too big.") 

3405 

3406 atmp = np.partition(a, (lowercut, uppercut - 1), axis) 

3407 

3408 sl = [slice(None)] * atmp.ndim 

3409 sl[axis] = slice(lowercut, uppercut) 

3410 return np.mean(atmp[tuple(sl)], axis=axis) 

3411 

3412 

3413F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue')) 

3414 

3415 

3416class F_onewayConstantInputWarning(RuntimeWarning): 

3417 """ 

3418 Warning generated by `f_oneway` when an input is constant, e.g. 

3419 each of the samples provided is a constant array. 

3420 """ 

3421 

3422 def __init__(self, msg=None): 

3423 if msg is None: 

3424 msg = ("Each of the input arrays is constant;" 

3425 "the F statistic is not defined or infinite") 

3426 self.args = (msg,) 

3427 

3428 

3429class F_onewayBadInputSizesWarning(RuntimeWarning): 

3430 """ 

3431 Warning generated by `f_oneway` when an input has length 0, 

3432 or if all the inputs have length 1. 

3433 """ 

3434 pass 

3435 

3436 

3437def _create_f_oneway_nan_result(shape, axis): 

3438 """ 

3439 This is a helper function for f_oneway for creating the return values 

3440 in certain degenerate conditions. It creates return values that are 

3441 all nan with the appropriate shape for the given `shape` and `axis`. 

3442 """ 

3443 axis = np.core.multiarray.normalize_axis_index(axis, len(shape)) 

3444 shp = shape[:axis] + shape[axis+1:] 

3445 if shp == (): 

3446 f = np.nan 

3447 prob = np.nan 

3448 else: 

3449 f = np.full(shp, fill_value=np.nan) 

3450 prob = f.copy() 

3451 return F_onewayResult(f, prob) 

3452 

3453 

3454def _first(arr, axis): 

3455 """ 

3456 Return arr[..., 0:1, ...] where 0:1 is in the `axis` position. 

3457 """ 

3458 # When the oldest version of numpy supported by scipy is at 

3459 # least 1.15.0, this function can be replaced by np.take_along_axis 

3460 # (with appropriately configured arguments). 

3461 axis = np.core.multiarray.normalize_axis_index(axis, arr.ndim) 

3462 return arr[tuple(slice(None) if k != axis else slice(0, 1) 

3463 for k in range(arr.ndim))] 

3464 

3465 

3466def f_oneway(*args, axis=0): 

3467 """ 

3468 Perform one-way ANOVA. 

3469 

3470 The one-way ANOVA tests the null hypothesis that two or more groups have 

3471 the same population mean. The test is applied to samples from two or 

3472 more groups, possibly with differing sizes. 

3473 

3474 Parameters 

3475 ---------- 

3476 sample1, sample2, ... : array_like 

3477 The sample measurements for each group. There must be at least 

3478 two arguments. If the arrays are multidimensional, then all the 

3479 dimensions of the array must be the same except for `axis`. 

3480 axis : int, optional 

3481 Axis of the input arrays along which the test is applied. 

3482 Default is 0. 

3483 

3484 Returns 

3485 ------- 

3486 statistic : float 

3487 The computed F statistic of the test. 

3488 pvalue : float 

3489 The associated p-value from the F distribution. 

3490 

3491 Warns 

3492 ----- 

3493 F_onewayConstantInputWarning 

3494 Raised if each of the input arrays is constant array. 

3495 In this case the F statistic is either infinite or isn't defined, 

3496 so ``np.inf`` or ``np.nan`` is returned. 

3497 

3498 F_onewayBadInputSizesWarning 

3499 Raised if the length of any input array is 0, or if all the input 

3500 arrays have length 1. ``np.nan`` is returned for the F statistic 

3501 and the p-value in these cases. 

3502 

3503 Notes 

3504 ----- 

3505 The ANOVA test has important assumptions that must be satisfied in order 

3506 for the associated p-value to be valid. 

3507 

3508 1. The samples are independent. 

3509 2. Each sample is from a normally distributed population. 

3510 3. The population standard deviations of the groups are all equal. This 

3511 property is known as homoscedasticity. 

3512 

3513 If these assumptions are not true for a given set of data, it may still 

3514 be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) 

3515 although with some loss of power. 

3516 

3517 The length of each group must be at least one, and there must be at 

3518 least one group with length greater than one. If these conditions 

3519 are not satisfied, a warning is generated and (``np.nan``, ``np.nan``) 

3520 is returned. 

3521 

3522 If each group contains constant values, and there exist at least two 

3523 groups with different values, the function generates a warning and 

3524 returns (``np.inf``, 0). 

3525 

3526 If all values in all groups are the same, function generates a warning 

3527 and returns (``np.nan``, ``np.nan``). 

3528 

3529 The algorithm is from Heiman [2]_, pp.394-7. 

3530 

3531 References 

3532 ---------- 

3533 .. [1] R. Lowry, "Concepts and Applications of Inferential Statistics", 

3534 Chapter 14, 2014, http://vassarstats.net/textbook/ 

3535 

3536 .. [2] G.W. Heiman, "Understanding research methods and statistics: An 

3537 integrated introduction for psychology", Houghton, Mifflin and 

3538 Company, 2001. 

3539 

3540 .. [3] G.H. McDonald, "Handbook of Biological Statistics", One-way ANOVA. 

3541 http://www.biostathandbook.com/onewayanova.html 

3542 

3543 Examples 

3544 -------- 

3545 >>> from scipy.stats import f_oneway 

3546 

3547 Here are some data [3]_ on a shell measurement (the length of the anterior 

3548 adductor muscle scar, standardized by dividing by length) in the mussel 

3549 Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon; 

3550 Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, taken from a 

3551 much larger data set used in McDonald et al. (1991). 

3552 

3553 >>> tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735, 

3554 ... 0.0659, 0.0923, 0.0836] 

3555 >>> newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835, 

3556 ... 0.0725] 

3557 >>> petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105] 

3558 >>> magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764, 

3559 ... 0.0689] 

3560 >>> tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045] 

3561 >>> f_oneway(tillamook, newport, petersburg, magadan, tvarminne) 

3562 F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544) 

3563 

3564 `f_oneway` accepts multidimensional input arrays. When the inputs 

3565 are multidimensional and `axis` is not given, the test is performed 

3566 along the first axis of the input arrays. For the following data, the 

3567 test is performed three times, once for each column. 

3568 

3569 >>> a = np.array([[9.87, 9.03, 6.81], 

3570 ... [7.18, 8.35, 7.00], 

3571 ... [8.39, 7.58, 7.68], 

3572 ... [7.45, 6.33, 9.35], 

3573 ... [6.41, 7.10, 9.33], 

3574 ... [8.00, 8.24, 8.44]]) 

3575 >>> b = np.array([[6.35, 7.30, 7.16], 

3576 ... [6.65, 6.68, 7.63], 

3577 ... [5.72, 7.73, 6.72], 

3578 ... [7.01, 9.19, 7.41], 

3579 ... [7.75, 7.87, 8.30], 

3580 ... [6.90, 7.97, 6.97]]) 

3581 >>> c = np.array([[3.31, 8.77, 1.01], 

3582 ... [8.25, 3.24, 3.62], 

3583 ... [6.32, 8.81, 5.19], 

3584 ... [7.48, 8.83, 8.91], 

3585 ... [8.59, 6.01, 6.07], 

3586 ... [3.07, 9.72, 7.48]]) 

3587 >>> F, p = f_oneway(a, b, c) 

3588 >>> F 

3589 array([1.75676344, 0.03701228, 3.76439349]) 

3590 >>> p 

3591 array([0.20630784, 0.96375203, 0.04733157]) 

3592 

3593 """ 

3594 if len(args) < 2: 

3595 raise TypeError(f'at least two inputs are required; got {len(args)}.') 

3596 

3597 args = [np.asarray(arg, dtype=float) for arg in args] 

3598 

3599 # ANOVA on N groups, each in its own array 

3600 num_groups = len(args) 

3601 

3602 # We haven't explicitly validated axis, but if it is bad, this call of 

3603 # np.concatenate will raise np.AxisError. The call will raise ValueError 

3604 # if the dimensions of all the arrays, except the axis dimension, are not 

3605 # the same. 

3606 alldata = np.concatenate(args, axis=axis) 

3607 bign = alldata.shape[axis] 

3608 

3609 # Check this after forming alldata, so shape errors are detected 

3610 # and reported before checking for 0 length inputs. 

3611 if any(arg.shape[axis] == 0 for arg in args): 

3612 warnings.warn(F_onewayBadInputSizesWarning('at least one input ' 

3613 'has length 0')) 

3614 return _create_f_oneway_nan_result(alldata.shape, axis) 

3615 

3616 # Must have at least one group with length greater than 1. 

3617 if all(arg.shape[axis] == 1 for arg in args): 

3618 msg = ('all input arrays have length 1. f_oneway requires that at ' 

3619 'least one input has length greater than 1.') 

3620 warnings.warn(F_onewayBadInputSizesWarning(msg)) 

3621 return _create_f_oneway_nan_result(alldata.shape, axis) 

3622 

3623 # Check if the values within each group are constant, and if the common 

3624 # value in at least one group is different from that in another group. 

3625 # Based on https://github.com/scipy/scipy/issues/11669 

3626 

3627 # If axis=0, say, and the groups have shape (n0, ...), (n1, ...), ..., 

3628 # then is_const is a boolean array with shape (num_groups, ...). 

3629 # It is True if the groups along the axis slice are each consant. 

3630 # In the typical case where each input array is 1-d, is_const is a 

3631 # 1-d array with length num_groups. 

3632 is_const = np.concatenate([(_first(a, axis) == a).all(axis=axis, 

3633 keepdims=True) 

3634 for a in args], axis=axis) 

3635 

3636 # all_const is a boolean array with shape (...) (see previous comment). 

3637 # It is True if the values within each group along the axis slice are 

3638 # the same (e.g. [[3, 3, 3], [5, 5, 5, 5], [4, 4, 4]]). 

3639 all_const = is_const.all(axis=axis) 

3640 if all_const.any(): 

3641 warnings.warn(F_onewayConstantInputWarning()) 

3642 

3643 # all_same_const is True if all the values in the groups along the axis=0 

3644 # slice are the same (e.g. [[3, 3, 3], [3, 3, 3, 3], [3, 3, 3]]). 

3645 all_same_const = (_first(alldata, axis) == alldata).all(axis=axis) 

3646 

3647 # Determine the mean of the data, and subtract that from all inputs to a 

3648 # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariant 

3649 # to a shift in location, and centering all data around zero vastly 

3650 # improves numerical stability. 

3651 offset = alldata.mean(axis=axis, keepdims=True) 

3652 alldata -= offset 

3653 

3654 normalized_ss = _square_of_sums(alldata, axis=axis) / bign 

3655 

3656 sstot = _sum_of_squares(alldata, axis=axis) - normalized_ss 

3657 

3658 ssbn = 0 

3659 for a in args: 

3660 ssbn += _square_of_sums(a - offset, axis=axis) / a.shape[axis] 

3661 

3662 # Naming: variables ending in bn/b are for "between treatments", wn/w are 

3663 # for "within treatments" 

3664 ssbn -= normalized_ss 

3665 sswn = sstot - ssbn 

3666 dfbn = num_groups - 1 

3667 dfwn = bign - num_groups 

3668 msb = ssbn / dfbn 

3669 msw = sswn / dfwn 

3670 with np.errstate(divide='ignore', invalid='ignore'): 

3671 f = msb / msw 

3672 

3673 prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf 

3674 

3675 # Fix any f values that should be inf or nan because the corresponding 

3676 # inputs were constant. 

3677 if np.isscalar(f): 

3678 if all_same_const: 

3679 f = np.nan 

3680 prob = np.nan 

3681 elif all_const: 

3682 f = np.inf 

3683 prob = 0.0 

3684 else: 

3685 f[all_const] = np.inf 

3686 prob[all_const] = 0.0 

3687 f[all_same_const] = np.nan 

3688 prob[all_same_const] = np.nan 

3689 

3690 return F_onewayResult(f, prob) 

3691 

3692 

3693class PearsonRConstantInputWarning(RuntimeWarning): 

3694 """Warning generated by `pearsonr` when an input is constant.""" 

3695 

3696 def __init__(self, msg=None): 

3697 if msg is None: 

3698 msg = ("An input array is constant; the correlation coefficent " 

3699 "is not defined.") 

3700 self.args = (msg,) 

3701 

3702 

3703class PearsonRNearConstantInputWarning(RuntimeWarning): 

3704 """Warning generated by `pearsonr` when an input is nearly constant.""" 

3705 

3706 def __init__(self, msg=None): 

3707 if msg is None: 

3708 msg = ("An input array is nearly constant; the computed " 

3709 "correlation coefficent may be inaccurate.") 

3710 self.args = (msg,) 

3711 

3712 

3713def pearsonr(x, y): 

3714 r""" 

3715 Pearson correlation coefficient and p-value for testing non-correlation. 

3716 

3717 The Pearson correlation coefficient [1]_ measures the linear relationship 

3718 between two datasets. The calculation of the p-value relies on the 

3719 assumption that each dataset is normally distributed. (See Kowalski [3]_ 

3720 for a discussion of the effects of non-normality of the input on the 

3721 distribution of the correlation coefficient.) Like other correlation 

3722 coefficients, this one varies between -1 and +1 with 0 implying no 

3723 correlation. Correlations of -1 or +1 imply an exact linear relationship. 

3724 Positive correlations imply that as x increases, so does y. Negative 

3725 correlations imply that as x increases, y decreases. 

3726 

3727 The p-value roughly indicates the probability of an uncorrelated system 

3728 producing datasets that have a Pearson correlation at least as extreme 

3729 as the one computed from these datasets. 

3730 

3731 Parameters 

3732 ---------- 

3733 x : (N,) array_like 

3734 Input array. 

3735 y : (N,) array_like 

3736 Input array. 

3737 

3738 Returns 

3739 ------- 

3740 r : float 

3741 Pearson's correlation coefficient. 

3742 p-value : float 

3743 Two-tailed p-value. 

3744 

3745 Warns 

3746 ----- 

3747 PearsonRConstantInputWarning 

3748 Raised if an input is a constant array. The correlation coefficient 

3749 is not defined in this case, so ``np.nan`` is returned. 

3750 

3751 PearsonRNearConstantInputWarning 

3752 Raised if an input is "nearly" constant. The array ``x`` is considered 

3753 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``. 

3754 Numerical errors in the calculation ``x - mean(x)`` in this case might 

3755 result in an inaccurate calculation of r. 

3756 

3757 See Also 

3758 -------- 

3759 spearmanr : Spearman rank-order correlation coefficient. 

3760 kendalltau : Kendall's tau, a correlation measure for ordinal data. 

3761 

3762 Notes 

3763 ----- 

3764 The correlation coefficient is calculated as follows: 

3765 

3766 .. math:: 

3767 

3768 r = \frac{\sum (x - m_x) (y - m_y)} 

3769 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}} 

3770 

3771 where :math:`m_x` is the mean of the vector :math:`x` and :math:`m_y` is 

3772 the mean of the vector :math:`y`. 

3773 

3774 Under the assumption that x and y are drawn from independent normal 

3775 distributions (so the population correlation coefficient is 0), the 

3776 probability density function of the sample correlation coefficient r 

3777 is ([1]_, [2]_):: 

3778 

3779 (1 - r**2)**(n/2 - 2) 

3780 f(r) = --------------------- 

3781 B(1/2, n/2 - 1) 

3782 

3783 where n is the number of samples, and B is the beta function. This 

3784 is sometimes referred to as the exact distribution of r. This is 

3785 the distribution that is used in `pearsonr` to compute the p-value. 

3786 The distribution is a beta distribution on the interval [-1, 1], 

3787 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's 

3788 implementation of the beta distribution, the distribution of r is:: 

3789 

3790 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2) 

3791 

3792 The p-value returned by `pearsonr` is a two-sided p-value. For a 

3793 given sample with correlation coefficient r, the p-value is 

3794 the probability that abs(r') of a random sample x' and y' drawn from 

3795 the population with zero correlation would be greater than or equal 

3796 to abs(r). In terms of the object ``dist`` shown above, the p-value 

3797 for a given r and length n can be computed as:: 

3798 

3799 p = 2*dist.cdf(-abs(r)) 

3800 

3801 When n is 2, the above continuous distribution is not well-defined. 

3802 One can interpret the limit of the beta distribution as the shape 

3803 parameters a and b approach a = b = 0 as a discrete distribution with 

3804 equal probability masses at r = 1 and r = -1. More directly, one 

3805 can observe that, given the data x = [x1, x2] and y = [y1, y2], and 

3806 assuming x1 != x2 and y1 != y2, the only possible values for r are 1 

3807 and -1. Because abs(r') for any sample x' and y' with length 2 will 

3808 be 1, the two-sided p-value for a sample of length 2 is always 1. 

3809 

3810 References 

3811 ---------- 

3812 .. [1] "Pearson correlation coefficient", Wikipedia, 

3813 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient 

3814 .. [2] Student, "Probable error of a correlation coefficient", 

3815 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310. 

3816 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution 

3817 of the Sample Product-Moment Correlation Coefficient" 

3818 Journal of the Royal Statistical Society. Series C (Applied 

3819 Statistics), Vol. 21, No. 1 (1972), pp. 1-12. 

3820 

3821 Examples 

3822 -------- 

3823 >>> from scipy import stats 

3824 >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) 

3825 >>> b = np.arange(7) 

3826 >>> stats.pearsonr(a, b) 

3827 (0.8660254037844386, 0.011724811003954649) 

3828 

3829 >>> stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4]) 

3830 (-0.7426106572325057, 0.1505558088534455) 

3831 

3832 """ 

3833 n = len(x) 

3834 if n != len(y): 

3835 raise ValueError('x and y must have the same length.') 

3836 

3837 if n < 2: 

3838 raise ValueError('x and y must have length at least 2.') 

3839 

3840 x = np.asarray(x) 

3841 y = np.asarray(y) 

3842 

3843 # If an input is constant, the correlation coefficient is not defined. 

3844 if (x == x[0]).all() or (y == y[0]).all(): 

3845 warnings.warn(PearsonRConstantInputWarning()) 

3846 return np.nan, np.nan 

3847 

3848 # dtype is the data type for the calculations. This expression ensures 

3849 # that the data type is at least 64 bit floating point. It might have 

3850 # more precision if the input is, for example, np.longdouble. 

3851 dtype = type(1.0 + x[0] + y[0]) 

3852 

3853 if n == 2: 

3854 return dtype(np.sign(x[1] - x[0])*np.sign(y[1] - y[0])), 1.0 

3855 

3856 xmean = x.mean(dtype=dtype) 

3857 ymean = y.mean(dtype=dtype) 

3858 

3859 # By using `astype(dtype)`, we ensure that the intermediate calculations 

3860 # use at least 64 bit floating point. 

3861 xm = x.astype(dtype) - xmean 

3862 ym = y.astype(dtype) - ymean 

3863 

3864 # Unlike np.linalg.norm or the expression sqrt((xm*xm).sum()), 

3865 # scipy.linalg.norm(xm) does not overflow if xm is, for example, 

3866 # [-5e210, 5e210, 3e200, -3e200] 

3867 normxm = linalg.norm(xm) 

3868 normym = linalg.norm(ym) 

3869 

3870 threshold = 1e-13 

3871 if normxm < threshold*abs(xmean) or normym < threshold*abs(ymean): 

3872 # If all the values in x (likewise y) are very close to the mean, 

3873 # the loss of precision that occurs in the subtraction xm = x - xmean 

3874 # might result in large errors in r. 

3875 warnings.warn(PearsonRNearConstantInputWarning()) 

3876 

3877 r = np.dot(xm/normxm, ym/normym) 

3878 

3879 # Presumably, if abs(r) > 1, then it is only some small artifact of 

3880 # floating point arithmetic. 

3881 r = max(min(r, 1.0), -1.0) 

3882 

3883 # As explained in the docstring, the p-value can be computed as 

3884 # p = 2*dist.cdf(-abs(r)) 

3885 # where dist is the beta distribution on [-1, 1] with shape parameters 

3886 # a = b = n/2 - 1. `special.btdtr` is the CDF for the beta distribution 

3887 # on [0, 1]. To use it, we make the transformation x = (r + 1)/2; the 

3888 # shape parameters do not change. Then -abs(r) used in `cdf(-abs(r))` 

3889 # becomes x = (-abs(r) + 1)/2 = 0.5*(1 - abs(r)). (r is cast to float64 

3890 # to avoid a TypeError raised by btdtr when r is higher precision.) 

3891 ab = n/2 - 1 

3892 prob = 2*special.btdtr(ab, ab, 0.5*(1 - abs(np.float64(r)))) 

3893 

3894 return r, prob 

3895 

3896 

3897def fisher_exact(table, alternative='two-sided'): 

3898 """ 

3899 Perform a Fisher exact test on a 2x2 contingency table. 

3900 

3901 Parameters 

3902 ---------- 

3903 table : array_like of ints 

3904 A 2x2 contingency table. Elements should be non-negative integers. 

3905 alternative : {'two-sided', 'less', 'greater'}, optional 

3906 Defines the alternative hypothesis. 

3907 The following options are available (default is 'two-sided'): 

3908 

3909 * 'two-sided' 

3910 * 'less': one-sided 

3911 * 'greater': one-sided 

3912 

3913 Returns 

3914 ------- 

3915 oddsratio : float 

3916 This is prior odds ratio and not a posterior estimate. 

3917 p_value : float 

3918 P-value, the probability of obtaining a distribution at least as 

3919 extreme as the one that was actually observed, assuming that the 

3920 null hypothesis is true. 

3921 

3922 See Also 

3923 -------- 

3924 chi2_contingency : Chi-square test of independence of variables in a 

3925 contingency table. 

3926 

3927 Notes 

3928 ----- 

3929 The calculated odds ratio is different from the one R uses. This scipy 

3930 implementation returns the (more common) "unconditional Maximum 

3931 Likelihood Estimate", while R uses the "conditional Maximum Likelihood 

3932 Estimate". 

3933 

3934 For tables with large numbers, the (inexact) chi-square test implemented 

3935 in the function `chi2_contingency` can also be used. 

3936 

3937 Examples 

3938 -------- 

3939 Say we spend a few days counting whales and sharks in the Atlantic and 

3940 Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the 

3941 Indian ocean 2 whales and 5 sharks. Then our contingency table is:: 

3942 

3943 Atlantic Indian 

3944 whales 8 2 

3945 sharks 1 5 

3946 

3947 We use this table to find the p-value: 

3948 

3949 >>> import scipy.stats as stats 

3950 >>> oddsratio, pvalue = stats.fisher_exact([[8, 2], [1, 5]]) 

3951 >>> pvalue 

3952 0.0349... 

3953 

3954 The probability that we would observe this or an even more imbalanced ratio 

3955 by chance is about 3.5%. A commonly used significance level is 5%--if we 

3956 adopt that, we can therefore conclude that our observed imbalance is 

3957 statistically significant; whales prefer the Atlantic while sharks prefer 

3958 the Indian ocean. 

3959 

3960 """ 

3961 hypergeom = distributions.hypergeom 

3962 c = np.asarray(table, dtype=np.int64) # int32 is not enough for the algorithm 

3963 if not c.shape == (2, 2): 

3964 raise ValueError("The input `table` must be of shape (2, 2).") 

3965 

3966 if np.any(c < 0): 

3967 raise ValueError("All values in `table` must be nonnegative.") 

3968 

3969 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1): 

3970 # If both values in a row or column are zero, the p-value is 1 and 

3971 # the odds ratio is NaN. 

3972 return np.nan, 1.0 

3973 

3974 if c[1, 0] > 0 and c[0, 1] > 0: 

3975 oddsratio = c[0, 0] * c[1, 1] / (c[1, 0] * c[0, 1]) 

3976 else: 

3977 oddsratio = np.inf 

3978 

3979 n1 = c[0, 0] + c[0, 1] 

3980 n2 = c[1, 0] + c[1, 1] 

3981 n = c[0, 0] + c[1, 0] 

3982 

3983 def binary_search(n, n1, n2, side): 

3984 """Binary search for where to begin halves in two-sided test.""" 

3985 if side == "upper": 

3986 minval = mode 

3987 maxval = n 

3988 else: 

3989 minval = 0 

3990 maxval = mode 

3991 guess = -1 

3992 while maxval - minval > 1: 

3993 if maxval == minval + 1 and guess == minval: 

3994 guess = maxval 

3995 else: 

3996 guess = (maxval + minval) // 2 

3997 pguess = hypergeom.pmf(guess, n1 + n2, n1, n) 

3998 if side == "upper": 

3999 ng = guess - 1 

4000 else: 

4001 ng = guess + 1 

4002 if pguess <= pexact < hypergeom.pmf(ng, n1 + n2, n1, n): 

4003 break 

4004 elif pguess < pexact: 

4005 maxval = guess 

4006 else: 

4007 minval = guess 

4008 if guess == -1: 

4009 guess = minval 

4010 if side == "upper": 

4011 while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: 

4012 guess -= 1 

4013 while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: 

4014 guess += 1 

4015 else: 

4016 while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: 

4017 guess += 1 

4018 while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: 

4019 guess -= 1 

4020 return guess 

4021 

4022 if alternative == 'less': 

4023 pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n) 

4024 elif alternative == 'greater': 

4025 # Same formula as the 'less' case, but with the second column. 

4026 pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1]) 

4027 elif alternative == 'two-sided': 

4028 mode = int((n + 1) * (n1 + 1) / (n1 + n2 + 2)) 

4029 pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n) 

4030 pmode = hypergeom.pmf(mode, n1 + n2, n1, n) 

4031 

4032 epsilon = 1 - 1e-4 

4033 if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= 1 - epsilon: 

4034 return oddsratio, 1. 

4035 

4036 elif c[0, 0] < mode: 

4037 plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n) 

4038 if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon: 

4039 return oddsratio, plower 

4040 

4041 guess = binary_search(n, n1, n2, "upper") 

4042 pvalue = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n) 

4043 else: 

4044 pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n) 

4045 if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon: 

4046 return oddsratio, pupper 

4047 

4048 guess = binary_search(n, n1, n2, "lower") 

4049 pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) 

4050 else: 

4051 msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}" 

4052 raise ValueError(msg) 

4053 

4054 pvalue = min(pvalue, 1.0) 

4055 

4056 return oddsratio, pvalue 

4057 

4058 

4059class SpearmanRConstantInputWarning(RuntimeWarning): 

4060 """Warning generated by `spearmanr` when an input is constant.""" 

4061 

4062 def __init__(self, msg=None): 

4063 if msg is None: 

4064 msg = ("An input array is constant; the correlation coefficent " 

4065 "is not defined.") 

4066 self.args = (msg,) 

4067 

4068 

4069SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue')) 

4070 

4071 

4072def spearmanr(a, b=None, axis=0, nan_policy='propagate'): 

4073 """ 

4074 Calculate a Spearman correlation coefficient with associated p-value. 

4075 

4076 The Spearman rank-order correlation coefficient is a nonparametric measure 

4077 of the monotonicity of the relationship between two datasets. Unlike the 

4078 Pearson correlation, the Spearman correlation does not assume that both 

4079 datasets are normally distributed. Like other correlation coefficients, 

4080 this one varies between -1 and +1 with 0 implying no correlation. 

4081 Correlations of -1 or +1 imply an exact monotonic relationship. Positive 

4082 correlations imply that as x increases, so does y. Negative correlations 

4083 imply that as x increases, y decreases. 

4084 

4085 The p-value roughly indicates the probability of an uncorrelated system 

4086 producing datasets that have a Spearman correlation at least as extreme 

4087 as the one computed from these datasets. The p-values are not entirely 

4088 reliable but are probably reasonable for datasets larger than 500 or so. 

4089 

4090 Parameters 

4091 ---------- 

4092 a, b : 1D or 2D array_like, b is optional 

4093 One or two 1-D or 2-D arrays containing multiple variables and 

4094 observations. When these are 1-D, each represents a vector of 

4095 observations of a single variable. For the behavior in the 2-D case, 

4096 see under ``axis``, below. 

4097 Both arrays need to have the same length in the ``axis`` dimension. 

4098 axis : int or None, optional 

4099 If axis=0 (default), then each column represents a variable, with 

4100 observations in the rows. If axis=1, the relationship is transposed: 

4101 each row represents a variable, while the columns contain observations. 

4102 If axis=None, then both arrays will be raveled. 

4103 nan_policy : {'propagate', 'raise', 'omit'}, optional 

4104 Defines how to handle when input contains nan. 

4105 The following options are available (default is 'propagate'): 

4106 

4107 * 'propagate': returns nan 

4108 * 'raise': throws an error 

4109 * 'omit': performs the calculations ignoring nan values 

4110 

4111 Returns 

4112 ------- 

4113 correlation : float or ndarray (2-D square) 

4114 Spearman correlation matrix or correlation coefficient (if only 2 

4115 variables are given as parameters. Correlation matrix is square with 

4116 length equal to total number of variables (columns or rows) in ``a`` 

4117 and ``b`` combined. 

4118 pvalue : float 

4119 The two-sided p-value for a hypothesis test whose null hypothesis is 

4120 that two sets of data are uncorrelated, has same dimension as rho. 

4121 

4122 References 

4123 ---------- 

4124 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

4125 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

4126 York. 2000. 

4127 Section 14.7 

4128 

4129 Examples 

4130 -------- 

4131 >>> from scipy import stats 

4132 >>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7]) 

4133 (0.82078268166812329, 0.088587005313543798) 

4134 >>> np.random.seed(1234321) 

4135 >>> x2n = np.random.randn(100, 2) 

4136 >>> y2n = np.random.randn(100, 2) 

4137 >>> stats.spearmanr(x2n) 

4138 (0.059969996999699973, 0.55338590803773591) 

4139 >>> stats.spearmanr(x2n[:,0], x2n[:,1]) 

4140 (0.059969996999699973, 0.55338590803773591) 

4141 >>> rho, pval = stats.spearmanr(x2n, y2n) 

4142 >>> rho 

4143 array([[ 1. , 0.05997 , 0.18569457, 0.06258626], 

4144 [ 0.05997 , 1. , 0.110003 , 0.02534653], 

4145 [ 0.18569457, 0.110003 , 1. , 0.03488749], 

4146 [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) 

4147 >>> pval 

4148 array([[ 0. , 0.55338591, 0.06435364, 0.53617935], 

4149 [ 0.55338591, 0. , 0.27592895, 0.80234077], 

4150 [ 0.06435364, 0.27592895, 0. , 0.73039992], 

4151 [ 0.53617935, 0.80234077, 0.73039992, 0. ]]) 

4152 >>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1) 

4153 >>> rho 

4154 array([[ 1. , 0.05997 , 0.18569457, 0.06258626], 

4155 [ 0.05997 , 1. , 0.110003 , 0.02534653], 

4156 [ 0.18569457, 0.110003 , 1. , 0.03488749], 

4157 [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) 

4158 >>> stats.spearmanr(x2n, y2n, axis=None) 

4159 (0.10816770419260482, 0.1273562188027364) 

4160 >>> stats.spearmanr(x2n.ravel(), y2n.ravel()) 

4161 (0.10816770419260482, 0.1273562188027364) 

4162 

4163 >>> xint = np.random.randint(10, size=(100, 2)) 

4164 >>> stats.spearmanr(xint) 

4165 (0.052760927029710199, 0.60213045837062351) 

4166 

4167 """ 

4168 if axis is not None and axis > 1: 

4169 raise ValueError("spearmanr only handles 1-D or 2-D arrays, supplied axis argument {}, please use only values 0, 1 or None for axis".format(axis)) 

4170 

4171 a, axisout = _chk_asarray(a, axis) 

4172 if a.ndim > 2: 

4173 raise ValueError("spearmanr only handles 1-D or 2-D arrays") 

4174 

4175 if b is None: 

4176 if a.ndim < 2: 

4177 raise ValueError("`spearmanr` needs at least 2 variables to compare") 

4178 else: 

4179 # Concatenate a and b, so that we now only have to handle the case 

4180 # of a 2-D `a`. 

4181 b, _ = _chk_asarray(b, axis) 

4182 if axisout == 0: 

4183 a = np.column_stack((a, b)) 

4184 else: 

4185 a = np.row_stack((a, b)) 

4186 

4187 n_vars = a.shape[1 - axisout] 

4188 n_obs = a.shape[axisout] 

4189 if n_obs <= 1: 

4190 # Handle empty arrays or single observations. 

4191 return SpearmanrResult(np.nan, np.nan) 

4192 

4193 if axisout == 0: 

4194 if (a[:, 0][0] == a[:, 0]).all() or (a[:, 1][0] == a[:, 1]).all(): 

4195 # If an input is constant, the correlation coefficient is not defined. 

4196 warnings.warn(SpearmanRConstantInputWarning()) 

4197 return SpearmanrResult(np.nan, np.nan) 

4198 else: # case when axisout == 1 b/c a is 2 dim only 

4199 if (a[0, :][0] == a[0, :]).all() or (a[1, :][0] == a[1, :]).all(): 

4200 # If an input is constant, the correlation coefficient is not defined. 

4201 warnings.warn(SpearmanRConstantInputWarning()) 

4202 return SpearmanrResult(np.nan, np.nan) 

4203 

4204 a_contains_nan, nan_policy = _contains_nan(a, nan_policy) 

4205 variable_has_nan = np.zeros(n_vars, dtype=bool) 

4206 if a_contains_nan: 

4207 if nan_policy == 'omit': 

4208 return mstats_basic.spearmanr(a, axis=axis, nan_policy=nan_policy) 

4209 elif nan_policy == 'propagate': 

4210 if a.ndim == 1 or n_vars <= 2: 

4211 return SpearmanrResult(np.nan, np.nan) 

4212 else: 

4213 # Keep track of variables with NaNs, set the outputs to NaN 

4214 # only for those variables 

4215 variable_has_nan = np.isnan(a).sum(axis=axisout) 

4216 

4217 a_ranked = np.apply_along_axis(rankdata, axisout, a) 

4218 rs = np.corrcoef(a_ranked, rowvar=axisout) 

4219 dof = n_obs - 2 # degrees of freedom 

4220 

4221 # rs can have elements equal to 1, so avoid zero division warnings 

4222 with np.errstate(divide='ignore'): 

4223 # clip the small negative values possibly caused by rounding 

4224 # errors before taking the square root 

4225 t = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0)) 

4226 

4227 prob = 2 * distributions.t.sf(np.abs(t), dof) 

4228 

4229 # For backwards compatibility, return scalars when comparing 2 columns 

4230 if rs.shape == (2, 2): 

4231 return SpearmanrResult(rs[1, 0], prob[1, 0]) 

4232 else: 

4233 rs[variable_has_nan, :] = np.nan 

4234 rs[:, variable_has_nan] = np.nan 

4235 return SpearmanrResult(rs, prob) 

4236 

4237 

4238PointbiserialrResult = namedtuple('PointbiserialrResult', 

4239 ('correlation', 'pvalue')) 

4240 

4241 

4242def pointbiserialr(x, y): 

4243 r""" 

4244 Calculate a point biserial correlation coefficient and its p-value. 

4245 

4246 The point biserial correlation is used to measure the relationship 

4247 between a binary variable, x, and a continuous variable, y. Like other 

4248 correlation coefficients, this one varies between -1 and +1 with 0 

4249 implying no correlation. Correlations of -1 or +1 imply a determinative 

4250 relationship. 

4251 

4252 This function uses a shortcut formula but produces the same result as 

4253 `pearsonr`. 

4254 

4255 Parameters 

4256 ---------- 

4257 x : array_like of bools 

4258 Input array. 

4259 y : array_like 

4260 Input array. 

4261 

4262 Returns 

4263 ------- 

4264 correlation : float 

4265 R value. 

4266 pvalue : float 

4267 Two-sided p-value. 

4268 

4269 Notes 

4270 ----- 

4271 `pointbiserialr` uses a t-test with ``n-1`` degrees of freedom. 

4272 It is equivalent to `pearsonr.` 

4273 

4274 The value of the point-biserial correlation can be calculated from: 

4275 

4276 .. math:: 

4277 

4278 r_{pb} = \frac{\overline{Y_{1}} - 

4279 \overline{Y_{0}}}{s_{y}}\sqrt{\frac{N_{1} N_{2}}{N (N - 1))}} 

4280 

4281 Where :math:`Y_{0}` and :math:`Y_{1}` are means of the metric 

4282 observations coded 0 and 1 respectively; :math:`N_{0}` and :math:`N_{1}` 

4283 are number of observations coded 0 and 1 respectively; :math:`N` is the 

4284 total number of observations and :math:`s_{y}` is the standard 

4285 deviation of all the metric observations. 

4286 

4287 A value of :math:`r_{pb}` that is significantly different from zero is 

4288 completely equivalent to a significant difference in means between the two 

4289 groups. Thus, an independent groups t Test with :math:`N-2` degrees of 

4290 freedom may be used to test whether :math:`r_{pb}` is nonzero. The 

4291 relation between the t-statistic for comparing two independent groups and 

4292 :math:`r_{pb}` is given by: 

4293 

4294 .. math:: 

4295 

4296 t = \sqrt{N - 2}\frac{r_{pb}}{\sqrt{1 - r^{2}_{pb}}} 

4297 

4298 References 

4299 ---------- 

4300 .. [1] J. Lev, "The Point Biserial Coefficient of Correlation", Ann. Math. 

4301 Statist., Vol. 20, no.1, pp. 125-126, 1949. 

4302 

4303 .. [2] R.F. Tate, "Correlation Between a Discrete and a Continuous 

4304 Variable. Point-Biserial Correlation.", Ann. Math. Statist., Vol. 25, 

4305 np. 3, pp. 603-607, 1954. 

4306 

4307 .. [3] D. Kornbrot "Point Biserial Correlation", In Wiley StatsRef: 

4308 Statistics Reference Online (eds N. Balakrishnan, et al.), 2014. 

4309 https://doi.org/10.1002/9781118445112.stat06227 

4310 

4311 Examples 

4312 -------- 

4313 >>> from scipy import stats 

4314 >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) 

4315 >>> b = np.arange(7) 

4316 >>> stats.pointbiserialr(a, b) 

4317 (0.8660254037844386, 0.011724811003954652) 

4318 >>> stats.pearsonr(a, b) 

4319 (0.86602540378443871, 0.011724811003954626) 

4320 >>> np.corrcoef(a, b) 

4321 array([[ 1. , 0.8660254], 

4322 [ 0.8660254, 1. ]]) 

4323 

4324 """ 

4325 rpb, prob = pearsonr(x, y) 

4326 return PointbiserialrResult(rpb, prob) 

4327 

4328 

4329KendalltauResult = namedtuple('KendalltauResult', ('correlation', 'pvalue')) 

4330 

4331 

4332def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', method='auto'): 

4333 """ 

4334 Calculate Kendall's tau, a correlation measure for ordinal data. 

4335 

4336 Kendall's tau is a measure of the correspondence between two rankings. 

4337 Values close to 1 indicate strong agreement, values close to -1 indicate 

4338 strong disagreement. This is the 1945 "tau-b" version of Kendall's 

4339 tau [2]_, which can account for ties and which reduces to the 1938 "tau-a" 

4340 version [1]_ in absence of ties. 

4341 

4342 Parameters 

4343 ---------- 

4344 x, y : array_like 

4345 Arrays of rankings, of the same shape. If arrays are not 1-D, they will 

4346 be flattened to 1-D. 

4347 initial_lexsort : bool, optional 

4348 Unused (deprecated). 

4349 nan_policy : {'propagate', 'raise', 'omit'}, optional 

4350 Defines how to handle when input contains nan. 

4351 The following options are available (default is 'propagate'): 

4352 

4353 * 'propagate': returns nan 

4354 * 'raise': throws an error 

4355 * 'omit': performs the calculations ignoring nan values 

4356 method : {'auto', 'asymptotic', 'exact'}, optional 

4357 Defines which method is used to calculate the p-value [5]_. 

4358 The following options are available (default is 'auto'): 

4359 

4360 * 'auto': selects the appropriate method based on a trade-off between 

4361 speed and accuracy 

4362 * 'asymptotic': uses a normal approximation valid for large samples 

4363 * 'exact': computes the exact p-value, but can only be used if no ties 

4364 are present 

4365 

4366 Returns 

4367 ------- 

4368 correlation : float 

4369 The tau statistic. 

4370 pvalue : float 

4371 The two-sided p-value for a hypothesis test whose null hypothesis is 

4372 an absence of association, tau = 0. 

4373 

4374 See Also 

4375 -------- 

4376 spearmanr : Calculates a Spearman rank-order correlation coefficient. 

4377 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y). 

4378 weightedtau : Computes a weighted version of Kendall's tau. 

4379 

4380 Notes 

4381 ----- 

4382 The definition of Kendall's tau that is used is [2]_:: 

4383 

4384 tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U)) 

4385 

4386 where P is the number of concordant pairs, Q the number of discordant 

4387 pairs, T the number of ties only in `x`, and U the number of ties only in 

4388 `y`. If a tie occurs for the same pair in both `x` and `y`, it is not 

4389 added to either T or U. 

4390 

4391 References 

4392 ---------- 

4393 .. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika 

4394 Vol. 30, No. 1/2, pp. 81-93, 1938. 

4395 .. [2] Maurice G. Kendall, "The treatment of ties in ranking problems", 

4396 Biometrika Vol. 33, No. 3, pp. 239-251. 1945. 

4397 .. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John 

4398 Wiley & Sons, 1967. 

4399 .. [4] Peter M. Fenwick, "A new data structure for cumulative frequency 

4400 tables", Software: Practice and Experience, Vol. 24, No. 3, 

4401 pp. 327-336, 1994. 

4402 .. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), 

4403 Charles Griffin & Co., 1970. 

4404 

4405 Examples 

4406 -------- 

4407 >>> from scipy import stats 

4408 >>> x1 = [12, 2, 1, 12, 2] 

4409 >>> x2 = [1, 4, 7, 1, 0] 

4410 >>> tau, p_value = stats.kendalltau(x1, x2) 

4411 >>> tau 

4412 -0.47140452079103173 

4413 >>> p_value 

4414 0.2827454599327748 

4415 

4416 """ 

4417 x = np.asarray(x).ravel() 

4418 y = np.asarray(y).ravel() 

4419 

4420 if x.size != y.size: 

4421 raise ValueError("All inputs to `kendalltau` must be of the same size, " 

4422 "found x-size %s and y-size %s" % (x.size, y.size)) 

4423 elif not x.size or not y.size: 

4424 return KendalltauResult(np.nan, np.nan) # Return NaN if arrays are empty 

4425 

4426 # check both x and y 

4427 cnx, npx = _contains_nan(x, nan_policy) 

4428 cny, npy = _contains_nan(y, nan_policy) 

4429 contains_nan = cnx or cny 

4430 if npx == 'omit' or npy == 'omit': 

4431 nan_policy = 'omit' 

4432 

4433 if contains_nan and nan_policy == 'propagate': 

4434 return KendalltauResult(np.nan, np.nan) 

4435 

4436 elif contains_nan and nan_policy == 'omit': 

4437 x = ma.masked_invalid(x) 

4438 y = ma.masked_invalid(y) 

4439 return mstats_basic.kendalltau(x, y, method=method) 

4440 

4441 if initial_lexsort is not None: # deprecate to drop! 

4442 warnings.warn('"initial_lexsort" is gone!') 

4443 

4444 def count_rank_tie(ranks): 

4445 cnt = np.bincount(ranks).astype('int64', copy=False) 

4446 cnt = cnt[cnt > 1] 

4447 return ((cnt * (cnt - 1) // 2).sum(), 

4448 (cnt * (cnt - 1.) * (cnt - 2)).sum(), 

4449 (cnt * (cnt - 1.) * (2*cnt + 5)).sum()) 

4450 

4451 size = x.size 

4452 perm = np.argsort(y) # sort on y and convert y to dense ranks 

4453 x, y = x[perm], y[perm] 

4454 y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp) 

4455 

4456 # stable sort on x and convert x to dense ranks 

4457 perm = np.argsort(x, kind='mergesort') 

4458 x, y = x[perm], y[perm] 

4459 x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp) 

4460 

4461 dis = _kendall_dis(x, y) # discordant pairs 

4462 

4463 obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True] 

4464 cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False) 

4465 

4466 ntie = (cnt * (cnt - 1) // 2).sum() # joint ties 

4467 xtie, x0, x1 = count_rank_tie(x) # ties in x, stats 

4468 ytie, y0, y1 = count_rank_tie(y) # ties in y, stats 

4469 

4470 tot = (size * (size - 1)) // 2 

4471 

4472 if xtie == tot or ytie == tot: 

4473 return KendalltauResult(np.nan, np.nan) 

4474 

4475 # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie 

4476 # = con + dis + xtie + ytie - ntie 

4477 con_minus_dis = tot - xtie - ytie + ntie - 2 * dis 

4478 tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie) 

4479 # Limit range to fix computational errors 

4480 tau = min(1., max(-1., tau)) 

4481 

4482 if method == 'exact' and (xtie != 0 or ytie != 0): 

4483 raise ValueError("Ties found, exact method cannot be used.") 

4484 

4485 if method == 'auto': 

4486 if (xtie == 0 and ytie == 0) and (size <= 33 or min(dis, tot-dis) <= 1): 

4487 method = 'exact' 

4488 else: 

4489 method = 'asymptotic' 

4490 

4491 if xtie == 0 and ytie == 0 and method == 'exact': 

4492 # Exact p-value, see p. 68 of Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), Charles Griffin & Co., 1970. 

4493 c = min(dis, tot-dis) 

4494 if size <= 0: 

4495 raise ValueError 

4496 elif c < 0 or 2*c > size*(size-1): 

4497 raise ValueError 

4498 elif size == 1: 

4499 pvalue = 1.0 

4500 elif size == 2: 

4501 pvalue = 1.0 

4502 elif c == 0: 

4503 pvalue = 2.0/math.factorial(size) if size < 171 else 0.0 

4504 elif c == 1: 

4505 pvalue = 2.0/math.factorial(size-1) if (size-1) < 171 else 0.0 

4506 elif 2*c == tot: 

4507 pvalue = 1.0 

4508 else: 

4509 new = [0.0]*(c+1) 

4510 new[0] = 1.0 

4511 new[1] = 1.0 

4512 for j in range(3,size+1): 

4513 old = new[:] 

4514 for k in range(1,min(j,c+1)): 

4515 new[k] += new[k-1] 

4516 for k in range(j,c+1): 

4517 new[k] += new[k-1] - old[k-j] 

4518 

4519 pvalue = 2.0*sum(new)/math.factorial(size) if size < 171 else 0.0 

4520 

4521 elif method == 'asymptotic': 

4522 # con_minus_dis is approx normally distributed with this variance [3]_ 

4523 var = (size * (size - 1) * (2.*size + 5) - x1 - y1) / 18. + ( 

4524 2. * xtie * ytie) / (size * (size - 1)) + x0 * y0 / (9. * 

4525 size * (size - 1) * (size - 2)) 

4526 pvalue = special.erfc(np.abs(con_minus_dis) / np.sqrt(var) / np.sqrt(2)) 

4527 else: 

4528 raise ValueError("Unknown method "+str(method)+" specified, please use auto, exact or asymptotic.") 

4529 

4530 return KendalltauResult(tau, pvalue) 

4531 

4532 

4533WeightedTauResult = namedtuple('WeightedTauResult', ('correlation', 'pvalue')) 

4534 

4535 

4536def weightedtau(x, y, rank=True, weigher=None, additive=True): 

4537 r""" 

4538 Compute a weighted version of Kendall's :math:`\tau`. 

4539 

4540 The weighted :math:`\tau` is a weighted version of Kendall's 

4541 :math:`\tau` in which exchanges of high weight are more influential than 

4542 exchanges of low weight. The default parameters compute the additive 

4543 hyperbolic version of the index, :math:`\tau_\mathrm h`, which has 

4544 been shown to provide the best balance between important and 

4545 unimportant elements [1]_. 

4546 

4547 The weighting is defined by means of a rank array, which assigns a 

4548 nonnegative rank to each element, and a weigher function, which 

4549 assigns a weight based from the rank to each element. The weight of an 

4550 exchange is then the sum or the product of the weights of the ranks of 

4551 the exchanged elements. The default parameters compute 

4552 :math:`\tau_\mathrm h`: an exchange between elements with rank 

4553 :math:`r` and :math:`s` (starting from zero) has weight 

4554 :math:`1/(r+1) + 1/(s+1)`. 

4555 

4556 Specifying a rank array is meaningful only if you have in mind an 

4557 external criterion of importance. If, as it usually happens, you do 

4558 not have in mind a specific rank, the weighted :math:`\tau` is 

4559 defined by averaging the values obtained using the decreasing 

4560 lexicographical rank by (`x`, `y`) and by (`y`, `x`). This is the 

4561 behavior with default parameters. 

4562 

4563 Note that if you are computing the weighted :math:`\tau` on arrays of 

4564 ranks, rather than of scores (i.e., a larger value implies a lower 

4565 rank) you must negate the ranks, so that elements of higher rank are 

4566 associated with a larger value. 

4567 

4568 Parameters 

4569 ---------- 

4570 x, y : array_like 

4571 Arrays of scores, of the same shape. If arrays are not 1-D, they will 

4572 be flattened to 1-D. 

4573 rank : array_like of ints or bool, optional 

4574 A nonnegative rank assigned to each element. If it is None, the 

4575 decreasing lexicographical rank by (`x`, `y`) will be used: elements of 

4576 higher rank will be those with larger `x`-values, using `y`-values to 

4577 break ties (in particular, swapping `x` and `y` will give a different 

4578 result). If it is False, the element indices will be used 

4579 directly as ranks. The default is True, in which case this 

4580 function returns the average of the values obtained using the 

4581 decreasing lexicographical rank by (`x`, `y`) and by (`y`, `x`). 

4582 weigher : callable, optional 

4583 The weigher function. Must map nonnegative integers (zero 

4584 representing the most important element) to a nonnegative weight. 

4585 The default, None, provides hyperbolic weighing, that is, 

4586 rank :math:`r` is mapped to weight :math:`1/(r+1)`. 

4587 additive : bool, optional 

4588 If True, the weight of an exchange is computed by adding the 

4589 weights of the ranks of the exchanged elements; otherwise, the weights 

4590 are multiplied. The default is True. 

4591 

4592 Returns 

4593 ------- 

4594 correlation : float 

4595 The weighted :math:`\tau` correlation index. 

4596 pvalue : float 

4597 Presently ``np.nan``, as the null statistics is unknown (even in the 

4598 additive hyperbolic case). 

4599 

4600 See Also 

4601 -------- 

4602 kendalltau : Calculates Kendall's tau. 

4603 spearmanr : Calculates a Spearman rank-order correlation coefficient. 

4604 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y). 

4605 

4606 Notes 

4607 ----- 

4608 This function uses an :math:`O(n \log n)`, mergesort-based algorithm 

4609 [1]_ that is a weighted extension of Knight's algorithm for Kendall's 

4610 :math:`\tau` [2]_. It can compute Shieh's weighted :math:`\tau` [3]_ 

4611 between rankings without ties (i.e., permutations) by setting 

4612 `additive` and `rank` to False, as the definition given in [1]_ is a 

4613 generalization of Shieh's. 

4614 

4615 NaNs are considered the smallest possible score. 

4616 

4617 .. versionadded:: 0.19.0 

4618 

4619 References 

4620 ---------- 

4621 .. [1] Sebastiano Vigna, "A weighted correlation index for rankings with 

4622 ties", Proceedings of the 24th international conference on World 

4623 Wide Web, pp. 1166-1176, ACM, 2015. 

4624 .. [2] W.R. Knight, "A Computer Method for Calculating Kendall's Tau with 

4625 Ungrouped Data", Journal of the American Statistical Association, 

4626 Vol. 61, No. 314, Part 1, pp. 436-439, 1966. 

4627 .. [3] Grace S. Shieh. "A weighted Kendall's tau statistic", Statistics & 

4628 Probability Letters, Vol. 39, No. 1, pp. 17-24, 1998. 

4629 

4630 Examples 

4631 -------- 

4632 >>> from scipy import stats 

4633 >>> x = [12, 2, 1, 12, 2] 

4634 >>> y = [1, 4, 7, 1, 0] 

4635 >>> tau, p_value = stats.weightedtau(x, y) 

4636 >>> tau 

4637 -0.56694968153682723 

4638 >>> p_value 

4639 nan 

4640 >>> tau, p_value = stats.weightedtau(x, y, additive=False) 

4641 >>> tau 

4642 -0.62205716951801038 

4643 

4644 NaNs are considered the smallest possible score: 

4645 

4646 >>> x = [12, 2, 1, 12, 2] 

4647 >>> y = [1, 4, 7, 1, np.nan] 

4648 >>> tau, _ = stats.weightedtau(x, y) 

4649 >>> tau 

4650 -0.56694968153682723 

4651 

4652 This is exactly Kendall's tau: 

4653 

4654 >>> x = [12, 2, 1, 12, 2] 

4655 >>> y = [1, 4, 7, 1, 0] 

4656 >>> tau, _ = stats.weightedtau(x, y, weigher=lambda x: 1) 

4657 >>> tau 

4658 -0.47140452079103173 

4659 

4660 >>> x = [12, 2, 1, 12, 2] 

4661 >>> y = [1, 4, 7, 1, 0] 

4662 >>> stats.weightedtau(x, y, rank=None) 

4663 WeightedTauResult(correlation=-0.4157652301037516, pvalue=nan) 

4664 >>> stats.weightedtau(y, x, rank=None) 

4665 WeightedTauResult(correlation=-0.7181341329699028, pvalue=nan) 

4666 

4667 """ 

4668 x = np.asarray(x).ravel() 

4669 y = np.asarray(y).ravel() 

4670 

4671 if x.size != y.size: 

4672 raise ValueError("All inputs to `weightedtau` must be of the same size, " 

4673 "found x-size %s and y-size %s" % (x.size, y.size)) 

4674 if not x.size: 

4675 return WeightedTauResult(np.nan, np.nan) # Return NaN if arrays are empty 

4676 

4677 # If there are NaNs we apply _toint64() 

4678 if np.isnan(np.sum(x)): 

4679 x = _toint64(x) 

4680 if np.isnan(np.sum(x)): 

4681 y = _toint64(y) 

4682 

4683 # Reduce to ranks unsupported types 

4684 if x.dtype != y.dtype: 

4685 if x.dtype != np.int64: 

4686 x = _toint64(x) 

4687 if y.dtype != np.int64: 

4688 y = _toint64(y) 

4689 else: 

4690 if x.dtype not in (np.int32, np.int64, np.float32, np.float64): 

4691 x = _toint64(x) 

4692 y = _toint64(y) 

4693 

4694 if rank is True: 

4695 return WeightedTauResult(( 

4696 _weightedrankedtau(x, y, None, weigher, additive) + 

4697 _weightedrankedtau(y, x, None, weigher, additive) 

4698 ) / 2, np.nan) 

4699 

4700 if rank is False: 

4701 rank = np.arange(x.size, dtype=np.intp) 

4702 elif rank is not None: 

4703 rank = np.asarray(rank).ravel() 

4704 if rank.size != x.size: 

4705 raise ValueError("All inputs to `weightedtau` must be of the same size, " 

4706 "found x-size %s and rank-size %s" % (x.size, rank.size)) 

4707 

4708 return WeightedTauResult(_weightedrankedtau(x, y, rank, weigher, additive), np.nan) 

4709 

4710 

4711# FROM MGCPY: https://github.com/neurodata/mgcpy 

4712 

4713class _ParallelP(object): 

4714 """ 

4715 Helper function to calculate parallel p-value. 

4716 """ 

4717 def __init__(self, x, y, compute_distance, random_states): 

4718 self.x = x 

4719 self.y = y 

4720 self.compute_distance = compute_distance 

4721 self.random_states = random_states 

4722 

4723 def __call__(self, index): 

4724 permx = self.random_states[index].permutation(self.x) 

4725 permy = self.random_states[index].permutation(self.y) 

4726 

4727 # calculate permuted stats, store in null distribution 

4728 perm_stat = _mgc_stat(permx, permy, self.compute_distance)[0] 

4729 

4730 return perm_stat 

4731 

4732 

4733def _perm_test(x, y, stat, compute_distance, reps=1000, workers=-1, 

4734 random_state=None): 

4735 r""" 

4736 Helper function that calculates the p-value. See below for uses. 

4737 

4738 Parameters 

4739 ---------- 

4740 x, y : ndarray 

4741 `x` and `y` have shapes `(n, p)` and `(n, q)`. 

4742 stat : float 

4743 The sample test statistic. 

4744 compute_distance : callable 

4745 A function that computes the distance or similarity among the samples 

4746 within each data matrix. Set to `None` if `x` and `y` are already 

4747 distance. 

4748 reps : int, optional 

4749 The number of replications used to estimate the null when using the 

4750 permutation test. The default is 1000 replications. 

4751 workers : int or map-like callable, optional 

4752 If `workers` is an int the population is subdivided into `workers` 

4753 sections and evaluated in parallel (uses 

4754 `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores 

4755 available to the Process. Alternatively supply a map-like callable, 

4756 such as `multiprocessing.Pool.map` for evaluating the population in 

4757 parallel. This evaluation is carried out as `workers(func, iterable)`. 

4758 Requires that `func` be pickleable. 

4759 random_state : int or np.random.RandomState instance, optional 

4760 If already a RandomState instance, use it. 

4761 If seed is an int, return a new RandomState instance seeded with seed. 

4762 If None, use np.random.RandomState. Default is None. 

4763 

4764 Returns 

4765 ------- 

4766 pvalue : float 

4767 The sample test p-value. 

4768 null_dist : list 

4769 The approximated null distribution. 

4770 """ 

4771 # generate seeds for each rep (change to new parallel random number 

4772 # capabilities in numpy >= 1.17+) 

4773 random_state = check_random_state(random_state) 

4774 random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32, 

4775 size=4, dtype=np.uint32)) for _ in range(reps)] 

4776 

4777 # parallelizes with specified workers over number of reps and set seeds 

4778 mapwrapper = MapWrapper(workers) 

4779 parallelp = _ParallelP(x=x, y=y, compute_distance=compute_distance, 

4780 random_states=random_states) 

4781 null_dist = np.array(list(mapwrapper(parallelp, range(reps)))) 

4782 

4783 # calculate p-value and significant permutation map through list 

4784 pvalue = (null_dist >= stat).sum() / reps 

4785 

4786 # correct for a p-value of 0. This is because, with bootstrapping 

4787 # permutations, a p-value of 0 is incorrect 

4788 if pvalue == 0: 

4789 pvalue = 1 / reps 

4790 

4791 return pvalue, null_dist 

4792 

4793 

4794def _euclidean_dist(x): 

4795 return cdist(x, x) 

4796 

4797 

4798MGCResult = namedtuple('MGCResult', ('stat', 'pvalue', 'mgc_dict')) 

4799 

4800 

4801def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000, 

4802 workers=1, is_twosamp=False, random_state=None): 

4803 r""" 

4804 Computes the Multiscale Graph Correlation (MGC) test statistic. 

4805 

4806 Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for 

4807 one property (e.g. cloud density), and the :math:`l`-nearest neighbors for 

4808 the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is 

4809 called the "scale". A priori, however, it is not know which scales will be 

4810 most informative. So, MGC computes all distance pairs, and then efficiently 

4811 computes the distance correlations for all scales. The local correlations 

4812 illustrate which scales are relatively informative about the relationship. 

4813 The key, therefore, to successfully discover and decipher relationships 

4814 between disparate data modalities is to adaptively determine which scales 

4815 are the most informative, and the geometric implication for the most 

4816 informative scales. Doing so not only provides an estimate of whether the 

4817 modalities are related, but also provides insight into how the 

4818 determination was made. This is especially important in high-dimensional 

4819 data, where simple visualizations do not reveal relationships to the 

4820 unaided human eye. Characterizations of this implementation in particular 

4821 have been derived from and benchmarked within in [2]_. 

4822 

4823 Parameters 

4824 ---------- 

4825 x, y : ndarray 

4826 If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is 

4827 the number of samples and `p` and `q` are the number of dimensions, 

4828 then the MGC independence test will be run. Alternatively, ``x`` and 

4829 ``y`` can have shapes ``(n, n)`` if they are distance or similarity 

4830 matrices, and ``compute_distance`` must be sent to ``None``. If ``x`` 

4831 and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired 

4832 two-sample MGC test will be run. 

4833 compute_distance : callable, optional 

4834 A function that computes the distance or similarity among the samples 

4835 within each data matrix. Set to ``None`` if ``x`` and ``y`` are 

4836 already distance matrices. The default uses the euclidean norm metric. 

4837 If you are calling a custom function, either create the distance 

4838 matrix before-hand or create a function of the form 

4839 ``compute_distance(x)`` where `x` is the data matrix for which 

4840 pairwise distances are calculated. 

4841 reps : int, optional 

4842 The number of replications used to estimate the null when using the 

4843 permutation test. The default is ``1000``. 

4844 workers : int or map-like callable, optional 

4845 If ``workers`` is an int the population is subdivided into ``workers`` 

4846 sections and evaluated in parallel (uses ``multiprocessing.Pool 

4847 <multiprocessing>``). Supply ``-1`` to use all cores available to the 

4848 Process. Alternatively supply a map-like callable, such as 

4849 ``multiprocessing.Pool.map`` for evaluating the p-value in parallel. 

4850 This evaluation is carried out as ``workers(func, iterable)``. 

4851 Requires that `func` be pickleable. The default is ``1``. 

4852 is_twosamp : bool, optional 

4853 If `True`, a two sample test will be run. If ``x`` and ``y`` have 

4854 shapes ``(n, p)`` and ``(m, p)``, this optional will be overriden and 

4855 set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes 

4856 ``(n, p)`` and a two sample test is desired. The default is ``False``. 

4857 random_state : int or np.random.RandomState instance, optional 

4858 If already a RandomState instance, use it. 

4859 If seed is an int, return a new RandomState instance seeded with seed. 

4860 If None, use np.random.RandomState. Default is None. 

4861 

4862 Returns 

4863 ------- 

4864 stat : float 

4865 The sample MGC test statistic within `[-1, 1]`. 

4866 pvalue : float 

4867 The p-value obtained via permutation. 

4868 mgc_dict : dict 

4869 Contains additional useful additional returns containing the following 

4870 keys: 

4871 

4872 - mgc_map : ndarray 

4873 A 2D representation of the latent geometry of the relationship. 

4874 of the relationship. 

4875 - opt_scale : (int, int) 

4876 The estimated optimal scale as a `(x, y)` pair. 

4877 - null_dist : list 

4878 The null distribution derived from the permuted matrices 

4879 

4880 See Also 

4881 -------- 

4882 pearsonr : Pearson correlation coefficient and p-value for testing 

4883 non-correlation. 

4884 kendalltau : Calculates Kendall's tau. 

4885 spearmanr : Calculates a Spearman rank-order correlation coefficient. 

4886 

4887 Notes 

4888 ----- 

4889 A description of the process of MGC and applications on neuroscience data 

4890 can be found in [1]_. It is performed using the following steps: 

4891 

4892 #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and 

4893 modified to be mean zero columnwise. This results in two 

4894 :math:`n \times n` distance matrices :math:`A` and :math:`B` (the 

4895 centering and unbiased modification) [3]_. 

4896 

4897 #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`, 

4898 

4899 * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs 

4900 are calculated for each property. Here, :math:`G_k (i, j)` indicates 

4901 the :math:`k`-smallest values of the :math:`i`-th row of :math:`A` 

4902 and :math:`H_l (i, j)` indicates the :math:`l` smallested values of 

4903 the :math:`i`-th row of :math:`B` 

4904 

4905 * Let :math:`\circ` denotes the entry-wise matrix product, then local 

4906 correlations are summed and normalized using the following statistic: 

4907 

4908 .. math:: 

4909 

4910 c^{kl} = \frac{\sum_{ij} A G_k B H_l} 

4911 {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}} 

4912 

4913 #. The MGC test statistic is the smoothed optimal local correlation of 

4914 :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)` 

4915 (which essentially set all isolated large correlations) as 0 and 

4916 connected large correlations the same as before, see [3]_.) MGC is, 

4917 

4918 .. math:: 

4919 

4920 MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right) 

4921 \right) 

4922 

4923 The test statistic returns a value between :math:`(-1, 1)` since it is 

4924 normalized. 

4925 

4926 The p-value returned is calculated using a permutation test. This process 

4927 is completed by first randomly permuting :math:`y` to estimate the null 

4928 distribution and then calculating the probability of observing a test 

4929 statistic, under the null, at least as extreme as the observed test 

4930 statistic. 

4931 

4932 MGC requires at least 5 samples to run with reliable results. It can also 

4933 handle high-dimensional data sets. 

4934 

4935 In addition, by manipulating the input data matrices, the two-sample 

4936 testing problem can be reduced to the independence testing problem [4]_. 

4937 Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n` 

4938 :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as 

4939 follows: 

4940 

4941 .. math:: 

4942 

4943 X = [U | V] \in \mathcal{R}^{p \times (n + m)} 

4944 

4945 Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)} 

4946 

4947 Then, the MGC statistic can be calculated as normal. This methodology can 

4948 be extended to similar tests such as distance correlation [4]_. 

4949 

4950 .. versionadded:: 1.4.0 

4951 

4952 References 

4953 ---------- 

4954 .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E., 

4955 Maggioni, M., & Shen, C. (2019). Discovering and deciphering 

4956 relationships across disparate data modalities. ELife. 

4957 .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A., 

4958 Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019). 

4959 mgcpy: A Comprehensive High Dimensional Independence Testing Python 

4960 Package. ArXiv:1907.02088 [Cs, Stat]. 

4961 .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance 

4962 correlation to multiscale graph correlation. Journal of the American 

4963 Statistical Association. 

4964 .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of 

4965 Distance and Kernel Methods for Hypothesis Testing. ArXiv:1806.05514 

4966 [Cs, Stat]. 

4967 

4968 Examples 

4969 -------- 

4970 >>> from scipy.stats import multiscale_graphcorr 

4971 >>> x = np.arange(100) 

4972 >>> y = x 

4973 >>> stat, pvalue, _ = multiscale_graphcorr(x, y, workers=-1) 

4974 >>> '%.1f, %.3f' % (stat, pvalue) 

4975 '1.0, 0.001' 

4976 

4977 Alternatively, 

4978 

4979 >>> x = np.arange(100) 

4980 >>> y = x 

4981 >>> mgc = multiscale_graphcorr(x, y) 

4982 >>> '%.1f, %.3f' % (mgc.stat, mgc.pvalue) 

4983 '1.0, 0.001' 

4984 

4985 To run an unpaired two-sample test, 

4986 

4987 >>> x = np.arange(100) 

4988 >>> y = np.arange(79) 

4989 >>> mgc = multiscale_graphcorr(x, y, random_state=1) 

4990 >>> '%.3f, %.2f' % (mgc.stat, mgc.pvalue) 

4991 '0.033, 0.02' 

4992 

4993 or, if shape of the inputs are the same, 

4994 

4995 >>> x = np.arange(100) 

4996 >>> y = x 

4997 >>> mgc = multiscale_graphcorr(x, y, is_twosamp=True) 

4998 >>> '%.3f, %.1f' % (mgc.stat, mgc.pvalue) 

4999 '-0.008, 1.0' 

5000 """ 

5001 if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray): 

5002 raise ValueError("x and y must be ndarrays") 

5003 

5004 # convert arrays of type (n,) to (n, 1) 

5005 if x.ndim == 1: 

5006 x = x[:, np.newaxis] 

5007 elif x.ndim != 2: 

5008 raise ValueError("Expected a 2-D array `x`, found shape " 

5009 "{}".format(x.shape)) 

5010 if y.ndim == 1: 

5011 y = y[:, np.newaxis] 

5012 elif y.ndim != 2: 

5013 raise ValueError("Expected a 2-D array `y`, found shape " 

5014 "{}".format(y.shape)) 

5015 

5016 nx, px = x.shape 

5017 ny, py = y.shape 

5018 

5019 # check for NaNs 

5020 _contains_nan(x, nan_policy='raise') 

5021 _contains_nan(y, nan_policy='raise') 

5022 

5023 # check for positive or negative infinity and raise error 

5024 if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0: 

5025 raise ValueError("Inputs contain infinities") 

5026 

5027 if nx != ny: 

5028 if px == py: 

5029 # reshape x and y for two sample testing 

5030 is_twosamp = True 

5031 else: 

5032 raise ValueError("Shape mismatch, x and y must have shape [n, p] " 

5033 "and [n, q] or have shape [n, p] and [m, p].") 

5034 

5035 if nx < 5 or ny < 5: 

5036 raise ValueError("MGC requires at least 5 samples to give reasonable " 

5037 "results.") 

5038 

5039 # convert x and y to float 

5040 x = x.astype(np.float64) 

5041 y = y.astype(np.float64) 

5042 

5043 # check if compute_distance_matrix if a callable() 

5044 if not callable(compute_distance) and compute_distance is not None: 

5045 raise ValueError("Compute_distance must be a function.") 

5046 

5047 # check if number of reps exists, integer, or > 0 (if under 1000 raises 

5048 # warning) 

5049 if not isinstance(reps, int) or reps < 0: 

5050 raise ValueError("Number of reps must be an integer greater than 0.") 

5051 elif reps < 1000: 

5052 msg = ("The number of replications is low (under 1000), and p-value " 

5053 "calculations may be unreliable. Use the p-value result, with " 

5054 "caution!") 

5055 warnings.warn(msg, RuntimeWarning) 

5056 

5057 if is_twosamp: 

5058 x, y = _two_sample_transform(x, y) 

5059 

5060 # calculate MGC stat 

5061 stat, stat_dict = _mgc_stat(x, y, compute_distance) 

5062 stat_mgc_map = stat_dict["stat_mgc_map"] 

5063 opt_scale = stat_dict["opt_scale"] 

5064 

5065 # calculate permutation MGC p-value 

5066 pvalue, null_dist = _perm_test(x, y, stat, compute_distance, reps=reps, 

5067 workers=workers, random_state=random_state) 

5068 

5069 # save all stats (other than stat/p-value) in dictionary 

5070 mgc_dict = {"mgc_map": stat_mgc_map, 

5071 "opt_scale": opt_scale, 

5072 "null_dist": null_dist} 

5073 

5074 return MGCResult(stat, pvalue, mgc_dict) 

5075 

5076 

5077def _mgc_stat(x, y, compute_distance): 

5078 r""" 

5079 Helper function that calculates the MGC stat. See above for use. 

5080 

5081 Parameters 

5082 ---------- 

5083 x, y : ndarray 

5084 `x` and `y` have shapes `(n, p)` and `(n, q)` or `(n, n)` and `(n, n)` 

5085 if distance matrices. 

5086 compute_distance : callable 

5087 A function that computes the distance or similarity among the samples 

5088 within each data matrix. Set to `None` if `x` and `y` are already 

5089 distance. 

5090 

5091 Returns 

5092 ------- 

5093 stat : float 

5094 The sample MGC test statistic within `[-1, 1]`. 

5095 stat_dict : dict 

5096 Contains additional useful additional returns containing the following 

5097 keys: 

5098 - stat_mgc_map : ndarray 

5099 MGC-map of the statistics. 

5100 - opt_scale : (float, float) 

5101 The estimated optimal scale as a `(x, y)` pair. 

5102 """ 

5103 # set distx and disty to x and y when compute_distance = None 

5104 distx = x 

5105 disty = y 

5106 

5107 if compute_distance is not None: 

5108 # compute distance matrices for x and y 

5109 distx = compute_distance(x) 

5110 disty = compute_distance(y) 

5111 

5112 # calculate MGC map and optimal scale 

5113 stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc') 

5114 

5115 n, m = stat_mgc_map.shape 

5116 if m == 1 or n == 1: 

5117 # the global scale at is the statistic calculated at maximial nearest 

5118 # neighbors. There is not enough local scale to search over, so 

5119 # default to global scale 

5120 stat = stat_mgc_map[m - 1][n - 1] 

5121 opt_scale = m * n 

5122 else: 

5123 samp_size = len(distx) - 1 

5124 

5125 # threshold to find connected region of significant local correlations 

5126 sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size) 

5127 

5128 # maximum within the significant region 

5129 stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map) 

5130 

5131 stat_dict = {"stat_mgc_map": stat_mgc_map, 

5132 "opt_scale": opt_scale} 

5133 

5134 return stat, stat_dict 

5135 

5136 

5137def _threshold_mgc_map(stat_mgc_map, samp_size): 

5138 r""" 

5139 Finds a connected region of significance in the MGC-map by thresholding. 

5140 

5141 Parameters 

5142 ---------- 

5143 stat_mgc_map : ndarray 

5144 All local correlations within `[-1,1]`. 

5145 samp_size : int 

5146 The sample size of original data. 

5147 

5148 Returns 

5149 ------- 

5150 sig_connect : ndarray 

5151 A binary matrix with 1's indicating the significant region. 

5152 """ 

5153 m, n = stat_mgc_map.shape 

5154 

5155 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05 

5156 # with varying levels of performance. Threshold is based on a beta 

5157 # approximation. 

5158 per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant 

5159 threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation 

5160 threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1 

5161 

5162 # the global scale at is the statistic calculated at maximial nearest 

5163 # neighbors. Threshold is the maximium on the global and local scales 

5164 threshold = max(threshold, stat_mgc_map[m - 1][n - 1]) 

5165 

5166 # find the largest connected component of significant correlations 

5167 sig_connect = stat_mgc_map > threshold 

5168 if np.sum(sig_connect) > 0: 

5169 sig_connect, _ = measurements.label(sig_connect) 

5170 _, label_counts = np.unique(sig_connect, return_counts=True) 

5171 

5172 # skip the first element in label_counts, as it is count(zeros) 

5173 max_label = np.argmax(label_counts[1:]) + 1 

5174 sig_connect = sig_connect == max_label 

5175 else: 

5176 sig_connect = np.array([[False]]) 

5177 

5178 return sig_connect 

5179 

5180 

5181def _smooth_mgc_map(sig_connect, stat_mgc_map): 

5182 """ 

5183 Finds the smoothed maximal within the significant region R. 

5184 

5185 If area of R is too small it returns the last local correlation. Otherwise, 

5186 returns the maximum within significant_connected_region. 

5187 

5188 Parameters 

5189 ---------- 

5190 sig_connect: ndarray 

5191 A binary matrix with 1's indicating the significant region. 

5192 stat_mgc_map: ndarray 

5193 All local correlations within `[-1, 1]`. 

5194 

5195 Returns 

5196 ------- 

5197 stat : float 

5198 The sample MGC statistic within `[-1, 1]`. 

5199 opt_scale: (float, float) 

5200 The estimated optimal scale as an `(x, y)` pair. 

5201 """ 

5202 

5203 m, n = stat_mgc_map.shape 

5204 

5205 # the global scale at is the statistic calculated at maximial nearest 

5206 # neighbors. By default, statistic and optimal scale are global. 

5207 stat = stat_mgc_map[m - 1][n - 1] 

5208 opt_scale = [m, n] 

5209 

5210 if np.linalg.norm(sig_connect) != 0: 

5211 # proceed only when the connected region's area is sufficiently large 

5212 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05 

5213 # with varying levels of performance 

5214 if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n): 

5215 max_corr = max(stat_mgc_map[sig_connect]) 

5216 

5217 # find all scales within significant_connected_region that maximize 

5218 # the local correlation 

5219 max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect) 

5220 

5221 if max_corr >= stat: 

5222 stat = max_corr 

5223 

5224 k, l = max_corr_index 

5225 one_d_indices = k * n + l # 2D to 1D indexing 

5226 k = np.max(one_d_indices) // n 

5227 l = np.max(one_d_indices) % n 

5228 opt_scale = [k+1, l+1] # adding 1s to match R indexing 

5229 

5230 return stat, opt_scale 

5231 

5232 

5233def _two_sample_transform(u, v): 

5234 """ 

5235 Helper function that concatenates x and y for two sample MGC stat. See 

5236 above for use. 

5237 

5238 Parameters 

5239 ---------- 

5240 u, v : ndarray 

5241 `u` and `v` have shapes `(n, p)` and `(m, p)`, 

5242 

5243 Returns 

5244 ------- 

5245 x : ndarray 

5246 Concatenate `u` and `v` along the `axis = 0`. `x` thus has shape 

5247 `(2n, p)`. 

5248 y : ndarray 

5249 Label matrix for `x` where 0 refers to samples that comes from `u` and 

5250 1 refers to samples that come from `v`. `y` thus has shape `(2n, 1)`. 

5251 """ 

5252 nx = u.shape[0] 

5253 ny = v.shape[0] 

5254 x = np.concatenate([u, v], axis=0) 

5255 y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1) 

5256 return x, y 

5257 

5258 

5259##################################### 

5260# INFERENTIAL STATISTICS # 

5261##################################### 

5262 

5263Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue')) 

5264 

5265 

5266def ttest_1samp(a, popmean, axis=0, nan_policy='propagate'): 

5267 """ 

5268 Calculate the T-test for the mean of ONE group of scores. 

5269 

5270 This is a two-sided test for the null hypothesis that the expected value 

5271 (mean) of a sample of independent observations `a` is equal to the given 

5272 population mean, `popmean`. 

5273 

5274 Parameters 

5275 ---------- 

5276 a : array_like 

5277 Sample observation. 

5278 popmean : float or array_like 

5279 Expected value in null hypothesis. If array_like, then it must have the 

5280 same shape as `a` excluding the axis dimension. 

5281 axis : int or None, optional 

5282 Axis along which to compute test. If None, compute over the whole 

5283 array `a`. 

5284 nan_policy : {'propagate', 'raise', 'omit'}, optional 

5285 Defines how to handle when input contains nan. 

5286 The following options are available (default is 'propagate'): 

5287 

5288 * 'propagate': returns nan 

5289 * 'raise': throws an error 

5290 * 'omit': performs the calculations ignoring nan values 

5291 

5292 Returns 

5293 ------- 

5294 statistic : float or array 

5295 t-statistic. 

5296 pvalue : float or array 

5297 Two-sided p-value. 

5298 

5299 Examples 

5300 -------- 

5301 >>> from scipy import stats 

5302 

5303 >>> np.random.seed(7654567) # fix seed to get the same result 

5304 >>> rvs = stats.norm.rvs(loc=5, scale=10, size=(50,2)) 

5305 

5306 Test if mean of random sample is equal to true mean, and different mean. 

5307 We reject the null hypothesis in the second case and don't reject it in 

5308 the first case. 

5309 

5310 >>> stats.ttest_1samp(rvs,5.0) 

5311 (array([-0.68014479, -0.04323899]), array([ 0.49961383, 0.96568674])) 

5312 >>> stats.ttest_1samp(rvs,0.0) 

5313 (array([ 2.77025808, 4.11038784]), array([ 0.00789095, 0.00014999])) 

5314 

5315 Examples using axis and non-scalar dimension for population mean. 

5316 

5317 >>> stats.ttest_1samp(rvs,[5.0,0.0]) 

5318 (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) 

5319 >>> stats.ttest_1samp(rvs.T,[5.0,0.0],axis=1) 

5320 (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) 

5321 >>> stats.ttest_1samp(rvs,[[5.0],[0.0]]) 

5322 (array([[-0.68014479, -0.04323899], 

5323 [ 2.77025808, 4.11038784]]), array([[ 4.99613833e-01, 9.65686743e-01], 

5324 [ 7.89094663e-03, 1.49986458e-04]])) 

5325 

5326 """ 

5327 a, axis = _chk_asarray(a, axis) 

5328 

5329 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

5330 

5331 if contains_nan and nan_policy == 'omit': 

5332 a = ma.masked_invalid(a) 

5333 return mstats_basic.ttest_1samp(a, popmean, axis) 

5334 

5335 n = a.shape[axis] 

5336 df = n - 1 

5337 

5338 d = np.mean(a, axis) - popmean 

5339 v = np.var(a, axis, ddof=1) 

5340 denom = np.sqrt(v / n) 

5341 

5342 with np.errstate(divide='ignore', invalid='ignore'): 

5343 t = np.divide(d, denom) 

5344 t, prob = _ttest_finish(df, t) 

5345 

5346 return Ttest_1sampResult(t, prob) 

5347 

5348 

5349def _ttest_finish(df, t): 

5350 """Common code between all 3 t-test functions.""" 

5351 prob = distributions.t.sf(np.abs(t), df) * 2 # use np.abs to get upper tail 

5352 if t.ndim == 0: 

5353 t = t[()] 

5354 

5355 return t, prob 

5356 

5357 

5358def _ttest_ind_from_stats(mean1, mean2, denom, df): 

5359 

5360 d = mean1 - mean2 

5361 with np.errstate(divide='ignore', invalid='ignore'): 

5362 t = np.divide(d, denom) 

5363 t, prob = _ttest_finish(df, t) 

5364 

5365 return (t, prob) 

5366 

5367 

5368def _unequal_var_ttest_denom(v1, n1, v2, n2): 

5369 vn1 = v1 / n1 

5370 vn2 = v2 / n2 

5371 with np.errstate(divide='ignore', invalid='ignore'): 

5372 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) 

5373 

5374 # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). 

5375 # Hence it doesn't matter what df is as long as it's not NaN. 

5376 df = np.where(np.isnan(df), 1, df) 

5377 denom = np.sqrt(vn1 + vn2) 

5378 return df, denom 

5379 

5380 

5381def _equal_var_ttest_denom(v1, n1, v2, n2): 

5382 df = n1 + n2 - 2.0 

5383 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df 

5384 denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2)) 

5385 return df, denom 

5386 

5387 

5388Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue')) 

5389 

5390 

5391def ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, 

5392 equal_var=True): 

5393 r""" 

5394 T-test for means of two independent samples from descriptive statistics. 

5395 

5396 This is a two-sided test for the null hypothesis that two independent 

5397 samples have identical average (expected) values. 

5398 

5399 Parameters 

5400 ---------- 

5401 mean1 : array_like 

5402 The mean(s) of sample 1. 

5403 std1 : array_like 

5404 The standard deviation(s) of sample 1. 

5405 nobs1 : array_like 

5406 The number(s) of observations of sample 1. 

5407 mean2 : array_like 

5408 The mean(s) of sample 2. 

5409 std2 : array_like 

5410 The standard deviations(s) of sample 2. 

5411 nobs2 : array_like 

5412 The number(s) of observations of sample 2. 

5413 equal_var : bool, optional 

5414 If True (default), perform a standard independent 2 sample test 

5415 that assumes equal population variances [1]_. 

5416 If False, perform Welch's t-test, which does not assume equal 

5417 population variance [2]_. 

5418 

5419 Returns 

5420 ------- 

5421 statistic : float or array 

5422 The calculated t-statistics. 

5423 pvalue : float or array 

5424 The two-tailed p-value. 

5425 

5426 See Also 

5427 -------- 

5428 scipy.stats.ttest_ind 

5429 

5430 Notes 

5431 ----- 

5432 .. versionadded:: 0.16.0 

5433 

5434 References 

5435 ---------- 

5436 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test 

5437 

5438 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test 

5439 

5440 Examples 

5441 -------- 

5442 Suppose we have the summary data for two samples, as follows:: 

5443 

5444 Sample Sample 

5445 Size Mean Variance 

5446 Sample 1 13 15.0 87.5 

5447 Sample 2 11 12.0 39.0 

5448 

5449 Apply the t-test to this data (with the assumption that the population 

5450 variances are equal): 

5451 

5452 >>> from scipy.stats import ttest_ind_from_stats 

5453 >>> ttest_ind_from_stats(mean1=15.0, std1=np.sqrt(87.5), nobs1=13, 

5454 ... mean2=12.0, std2=np.sqrt(39.0), nobs2=11) 

5455 Ttest_indResult(statistic=0.9051358093310269, pvalue=0.3751996797581487) 

5456 

5457 For comparison, here is the data from which those summary statistics 

5458 were taken. With this data, we can compute the same result using 

5459 `scipy.stats.ttest_ind`: 

5460 

5461 >>> a = np.array([1, 3, 4, 6, 11, 13, 15, 19, 22, 24, 25, 26, 26]) 

5462 >>> b = np.array([2, 4, 6, 9, 11, 13, 14, 15, 18, 19, 21]) 

5463 >>> from scipy.stats import ttest_ind 

5464 >>> ttest_ind(a, b) 

5465 Ttest_indResult(statistic=0.905135809331027, pvalue=0.3751996797581486) 

5466 

5467 Suppose we instead have binary data and would like to apply a t-test to 

5468 compare the proportion of 1s in two independent groups:: 

5469 

5470 Number of Sample Sample 

5471 Size ones Mean Variance 

5472 Sample 1 150 30 0.2 0.16 

5473 Sample 2 200 45 0.225 0.174375 

5474 

5475 The sample mean :math:`\hat{p}` is the proportion of ones in the sample 

5476 and the variance for a binary observation is estimated by 

5477 :math:`\hat{p}(1-\hat{p})`. 

5478 

5479 >>> ttest_ind_from_stats(mean1=0.2, std1=np.sqrt(0.16), nobs1=150, 

5480 ... mean2=0.225, std2=np.sqrt(0.17437), nobs2=200) 

5481 Ttest_indResult(statistic=-0.564327545549774, pvalue=0.5728947691244874) 

5482 

5483 For comparison, we could compute the t statistic and p-value using 

5484 arrays of 0s and 1s and `scipy.stat.ttest_ind`, as above. 

5485 

5486 >>> group1 = np.array([1]*30 + [0]*(150-30)) 

5487 >>> group2 = np.array([1]*45 + [0]*(200-45)) 

5488 >>> ttest_ind(group1, group2) 

5489 Ttest_indResult(statistic=-0.5627179589855622, pvalue=0.573989277115258) 

5490 

5491 """ 

5492 if equal_var: 

5493 df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2) 

5494 else: 

5495 df, denom = _unequal_var_ttest_denom(std1**2, nobs1, 

5496 std2**2, nobs2) 

5497 

5498 res = _ttest_ind_from_stats(mean1, mean2, denom, df) 

5499 return Ttest_indResult(*res) 

5500 

5501 

5502def _ttest_nans(a, b, axis, namedtuple_type): 

5503 """ 

5504 Generate an array of `nan`, with shape determined by `a`, `b` and `axis`. 

5505 

5506 This function is used by ttest_ind and ttest_rel to create the return 

5507 value when one of the inputs has size 0. 

5508 

5509 The shapes of the arrays are determined by dropping `axis` from the 

5510 shapes of `a` and `b` and broadcasting what is left. 

5511 

5512 The return value is a named tuple of the type given in `namedtuple_type`. 

5513 

5514 Examples 

5515 -------- 

5516 >>> a = np.zeros((9, 2)) 

5517 >>> b = np.zeros((5, 1)) 

5518 >>> _ttest_nans(a, b, 0, Ttest_indResult) 

5519 Ttest_indResult(statistic=array([nan, nan]), pvalue=array([nan, nan])) 

5520 

5521 >>> a = np.zeros((3, 0, 9)) 

5522 >>> b = np.zeros((1, 10)) 

5523 >>> stat, p = _ttest_nans(a, b, -1, Ttest_indResult) 

5524 >>> stat 

5525 array([], shape=(3, 0), dtype=float64) 

5526 >>> p 

5527 array([], shape=(3, 0), dtype=float64) 

5528 

5529 >>> a = np.zeros(10) 

5530 >>> b = np.zeros(7) 

5531 >>> _ttest_nans(a, b, 0, Ttest_indResult) 

5532 Ttest_indResult(statistic=nan, pvalue=nan) 

5533 """ 

5534 shp = _broadcast_shapes_with_dropped_axis(a, b, axis) 

5535 if len(shp) == 0: 

5536 t = np.nan 

5537 p = np.nan 

5538 else: 

5539 t = np.full(shp, fill_value=np.nan) 

5540 p = t.copy() 

5541 return namedtuple_type(t, p) 

5542 

5543 

5544def ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate'): 

5545 """ 

5546 Calculate the T-test for the means of *two independent* samples of scores. 

5547 

5548 This is a two-sided test for the null hypothesis that 2 independent samples 

5549 have identical average (expected) values. This test assumes that the 

5550 populations have identical variances by default. 

5551 

5552 Parameters 

5553 ---------- 

5554 a, b : array_like 

5555 The arrays must have the same shape, except in the dimension 

5556 corresponding to `axis` (the first, by default). 

5557 axis : int or None, optional 

5558 Axis along which to compute test. If None, compute over the whole 

5559 arrays, `a`, and `b`. 

5560 equal_var : bool, optional 

5561 If True (default), perform a standard independent 2 sample test 

5562 that assumes equal population variances [1]_. 

5563 If False, perform Welch's t-test, which does not assume equal 

5564 population variance [2]_. 

5565 

5566 .. versionadded:: 0.11.0 

5567 nan_policy : {'propagate', 'raise', 'omit'}, optional 

5568 Defines how to handle when input contains nan. 

5569 The following options are available (default is 'propagate'): 

5570 

5571 * 'propagate': returns nan 

5572 * 'raise': throws an error 

5573 * 'omit': performs the calculations ignoring nan values 

5574 

5575 Returns 

5576 ------- 

5577 statistic : float or array 

5578 The calculated t-statistic. 

5579 pvalue : float or array 

5580 The two-tailed p-value. 

5581 

5582 Notes 

5583 ----- 

5584 We can use this test, if we observe two independent samples from 

5585 the same or different population, e.g. exam scores of boys and 

5586 girls or of two ethnic groups. The test measures whether the 

5587 average (expected) value differs significantly across samples. If 

5588 we observe a large p-value, for example larger than 0.05 or 0.1, 

5589 then we cannot reject the null hypothesis of identical average scores. 

5590 If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%, 

5591 then we reject the null hypothesis of equal averages. 

5592 

5593 References 

5594 ---------- 

5595 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test 

5596 

5597 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test 

5598 

5599 Examples 

5600 -------- 

5601 >>> from scipy import stats 

5602 >>> np.random.seed(12345678) 

5603 

5604 Test with sample with identical means: 

5605 

5606 >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) 

5607 >>> rvs2 = stats.norm.rvs(loc=5,scale=10,size=500) 

5608 >>> stats.ttest_ind(rvs1,rvs2) 

5609 (0.26833823296239279, 0.78849443369564776) 

5610 >>> stats.ttest_ind(rvs1,rvs2, equal_var = False) 

5611 (0.26833823296239279, 0.78849452749500748) 

5612 

5613 `ttest_ind` underestimates p for unequal variances: 

5614 

5615 >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500) 

5616 >>> stats.ttest_ind(rvs1, rvs3) 

5617 (-0.46580283298287162, 0.64145827413436174) 

5618 >>> stats.ttest_ind(rvs1, rvs3, equal_var = False) 

5619 (-0.46580283298287162, 0.64149646246569292) 

5620 

5621 When n1 != n2, the equal variance t-statistic is no longer equal to the 

5622 unequal variance t-statistic: 

5623 

5624 >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100) 

5625 >>> stats.ttest_ind(rvs1, rvs4) 

5626 (-0.99882539442782481, 0.3182832709103896) 

5627 >>> stats.ttest_ind(rvs1, rvs4, equal_var = False) 

5628 (-0.69712570584654099, 0.48716927725402048) 

5629 

5630 T-test with different means, variance, and n: 

5631 

5632 >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100) 

5633 >>> stats.ttest_ind(rvs1, rvs5) 

5634 (-1.4679669854490653, 0.14263895620529152) 

5635 >>> stats.ttest_ind(rvs1, rvs5, equal_var = False) 

5636 (-0.94365973617132992, 0.34744170334794122) 

5637 

5638 """ 

5639 a, b, axis = _chk2_asarray(a, b, axis) 

5640 

5641 # check both a and b 

5642 cna, npa = _contains_nan(a, nan_policy) 

5643 cnb, npb = _contains_nan(b, nan_policy) 

5644 contains_nan = cna or cnb 

5645 if npa == 'omit' or npb == 'omit': 

5646 nan_policy = 'omit' 

5647 

5648 if contains_nan and nan_policy == 'omit': 

5649 a = ma.masked_invalid(a) 

5650 b = ma.masked_invalid(b) 

5651 return mstats_basic.ttest_ind(a, b, axis, equal_var) 

5652 

5653 if a.size == 0 or b.size == 0: 

5654 return _ttest_nans(a, b, axis, Ttest_indResult) 

5655 

5656 v1 = np.var(a, axis, ddof=1) 

5657 v2 = np.var(b, axis, ddof=1) 

5658 n1 = a.shape[axis] 

5659 n2 = b.shape[axis] 

5660 

5661 if equal_var: 

5662 df, denom = _equal_var_ttest_denom(v1, n1, v2, n2) 

5663 else: 

5664 df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2) 

5665 

5666 res = _ttest_ind_from_stats(np.mean(a, axis), np.mean(b, axis), denom, df) 

5667 

5668 return Ttest_indResult(*res) 

5669 

5670 

5671def _get_len(a, axis, msg): 

5672 try: 

5673 n = a.shape[axis] 

5674 except IndexError: 

5675 raise np.AxisError(axis, a.ndim, msg) from None 

5676 return n 

5677 

5678 

5679Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue')) 

5680 

5681 

5682def ttest_rel(a, b, axis=0, nan_policy='propagate'): 

5683 """ 

5684 Calculate the t-test on TWO RELATED samples of scores, a and b. 

5685 

5686 This is a two-sided test for the null hypothesis that 2 related or 

5687 repeated samples have identical average (expected) values. 

5688 

5689 Parameters 

5690 ---------- 

5691 a, b : array_like 

5692 The arrays must have the same shape. 

5693 axis : int or None, optional 

5694 Axis along which to compute test. If None, compute over the whole 

5695 arrays, `a`, and `b`. 

5696 nan_policy : {'propagate', 'raise', 'omit'}, optional 

5697 Defines how to handle when input contains nan. 

5698 The following options are available (default is 'propagate'): 

5699 

5700 * 'propagate': returns nan 

5701 * 'raise': throws an error 

5702 * 'omit': performs the calculations ignoring nan values 

5703 

5704 Returns 

5705 ------- 

5706 statistic : float or array 

5707 t-statistic. 

5708 pvalue : float or array 

5709 Two-sided p-value. 

5710 

5711 Notes 

5712 ----- 

5713 Examples for use are scores of the same set of student in 

5714 different exams, or repeated sampling from the same units. The 

5715 test measures whether the average score differs significantly 

5716 across samples (e.g. exams). If we observe a large p-value, for 

5717 example greater than 0.05 or 0.1 then we cannot reject the null 

5718 hypothesis of identical average scores. If the p-value is smaller 

5719 than the threshold, e.g. 1%, 5% or 10%, then we reject the null 

5720 hypothesis of equal averages. Small p-values are associated with 

5721 large t-statistics. 

5722 

5723 References 

5724 ---------- 

5725 https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples 

5726 

5727 Examples 

5728 -------- 

5729 >>> from scipy import stats 

5730 >>> np.random.seed(12345678) # fix random seed to get same numbers 

5731 

5732 >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) 

5733 >>> rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) + 

5734 ... stats.norm.rvs(scale=0.2,size=500)) 

5735 >>> stats.ttest_rel(rvs1,rvs2) 

5736 (0.24101764965300962, 0.80964043445811562) 

5737 >>> rvs3 = (stats.norm.rvs(loc=8,scale=10,size=500) + 

5738 ... stats.norm.rvs(scale=0.2,size=500)) 

5739 >>> stats.ttest_rel(rvs1,rvs3) 

5740 (-3.9995108708727933, 7.3082402191726459e-005) 

5741 

5742 """ 

5743 a, b, axis = _chk2_asarray(a, b, axis) 

5744 

5745 cna, npa = _contains_nan(a, nan_policy) 

5746 cnb, npb = _contains_nan(b, nan_policy) 

5747 contains_nan = cna or cnb 

5748 if npa == 'omit' or npb == 'omit': 

5749 nan_policy = 'omit' 

5750 

5751 if contains_nan and nan_policy == 'omit': 

5752 a = ma.masked_invalid(a) 

5753 b = ma.masked_invalid(b) 

5754 m = ma.mask_or(ma.getmask(a), ma.getmask(b)) 

5755 aa = ma.array(a, mask=m, copy=True) 

5756 bb = ma.array(b, mask=m, copy=True) 

5757 return mstats_basic.ttest_rel(aa, bb, axis) 

5758 

5759 na = _get_len(a, axis, "first argument") 

5760 nb = _get_len(b, axis, "second argument") 

5761 if na != nb: 

5762 raise ValueError('unequal length arrays') 

5763 

5764 if na == 0: 

5765 return _ttest_nans(a, b, axis, Ttest_relResult) 

5766 

5767 n = a.shape[axis] 

5768 df = n - 1 

5769 

5770 d = (a - b).astype(np.float64) 

5771 v = np.var(d, axis, ddof=1) 

5772 dm = np.mean(d, axis) 

5773 denom = np.sqrt(v / n) 

5774 

5775 with np.errstate(divide='ignore', invalid='ignore'): 

5776 t = np.divide(dm, denom) 

5777 t, prob = _ttest_finish(df, t) 

5778 

5779 return Ttest_relResult(t, prob) 

5780 

5781 

5782# Map from names to lambda_ values used in power_divergence(). 

5783_power_div_lambda_names = { 

5784 "pearson": 1, 

5785 "log-likelihood": 0, 

5786 "freeman-tukey": -0.5, 

5787 "mod-log-likelihood": -1, 

5788 "neyman": -2, 

5789 "cressie-read": 2/3, 

5790} 

5791 

5792 

5793def _count(a, axis=None): 

5794 """ 

5795 Count the number of non-masked elements of an array. 

5796 

5797 This function behaves like np.ma.count(), but is much faster 

5798 for ndarrays. 

5799 """ 

5800 if hasattr(a, 'count'): 

5801 num = a.count(axis=axis) 

5802 if isinstance(num, np.ndarray) and num.ndim == 0: 

5803 # In some cases, the `count` method returns a scalar array (e.g. 

5804 # np.array(3)), but we want a plain integer. 

5805 num = int(num) 

5806 else: 

5807 if axis is None: 

5808 num = a.size 

5809 else: 

5810 num = a.shape[axis] 

5811 return num 

5812 

5813 

5814Power_divergenceResult = namedtuple('Power_divergenceResult', 

5815 ('statistic', 'pvalue')) 

5816 

5817 

5818def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): 

5819 """ 

5820 Cressie-Read power divergence statistic and goodness of fit test. 

5821 

5822 This function tests the null hypothesis that the categorical data 

5823 has the given frequencies, using the Cressie-Read power divergence 

5824 statistic. 

5825 

5826 Parameters 

5827 ---------- 

5828 f_obs : array_like 

5829 Observed frequencies in each category. 

5830 f_exp : array_like, optional 

5831 Expected frequencies in each category. By default the categories are 

5832 assumed to be equally likely. 

5833 ddof : int, optional 

5834 "Delta degrees of freedom": adjustment to the degrees of freedom 

5835 for the p-value. The p-value is computed using a chi-squared 

5836 distribution with ``k - 1 - ddof`` degrees of freedom, where `k` 

5837 is the number of observed frequencies. The default value of `ddof` 

5838 is 0. 

5839 axis : int or None, optional 

5840 The axis of the broadcast result of `f_obs` and `f_exp` along which to 

5841 apply the test. If axis is None, all values in `f_obs` are treated 

5842 as a single data set. Default is 0. 

5843 lambda_ : float or str, optional 

5844 The power in the Cressie-Read power divergence statistic. The default 

5845 is 1. For convenience, `lambda_` may be assigned one of the following 

5846 strings, in which case the corresponding numerical value is used:: 

5847 

5848 String Value Description 

5849 "pearson" 1 Pearson's chi-squared statistic. 

5850 In this case, the function is 

5851 equivalent to `stats.chisquare`. 

5852 "log-likelihood" 0 Log-likelihood ratio. Also known as 

5853 the G-test [3]_. 

5854 "freeman-tukey" -1/2 Freeman-Tukey statistic. 

5855 "mod-log-likelihood" -1 Modified log-likelihood ratio. 

5856 "neyman" -2 Neyman's statistic. 

5857 "cressie-read" 2/3 The power recommended in [5]_. 

5858 

5859 Returns 

5860 ------- 

5861 statistic : float or ndarray 

5862 The Cressie-Read power divergence test statistic. The value is 

5863 a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. 

5864 pvalue : float or ndarray 

5865 The p-value of the test. The value is a float if `ddof` and the 

5866 return value `stat` are scalars. 

5867 

5868 See Also 

5869 -------- 

5870 chisquare 

5871 

5872 Notes 

5873 ----- 

5874 This test is invalid when the observed or expected frequencies in each 

5875 category are too small. A typical rule is that all of the observed 

5876 and expected frequencies should be at least 5. 

5877 

5878 When `lambda_` is less than zero, the formula for the statistic involves 

5879 dividing by `f_obs`, so a warning or error may be generated if any value 

5880 in `f_obs` is 0. 

5881 

5882 Similarly, a warning or error may be generated if any value in `f_exp` is 

5883 zero when `lambda_` >= 0. 

5884 

5885 The default degrees of freedom, k-1, are for the case when no parameters 

5886 of the distribution are estimated. If p parameters are estimated by 

5887 efficient maximum likelihood then the correct degrees of freedom are 

5888 k-1-p. If the parameters are estimated in a different way, then the 

5889 dof can be between k-1-p and k-1. However, it is also possible that 

5890 the asymptotic distribution is not a chisquare, in which case this 

5891 test is not appropriate. 

5892 

5893 This function handles masked arrays. If an element of `f_obs` or `f_exp` 

5894 is masked, then data at that position is ignored, and does not count 

5895 towards the size of the data set. 

5896 

5897 .. versionadded:: 0.13.0 

5898 

5899 References 

5900 ---------- 

5901 .. [1] Lowry, Richard. "Concepts and Applications of Inferential 

5902 Statistics". Chapter 8. 

5903 https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html 

5904 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test 

5905 .. [3] "G-test", https://en.wikipedia.org/wiki/G-test 

5906 .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and 

5907 practice of statistics in biological research", New York: Freeman 

5908 (1981) 

5909 .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit 

5910 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), 

5911 pp. 440-464. 

5912 

5913 Examples 

5914 -------- 

5915 (See `chisquare` for more examples.) 

5916 

5917 When just `f_obs` is given, it is assumed that the expected frequencies 

5918 are uniform and given by the mean of the observed frequencies. Here we 

5919 perform a G-test (i.e. use the log-likelihood ratio statistic): 

5920 

5921 >>> from scipy.stats import power_divergence 

5922 >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood') 

5923 (2.006573162632538, 0.84823476779463769) 

5924 

5925 The expected frequencies can be given with the `f_exp` argument: 

5926 

5927 >>> power_divergence([16, 18, 16, 14, 12, 12], 

5928 ... f_exp=[16, 16, 16, 16, 16, 8], 

5929 ... lambda_='log-likelihood') 

5930 (3.3281031458963746, 0.6495419288047497) 

5931 

5932 When `f_obs` is 2-D, by default the test is applied to each column. 

5933 

5934 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T 

5935 >>> obs.shape 

5936 (6, 2) 

5937 >>> power_divergence(obs, lambda_="log-likelihood") 

5938 (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) 

5939 

5940 By setting ``axis=None``, the test is applied to all data in the array, 

5941 which is equivalent to applying the test to the flattened array. 

5942 

5943 >>> power_divergence(obs, axis=None) 

5944 (23.31034482758621, 0.015975692534127565) 

5945 >>> power_divergence(obs.ravel()) 

5946 (23.31034482758621, 0.015975692534127565) 

5947 

5948 `ddof` is the change to make to the default degrees of freedom. 

5949 

5950 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1) 

5951 (2.0, 0.73575888234288467) 

5952 

5953 The calculation of the p-values is done by broadcasting the 

5954 test statistic with `ddof`. 

5955 

5956 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) 

5957 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) 

5958 

5959 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has 

5960 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting 

5961 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared 

5962 statistics, we must use ``axis=1``: 

5963 

5964 >>> power_divergence([16, 18, 16, 14, 12, 12], 

5965 ... f_exp=[[16, 16, 16, 16, 16, 8], 

5966 ... [8, 20, 20, 16, 12, 12]], 

5967 ... axis=1) 

5968 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) 

5969 

5970 """ 

5971 # Convert the input argument `lambda_` to a numerical value. 

5972 if isinstance(lambda_, str): 

5973 if lambda_ not in _power_div_lambda_names: 

5974 names = repr(list(_power_div_lambda_names.keys()))[1:-1] 

5975 raise ValueError("invalid string for lambda_: {0!r}. Valid strings " 

5976 "are {1}".format(lambda_, names)) 

5977 lambda_ = _power_div_lambda_names[lambda_] 

5978 elif lambda_ is None: 

5979 lambda_ = 1 

5980 

5981 f_obs = np.asanyarray(f_obs) 

5982 

5983 if f_exp is not None: 

5984 f_exp = np.asanyarray(f_exp) 

5985 else: 

5986 # Ignore 'invalid' errors so the edge case of a data set with length 0 

5987 # is handled without spurious warnings. 

5988 with np.errstate(invalid='ignore'): 

5989 f_exp = f_obs.mean(axis=axis, keepdims=True) 

5990 

5991 # `terms` is the array of terms that are summed along `axis` to create 

5992 # the test statistic. We use some specialized code for a few special 

5993 # cases of lambda_. 

5994 if lambda_ == 1: 

5995 # Pearson's chi-squared statistic 

5996 terms = (f_obs.astype(np.float64) - f_exp)**2 / f_exp 

5997 elif lambda_ == 0: 

5998 # Log-likelihood ratio (i.e. G-test) 

5999 terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) 

6000 elif lambda_ == -1: 

6001 # Modified log-likelihood ratio 

6002 terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) 

6003 else: 

6004 # General Cressie-Read power divergence. 

6005 terms = f_obs * ((f_obs / f_exp)**lambda_ - 1) 

6006 terms /= 0.5 * lambda_ * (lambda_ + 1) 

6007 

6008 stat = terms.sum(axis=axis) 

6009 

6010 num_obs = _count(terms, axis=axis) 

6011 ddof = asarray(ddof) 

6012 p = distributions.chi2.sf(stat, num_obs - 1 - ddof) 

6013 

6014 return Power_divergenceResult(stat, p) 

6015 

6016 

6017def chisquare(f_obs, f_exp=None, ddof=0, axis=0): 

6018 """ 

6019 Calculate a one-way chi-square test. 

6020 

6021 The chi-square test tests the null hypothesis that the categorical data 

6022 has the given frequencies. 

6023 

6024 Parameters 

6025 ---------- 

6026 f_obs : array_like 

6027 Observed frequencies in each category. 

6028 f_exp : array_like, optional 

6029 Expected frequencies in each category. By default the categories are 

6030 assumed to be equally likely. 

6031 ddof : int, optional 

6032 "Delta degrees of freedom": adjustment to the degrees of freedom 

6033 for the p-value. The p-value is computed using a chi-squared 

6034 distribution with ``k - 1 - ddof`` degrees of freedom, where `k` 

6035 is the number of observed frequencies. The default value of `ddof` 

6036 is 0. 

6037 axis : int or None, optional 

6038 The axis of the broadcast result of `f_obs` and `f_exp` along which to 

6039 apply the test. If axis is None, all values in `f_obs` are treated 

6040 as a single data set. Default is 0. 

6041 

6042 Returns 

6043 ------- 

6044 chisq : float or ndarray 

6045 The chi-squared test statistic. The value is a float if `axis` is 

6046 None or `f_obs` and `f_exp` are 1-D. 

6047 p : float or ndarray 

6048 The p-value of the test. The value is a float if `ddof` and the 

6049 return value `chisq` are scalars. 

6050 

6051 See Also 

6052 -------- 

6053 scipy.stats.power_divergence 

6054 

6055 Notes 

6056 ----- 

6057 This test is invalid when the observed or expected frequencies in each 

6058 category are too small. A typical rule is that all of the observed 

6059 and expected frequencies should be at least 5. 

6060 

6061 The default degrees of freedom, k-1, are for the case when no parameters 

6062 of the distribution are estimated. If p parameters are estimated by 

6063 efficient maximum likelihood then the correct degrees of freedom are 

6064 k-1-p. If the parameters are estimated in a different way, then the 

6065 dof can be between k-1-p and k-1. However, it is also possible that 

6066 the asymptotic distribution is not chi-square, in which case this test 

6067 is not appropriate. 

6068 

6069 References 

6070 ---------- 

6071 .. [1] Lowry, Richard. "Concepts and Applications of Inferential 

6072 Statistics". Chapter 8. 

6073 https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html 

6074 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test 

6075 

6076 Examples 

6077 -------- 

6078 When just `f_obs` is given, it is assumed that the expected frequencies 

6079 are uniform and given by the mean of the observed frequencies. 

6080 

6081 >>> from scipy.stats import chisquare 

6082 >>> chisquare([16, 18, 16, 14, 12, 12]) 

6083 (2.0, 0.84914503608460956) 

6084 

6085 With `f_exp` the expected frequencies can be given. 

6086 

6087 >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]) 

6088 (3.5, 0.62338762774958223) 

6089 

6090 When `f_obs` is 2-D, by default the test is applied to each column. 

6091 

6092 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T 

6093 >>> obs.shape 

6094 (6, 2) 

6095 >>> chisquare(obs) 

6096 (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415])) 

6097 

6098 By setting ``axis=None``, the test is applied to all data in the array, 

6099 which is equivalent to applying the test to the flattened array. 

6100 

6101 >>> chisquare(obs, axis=None) 

6102 (23.31034482758621, 0.015975692534127565) 

6103 >>> chisquare(obs.ravel()) 

6104 (23.31034482758621, 0.015975692534127565) 

6105 

6106 `ddof` is the change to make to the default degrees of freedom. 

6107 

6108 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1) 

6109 (2.0, 0.73575888234288467) 

6110 

6111 The calculation of the p-values is done by broadcasting the 

6112 chi-squared statistic with `ddof`. 

6113 

6114 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) 

6115 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) 

6116 

6117 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has 

6118 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting 

6119 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared 

6120 statistics, we use ``axis=1``: 

6121 

6122 >>> chisquare([16, 18, 16, 14, 12, 12], 

6123 ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]], 

6124 ... axis=1) 

6125 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) 

6126 

6127 """ 

6128 return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, 

6129 lambda_="pearson") 

6130 

6131 

6132KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue')) 

6133 

6134 

6135def _compute_dplus(cdfvals): 

6136 """Computes D+ as used in the Kolmogorov-Smirnov test. 

6137 

6138 Parameters 

6139 ---------- 

6140 cdfvals: array_like 

6141 Sorted array of CDF values between 0 and 1 

6142 

6143 Returns 

6144 ------- 

6145 Maximum distance of the CDF values below Uniform(0, 1) 

6146""" 

6147 n = len(cdfvals) 

6148 return (np.arange(1.0, n + 1) / n - cdfvals).max() 

6149 

6150 

6151def _compute_dminus(cdfvals): 

6152 """Computes D- as used in the Kolmogorov-Smirnov test. 

6153 

6154 Parameters 

6155 ---------- 

6156 cdfvals: array_like 

6157 Sorted array of CDF values between 0 and 1 

6158 

6159 Returns 

6160 ------- 

6161 Maximum distance of the CDF values above Uniform(0, 1) 

6162 """ 

6163 n = len(cdfvals) 

6164 return (cdfvals - np.arange(0.0, n)/n).max() 

6165 

6166 

6167def ks_1samp(x, cdf, args=(), alternative='two-sided', mode='auto'): 

6168 """ 

6169 Performs the Kolmogorov-Smirnov test for goodness of fit. 

6170 

6171 This performs a test of the distribution F(x) of an observed 

6172 random variable against a given distribution G(x). Under the null 

6173 hypothesis, the two distributions are identical, F(x)=G(x). The 

6174 alternative hypothesis can be either 'two-sided' (default), 'less' 

6175 or 'greater'. The KS test is only valid for continuous distributions. 

6176 

6177 Parameters 

6178 ---------- 

6179 x : array_like 

6180 a 1-D array of observations of iid random variables. 

6181 cdf : callable 

6182 callable used to calculate the cdf. 

6183 args : tuple, sequence, optional 

6184 Distribution parameters, used with `cdf`. 

6185 alternative : {'two-sided', 'less', 'greater'}, optional 

6186 Defines the alternative hypothesis. 

6187 The following options are available (default is 'two-sided'): 

6188 

6189 * 'two-sided' 

6190 * 'less': one-sided, see explanation in Notes 

6191 * 'greater': one-sided, see explanation in Notes 

6192 mode : {'auto', 'exact', 'approx', 'asymp'}, optional 

6193 Defines the distribution used for calculating the p-value. 

6194 The following options are available (default is 'auto'): 

6195 

6196 * 'auto' : selects one of the other options. 

6197 * 'exact' : uses the exact distribution of test statistic. 

6198 * 'approx' : approximates the two-sided probability with twice the one-sided probability 

6199 * 'asymp': uses asymptotic distribution of test statistic 

6200 

6201 Returns 

6202 ------- 

6203 statistic : float 

6204 KS test statistic, either D, D+ or D- (depending on the value of 'alternative') 

6205 pvalue : float 

6206 One-tailed or two-tailed p-value. 

6207 

6208 See Also 

6209 -------- 

6210 ks_2samp, kstest 

6211 

6212 Notes 

6213 ----- 

6214 In the one-sided test, the alternative is that the empirical 

6215 cumulative distribution function of the random variable is "less" 

6216 or "greater" than the cumulative distribution function G(x) of the 

6217 hypothesis, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``. 

6218 

6219 Examples 

6220 -------- 

6221 >>> from scipy import stats 

6222 

6223 >>> x = np.linspace(-15, 15, 9) 

6224 >>> stats.ks_1samp(x, stats.norm.cdf) 

6225 (0.44435602715924361, 0.038850142705171065) 

6226 

6227 >>> np.random.seed(987654321) # set random seed to get the same result 

6228 >>> stats.ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf) 

6229 (0.058352892479417884, 0.8653960860778898) 

6230 

6231 *Test against one-sided alternative hypothesis* 

6232 

6233 Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``: 

6234 

6235 >>> np.random.seed(987654321) 

6236 >>> x = stats.norm.rvs(loc=0.2, size=100) 

6237 >>> stats.ks_1samp(x, stats.norm.cdf, alternative='less') 

6238 (0.12464329735846891, 0.040989164077641749) 

6239 

6240 Reject equal distribution against alternative hypothesis: less 

6241 

6242 >>> stats.ks_1samp(x, stats.norm.cdf, alternative='greater') 

6243 (0.0072115233216311081, 0.98531158590396395) 

6244 

6245 Don't reject equal distribution against alternative hypothesis: greater 

6246 

6247 >>> stats.ks_1samp(x, stats.norm.cdf) 

6248 (0.12464329735846891, 0.08197335233541582) 

6249 

6250 Don't reject equal distribution against alternative hypothesis: two-sided 

6251 

6252 *Testing t distributed random variables against normal distribution* 

6253 

6254 With 100 degrees of freedom the t distribution looks close to the normal 

6255 distribution, and the K-S test does not reject the hypothesis that the 

6256 sample came from the normal distribution: 

6257 

6258 >>> np.random.seed(987654321) 

6259 >>> stats.ks_1samp(stats.t.rvs(100,size=100), stats.norm.cdf) 

6260 (0.072018929165471257, 0.6505883498379312) 

6261 

6262 With 3 degrees of freedom the t distribution looks sufficiently different 

6263 from the normal distribution, that we can reject the hypothesis that the 

6264 sample came from the normal distribution at the 10% level: 

6265 

6266 >>> np.random.seed(987654321) 

6267 >>> stats.ks_1samp(stats.t.rvs(3,size=100), stats.norm.cdf) 

6268 (0.131016895759829, 0.058826222555312224) 

6269 

6270 """ 

6271 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

6272 alternative.lower()[0], alternative) 

6273 if alternative not in ['two-sided', 'greater', 'less']: 

6274 raise ValueError("Unexpected alternative %s" % alternative) 

6275 if np.ma.is_masked(x): 

6276 x = x.compressed() 

6277 

6278 N = len(x) 

6279 x = np.sort(x) 

6280 cdfvals = cdf(x, *args) 

6281 

6282 if alternative == 'greater': 

6283 Dplus = _compute_dplus(cdfvals) 

6284 return KstestResult(Dplus, distributions.ksone.sf(Dplus, N)) 

6285 

6286 if alternative == 'less': 

6287 Dminus = _compute_dminus(cdfvals) 

6288 return KstestResult(Dminus, distributions.ksone.sf(Dminus, N)) 

6289 

6290 # alternative == 'two-sided': 

6291 Dplus = _compute_dplus(cdfvals) 

6292 Dminus = _compute_dminus(cdfvals) 

6293 D = np.max([Dplus, Dminus]) 

6294 if mode == 'auto': # Always select exact 

6295 mode = 'exact' 

6296 if mode == 'exact': 

6297 prob = distributions.kstwo.sf(D, N) 

6298 elif mode == 'asymp': 

6299 prob = distributions.kstwobign.sf(D * np.sqrt(N)) 

6300 else: 

6301 # mode == 'approx' 

6302 prob = 2 * distributions.ksone.sf(D, N) 

6303 prob = np.clip(prob, 0, 1) 

6304 return KstestResult(D, prob) 

6305 

6306 

6307Ks_2sampResult = KstestResult 

6308 

6309 

6310def _compute_prob_inside_method(m, n, g, h): 

6311 """ 

6312 Count the proportion of paths that stay strictly inside two diagonal lines. 

6313 

6314 Parameters 

6315 ---------- 

6316 m : integer 

6317 m > 0 

6318 n : integer 

6319 n > 0 

6320 g : integer 

6321 g is greatest common divisor of m and n 

6322 h : integer 

6323 0 <= h <= lcm(m,n) 

6324 

6325 Returns 

6326 ------- 

6327 p : float 

6328 The proportion of paths that stay inside the two lines. 

6329 

6330 

6331 Count the integer lattice paths from (0, 0) to (m, n) which satisfy 

6332 |x/m - y/n| < h / lcm(m, n). 

6333 The paths make steps of size +1 in either positive x or positive y directions. 

6334 

6335 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. 

6336 Hodges, J.L. Jr., 

6337 "The Significance Probability of the Smirnov Two-Sample Test," 

6338 Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. 

6339 

6340 """ 

6341 # Probability is symmetrical in m, n. Computation below uses m >= n. 

6342 if m < n: 

6343 m, n = n, m 

6344 mg = m // g 

6345 ng = n // g 

6346 

6347 # Count the integer lattice paths from (0, 0) to (m, n) which satisfy 

6348 # |nx/g - my/g| < h. 

6349 # Compute matrix A such that: 

6350 # A(x, 0) = A(0, y) = 1 

6351 # A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that 

6352 # A(x, y) = 0 if |x/m - y/n|>= h 

6353 # Probability is A(m, n)/binom(m+n, n) 

6354 # Optimizations exist for m==n, m==n*p. 

6355 # Only need to preserve a single column of A, and only a sliding window of it. 

6356 # minj keeps track of the slide. 

6357 minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1) 

6358 curlen = maxj - minj 

6359 # Make a vector long enough to hold maximum window needed. 

6360 lenA = min(2 * maxj + 2, n + 1) 

6361 # This is an integer calculation, but the entries are essentially 

6362 # binomial coefficients, hence grow quickly. 

6363 # Scaling after each column is computed avoids dividing by a 

6364 # large binomial coefficent at the end, but is not sufficient to avoid 

6365 # the large dyanamic range which appears during the calculation. 

6366 # Instead we rescale based on the magnitude of the right most term in 

6367 # the column and keep track of an exponent separately and apply 

6368 # it at the end of the calculation. Similarly when multiplying by 

6369 # the binomial coefficint 

6370 dtype = np.float64 

6371 A = np.zeros(lenA, dtype=dtype) 

6372 # Initialize the first column 

6373 A[minj:maxj] = 1 

6374 expnt = 0 

6375 for i in range(1, m + 1): 

6376 # Generate the next column. 

6377 # First calculate the sliding window 

6378 lastminj, lastlen = minj, curlen 

6379 minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0) 

6380 minj = min(minj, n) 

6381 maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1) 

6382 if maxj <= minj: 

6383 return 0 

6384 # Now fill in the values 

6385 A[0:maxj - minj] = np.cumsum(A[minj - lastminj:maxj - lastminj]) 

6386 curlen = maxj - minj 

6387 if lastlen > curlen: 

6388 # Set some carried-over elements to 0 

6389 A[maxj - minj:maxj - minj + (lastlen - curlen)] = 0 

6390 # Rescale if the right most value is over 2**900 

6391 val = A[maxj - minj - 1] 

6392 _, valexpt = math.frexp(val) 

6393 if valexpt > 900: 

6394 # Scaling to bring down to about 2**800 appears 

6395 # sufficient for sizes under 10000. 

6396 valexpt -= 800 

6397 A = np.ldexp(A, -valexpt) 

6398 expnt += valexpt 

6399 

6400 val = A[maxj - minj - 1] 

6401 # Now divide by the binomial (m+n)!/m!/n! 

6402 for i in range(1, n + 1): 

6403 val = (val * i) / (m + i) 

6404 _, valexpt = math.frexp(val) 

6405 if valexpt < -128: 

6406 val = np.ldexp(val, -valexpt) 

6407 expnt += valexpt 

6408 # Finally scale if needed. 

6409 return np.ldexp(val, expnt) 

6410 

6411 

6412def _compute_prob_outside_square(n, h): 

6413 """ 

6414 Compute the proportion of paths that pass outside the two diagonal lines. 

6415 

6416 Parameters 

6417 ---------- 

6418 n : integer 

6419 n > 0 

6420 h : integer 

6421 0 <= h <= n 

6422 

6423 Returns 

6424 ------- 

6425 p : float 

6426 The proportion of paths that pass outside the lines x-y = +/-h. 

6427 

6428 """ 

6429 # Compute Pr(D_{n,n} >= h/n) 

6430 # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... ) / binom(2n, n) 

6431 # This formulation exhibits subtractive cancellation. 

6432 # Instead divide each term by binom(2n, n), then factor common terms 

6433 # and use a Horner-like algorithm 

6434 # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...))))) 

6435 

6436 P = 0.0 

6437 k = int(np.floor(n / h)) 

6438 while k >= 0: 

6439 p1 = 1.0 

6440 # Each of the Ai terms has numerator and denominator with h simple terms. 

6441 for j in range(h): 

6442 p1 = (n - k * h - j) * p1 / (n + k * h + j + 1) 

6443 P = p1 * (1.0 - P) 

6444 k -= 1 

6445 return 2 * P 

6446 

6447 

6448def _count_paths_outside_method(m, n, g, h): 

6449 """ 

6450 Count the number of paths that pass outside the specified diagonal. 

6451 

6452 Parameters 

6453 ---------- 

6454 m : integer 

6455 m > 0 

6456 n : integer 

6457 n > 0 

6458 g : integer 

6459 g is greatest common divisor of m and n 

6460 h : integer 

6461 0 <= h <= lcm(m,n) 

6462 

6463 Returns 

6464 ------- 

6465 p : float 

6466 The number of paths that go low. 

6467 The calculation may overflow - check for a finite answer. 

6468 

6469 Exceptions 

6470 ---------- 

6471 FloatingPointError: Raised if the intermediate computation goes outside 

6472 the range of a float. 

6473 

6474 Notes 

6475 ----- 

6476 Count the integer lattice paths from (0, 0) to (m, n), which at some 

6477 point (x, y) along the path, satisfy: 

6478 m*y <= n*x - h*g 

6479 The paths make steps of size +1 in either positive x or positive y directions. 

6480 

6481 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. 

6482 Hodges, J.L. Jr., 

6483 "The Significance Probability of the Smirnov Two-Sample Test," 

6484 Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. 

6485 

6486 """ 

6487 # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n) 

6488 # B(x, y) = #{paths from (0,0) to (x,y) without previously crossing the boundary} 

6489 # = binom(x, y) - #{paths which already reached the boundary} 

6490 # Multiply by the number of path extensions going from (x, y) to (m, n) 

6491 # Sum. 

6492 

6493 # Probability is symmetrical in m, n. Computation below assumes m >= n. 

6494 if m < n: 

6495 m, n = n, m 

6496 mg = m // g 

6497 ng = n // g 

6498 

6499 # Not every x needs to be considered. 

6500 # xj holds the list of x values to be checked. 

6501 # Wherever n*x/m + ng*h crosses an integer 

6502 lxj = n + (mg-h)//mg 

6503 xj = [(h + mg * j + ng-1)//ng for j in range(lxj)] 

6504 # B is an array just holding a few values of B(x,y), the ones needed. 

6505 # B[j] == B(x_j, j) 

6506 if lxj == 0: 

6507 return np.round(special.binom(m + n, n)) 

6508 B = np.zeros(lxj) 

6509 B[0] = 1 

6510 # Compute the B(x, y) terms 

6511 # The binomial coefficient is an integer, but special.binom() may return a float. 

6512 # Round it to the nearest integer. 

6513 for j in range(1, lxj): 

6514 Bj = np.round(special.binom(xj[j] + j, j)) 

6515 if not np.isfinite(Bj): 

6516 raise FloatingPointError() 

6517 for i in range(j): 

6518 bin = np.round(special.binom(xj[j] - xj[i] + j - i, j-i)) 

6519 Bj -= bin * B[i] 

6520 B[j] = Bj 

6521 if not np.isfinite(Bj): 

6522 raise FloatingPointError() 

6523 # Compute the number of path extensions... 

6524 num_paths = 0 

6525 for j in range(lxj): 

6526 bin = np.round(special.binom((m-xj[j]) + (n - j), n-j)) 

6527 term = B[j] * bin 

6528 if not np.isfinite(term): 

6529 raise FloatingPointError() 

6530 num_paths += term 

6531 return np.round(num_paths) 

6532 

6533 

6534def _attempt_exact_2kssamp(n1, n2, g, d, alternative): 

6535 """Attempts to compute the exact 2sample probability. 

6536 

6537 n1, n2 are the sample sizes 

6538 g is the gcd(n1, n2) 

6539 d is the computed max difference in ECDFs 

6540 

6541 Returns (success, d, probability) 

6542 """ 

6543 lcm = (n1 // g) * n2 

6544 h = int(np.round(d * lcm)) 

6545 d = h * 1.0 / lcm 

6546 if h == 0: 

6547 return True, d, 1.0 

6548 saw_fp_error, prob = False, np.nan 

6549 try: 

6550 if alternative == 'two-sided': 

6551 if n1 == n2: 

6552 prob = _compute_prob_outside_square(n1, h) 

6553 else: 

6554 prob = 1 - _compute_prob_inside_method(n1, n2, g, h) 

6555 else: 

6556 if n1 == n2: 

6557 # prob = binom(2n, n-h) / binom(2n, n) 

6558 # Evaluating in that form incurs roundoff errors 

6559 # from special.binom. Instead calculate directly 

6560 jrange = np.arange(h) 

6561 prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0)) 

6562 else: 

6563 num_paths = _count_paths_outside_method(n1, n2, g, h) 

6564 bin = special.binom(n1 + n2, n1) 

6565 if not np.isfinite(bin) or not np.isfinite(num_paths) or num_paths > bin: 

6566 saw_fp_error = True 

6567 else: 

6568 prob = num_paths / bin 

6569 

6570 except FloatingPointError: 

6571 saw_fp_error = True 

6572 

6573 if saw_fp_error: 

6574 return False, d, np.nan 

6575 if not (0 <= prob <= 1): 

6576 return False, d, prob 

6577 return True, d, prob 

6578 

6579 

6580def ks_2samp(data1, data2, alternative='two-sided', mode='auto'): 

6581 """ 

6582 Compute the Kolmogorov-Smirnov statistic on 2 samples. 

6583 

6584 This is a two-sided test for the null hypothesis that 2 independent samples 

6585 are drawn from the same continuous distribution. The alternative hypothesis 

6586 can be either 'two-sided' (default), 'less' or 'greater'. 

6587 

6588 Parameters 

6589 ---------- 

6590 data1, data2 : array_like, 1-Dimensional 

6591 Two arrays of sample observations assumed to be drawn from a continuous 

6592 distribution, sample sizes can be different. 

6593 alternative : {'two-sided', 'less', 'greater'}, optional 

6594 Defines the alternative hypothesis. 

6595 The following options are available (default is 'two-sided'): 

6596 

6597 * 'two-sided' 

6598 * 'less': one-sided, see explanation in Notes 

6599 * 'greater': one-sided, see explanation in Notes 

6600 mode : {'auto', 'exact', 'asymp'}, optional 

6601 Defines the method used for calculating the p-value. 

6602 The following options are available (default is 'auto'): 

6603 

6604 * 'auto' : use 'exact' for small size arrays, 'asymp' for large 

6605 * 'exact' : use exact distribution of test statistic 

6606 * 'asymp' : use asymptotic distribution of test statistic 

6607 

6608 Returns 

6609 ------- 

6610 statistic : float 

6611 KS statistic. 

6612 pvalue : float 

6613 Two-tailed p-value. 

6614 

6615 See Also 

6616 -------- 

6617 kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp 

6618 

6619 Notes 

6620 ----- 

6621 This tests whether 2 samples are drawn from the same distribution. Note 

6622 that, like in the case of the one-sample KS test, the distribution is 

6623 assumed to be continuous. 

6624 

6625 In the one-sided test, the alternative is that the empirical 

6626 cumulative distribution function F(x) of the data1 variable is "less" 

6627 or "greater" than the empirical cumulative distribution function G(x) 

6628 of the data2 variable, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``. 

6629 

6630 If the KS statistic is small or the p-value is high, then we cannot 

6631 reject the hypothesis that the distributions of the two samples 

6632 are the same. 

6633 

6634 If the mode is 'auto', the computation is exact if the sample sizes are 

6635 less than 10000. For larger sizes, the computation uses the 

6636 Kolmogorov-Smirnov distributions to compute an approximate value. 

6637 

6638 The 'two-sided' 'exact' computation computes the complementary probability 

6639 and then subtracts from 1. As such, the minimum probability it can return 

6640 is about 1e-16. While the algorithm itself is exact, numerical 

6641 errors may accumulate for large sample sizes. It is most suited to 

6642 situations in which one of the sample sizes is only a few thousand. 

6643 

6644 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_. 

6645 

6646 References 

6647 ---------- 

6648 .. [1] Hodges, J.L. Jr., "The Significance Probability of the Smirnov 

6649 Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. 

6650 

6651 

6652 Examples 

6653 -------- 

6654 >>> from scipy import stats 

6655 >>> np.random.seed(12345678) #fix random seed to get the same result 

6656 >>> n1 = 200 # size of first sample 

6657 >>> n2 = 300 # size of second sample 

6658 

6659 For a different distribution, we can reject the null hypothesis since the 

6660 pvalue is below 1%: 

6661 

6662 >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) 

6663 >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) 

6664 >>> stats.ks_2samp(rvs1, rvs2) 

6665 (0.20833333333333334, 5.129279597781977e-05) 

6666 

6667 For a slightly different distribution, we cannot reject the null hypothesis 

6668 at a 10% or lower alpha since the p-value at 0.144 is higher than 10% 

6669 

6670 >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) 

6671 >>> stats.ks_2samp(rvs1, rvs3) 

6672 (0.10333333333333333, 0.14691437867433876) 

6673 

6674 For an identical distribution, we cannot reject the null hypothesis since 

6675 the p-value is high, 41%: 

6676 

6677 >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) 

6678 >>> stats.ks_2samp(rvs1, rvs4) 

6679 (0.07999999999999996, 0.41126949729859719) 

6680 

6681 """ 

6682 if mode not in ['auto', 'exact', 'asymp']: 

6683 raise ValueError(f'Invalid value for mode: {mode}') 

6684 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

6685 alternative.lower()[0], alternative) 

6686 if alternative not in ['two-sided', 'less', 'greater']: 

6687 raise ValueError(f'Invalid value for alternative: {alternative}') 

6688 MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N 

6689 if np.ma.is_masked(data1): 

6690 data1 = data1.compressed() 

6691 if np.ma.is_masked(data2): 

6692 data2 = data2.compressed() 

6693 data1 = np.sort(data1) 

6694 data2 = np.sort(data2) 

6695 n1 = data1.shape[0] 

6696 n2 = data2.shape[0] 

6697 if min(n1, n2) == 0: 

6698 raise ValueError('Data passed to ks_2samp must not be empty') 

6699 

6700 data_all = np.concatenate([data1, data2]) 

6701 # using searchsorted solves equal data problem 

6702 cdf1 = np.searchsorted(data1, data_all, side='right') / n1 

6703 cdf2 = np.searchsorted(data2, data_all, side='right') / n2 

6704 cddiffs = cdf1 - cdf2 

6705 minS = -np.min(cddiffs) 

6706 maxS = np.max(cddiffs) 

6707 alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': max(minS, maxS)} 

6708 d = alt2Dvalue[alternative] 

6709 g = gcd(n1, n2) 

6710 n1g = n1 // g 

6711 n2g = n2 // g 

6712 prob = -np.inf 

6713 original_mode = mode 

6714 if mode == 'auto': 

6715 mode = 'exact' if max(n1, n2) <= MAX_AUTO_N else 'asymp' 

6716 elif mode == 'exact': 

6717 # If lcm(n1, n2) is too big, switch from exact to asymp 

6718 if n1g >= np.iinfo(np.int).max / n2g: 

6719 mode = 'asymp' 

6720 warnings.warn( 

6721 "Exact ks_2samp calculation not possible with samples sizes " 

6722 "%d and %d. Switching to 'asymp' " % (n1, n2), RuntimeWarning) 

6723 

6724 if mode == 'exact': 

6725 success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative) 

6726 if not success: 

6727 mode = 'asymp' 

6728 if original_mode == 'exact': 

6729 warnings.warn(f"ks_2samp: Exact calculation unsuccessful. " 

6730 f"Switching to mode={mode}.", RuntimeWarning) 

6731 

6732 if mode == 'asymp': 

6733 # The product n1*n2 is large. Use Smirnov's asymptoptic formula. 

6734 if alternative == 'two-sided': 

6735 en = n1 * n2 / (n1 + n2) 

6736 prob = distributions.kstwo.sf(d, np.round(en)) 

6737 else: 

6738 m, n = max(n1, n2), min(n1, n2) 

6739 z = np.sqrt(m*n/(m+n)) * d 

6740 # Use Hodges' suggested approximation Eqn 5.3 

6741 expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0 

6742 prob = np.exp(expt) 

6743 

6744 prob = np.clip(prob, 0, 1) 

6745 return KstestResult(d, prob) 

6746 

6747 

6748def _parse_kstest_args(data1, data2, args, N): 

6749 # kstest allows many different variations of arguments. 

6750 # Pull out the parsing into a separate function 

6751 # (xvals, yvals, ) # 2sample 

6752 # (xvals, cdf function,..) 

6753 # (xvals, name of distribution, ...) 

6754 # (name of distribution, name of distribution, ...) 

6755 

6756 # Returns xvals, yvals, cdf 

6757 # where cdf is a cdf function, or None 

6758 # and yvals is either an array_like of values, or None 

6759 # and xvals is array_like. 

6760 rvsfunc, cdf = None, None 

6761 if isinstance(data1, str): 

6762 rvsfunc = getattr(distributions, data1).rvs 

6763 elif callable(data1): 

6764 rvsfunc = data1 

6765 

6766 if isinstance(data2, str): 

6767 cdf = getattr(distributions, data2).cdf 

6768 data2 = None 

6769 elif callable(data2): 

6770 cdf = data2 

6771 data2 = None 

6772 

6773 data1 = np.sort(rvsfunc(*args, size=N) if rvsfunc else data1) 

6774 return data1, data2, cdf 

6775 

6776 

6777def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='auto'): 

6778 """ 

6779 Performs the (one sample or two samples) Kolmogorov-Smirnov test for goodness of fit. 

6780 

6781 The one-sample test performs a test of the distribution F(x) of an observed 

6782 random variable against a given distribution G(x). Under the null 

6783 hypothesis, the two distributions are identical, F(x)=G(x). The 

6784 alternative hypothesis can be either 'two-sided' (default), 'less' 

6785 or 'greater'. The KS test is only valid for continuous distributions. 

6786 The two-sample test tests whether the two independent samples are drawn 

6787 from the same continuous distribution. 

6788 

6789 Parameters 

6790 ---------- 

6791 rvs : str, array_like, or callable 

6792 If an array, it should be a 1-D array of observations of random 

6793 variables. 

6794 If a callable, it should be a function to generate random variables; 

6795 it is required to have a keyword argument `size`. 

6796 If a string, it should be the name of a distribution in `scipy.stats`, 

6797 which will be used to generate random variables. 

6798 cdf : str, array_like or callable 

6799 If array_like, it should be a 1-D array of observations of random 

6800 variables, and the two-sample test is performed (and rvs must be array_like) 

6801 If a callable, that callable is used to calculate the cdf. 

6802 If a string, it should be the name of a distribution in `scipy.stats`, 

6803 which will be used as the cdf function. 

6804 args : tuple, sequence, optional 

6805 Distribution parameters, used if `rvs` or `cdf` are strings or callables. 

6806 N : int, optional 

6807 Sample size if `rvs` is string or callable. Default is 20. 

6808 alternative : {'two-sided', 'less', 'greater'}, optional 

6809 Defines the alternative hypothesis. 

6810 The following options are available (default is 'two-sided'): 

6811 

6812 * 'two-sided' 

6813 * 'less': one-sided, see explanation in Notes 

6814 * 'greater': one-sided, see explanation in Notes 

6815 mode : {'auto', 'exact', 'approx', 'asymp'}, optional 

6816 Defines the distribution used for calculating the p-value. 

6817 The following options are available (default is 'auto'): 

6818 

6819 * 'auto' : selects one of the other options. 

6820 * 'exact' : uses the exact distribution of test statistic. 

6821 * 'approx' : approximates the two-sided probability with twice the one-sided probability 

6822 * 'asymp': uses asymptotic distribution of test statistic 

6823 

6824 Returns 

6825 ------- 

6826 statistic : float 

6827 KS test statistic, either D, D+ or D-. 

6828 pvalue : float 

6829 One-tailed or two-tailed p-value. 

6830 

6831 See Also 

6832 -------- 

6833 ks_2samp 

6834 

6835 Notes 

6836 ----- 

6837 In the one-sided test, the alternative is that the empirical 

6838 cumulative distribution function of the random variable is "less" 

6839 or "greater" than the cumulative distribution function G(x) of the 

6840 hypothesis, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``. 

6841 

6842 Examples 

6843 -------- 

6844 >>> from scipy import stats 

6845 

6846 >>> x = np.linspace(-15, 15, 9) 

6847 >>> stats.kstest(x, 'norm') 

6848 (0.44435602715924361, 0.038850142705171065) 

6849 

6850 >>> np.random.seed(987654321) # set random seed to get the same result 

6851 >>> stats.kstest(stats.norm.rvs(size=100), stats.norm.cdf) 

6852 (0.058352892479417884, 0.8653960860778898) 

6853 

6854 The above lines are equivalent to: 

6855 

6856 >>> np.random.seed(987654321) 

6857 >>> stats.kstest(stats.norm.rvs, 'norm', N=100) 

6858 (0.058352892479417884, 0.8653960860778898) 

6859 

6860 *Test against one-sided alternative hypothesis* 

6861 

6862 Shift distribution to larger values, so that ``CDF(x) < norm.cdf(x)``: 

6863 

6864 >>> np.random.seed(987654321) 

6865 >>> x = stats.norm.rvs(loc=0.2, size=100) 

6866 >>> stats.kstest(x, 'norm', alternative='less') 

6867 (0.12464329735846891, 0.040989164077641749) 

6868 

6869 Reject equal distribution against alternative hypothesis: less 

6870 

6871 >>> stats.kstest(x, 'norm', alternative='greater') 

6872 (0.0072115233216311081, 0.98531158590396395) 

6873 

6874 Don't reject equal distribution against alternative hypothesis: greater 

6875 

6876 >>> stats.kstest(x, 'norm') 

6877 (0.12464329735846891, 0.08197335233541582) 

6878 

6879 *Testing t distributed random variables against normal distribution* 

6880 

6881 With 100 degrees of freedom the t distribution looks close to the normal 

6882 distribution, and the K-S test does not reject the hypothesis that the 

6883 sample came from the normal distribution: 

6884 

6885 >>> np.random.seed(987654321) 

6886 >>> stats.kstest(stats.t.rvs(100, size=100), 'norm') 

6887 (0.072018929165471257, 0.6505883498379312) 

6888 

6889 With 3 degrees of freedom the t distribution looks sufficiently different 

6890 from the normal distribution, that we can reject the hypothesis that the 

6891 sample came from the normal distribution at the 10% level: 

6892 

6893 >>> np.random.seed(987654321) 

6894 >>> stats.kstest(stats.t.rvs(3, size=100), 'norm') 

6895 (0.131016895759829, 0.058826222555312224) 

6896 

6897 """ 

6898 # to not break compatibility with existing code 

6899 if alternative == 'two_sided': 

6900 alternative = 'two-sided' 

6901 if alternative not in ['two-sided', 'greater', 'less']: 

6902 raise ValueError("Unexpected alternative %s" % alternative) 

6903 xvals, yvals, cdf = _parse_kstest_args(rvs, cdf, args, N) 

6904 if cdf: 

6905 return ks_1samp(xvals, cdf, args=args, alternative=alternative, mode=mode) 

6906 return ks_2samp(xvals, yvals, alternative=alternative, mode=mode) 

6907 

6908 

6909def tiecorrect(rankvals): 

6910 """ 

6911 Tie correction factor for Mann-Whitney U and Kruskal-Wallis H tests. 

6912 

6913 Parameters 

6914 ---------- 

6915 rankvals : array_like 

6916 A 1-D sequence of ranks. Typically this will be the array 

6917 returned by `~scipy.stats.rankdata`. 

6918 

6919 Returns 

6920 ------- 

6921 factor : float 

6922 Correction factor for U or H. 

6923 

6924 See Also 

6925 -------- 

6926 rankdata : Assign ranks to the data 

6927 mannwhitneyu : Mann-Whitney rank test 

6928 kruskal : Kruskal-Wallis H test 

6929 

6930 References 

6931 ---------- 

6932 .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral 

6933 Sciences. New York: McGraw-Hill. 

6934 

6935 Examples 

6936 -------- 

6937 >>> from scipy.stats import tiecorrect, rankdata 

6938 >>> tiecorrect([1, 2.5, 2.5, 4]) 

6939 0.9 

6940 >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4]) 

6941 >>> ranks 

6942 array([ 1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5]) 

6943 >>> tiecorrect(ranks) 

6944 0.9833333333333333 

6945 

6946 """ 

6947 arr = np.sort(rankvals) 

6948 idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0] 

6949 cnt = np.diff(idx).astype(np.float64) 

6950 

6951 size = np.float64(arr.size) 

6952 return 1.0 if size < 2 else 1.0 - (cnt**3 - cnt).sum() / (size**3 - size) 

6953 

6954 

6955MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue')) 

6956 

6957 

6958def mannwhitneyu(x, y, use_continuity=True, alternative=None): 

6959 """ 

6960 Compute the Mann-Whitney rank test on samples x and y. 

6961 

6962 Parameters 

6963 ---------- 

6964 x, y : array_like 

6965 Array of samples, should be one-dimensional. 

6966 use_continuity : bool, optional 

6967 Whether a continuity correction (1/2.) should be taken into 

6968 account. Default is True. 

6969 alternative : {None, 'two-sided', 'less', 'greater'}, optional 

6970 Defines the alternative hypothesis. 

6971 The following options are available (default is None): 

6972 

6973 * None: computes p-value half the size of the 'two-sided' p-value and 

6974 a different U statistic. The default behavior is not the same as 

6975 using 'less' or 'greater'; it only exists for backward compatibility 

6976 and is deprecated. 

6977 * 'two-sided' 

6978 * 'less': one-sided 

6979 * 'greater': one-sided 

6980 

6981 Use of the None option is deprecated. 

6982 

6983 Returns 

6984 ------- 

6985 statistic : float 

6986 The Mann-Whitney U statistic, equal to min(U for x, U for y) if 

6987 `alternative` is equal to None (deprecated; exists for backward 

6988 compatibility), and U for y otherwise. 

6989 pvalue : float 

6990 p-value assuming an asymptotic normal distribution. One-sided or 

6991 two-sided, depending on the choice of `alternative`. 

6992 

6993 Notes 

6994 ----- 

6995 Use only when the number of observation in each sample is > 20 and 

6996 you have 2 independent samples of ranks. Mann-Whitney U is 

6997 significant if the u-obtained is LESS THAN or equal to the critical 

6998 value of U. 

6999 

7000 This test corrects for ties and by default uses a continuity correction. 

7001 

7002 References 

7003 ---------- 

7004 .. [1] https://en.wikipedia.org/wiki/Mann-Whitney_U_test 

7005 

7006 .. [2] H.B. Mann and D.R. Whitney, "On a Test of Whether one of Two Random 

7007 Variables is Stochastically Larger than the Other," The Annals of 

7008 Mathematical Statistics, vol. 18, no. 1, pp. 50-60, 1947. 

7009 

7010 """ 

7011 if alternative is None: 

7012 warnings.warn("Calling `mannwhitneyu` without specifying " 

7013 "`alternative` is deprecated.", DeprecationWarning) 

7014 

7015 x = np.asarray(x) 

7016 y = np.asarray(y) 

7017 n1 = len(x) 

7018 n2 = len(y) 

7019 ranked = rankdata(np.concatenate((x, y))) 

7020 rankx = ranked[0:n1] # get the x-ranks 

7021 u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx, axis=0) # calc U for x 

7022 u2 = n1*n2 - u1 # remainder is U for y 

7023 T = tiecorrect(ranked) 

7024 if T == 0: 

7025 raise ValueError('All numbers are identical in mannwhitneyu') 

7026 sd = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0) 

7027 

7028 meanrank = n1*n2/2.0 + 0.5 * use_continuity 

7029 if alternative is None or alternative == 'two-sided': 

7030 bigu = max(u1, u2) 

7031 elif alternative == 'less': 

7032 bigu = u1 

7033 elif alternative == 'greater': 

7034 bigu = u2 

7035 else: 

7036 raise ValueError("alternative should be None, 'less', 'greater' " 

7037 "or 'two-sided'") 

7038 

7039 z = (bigu - meanrank) / sd 

7040 if alternative is None: 

7041 # This behavior, equal to half the size of the two-sided 

7042 # p-value, is deprecated. 

7043 p = distributions.norm.sf(abs(z)) 

7044 elif alternative == 'two-sided': 

7045 p = 2 * distributions.norm.sf(abs(z)) 

7046 else: 

7047 p = distributions.norm.sf(z) 

7048 

7049 u = u2 

7050 # This behavior is deprecated. 

7051 if alternative is None: 

7052 u = min(u1, u2) 

7053 return MannwhitneyuResult(u, p) 

7054 

7055 

7056RanksumsResult = namedtuple('RanksumsResult', ('statistic', 'pvalue')) 

7057 

7058 

7059def ranksums(x, y): 

7060 """ 

7061 Compute the Wilcoxon rank-sum statistic for two samples. 

7062 

7063 The Wilcoxon rank-sum test tests the null hypothesis that two sets 

7064 of measurements are drawn from the same distribution. The alternative 

7065 hypothesis is that values in one sample are more likely to be 

7066 larger than the values in the other sample. 

7067 

7068 This test should be used to compare two samples from continuous 

7069 distributions. It does not handle ties between measurements 

7070 in x and y. For tie-handling and an optional continuity correction 

7071 see `scipy.stats.mannwhitneyu`. 

7072 

7073 Parameters 

7074 ---------- 

7075 x,y : array_like 

7076 The data from the two samples. 

7077 

7078 Returns 

7079 ------- 

7080 statistic : float 

7081 The test statistic under the large-sample approximation that the 

7082 rank sum statistic is normally distributed. 

7083 pvalue : float 

7084 The two-sided p-value of the test. 

7085 

7086 References 

7087 ---------- 

7088 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test 

7089 

7090 Examples 

7091 -------- 

7092 We can test the hypothesis that two independent unequal-sized samples are 

7093 drawn from the same distribution with computing the Wilcoxon rank-sum 

7094 statistic. 

7095 

7096 >>> from scipy.stats import ranksums 

7097 >>> sample1 = np.random.uniform(-1, 1, 200) 

7098 >>> sample2 = np.random.uniform(-0.5, 1.5, 300) # a shifted distribution 

7099 >>> ranksums(sample1, sample2) 

7100 RanksumsResult(statistic=-7.887059, pvalue=3.09390448e-15) # may vary 

7101 

7102 The p-value of less than ``0.05`` indicates that this test rejects the 

7103 hypothesis at the 5% significance level. 

7104 

7105 """ 

7106 x, y = map(np.asarray, (x, y)) 

7107 n1 = len(x) 

7108 n2 = len(y) 

7109 alldata = np.concatenate((x, y)) 

7110 ranked = rankdata(alldata) 

7111 x = ranked[:n1] 

7112 s = np.sum(x, axis=0) 

7113 expected = n1 * (n1+n2+1) / 2.0 

7114 z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0) 

7115 prob = 2 * distributions.norm.sf(abs(z)) 

7116 

7117 return RanksumsResult(z, prob) 

7118 

7119 

7120KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue')) 

7121 

7122 

7123def kruskal(*args, **kwargs): 

7124 """ 

7125 Compute the Kruskal-Wallis H-test for independent samples. 

7126 

7127 The Kruskal-Wallis H-test tests the null hypothesis that the population 

7128 median of all of the groups are equal. It is a non-parametric version of 

7129 ANOVA. The test works on 2 or more independent samples, which may have 

7130 different sizes. Note that rejecting the null hypothesis does not 

7131 indicate which of the groups differs. Post hoc comparisons between 

7132 groups are required to determine which groups are different. 

7133 

7134 Parameters 

7135 ---------- 

7136 sample1, sample2, ... : array_like 

7137 Two or more arrays with the sample measurements can be given as 

7138 arguments. 

7139 nan_policy : {'propagate', 'raise', 'omit'}, optional 

7140 Defines how to handle when input contains nan. 

7141 The following options are available (default is 'propagate'): 

7142 

7143 * 'propagate': returns nan 

7144 * 'raise': throws an error 

7145 * 'omit': performs the calculations ignoring nan values 

7146 

7147 Returns 

7148 ------- 

7149 statistic : float 

7150 The Kruskal-Wallis H statistic, corrected for ties. 

7151 pvalue : float 

7152 The p-value for the test using the assumption that H has a chi 

7153 square distribution. 

7154 

7155 See Also 

7156 -------- 

7157 f_oneway : 1-way ANOVA. 

7158 mannwhitneyu : Mann-Whitney rank test on two samples. 

7159 friedmanchisquare : Friedman test for repeated measurements. 

7160 

7161 Notes 

7162 ----- 

7163 Due to the assumption that H has a chi square distribution, the number 

7164 of samples in each group must not be too small. A typical rule is 

7165 that each sample must have at least 5 measurements. 

7166 

7167 References 

7168 ---------- 

7169 .. [1] W. H. Kruskal & W. W. Wallis, "Use of Ranks in 

7170 One-Criterion Variance Analysis", Journal of the American Statistical 

7171 Association, Vol. 47, Issue 260, pp. 583-621, 1952. 

7172 .. [2] https://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance 

7173 

7174 Examples 

7175 -------- 

7176 >>> from scipy import stats 

7177 >>> x = [1, 3, 5, 7, 9] 

7178 >>> y = [2, 4, 6, 8, 10] 

7179 >>> stats.kruskal(x, y) 

7180 KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895) 

7181 

7182 >>> x = [1, 1, 1] 

7183 >>> y = [2, 2, 2] 

7184 >>> z = [2, 2] 

7185 >>> stats.kruskal(x, y, z) 

7186 KruskalResult(statistic=7.0, pvalue=0.0301973834223185) 

7187 

7188 """ 

7189 args = list(map(np.asarray, args)) 

7190 num_groups = len(args) 

7191 if num_groups < 2: 

7192 raise ValueError("Need at least two groups in stats.kruskal()") 

7193 

7194 for arg in args: 

7195 if arg.size == 0: 

7196 return KruskalResult(np.nan, np.nan) 

7197 n = np.asarray(list(map(len, args))) 

7198 

7199 if 'nan_policy' in kwargs.keys(): 

7200 if kwargs['nan_policy'] not in ('propagate', 'raise', 'omit'): 

7201 raise ValueError("nan_policy must be 'propagate', " 

7202 "'raise' or'omit'") 

7203 else: 

7204 nan_policy = kwargs['nan_policy'] 

7205 else: 

7206 nan_policy = 'propagate' 

7207 

7208 contains_nan = False 

7209 for arg in args: 

7210 cn = _contains_nan(arg, nan_policy) 

7211 if cn[0]: 

7212 contains_nan = True 

7213 break 

7214 

7215 if contains_nan and nan_policy == 'omit': 

7216 for a in args: 

7217 a = ma.masked_invalid(a) 

7218 return mstats_basic.kruskal(*args) 

7219 

7220 if contains_nan and nan_policy == 'propagate': 

7221 return KruskalResult(np.nan, np.nan) 

7222 

7223 alldata = np.concatenate(args) 

7224 ranked = rankdata(alldata) 

7225 ties = tiecorrect(ranked) 

7226 if ties == 0: 

7227 raise ValueError('All numbers are identical in kruskal') 

7228 

7229 # Compute sum^2/n for each group and sum 

7230 j = np.insert(np.cumsum(n), 0, 0) 

7231 ssbn = 0 

7232 for i in range(num_groups): 

7233 ssbn += _square_of_sums(ranked[j[i]:j[i+1]]) / n[i] 

7234 

7235 totaln = np.sum(n, dtype=float) 

7236 h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) 

7237 df = num_groups - 1 

7238 h /= ties 

7239 

7240 return KruskalResult(h, distributions.chi2.sf(h, df)) 

7241 

7242 

7243FriedmanchisquareResult = namedtuple('FriedmanchisquareResult', 

7244 ('statistic', 'pvalue')) 

7245 

7246 

7247def friedmanchisquare(*args): 

7248 """ 

7249 Compute the Friedman test for repeated measurements. 

7250 

7251 The Friedman test tests the null hypothesis that repeated measurements of 

7252 the same individuals have the same distribution. It is often used 

7253 to test for consistency among measurements obtained in different ways. 

7254 For example, if two measurement techniques are used on the same set of 

7255 individuals, the Friedman test can be used to determine if the two 

7256 measurement techniques are consistent. 

7257 

7258 Parameters 

7259 ---------- 

7260 measurements1, measurements2, measurements3... : array_like 

7261 Arrays of measurements. All of the arrays must have the same number 

7262 of elements. At least 3 sets of measurements must be given. 

7263 

7264 Returns 

7265 ------- 

7266 statistic : float 

7267 The test statistic, correcting for ties. 

7268 pvalue : float 

7269 The associated p-value assuming that the test statistic has a chi 

7270 squared distribution. 

7271 

7272 Notes 

7273 ----- 

7274 Due to the assumption that the test statistic has a chi squared 

7275 distribution, the p-value is only reliable for n > 10 and more than 

7276 6 repeated measurements. 

7277 

7278 References 

7279 ---------- 

7280 .. [1] https://en.wikipedia.org/wiki/Friedman_test 

7281 

7282 """ 

7283 k = len(args) 

7284 if k < 3: 

7285 raise ValueError('Less than 3 levels. Friedman test not appropriate.') 

7286 

7287 n = len(args[0]) 

7288 for i in range(1, k): 

7289 if len(args[i]) != n: 

7290 raise ValueError('Unequal N in friedmanchisquare. Aborting.') 

7291 

7292 # Rank data 

7293 data = np.vstack(args).T 

7294 data = data.astype(float) 

7295 for i in range(len(data)): 

7296 data[i] = rankdata(data[i]) 

7297 

7298 # Handle ties 

7299 ties = 0 

7300 for i in range(len(data)): 

7301 replist, repnum = find_repeats(array(data[i])) 

7302 for t in repnum: 

7303 ties += t * (t*t - 1) 

7304 c = 1 - ties / (k*(k*k - 1)*n) 

7305 

7306 ssbn = np.sum(data.sum(axis=0)**2) 

7307 chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c 

7308 

7309 return FriedmanchisquareResult(chisq, distributions.chi2.sf(chisq, k - 1)) 

7310 

7311 

7312BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', 

7313 ('statistic', 'pvalue')) 

7314 

7315 

7316def brunnermunzel(x, y, alternative="two-sided", distribution="t", 

7317 nan_policy='propagate'): 

7318 """ 

7319 Compute the Brunner-Munzel test on samples x and y. 

7320 

7321 The Brunner-Munzel test is a nonparametric test of the null hypothesis that 

7322 when values are taken one by one from each group, the probabilities of 

7323 getting large values in both groups are equal. 

7324 Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the 

7325 assumption of equivariance of two groups. Note that this does not assume 

7326 the distributions are same. This test works on two independent samples, 

7327 which may have different sizes. 

7328 

7329 Parameters 

7330 ---------- 

7331 x, y : array_like 

7332 Array of samples, should be one-dimensional. 

7333 alternative : {'two-sided', 'less', 'greater'}, optional 

7334 Defines the alternative hypothesis. 

7335 The following options are available (default is 'two-sided'): 

7336 

7337 * 'two-sided' 

7338 * 'less': one-sided 

7339 * 'greater': one-sided 

7340 distribution : {'t', 'normal'}, optional 

7341 Defines how to get the p-value. 

7342 The following options are available (default is 't'): 

7343 

7344 * 't': get the p-value by t-distribution 

7345 * 'normal': get the p-value by standard normal distribution. 

7346 nan_policy : {'propagate', 'raise', 'omit'}, optional 

7347 Defines how to handle when input contains nan. 

7348 The following options are available (default is 'propagate'): 

7349 

7350 * 'propagate': returns nan 

7351 * 'raise': throws an error 

7352 * 'omit': performs the calculations ignoring nan values 

7353 

7354 Returns 

7355 ------- 

7356 statistic : float 

7357 The Brunner-Munzer W statistic. 

7358 pvalue : float 

7359 p-value assuming an t distribution. One-sided or 

7360 two-sided, depending on the choice of `alternative` and `distribution`. 

7361 

7362 See Also 

7363 -------- 

7364 mannwhitneyu : Mann-Whitney rank test on two samples. 

7365 

7366 Notes 

7367 ----- 

7368 Brunner and Munzel recommended to estimate the p-value by t-distribution 

7369 when the size of data is 50 or less. If the size is lower than 10, it would 

7370 be better to use permuted Brunner Munzel test (see [2]_). 

7371 

7372 References 

7373 ---------- 

7374 .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher 

7375 problem: Asymptotic theory and a small-sample approximation". 

7376 Biometrical Journal. Vol. 42(2000): 17-25. 

7377 .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the 

7378 non-parametric Behrens-Fisher problem". Computational Statistics and 

7379 Data Analysis. Vol. 51(2007): 5192-5204. 

7380 

7381 Examples 

7382 -------- 

7383 >>> from scipy import stats 

7384 >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1] 

7385 >>> x2 = [3,3,4,3,1,2,3,1,1,5,4] 

7386 >>> w, p_value = stats.brunnermunzel(x1, x2) 

7387 >>> w 

7388 3.1374674823029505 

7389 >>> p_value 

7390 0.0057862086661515377 

7391 

7392 """ 

7393 x = np.asarray(x) 

7394 y = np.asarray(y) 

7395 

7396 # check both x and y 

7397 cnx, npx = _contains_nan(x, nan_policy) 

7398 cny, npy = _contains_nan(y, nan_policy) 

7399 contains_nan = cnx or cny 

7400 if npx == "omit" or npy == "omit": 

7401 nan_policy = "omit" 

7402 

7403 if contains_nan and nan_policy == "propagate": 

7404 return BrunnerMunzelResult(np.nan, np.nan) 

7405 elif contains_nan and nan_policy == "omit": 

7406 x = ma.masked_invalid(x) 

7407 y = ma.masked_invalid(y) 

7408 return mstats_basic.brunnermunzel(x, y, alternative, distribution) 

7409 

7410 nx = len(x) 

7411 ny = len(y) 

7412 if nx == 0 or ny == 0: 

7413 return BrunnerMunzelResult(np.nan, np.nan) 

7414 rankc = rankdata(np.concatenate((x, y))) 

7415 rankcx = rankc[0:nx] 

7416 rankcy = rankc[nx:nx+ny] 

7417 rankcx_mean = np.mean(rankcx) 

7418 rankcy_mean = np.mean(rankcy) 

7419 rankx = rankdata(x) 

7420 ranky = rankdata(y) 

7421 rankx_mean = np.mean(rankx) 

7422 ranky_mean = np.mean(ranky) 

7423 

7424 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0)) 

7425 Sx /= nx - 1 

7426 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0)) 

7427 Sy /= ny - 1 

7428 

7429 wbfn = nx * ny * (rankcy_mean - rankcx_mean) 

7430 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy) 

7431 

7432 if distribution == "t": 

7433 df_numer = np.power(nx * Sx + ny * Sy, 2.0) 

7434 df_denom = np.power(nx * Sx, 2.0) / (nx - 1) 

7435 df_denom += np.power(ny * Sy, 2.0) / (ny - 1) 

7436 df = df_numer / df_denom 

7437 p = distributions.t.cdf(wbfn, df) 

7438 elif distribution == "normal": 

7439 p = distributions.norm.cdf(wbfn) 

7440 else: 

7441 raise ValueError( 

7442 "distribution should be 't' or 'normal'") 

7443 

7444 if alternative == "greater": 

7445 pass 

7446 elif alternative == "less": 

7447 p = 1 - p 

7448 elif alternative == "two-sided": 

7449 p = 2 * np.min([p, 1-p]) 

7450 else: 

7451 raise ValueError( 

7452 "alternative should be 'less', 'greater' or 'two-sided'") 

7453 

7454 return BrunnerMunzelResult(wbfn, p) 

7455 

7456 

7457def combine_pvalues(pvalues, method='fisher', weights=None): 

7458 """ 

7459 Combine p-values from independent tests bearing upon the same hypothesis. 

7460 

7461 Parameters 

7462 ---------- 

7463 pvalues : array_like, 1-D 

7464 Array of p-values assumed to come from independent tests. 

7465 method : {'fisher', 'pearson', 'tippett', 'stouffer', 'mudholkar_george'}, optional 

7466 Name of method to use to combine p-values. 

7467 The following methods are available (default is 'fisher'): 

7468 

7469 * 'fisher': Fisher's method (Fisher's combined probability test), the 

7470 sum of the logarithm of the p-values 

7471 * 'pearson': Pearson's method (similar to Fisher's but uses sum of the 

7472 complement of the p-values inside the logarithms) 

7473 * 'tippett': Tippett's method (minimum of p-values) 

7474 * 'stouffer': Stouffer's Z-score method 

7475 * 'mudholkar_george': the difference of Fisher's and Pearson's methods 

7476 divided by 2 

7477 weights : array_like, 1-D, optional 

7478 Optional array of weights used only for Stouffer's Z-score method. 

7479 

7480 Returns 

7481 ------- 

7482 statistic: float 

7483 The statistic calculated by the specified method. 

7484 pval: float 

7485 The combined p-value. 

7486 

7487 Notes 

7488 ----- 

7489 Fisher's method (also known as Fisher's combined probability test) [1]_ uses 

7490 a chi-squared statistic to compute a combined p-value. The closely related 

7491 Stouffer's Z-score method [2]_ uses Z-scores rather than p-values. The 

7492 advantage of Stouffer's method is that it is straightforward to introduce 

7493 weights, which can make Stouffer's method more powerful than Fisher's 

7494 method when the p-values are from studies of different size [6]_ [7]_. 

7495 The Pearson's method uses :math:`log(1-p_i)` inside the sum whereas Fisher's 

7496 method uses :math:`log(p_i)` [4]_. For Fisher's and Pearson's method, the 

7497 sum of the logarithms is multiplied by -2 in the implementation. This 

7498 quantity has a chi-square distribution that determines the p-value. The 

7499 `mudholkar_george` method is the difference of the Fisher's and Pearson's 

7500 test statistics, each of which include the -2 factor [4]_. However, the 

7501 `mudholkar_george` method does not include these -2 factors. The test 

7502 statistic of `mudholkar_george` is the sum of logisitic random variables and 

7503 equation 3.6 in [3]_ is used to approximate the p-value based on Student's 

7504 t-distribution. 

7505 

7506 Fisher's method may be extended to combine p-values from dependent tests 

7507 [5]_. Extensions such as Brown's method and Kost's method are not currently 

7508 implemented. 

7509 

7510 .. versionadded:: 0.15.0 

7511 

7512 References 

7513 ---------- 

7514 .. [1] https://en.wikipedia.org/wiki/Fisher%27s_method 

7515 .. [2] https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer.27s_Z-score_method 

7516 .. [3] George, E. O., and G. S. Mudholkar. "On the convolution of logistic 

7517 random variables." Metrika 30.1 (1983): 1-13. 

7518 .. [4] Heard, N. and Rubin-Delanchey, P. "Choosing between methods of 

7519 combining p-values." Biometrika 105.1 (2018): 239-246. 

7520 .. [5] Whitlock, M. C. "Combining probability from independent tests: the 

7521 weighted Z-method is superior to Fisher's approach." Journal of 

7522 Evolutionary Biology 18, no. 5 (2005): 1368-1373. 

7523 .. [6] Zaykin, Dmitri V. "Optimally weighted Z-test is a powerful method 

7524 for combining probabilities in meta-analysis." Journal of 

7525 Evolutionary Biology 24, no. 8 (2011): 1836-1841. 

7526 .. [7] https://en.wikipedia.org/wiki/Extensions_of_Fisher%27s_method 

7527 

7528 """ 

7529 pvalues = np.asarray(pvalues) 

7530 if pvalues.ndim != 1: 

7531 raise ValueError("pvalues is not 1-D") 

7532 

7533 if method == 'fisher': 

7534 statistic = -2 * np.sum(np.log(pvalues)) 

7535 pval = distributions.chi2.sf(statistic, 2 * len(pvalues)) 

7536 elif method == 'pearson': 

7537 statistic = -2 * np.sum(np.log1p(-pvalues)) 

7538 pval = distributions.chi2.sf(statistic, 2 * len(pvalues)) 

7539 elif method == 'mudholkar_george': 

7540 statistic = -np.sum(np.log(pvalues)) + np.sum(np.log1p(-pvalues)) 

7541 nu = 5 * len(pvalues) + 4 

7542 approx_factor = np.sqrt(nu / (nu - 2)) 

7543 pval = distributions.t.sf(statistic * approx_factor, nu) 

7544 elif method == 'tippett': 

7545 statistic = np.min(pvalues) 

7546 pval = distributions.beta.sf(statistic, 1, len(pvalues)) 

7547 elif method == 'stouffer': 

7548 if weights is None: 

7549 weights = np.ones_like(pvalues) 

7550 elif len(weights) != len(pvalues): 

7551 raise ValueError("pvalues and weights must be of the same size.") 

7552 

7553 weights = np.asarray(weights) 

7554 if weights.ndim != 1: 

7555 raise ValueError("weights is not 1-D") 

7556 

7557 Zi = distributions.norm.isf(pvalues) 

7558 statistic = np.dot(weights, Zi) / np.linalg.norm(weights) 

7559 pval = distributions.norm.sf(statistic) 

7560 

7561 else: 

7562 raise ValueError( 

7563 "Invalid method '%s'. Options are 'fisher', 'pearson', \ 

7564 'mudholkar_george', 'tippett', 'or 'stouffer'", method) 

7565 

7566 return (statistic, pval) 

7567 

7568 

7569##################################### 

7570# STATISTICAL DISTANCES # 

7571##################################### 

7572 

7573def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None): 

7574 r""" 

7575 Compute the first Wasserstein distance between two 1D distributions. 

7576 

7577 This distance is also known as the earth mover's distance, since it can be 

7578 seen as the minimum amount of "work" required to transform :math:`u` into 

7579 :math:`v`, where "work" is measured as the amount of distribution weight 

7580 that must be moved, multiplied by the distance it has to be moved. 

7581 

7582 .. versionadded:: 1.0.0 

7583 

7584 Parameters 

7585 ---------- 

7586 u_values, v_values : array_like 

7587 Values observed in the (empirical) distribution. 

7588 u_weights, v_weights : array_like, optional 

7589 Weight for each value. If unspecified, each value is assigned the same 

7590 weight. 

7591 `u_weights` (resp. `v_weights`) must have the same length as 

7592 `u_values` (resp. `v_values`). If the weight sum differs from 1, it 

7593 must still be positive and finite so that the weights can be normalized 

7594 to sum to 1. 

7595 

7596 Returns 

7597 ------- 

7598 distance : float 

7599 The computed distance between the distributions. 

7600 

7601 Notes 

7602 ----- 

7603 The first Wasserstein distance between the distributions :math:`u` and 

7604 :math:`v` is: 

7605 

7606 .. math:: 

7607 

7608 l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int_{\mathbb{R} \times 

7609 \mathbb{R}} |x-y| \mathrm{d} \pi (x, y) 

7610 

7611 where :math:`\Gamma (u, v)` is the set of (probability) distributions on 

7612 :math:`\mathbb{R} \times \mathbb{R}` whose marginals are :math:`u` and 

7613 :math:`v` on the first and second factors respectively. 

7614 

7615 If :math:`U` and :math:`V` are the respective CDFs of :math:`u` and 

7616 :math:`v`, this distance also equals to: 

7617 

7618 .. math:: 

7619 

7620 l_1(u, v) = \int_{-\infty}^{+\infty} |U-V| 

7621 

7622 See [2]_ for a proof of the equivalence of both definitions. 

7623 

7624 The input distributions can be empirical, therefore coming from samples 

7625 whose values are effectively inputs of the function, or they can be seen as 

7626 generalized functions, in which case they are weighted sums of Dirac delta 

7627 functions located at the specified values. 

7628 

7629 References 

7630 ---------- 

7631 .. [1] "Wasserstein metric", https://en.wikipedia.org/wiki/Wasserstein_metric 

7632 .. [2] Ramdas, Garcia, Cuturi "On Wasserstein Two Sample Testing and Related 

7633 Families of Nonparametric Tests" (2015). :arXiv:`1509.02237`. 

7634 

7635 Examples 

7636 -------- 

7637 >>> from scipy.stats import wasserstein_distance 

7638 >>> wasserstein_distance([0, 1, 3], [5, 6, 8]) 

7639 5.0 

7640 >>> wasserstein_distance([0, 1], [0, 1], [3, 1], [2, 2]) 

7641 0.25 

7642 >>> wasserstein_distance([3.4, 3.9, 7.5, 7.8], [4.5, 1.4], 

7643 ... [1.4, 0.9, 3.1, 7.2], [3.2, 3.5]) 

7644 4.0781331438047861 

7645 

7646 """ 

7647 return _cdf_distance(1, u_values, v_values, u_weights, v_weights) 

7648 

7649 

7650def energy_distance(u_values, v_values, u_weights=None, v_weights=None): 

7651 r""" 

7652 Compute the energy distance between two 1D distributions. 

7653 

7654 .. versionadded:: 1.0.0 

7655 

7656 Parameters 

7657 ---------- 

7658 u_values, v_values : array_like 

7659 Values observed in the (empirical) distribution. 

7660 u_weights, v_weights : array_like, optional 

7661 Weight for each value. If unspecified, each value is assigned the same 

7662 weight. 

7663 `u_weights` (resp. `v_weights`) must have the same length as 

7664 `u_values` (resp. `v_values`). If the weight sum differs from 1, it 

7665 must still be positive and finite so that the weights can be normalized 

7666 to sum to 1. 

7667 

7668 Returns 

7669 ------- 

7670 distance : float 

7671 The computed distance between the distributions. 

7672 

7673 Notes 

7674 ----- 

7675 The energy distance between two distributions :math:`u` and :math:`v`, whose 

7676 respective CDFs are :math:`U` and :math:`V`, equals to: 

7677 

7678 .. math:: 

7679 

7680 D(u, v) = \left( 2\mathbb E|X - Y| - \mathbb E|X - X'| - 

7681 \mathbb E|Y - Y'| \right)^{1/2} 

7682 

7683 where :math:`X` and :math:`X'` (resp. :math:`Y` and :math:`Y'`) are 

7684 independent random variables whose probability distribution is :math:`u` 

7685 (resp. :math:`v`). 

7686 

7687 As shown in [2]_, for one-dimensional real-valued variables, the energy 

7688 distance is linked to the non-distribution-free version of the Cramer-von 

7689 Mises distance: 

7690 

7691 .. math:: 

7692 

7693 D(u, v) = \sqrt{2} l_2(u, v) = \left( 2 \int_{-\infty}^{+\infty} (U-V)^2 

7694 \right)^{1/2} 

7695 

7696 Note that the common Cramer-von Mises criterion uses the distribution-free 

7697 version of the distance. See [2]_ (section 2), for more details about both 

7698 versions of the distance. 

7699 

7700 The input distributions can be empirical, therefore coming from samples 

7701 whose values are effectively inputs of the function, or they can be seen as 

7702 generalized functions, in which case they are weighted sums of Dirac delta 

7703 functions located at the specified values. 

7704 

7705 References 

7706 ---------- 

7707 .. [1] "Energy distance", https://en.wikipedia.org/wiki/Energy_distance 

7708 .. [2] Szekely "E-statistics: The energy of statistical samples." Bowling 

7709 Green State University, Department of Mathematics and Statistics, 

7710 Technical Report 02-16 (2002). 

7711 .. [3] Rizzo, Szekely "Energy distance." Wiley Interdisciplinary Reviews: 

7712 Computational Statistics, 8(1):27-38 (2015). 

7713 .. [4] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, 

7714 Munos "The Cramer Distance as a Solution to Biased Wasserstein 

7715 Gradients" (2017). :arXiv:`1705.10743`. 

7716 

7717 Examples 

7718 -------- 

7719 >>> from scipy.stats import energy_distance 

7720 >>> energy_distance([0], [2]) 

7721 2.0000000000000004 

7722 >>> energy_distance([0, 8], [0, 8], [3, 1], [2, 2]) 

7723 1.0000000000000002 

7724 >>> energy_distance([0.7, 7.4, 2.4, 6.8], [1.4, 8. ], 

7725 ... [2.1, 4.2, 7.4, 8. ], [7.6, 8.8]) 

7726 0.88003340976158217 

7727 

7728 """ 

7729 return np.sqrt(2) * _cdf_distance(2, u_values, v_values, 

7730 u_weights, v_weights) 

7731 

7732 

7733def _cdf_distance(p, u_values, v_values, u_weights=None, v_weights=None): 

7734 r""" 

7735 Compute, between two one-dimensional distributions :math:`u` and 

7736 :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the 

7737 statistical distance that is defined as: 

7738 

7739 .. math:: 

7740 

7741 l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} 

7742 

7743 p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2 

7744 gives the energy distance. 

7745 

7746 Parameters 

7747 ---------- 

7748 u_values, v_values : array_like 

7749 Values observed in the (empirical) distribution. 

7750 u_weights, v_weights : array_like, optional 

7751 Weight for each value. If unspecified, each value is assigned the same 

7752 weight. 

7753 `u_weights` (resp. `v_weights`) must have the same length as 

7754 `u_values` (resp. `v_values`). If the weight sum differs from 1, it 

7755 must still be positive and finite so that the weights can be normalized 

7756 to sum to 1. 

7757 

7758 Returns 

7759 ------- 

7760 distance : float 

7761 The computed distance between the distributions. 

7762 

7763 Notes 

7764 ----- 

7765 The input distributions can be empirical, therefore coming from samples 

7766 whose values are effectively inputs of the function, or they can be seen as 

7767 generalized functions, in which case they are weighted sums of Dirac delta 

7768 functions located at the specified values. 

7769 

7770 References 

7771 ---------- 

7772 .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, 

7773 Munos "The Cramer Distance as a Solution to Biased Wasserstein 

7774 Gradients" (2017). :arXiv:`1705.10743`. 

7775 

7776 """ 

7777 u_values, u_weights = _validate_distribution(u_values, u_weights) 

7778 v_values, v_weights = _validate_distribution(v_values, v_weights) 

7779 

7780 u_sorter = np.argsort(u_values) 

7781 v_sorter = np.argsort(v_values) 

7782 

7783 all_values = np.concatenate((u_values, v_values)) 

7784 all_values.sort(kind='mergesort') 

7785 

7786 # Compute the differences between pairs of successive values of u and v. 

7787 deltas = np.diff(all_values) 

7788 

7789 # Get the respective positions of the values of u and v among the values of 

7790 # both distributions. 

7791 u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], 'right') 

7792 v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], 'right') 

7793 

7794 # Calculate the CDFs of u and v using their weights, if specified. 

7795 if u_weights is None: 

7796 u_cdf = u_cdf_indices / u_values.size 

7797 else: 

7798 u_sorted_cumweights = np.concatenate(([0], 

7799 np.cumsum(u_weights[u_sorter]))) 

7800 u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1] 

7801 

7802 if v_weights is None: 

7803 v_cdf = v_cdf_indices / v_values.size 

7804 else: 

7805 v_sorted_cumweights = np.concatenate(([0], 

7806 np.cumsum(v_weights[v_sorter]))) 

7807 v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1] 

7808 

7809 # Compute the value of the integral based on the CDFs. 

7810 # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead 

7811 # of about 15%. 

7812 if p == 1: 

7813 return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas)) 

7814 if p == 2: 

7815 return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas))) 

7816 return np.power(np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p), 

7817 deltas)), 1/p) 

7818 

7819 

7820def _validate_distribution(values, weights): 

7821 """ 

7822 Validate the values and weights from a distribution input of `cdf_distance` 

7823 and return them as ndarray objects. 

7824 

7825 Parameters 

7826 ---------- 

7827 values : array_like 

7828 Values observed in the (empirical) distribution. 

7829 weights : array_like 

7830 Weight for each value. 

7831 

7832 Returns 

7833 ------- 

7834 values : ndarray 

7835 Values as ndarray. 

7836 weights : ndarray 

7837 Weights as ndarray. 

7838 

7839 """ 

7840 # Validate the value array. 

7841 values = np.asarray(values, dtype=float) 

7842 if len(values) == 0: 

7843 raise ValueError("Distribution can't be empty.") 

7844 

7845 # Validate the weight array, if specified. 

7846 if weights is not None: 

7847 weights = np.asarray(weights, dtype=float) 

7848 if len(weights) != len(values): 

7849 raise ValueError('Value and weight array-likes for the same ' 

7850 'empirical distribution must be of the same size.') 

7851 if np.any(weights < 0): 

7852 raise ValueError('All weights must be non-negative.') 

7853 if not 0 < np.sum(weights) < np.inf: 

7854 raise ValueError('Weight array-like sum must be positive and ' 

7855 'finite. Set as None for an equal distribution of ' 

7856 'weight.') 

7857 

7858 return values, weights 

7859 

7860 return values, None 

7861 

7862 

7863##################################### 

7864# SUPPORT FUNCTIONS # 

7865##################################### 

7866 

7867RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts')) 

7868 

7869 

7870def find_repeats(arr): 

7871 """ 

7872 Find repeats and repeat counts. 

7873 

7874 Parameters 

7875 ---------- 

7876 arr : array_like 

7877 Input array. This is cast to float64. 

7878 

7879 Returns 

7880 ------- 

7881 values : ndarray 

7882 The unique values from the (flattened) input that are repeated. 

7883 

7884 counts : ndarray 

7885 Number of times the corresponding 'value' is repeated. 

7886 

7887 Notes 

7888 ----- 

7889 In numpy >= 1.9 `numpy.unique` provides similar functionality. The main 

7890 difference is that `find_repeats` only returns repeated values. 

7891 

7892 Examples 

7893 -------- 

7894 >>> from scipy import stats 

7895 >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5]) 

7896 RepeatedResults(values=array([2.]), counts=array([4])) 

7897 

7898 >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) 

7899 RepeatedResults(values=array([4., 5.]), counts=array([2, 2])) 

7900 

7901 """ 

7902 # Note: always copies. 

7903 return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64))) 

7904 

7905 

7906def _sum_of_squares(a, axis=0): 

7907 """ 

7908 Square each element of the input array, and return the sum(s) of that. 

7909 

7910 Parameters 

7911 ---------- 

7912 a : array_like 

7913 Input array. 

7914 axis : int or None, optional 

7915 Axis along which to calculate. Default is 0. If None, compute over 

7916 the whole array `a`. 

7917 

7918 Returns 

7919 ------- 

7920 sum_of_squares : ndarray 

7921 The sum along the given axis for (a**2). 

7922 

7923 See Also 

7924 -------- 

7925 _square_of_sums : The square(s) of the sum(s) (the opposite of 

7926 `_sum_of_squares`). 

7927 

7928 """ 

7929 a, axis = _chk_asarray(a, axis) 

7930 return np.sum(a*a, axis) 

7931 

7932 

7933def _square_of_sums(a, axis=0): 

7934 """ 

7935 Sum elements of the input array, and return the square(s) of that sum. 

7936 

7937 Parameters 

7938 ---------- 

7939 a : array_like 

7940 Input array. 

7941 axis : int or None, optional 

7942 Axis along which to calculate. Default is 0. If None, compute over 

7943 the whole array `a`. 

7944 

7945 Returns 

7946 ------- 

7947 square_of_sums : float or ndarray 

7948 The square of the sum over `axis`. 

7949 

7950 See Also 

7951 -------- 

7952 _sum_of_squares : The sum of squares (the opposite of `square_of_sums`). 

7953 

7954 """ 

7955 a, axis = _chk_asarray(a, axis) 

7956 s = np.sum(a, axis) 

7957 if not np.isscalar(s): 

7958 return s.astype(float) * s 

7959 else: 

7960 return float(s) * s 

7961 

7962 

7963def rankdata(a, method='average', *, axis=None): 

7964 """ 

7965 Assign ranks to data, dealing with ties appropriately. 

7966 

7967 By default (``axis=None``), the data array is first flattened, and a flat 

7968 array of ranks is returned. Separately reshape the rank array to the 

7969 shape of the data array if desired (see Examples). 

7970 

7971 Ranks begin at 1. The `method` argument controls how ranks are assigned 

7972 to equal values. See [1]_ for further discussion of ranking methods. 

7973 

7974 Parameters 

7975 ---------- 

7976 a : array_like 

7977 The array of values to be ranked. 

7978 method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional 

7979 The method used to assign ranks to tied elements. 

7980 The following methods are available (default is 'average'): 

7981 

7982 * 'average': The average of the ranks that would have been assigned to 

7983 all the tied values is assigned to each value. 

7984 * 'min': The minimum of the ranks that would have been assigned to all 

7985 the tied values is assigned to each value. (This is also 

7986 referred to as "competition" ranking.) 

7987 * 'max': The maximum of the ranks that would have been assigned to all 

7988 the tied values is assigned to each value. 

7989 * 'dense': Like 'min', but the rank of the next highest element is 

7990 assigned the rank immediately after those assigned to the tied 

7991 elements. 

7992 * 'ordinal': All values are given a distinct rank, corresponding to 

7993 the order that the values occur in `a`. 

7994 axis : {None, int}, optional 

7995 Axis along which to perform the ranking. If ``None``, the data array 

7996 is first flattened. 

7997 

7998 Returns 

7999 ------- 

8000 ranks : ndarray 

8001 An array of size equal to the size of `a`, containing rank 

8002 scores. 

8003 

8004 References 

8005 ---------- 

8006 .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking 

8007 

8008 Examples 

8009 -------- 

8010 >>> from scipy.stats import rankdata 

8011 >>> rankdata([0, 2, 3, 2]) 

8012 array([ 1. , 2.5, 4. , 2.5]) 

8013 >>> rankdata([0, 2, 3, 2], method='min') 

8014 array([ 1, 2, 4, 2]) 

8015 >>> rankdata([0, 2, 3, 2], method='max') 

8016 array([ 1, 3, 4, 3]) 

8017 >>> rankdata([0, 2, 3, 2], method='dense') 

8018 array([ 1, 2, 3, 2]) 

8019 >>> rankdata([0, 2, 3, 2], method='ordinal') 

8020 array([ 1, 2, 4, 3]) 

8021 >>> rankdata([[0, 2], [3, 2]]).reshape(2,2) 

8022 array([[1. , 2.5], 

8023 [4. , 2.5]]) 

8024 >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1) 

8025 array([[1. , 2.5, 2.5], 

8026 [2. , 1. , 3. ]]) 

8027 """ 

8028 if method not in ('average', 'min', 'max', 'dense', 'ordinal'): 

8029 raise ValueError('unknown method "{0}"'.format(method)) 

8030 

8031 if axis is not None: 

8032 a = np.asarray(a) 

8033 if a.size == 0: 

8034 # The return values of `normalize_axis_index` are ignored. The 

8035 # call validates `axis`, even though we won't use it. 

8036 # use scipy._lib._util._normalize_axis_index when available 

8037 np.core.multiarray.normalize_axis_index(axis, a.ndim) 

8038 dt = np.float64 if method == 'average' else np.int_ 

8039 return np.empty(a.shape, dtype=dt) 

8040 return np.apply_along_axis(rankdata, axis, a, method) 

8041 

8042 arr = np.ravel(np.asarray(a)) 

8043 algo = 'mergesort' if method == 'ordinal' else 'quicksort' 

8044 sorter = np.argsort(arr, kind=algo) 

8045 

8046 inv = np.empty(sorter.size, dtype=np.intp) 

8047 inv[sorter] = np.arange(sorter.size, dtype=np.intp) 

8048 

8049 if method == 'ordinal': 

8050 return inv + 1 

8051 

8052 arr = arr[sorter] 

8053 obs = np.r_[True, arr[1:] != arr[:-1]] 

8054 dense = obs.cumsum()[inv] 

8055 

8056 if method == 'dense': 

8057 return dense 

8058 

8059 # cumulative counts of each unique value 

8060 count = np.r_[np.nonzero(obs)[0], len(obs)] 

8061 

8062 if method == 'max': 

8063 return count[dense] 

8064 

8065 if method == 'min': 

8066 return count[dense - 1] + 1 

8067 

8068 # average method 

8069 return .5 * (count[dense] + count[dense - 1] + 1)