Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import math 

2import warnings 

3from collections import namedtuple 

4 

5import numpy as np 

6from numpy import (isscalar, r_, log, around, unique, asarray, 

7 zeros, arange, sort, amin, amax, any, atleast_1d, 

8 sqrt, ceil, floor, array, compress, 

9 pi, exp, ravel, count_nonzero, sin, cos, arctan2, hypot) 

10 

11from scipy import optimize 

12from scipy import special 

13from . import statlib 

14from . import stats 

15from .stats import find_repeats, _contains_nan 

16from .contingency import chi2_contingency 

17from . import distributions 

18from ._distn_infrastructure import rv_generic 

19from ._hypotests import _get_wilcoxon_distr 

20 

21 

22__all__ = ['mvsdist', 

23 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot', 

24 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot', 

25 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test', 

26 'fligner', 'mood', 'wilcoxon', 'median_test', 

27 'circmean', 'circvar', 'circstd', 'anderson_ksamp', 

28 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax', 

29 'yeojohnson_normplot' 

30 ] 

31 

32 

33Mean = namedtuple('Mean', ('statistic', 'minmax')) 

34Variance = namedtuple('Variance', ('statistic', 'minmax')) 

35Std_dev = namedtuple('Std_dev', ('statistic', 'minmax')) 

36 

37 

38def bayes_mvs(data, alpha=0.90): 

39 r""" 

40 Bayesian confidence intervals for the mean, var, and std. 

41 

42 Parameters 

43 ---------- 

44 data : array_like 

45 Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`. 

46 Requires 2 or more data points. 

47 alpha : float, optional 

48 Probability that the returned confidence interval contains 

49 the true parameter. 

50 

51 Returns 

52 ------- 

53 mean_cntr, var_cntr, std_cntr : tuple 

54 The three results are for the mean, variance and standard deviation, 

55 respectively. Each result is a tuple of the form:: 

56 

57 (center, (lower, upper)) 

58 

59 with `center` the mean of the conditional pdf of the value given the 

60 data, and `(lower, upper)` a confidence interval, centered on the 

61 median, containing the estimate to a probability ``alpha``. 

62 

63 See Also 

64 -------- 

65 mvsdist 

66 

67 Notes 

68 ----- 

69 Each tuple of mean, variance, and standard deviation estimates represent 

70 the (center, (lower, upper)) with center the mean of the conditional pdf 

71 of the value given the data and (lower, upper) is a confidence interval 

72 centered on the median, containing the estimate to a probability 

73 ``alpha``. 

74 

75 Converts data to 1-D and assumes all data has the same mean and variance. 

76 Uses Jeffrey's prior for variance and std. 

77 

78 Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))`` 

79 

80 References 

81 ---------- 

82 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and 

83 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278, 

84 2006. 

85 

86 Examples 

87 -------- 

88 First a basic example to demonstrate the outputs: 

89 

90 >>> from scipy import stats 

91 >>> data = [6, 9, 12, 7, 8, 8, 13] 

92 >>> mean, var, std = stats.bayes_mvs(data) 

93 >>> mean 

94 Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467)) 

95 >>> var 

96 Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...)) 

97 >>> std 

98 Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631)) 

99 

100 Now we generate some normally distributed random data, and get estimates of 

101 mean and standard deviation with 95% confidence intervals for those 

102 estimates: 

103 

104 >>> n_samples = 100000 

105 >>> data = stats.norm.rvs(size=n_samples) 

106 >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95) 

107 

108 >>> import matplotlib.pyplot as plt 

109 >>> fig = plt.figure() 

110 >>> ax = fig.add_subplot(111) 

111 >>> ax.hist(data, bins=100, density=True, label='Histogram of data') 

112 >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean') 

113 >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r', 

114 ... alpha=0.2, label=r'Estimated mean (95% limits)') 

115 >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale') 

116 >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2, 

117 ... label=r'Estimated scale (95% limits)') 

118 

119 >>> ax.legend(fontsize=10) 

120 >>> ax.set_xlim([-4, 4]) 

121 >>> ax.set_ylim([0, 0.5]) 

122 >>> plt.show() 

123 

124 """ 

125 m, v, s = mvsdist(data) 

126 if alpha >= 1 or alpha <= 0: 

127 raise ValueError("0 < alpha < 1 is required, but alpha=%s was given." 

128 % alpha) 

129 

130 m_res = Mean(m.mean(), m.interval(alpha)) 

131 v_res = Variance(v.mean(), v.interval(alpha)) 

132 s_res = Std_dev(s.mean(), s.interval(alpha)) 

133 

134 return m_res, v_res, s_res 

135 

136 

137def mvsdist(data): 

138 """ 

139 'Frozen' distributions for mean, variance, and standard deviation of data. 

140 

141 Parameters 

142 ---------- 

143 data : array_like 

144 Input array. Converted to 1-D using ravel. 

145 Requires 2 or more data-points. 

146 

147 Returns 

148 ------- 

149 mdist : "frozen" distribution object 

150 Distribution object representing the mean of the data. 

151 vdist : "frozen" distribution object 

152 Distribution object representing the variance of the data. 

153 sdist : "frozen" distribution object 

154 Distribution object representing the standard deviation of the data. 

155 

156 See Also 

157 -------- 

158 bayes_mvs 

159 

160 Notes 

161 ----- 

162 The return values from ``bayes_mvs(data)`` is equivalent to 

163 ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``. 

164 

165 In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)`` 

166 on the three distribution objects returned from this function will give 

167 the same results that are returned from `bayes_mvs`. 

168 

169 References 

170 ---------- 

171 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and 

172 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278, 

173 2006. 

174 

175 Examples 

176 -------- 

177 >>> from scipy import stats 

178 >>> data = [6, 9, 12, 7, 8, 8, 13] 

179 >>> mean, var, std = stats.mvsdist(data) 

180 

181 We now have frozen distribution objects "mean", "var" and "std" that we can 

182 examine: 

183 

184 >>> mean.mean() 

185 9.0 

186 >>> mean.interval(0.95) 

187 (6.6120585482655692, 11.387941451734431) 

188 >>> mean.std() 

189 1.1952286093343936 

190 

191 """ 

192 x = ravel(data) 

193 n = len(x) 

194 if n < 2: 

195 raise ValueError("Need at least 2 data-points.") 

196 xbar = x.mean() 

197 C = x.var() 

198 if n > 1000: # gaussian approximations for large n 

199 mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n)) 

200 sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n))) 

201 vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C) 

202 else: 

203 nm1 = n - 1 

204 fac = n * C / 2. 

205 val = nm1 / 2. 

206 mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1)) 

207 sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac)) 

208 vdist = distributions.invgamma(val, scale=fac) 

209 return mdist, vdist, sdist 

210 

211 

212def kstat(data, n=2): 

213 r""" 

214 Return the nth k-statistic (1<=n<=4 so far). 

215 

216 The nth k-statistic k_n is the unique symmetric unbiased estimator of the 

217 nth cumulant kappa_n. 

218 

219 Parameters 

220 ---------- 

221 data : array_like 

222 Input array. Note that n-D input gets flattened. 

223 n : int, {1, 2, 3, 4}, optional 

224 Default is equal to 2. 

225 

226 Returns 

227 ------- 

228 kstat : float 

229 The nth k-statistic. 

230 

231 See Also 

232 -------- 

233 kstatvar: Returns an unbiased estimator of the variance of the k-statistic. 

234 moment: Returns the n-th central moment about the mean for a sample. 

235 

236 Notes 

237 ----- 

238 For a sample size n, the first few k-statistics are given by: 

239 

240 .. math:: 

241 

242 k_{1} = \mu 

243 k_{2} = \frac{n}{n-1} m_{2} 

244 k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3} 

245 k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)} 

246 

247 where :math:`\mu` is the sample mean, :math:`m_2` is the sample 

248 variance, and :math:`m_i` is the i-th sample central moment. 

249 

250 References 

251 ---------- 

252 http://mathworld.wolfram.com/k-Statistic.html 

253 

254 http://mathworld.wolfram.com/Cumulant.html 

255 

256 Examples 

257 -------- 

258 >>> from scipy import stats 

259 >>> rndm = np.random.RandomState(1234) 

260 

261 As sample size increases, n-th moment and n-th k-statistic converge to the 

262 same number (although they aren't identical). In the case of the normal 

263 distribution, they converge to zero. 

264 

265 >>> for n in [2, 3, 4, 5, 6, 7]: 

266 ... x = rndm.normal(size=10**n) 

267 ... m, k = stats.moment(x, 3), stats.kstat(x, 3) 

268 ... print("%.3g %.3g %.3g" % (m, k, m-k)) 

269 -0.631 -0.651 0.0194 

270 0.0282 0.0283 -8.49e-05 

271 -0.0454 -0.0454 1.36e-05 

272 7.53e-05 7.53e-05 -2.26e-09 

273 0.00166 0.00166 -4.99e-09 

274 -2.88e-06 -2.88e-06 8.63e-13 

275 """ 

276 if n > 4 or n < 1: 

277 raise ValueError("k-statistics only supported for 1<=n<=4") 

278 n = int(n) 

279 S = np.zeros(n + 1, np.float64) 

280 data = ravel(data) 

281 N = data.size 

282 

283 # raise ValueError on empty input 

284 if N == 0: 

285 raise ValueError("Data input must not be empty") 

286 

287 # on nan input, return nan without warning 

288 if np.isnan(np.sum(data)): 

289 return np.nan 

290 

291 for k in range(1, n + 1): 

292 S[k] = np.sum(data**k, axis=0) 

293 if n == 1: 

294 return S[1] * 1.0/N 

295 elif n == 2: 

296 return (N*S[2] - S[1]**2.0) / (N*(N - 1.0)) 

297 elif n == 3: 

298 return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0)) 

299 elif n == 4: 

300 return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - 

301 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / 

302 (N*(N-1.0)*(N-2.0)*(N-3.0))) 

303 else: 

304 raise ValueError("Should not be here.") 

305 

306 

307def kstatvar(data, n=2): 

308 r""" 

309 Return an unbiased estimator of the variance of the k-statistic. 

310 

311 See `kstat` for more details of the k-statistic. 

312 

313 Parameters 

314 ---------- 

315 data : array_like 

316 Input array. Note that n-D input gets flattened. 

317 n : int, {1, 2}, optional 

318 Default is equal to 2. 

319 

320 Returns 

321 ------- 

322 kstatvar : float 

323 The nth k-statistic variance. 

324 

325 See Also 

326 -------- 

327 kstat: Returns the n-th k-statistic. 

328 moment: Returns the n-th central moment about the mean for a sample. 

329 

330 Notes 

331 ----- 

332 The variances of the first few k-statistics are given by: 

333 

334 .. math:: 

335 

336 var(k_{1}) = \frac{\kappa^2}{n} 

337 var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1} 

338 var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} + 

339 \frac{9 \kappa^2_{3}}{n - 1} + 

340 \frac{6 n \kappa^3_{2}}{(n-1) (n-2)} 

341 var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} + 

342 \frac{48 \kappa_{3} \kappa_5}{n - 1} + 

343 \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} + 

344 \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} + 

345 \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)} 

346 """ 

347 data = ravel(data) 

348 N = len(data) 

349 if n == 1: 

350 return kstat(data, n=2) * 1.0/N 

351 elif n == 2: 

352 k2 = kstat(data, n=2) 

353 k4 = kstat(data, n=4) 

354 return (2*N*k2**2 + (N-1)*k4) / (N*(N+1)) 

355 else: 

356 raise ValueError("Only n=1 or n=2 supported.") 

357 

358 

359def _calc_uniform_order_statistic_medians(n): 

360 """ 

361 Approximations of uniform order statistic medians. 

362 

363 Parameters 

364 ---------- 

365 n : int 

366 Sample size. 

367 

368 Returns 

369 ------- 

370 v : 1d float array 

371 Approximations of the order statistic medians. 

372 

373 References 

374 ---------- 

375 .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient 

376 Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 

377 

378 Examples 

379 -------- 

380 Order statistics of the uniform distribution on the unit interval 

381 are marginally distributed according to beta distributions. 

382 The expectations of these order statistic are evenly spaced across 

383 the interval, but the distributions are skewed in a way that 

384 pushes the medians slightly towards the endpoints of the unit interval: 

385 

386 >>> n = 4 

387 >>> k = np.arange(1, n+1) 

388 >>> from scipy.stats import beta 

389 >>> a = k 

390 >>> b = n-k+1 

391 >>> beta.mean(a, b) 

392 array([ 0.2, 0.4, 0.6, 0.8]) 

393 >>> beta.median(a, b) 

394 array([ 0.15910358, 0.38572757, 0.61427243, 0.84089642]) 

395 

396 The Filliben approximation uses the exact medians of the smallest 

397 and greatest order statistics, and the remaining medians are approximated 

398 by points spread evenly across a sub-interval of the unit interval: 

399 

400 >>> from scipy.morestats import _calc_uniform_order_statistic_medians 

401 >>> _calc_uniform_order_statistic_medians(n) 

402 array([ 0.15910358, 0.38545246, 0.61454754, 0.84089642]) 

403 

404 This plot shows the skewed distributions of the order statistics 

405 of a sample of size four from a uniform distribution on the unit interval: 

406 

407 >>> import matplotlib.pyplot as plt 

408 >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True) 

409 >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)] 

410 >>> plt.figure() 

411 >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3]) 

412 

413 """ 

414 v = np.zeros(n, dtype=np.float64) 

415 v[-1] = 0.5**(1.0 / n) 

416 v[0] = 1 - v[-1] 

417 i = np.arange(2, n) 

418 v[1:-1] = (i - 0.3175) / (n + 0.365) 

419 return v 

420 

421 

422def _parse_dist_kw(dist, enforce_subclass=True): 

423 """Parse `dist` keyword. 

424 

425 Parameters 

426 ---------- 

427 dist : str or stats.distributions instance. 

428 Several functions take `dist` as a keyword, hence this utility 

429 function. 

430 enforce_subclass : bool, optional 

431 If True (default), `dist` needs to be a 

432 `_distn_infrastructure.rv_generic` instance. 

433 It can sometimes be useful to set this keyword to False, if a function 

434 wants to accept objects that just look somewhat like such an instance 

435 (for example, they have a ``ppf`` method). 

436 

437 """ 

438 if isinstance(dist, rv_generic): 

439 pass 

440 elif isinstance(dist, str): 

441 try: 

442 dist = getattr(distributions, dist) 

443 except AttributeError: 

444 raise ValueError("%s is not a valid distribution name" % dist) 

445 elif enforce_subclass: 

446 msg = ("`dist` should be a stats.distributions instance or a string " 

447 "with the name of such a distribution.") 

448 raise ValueError(msg) 

449 

450 return dist 

451 

452 

453def _add_axis_labels_title(plot, xlabel, ylabel, title): 

454 """Helper function to add axes labels and a title to stats plots""" 

455 try: 

456 if hasattr(plot, 'set_title'): 

457 # Matplotlib Axes instance or something that looks like it 

458 plot.set_title(title) 

459 plot.set_xlabel(xlabel) 

460 plot.set_ylabel(ylabel) 

461 else: 

462 # matplotlib.pyplot module 

463 plot.title(title) 

464 plot.xlabel(xlabel) 

465 plot.ylabel(ylabel) 

466 except Exception: 

467 # Not an MPL object or something that looks (enough) like it. 

468 # Don't crash on adding labels or title 

469 pass 

470 

471 

472def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False): 

473 """ 

474 Calculate quantiles for a probability plot, and optionally show the plot. 

475 

476 Generates a probability plot of sample data against the quantiles of a 

477 specified theoretical distribution (the normal distribution by default). 

478 `probplot` optionally calculates a best-fit line for the data and plots the 

479 results using Matplotlib or a given plot function. 

480 

481 Parameters 

482 ---------- 

483 x : array_like 

484 Sample/response data from which `probplot` creates the plot. 

485 sparams : tuple, optional 

486 Distribution-specific shape parameters (shape parameters plus location 

487 and scale). 

488 dist : str or stats.distributions instance, optional 

489 Distribution or distribution function name. The default is 'norm' for a 

490 normal probability plot. Objects that look enough like a 

491 stats.distributions instance (i.e. they have a ``ppf`` method) are also 

492 accepted. 

493 fit : bool, optional 

494 Fit a least-squares regression (best-fit) line to the sample data if 

495 True (default). 

496 plot : object, optional 

497 If given, plots the quantiles and least squares fit. 

498 `plot` is an object that has to have methods "plot" and "text". 

499 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

500 or a custom object with the same methods. 

501 Default is None, which means that no plot is created. 

502 

503 Returns 

504 ------- 

505 (osm, osr) : tuple of ndarrays 

506 Tuple of theoretical quantiles (osm, or order statistic medians) and 

507 ordered responses (osr). `osr` is simply sorted input `x`. 

508 For details on how `osm` is calculated see the Notes section. 

509 (slope, intercept, r) : tuple of floats, optional 

510 Tuple containing the result of the least-squares fit, if that is 

511 performed by `probplot`. `r` is the square root of the coefficient of 

512 determination. If ``fit=False`` and ``plot=None``, this tuple is not 

513 returned. 

514 

515 Notes 

516 ----- 

517 Even if `plot` is given, the figure is not shown or saved by `probplot`; 

518 ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after 

519 calling `probplot`. 

520 

521 `probplot` generates a probability plot, which should not be confused with 

522 a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this 

523 type, see ``statsmodels.api.ProbPlot``. 

524 

525 The formula used for the theoretical quantiles (horizontal axis of the 

526 probability plot) is Filliben's estimate:: 

527 

528 quantiles = dist.ppf(val), for 

529 

530 0.5**(1/n), for i = n 

531 val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1 

532 1 - 0.5**(1/n), for i = 1 

533 

534 where ``i`` indicates the i-th ordered value and ``n`` is the total number 

535 of values. 

536 

537 Examples 

538 -------- 

539 >>> from scipy import stats 

540 >>> import matplotlib.pyplot as plt 

541 >>> nsample = 100 

542 >>> np.random.seed(7654321) 

543 

544 A t distribution with small degrees of freedom: 

545 

546 >>> ax1 = plt.subplot(221) 

547 >>> x = stats.t.rvs(3, size=nsample) 

548 >>> res = stats.probplot(x, plot=plt) 

549 

550 A t distribution with larger degrees of freedom: 

551 

552 >>> ax2 = plt.subplot(222) 

553 >>> x = stats.t.rvs(25, size=nsample) 

554 >>> res = stats.probplot(x, plot=plt) 

555 

556 A mixture of two normal distributions with broadcasting: 

557 

558 >>> ax3 = plt.subplot(223) 

559 >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5], 

560 ... size=(nsample//2,2)).ravel() 

561 >>> res = stats.probplot(x, plot=plt) 

562 

563 A standard normal distribution: 

564 

565 >>> ax4 = plt.subplot(224) 

566 >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample) 

567 >>> res = stats.probplot(x, plot=plt) 

568 

569 Produce a new figure with a loggamma distribution, using the ``dist`` and 

570 ``sparams`` keywords: 

571 

572 >>> fig = plt.figure() 

573 >>> ax = fig.add_subplot(111) 

574 >>> x = stats.loggamma.rvs(c=2.5, size=500) 

575 >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax) 

576 >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5") 

577 

578 Show the results with Matplotlib: 

579 

580 >>> plt.show() 

581 

582 """ 

583 x = np.asarray(x) 

584 _perform_fit = fit or (plot is not None) 

585 if x.size == 0: 

586 if _perform_fit: 

587 return (x, x), (np.nan, np.nan, 0.0) 

588 else: 

589 return x, x 

590 

591 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 

592 dist = _parse_dist_kw(dist, enforce_subclass=False) 

593 if sparams is None: 

594 sparams = () 

595 if isscalar(sparams): 

596 sparams = (sparams,) 

597 if not isinstance(sparams, tuple): 

598 sparams = tuple(sparams) 

599 

600 osm = dist.ppf(osm_uniform, *sparams) 

601 osr = sort(x) 

602 if _perform_fit: 

603 # perform a linear least squares fit. 

604 slope, intercept, r, prob, sterrest = stats.linregress(osm, osr) 

605 

606 if plot is not None: 

607 plot.plot(osm, osr, 'bo', osm, slope*osm + intercept, 'r-') 

608 _add_axis_labels_title(plot, xlabel='Theoretical quantiles', 

609 ylabel='Ordered Values', 

610 title='Probability Plot') 

611 

612 # Add R^2 value to the plot as text 

613 if rvalue: 

614 xmin = amin(osm) 

615 xmax = amax(osm) 

616 ymin = amin(x) 

617 ymax = amax(x) 

618 posx = xmin + 0.70 * (xmax - xmin) 

619 posy = ymin + 0.01 * (ymax - ymin) 

620 plot.text(posx, posy, "$R^2=%1.4f$" % r**2) 

621 

622 if fit: 

623 return (osm, osr), (slope, intercept, r) 

624 else: 

625 return osm, osr 

626 

627 

628def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'): 

629 """ 

630 Calculate the shape parameter that maximizes the PPCC. 

631 

632 The probability plot correlation coefficient (PPCC) plot can be used to 

633 determine the optimal shape parameter for a one-parameter family of 

634 distributions. ppcc_max returns the shape parameter that would maximize the 

635 probability plot correlation coefficient for the given data to a 

636 one-parameter family of distributions. 

637 

638 Parameters 

639 ---------- 

640 x : array_like 

641 Input array. 

642 brack : tuple, optional 

643 Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c) 

644 then they are assumed to be a starting interval for a downhill bracket 

645 search (see `scipy.optimize.brent`). 

646 dist : str or stats.distributions instance, optional 

647 Distribution or distribution function name. Objects that look enough 

648 like a stats.distributions instance (i.e. they have a ``ppf`` method) 

649 are also accepted. The default is ``'tukeylambda'``. 

650 

651 Returns 

652 ------- 

653 shape_value : float 

654 The shape parameter at which the probability plot correlation 

655 coefficient reaches its max value. 

656 

657 See Also 

658 -------- 

659 ppcc_plot, probplot, boxcox 

660 

661 Notes 

662 ----- 

663 The brack keyword serves as a starting point which is useful in corner 

664 cases. One can use a plot to obtain a rough visual estimate of the location 

665 for the maximum to start the search near it. 

666 

667 References 

668 ---------- 

669 .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test for 

670 Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 

671 

672 .. [2] https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm 

673 

674 Examples 

675 -------- 

676 First we generate some random data from a Tukey-Lambda distribution, 

677 with shape parameter -0.7: 

678 

679 >>> from scipy import stats 

680 >>> x = stats.tukeylambda.rvs(-0.7, loc=2, scale=0.5, size=10000, 

681 ... random_state=1234567) + 1e4 

682 

683 Now we explore this data with a PPCC plot as well as the related 

684 probability plot and Box-Cox normplot. A red line is drawn where we 

685 expect the PPCC value to be maximal (at the shape parameter -0.7 used 

686 above): 

687 

688 >>> import matplotlib.pyplot as plt 

689 >>> fig = plt.figure(figsize=(8, 6)) 

690 >>> ax = fig.add_subplot(111) 

691 >>> res = stats.ppcc_plot(x, -5, 5, plot=ax) 

692 

693 We calculate the value where the shape should reach its maximum and a red 

694 line is drawn there. The line should coincide with the highest point in the 

695 ppcc_plot. 

696 

697 >>> max = stats.ppcc_max(x) 

698 >>> ax.vlines(max, 0, 1, colors='r', label='Expected shape value') 

699 

700 >>> plt.show() 

701 

702 """ 

703 dist = _parse_dist_kw(dist) 

704 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 

705 osr = sort(x) 

706 

707 # this function computes the x-axis values of the probability plot 

708 # and computes a linear regression (including the correlation) 

709 # and returns 1-r so that a minimization function maximizes the 

710 # correlation 

711 def tempfunc(shape, mi, yvals, func): 

712 xvals = func(mi, shape) 

713 r, prob = stats.pearsonr(xvals, yvals) 

714 return 1 - r 

715 

716 return optimize.brent(tempfunc, brack=brack, args=(osm_uniform, osr, dist.ppf)) 

717 

718 

719def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80): 

720 """ 

721 Calculate and optionally plot probability plot correlation coefficient. 

722 

723 The probability plot correlation coefficient (PPCC) plot can be used to 

724 determine the optimal shape parameter for a one-parameter family of 

725 distributions. It cannot be used for distributions without shape parameters 

726 (like the normal distribution) or with multiple shape parameters. 

727 

728 By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A 

729 Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed 

730 distributions via an approximately normal one, and is therefore particularly 

731 useful in practice. 

732 

733 Parameters 

734 ---------- 

735 x : array_like 

736 Input array. 

737 a, b : scalar 

738 Lower and upper bounds of the shape parameter to use. 

739 dist : str or stats.distributions instance, optional 

740 Distribution or distribution function name. Objects that look enough 

741 like a stats.distributions instance (i.e. they have a ``ppf`` method) 

742 are also accepted. The default is ``'tukeylambda'``. 

743 plot : object, optional 

744 If given, plots PPCC against the shape parameter. 

745 `plot` is an object that has to have methods "plot" and "text". 

746 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

747 or a custom object with the same methods. 

748 Default is None, which means that no plot is created. 

749 N : int, optional 

750 Number of points on the horizontal axis (equally distributed from 

751 `a` to `b`). 

752 

753 Returns 

754 ------- 

755 svals : ndarray 

756 The shape values for which `ppcc` was calculated. 

757 ppcc : ndarray 

758 The calculated probability plot correlation coefficient values. 

759 

760 See Also 

761 -------- 

762 ppcc_max, probplot, boxcox_normplot, tukeylambda 

763 

764 References 

765 ---------- 

766 J.J. Filliben, "The Probability Plot Correlation Coefficient Test for 

767 Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 

768 

769 Examples 

770 -------- 

771 First we generate some random data from a Tukey-Lambda distribution, 

772 with shape parameter -0.7: 

773 

774 >>> from scipy import stats 

775 >>> import matplotlib.pyplot as plt 

776 >>> np.random.seed(1234567) 

777 >>> x = stats.tukeylambda.rvs(-0.7, loc=2, scale=0.5, size=10000) + 1e4 

778 

779 Now we explore this data with a PPCC plot as well as the related 

780 probability plot and Box-Cox normplot. A red line is drawn where we 

781 expect the PPCC value to be maximal (at the shape parameter -0.7 used 

782 above): 

783 

784 >>> fig = plt.figure(figsize=(12, 4)) 

785 >>> ax1 = fig.add_subplot(131) 

786 >>> ax2 = fig.add_subplot(132) 

787 >>> ax3 = fig.add_subplot(133) 

788 >>> res = stats.probplot(x, plot=ax1) 

789 >>> res = stats.boxcox_normplot(x, -5, 5, plot=ax2) 

790 >>> res = stats.ppcc_plot(x, -5, 5, plot=ax3) 

791 >>> ax3.vlines(-0.7, 0, 1, colors='r', label='Expected shape value') 

792 >>> plt.show() 

793 

794 """ 

795 if b <= a: 

796 raise ValueError("`b` has to be larger than `a`.") 

797 

798 svals = np.linspace(a, b, num=N) 

799 ppcc = np.empty_like(svals) 

800 for k, sval in enumerate(svals): 

801 _, r2 = probplot(x, sval, dist=dist, fit=True) 

802 ppcc[k] = r2[-1] 

803 

804 if plot is not None: 

805 plot.plot(svals, ppcc, 'x') 

806 _add_axis_labels_title(plot, xlabel='Shape Values', 

807 ylabel='Prob Plot Corr. Coef.', 

808 title='(%s) PPCC Plot' % dist) 

809 

810 return svals, ppcc 

811 

812 

813def boxcox_llf(lmb, data): 

814 r"""The boxcox log-likelihood function. 

815 

816 Parameters 

817 ---------- 

818 lmb : scalar 

819 Parameter for Box-Cox transformation. See `boxcox` for details. 

820 data : array_like 

821 Data to calculate Box-Cox log-likelihood for. If `data` is 

822 multi-dimensional, the log-likelihood is calculated along the first 

823 axis. 

824 

825 Returns 

826 ------- 

827 llf : float or ndarray 

828 Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`, 

829 an array otherwise. 

830 

831 See Also 

832 -------- 

833 boxcox, probplot, boxcox_normplot, boxcox_normmax 

834 

835 Notes 

836 ----- 

837 The Box-Cox log-likelihood function is defined here as 

838 

839 .. math:: 

840 

841 llf = (\lambda - 1) \sum_i(\log(x_i)) - 

842 N/2 \log(\sum_i (y_i - \bar{y})^2 / N), 

843 

844 where ``y`` is the Box-Cox transformed input data ``x``. 

845 

846 Examples 

847 -------- 

848 >>> from scipy import stats 

849 >>> import matplotlib.pyplot as plt 

850 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes 

851 >>> np.random.seed(1245) 

852 

853 Generate some random variates and calculate Box-Cox log-likelihood values 

854 for them for a range of ``lmbda`` values: 

855 

856 >>> x = stats.loggamma.rvs(5, loc=10, size=1000) 

857 >>> lmbdas = np.linspace(-2, 10) 

858 >>> llf = np.zeros(lmbdas.shape, dtype=float) 

859 >>> for ii, lmbda in enumerate(lmbdas): 

860 ... llf[ii] = stats.boxcox_llf(lmbda, x) 

861 

862 Also find the optimal lmbda value with `boxcox`: 

863 

864 >>> x_most_normal, lmbda_optimal = stats.boxcox(x) 

865 

866 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a 

867 horizontal line to check that that's really the optimum: 

868 

869 >>> fig = plt.figure() 

870 >>> ax = fig.add_subplot(111) 

871 >>> ax.plot(lmbdas, llf, 'b.-') 

872 >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r') 

873 >>> ax.set_xlabel('lmbda parameter') 

874 >>> ax.set_ylabel('Box-Cox log-likelihood') 

875 

876 Now add some probability plots to show that where the log-likelihood is 

877 maximized the data transformed with `boxcox` looks closest to normal: 

878 

879 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' 

880 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): 

881 ... xt = stats.boxcox(x, lmbda=lmbda) 

882 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) 

883 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) 

884 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') 

885 ... ax_inset.set_xticklabels([]) 

886 ... ax_inset.set_yticklabels([]) 

887 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda) 

888 

889 >>> plt.show() 

890 

891 """ 

892 data = np.asarray(data) 

893 N = data.shape[0] 

894 if N == 0: 

895 return np.nan 

896 

897 logdata = np.log(data) 

898 

899 # Compute the variance of the transformed data. 

900 if lmb == 0: 

901 variance = np.var(logdata, axis=0) 

902 else: 

903 # Transform without the constant offset 1/lmb. The offset does 

904 # not effect the variance, and the subtraction of the offset can 

905 # lead to loss of precision. 

906 variance = np.var(data**lmb / lmb, axis=0) 

907 

908 return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance) 

909 

910 

911def _boxcox_conf_interval(x, lmax, alpha): 

912 # Need to find the lambda for which 

913 # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1 

914 fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1) 

915 target = boxcox_llf(lmax, x) - fac 

916 

917 def rootfunc(lmbda, data, target): 

918 return boxcox_llf(lmbda, data) - target 

919 

920 # Find positive endpoint of interval in which answer is to be found 

921 newlm = lmax + 0.5 

922 N = 0 

923 while (rootfunc(newlm, x, target) > 0.0) and (N < 500): 

924 newlm += 0.1 

925 N += 1 

926 

927 if N == 500: 

928 raise RuntimeError("Could not find endpoint.") 

929 

930 lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target)) 

931 

932 # Now find negative interval in the same way 

933 newlm = lmax - 0.5 

934 N = 0 

935 while (rootfunc(newlm, x, target) > 0.0) and (N < 500): 

936 newlm -= 0.1 

937 N += 1 

938 

939 if N == 500: 

940 raise RuntimeError("Could not find endpoint.") 

941 

942 lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target)) 

943 return lmminus, lmplus 

944 

945 

946def boxcox(x, lmbda=None, alpha=None): 

947 r""" 

948 Return a dataset transformed by a Box-Cox power transformation. 

949 

950 Parameters 

951 ---------- 

952 x : ndarray 

953 Input array. Must be positive 1-dimensional. Must not be constant. 

954 lmbda : {None, scalar}, optional 

955 If `lmbda` is not None, do the transformation for that value. 

956 

957 If `lmbda` is None, find the lambda that maximizes the log-likelihood 

958 function and return it as the second output argument. 

959 alpha : {None, float}, optional 

960 If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence 

961 interval for `lmbda` as the third output argument. 

962 Must be between 0.0 and 1.0. 

963 

964 Returns 

965 ------- 

966 boxcox : ndarray 

967 Box-Cox power transformed array. 

968 maxlog : float, optional 

969 If the `lmbda` parameter is None, the second returned argument is 

970 the lambda that maximizes the log-likelihood function. 

971 (min_ci, max_ci) : tuple of float, optional 

972 If `lmbda` parameter is None and ``alpha`` is not None, this returned 

973 tuple of floats represents the minimum and maximum confidence limits 

974 given ``alpha``. 

975 

976 See Also 

977 -------- 

978 probplot, boxcox_normplot, boxcox_normmax, boxcox_llf 

979 

980 Notes 

981 ----- 

982 The Box-Cox transform is given by:: 

983 

984 y = (x**lmbda - 1) / lmbda, for lmbda > 0 

985 log(x), for lmbda = 0 

986 

987 `boxcox` requires the input data to be positive. Sometimes a Box-Cox 

988 transformation provides a shift parameter to achieve this; `boxcox` does 

989 not. Such a shift parameter is equivalent to adding a positive constant to 

990 `x` before calling `boxcox`. 

991 

992 The confidence limits returned when ``alpha`` is provided give the interval 

993 where: 

994 

995 .. math:: 

996 

997 llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), 

998 

999 with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared 

1000 function. 

1001 

1002 References 

1003 ---------- 

1004 G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the 

1005 Royal Statistical Society B, 26, 211-252 (1964). 

1006 

1007 Examples 

1008 -------- 

1009 >>> from scipy import stats 

1010 >>> import matplotlib.pyplot as plt 

1011 

1012 We generate some random variates from a non-normal distribution and make a 

1013 probability plot for it, to show it is non-normal in the tails: 

1014 

1015 >>> fig = plt.figure() 

1016 >>> ax1 = fig.add_subplot(211) 

1017 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1018 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1) 

1019 >>> ax1.set_xlabel('') 

1020 >>> ax1.set_title('Probplot against normal distribution') 

1021 

1022 We now use `boxcox` to transform the data so it's closest to normal: 

1023 

1024 >>> ax2 = fig.add_subplot(212) 

1025 >>> xt, _ = stats.boxcox(x) 

1026 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2) 

1027 >>> ax2.set_title('Probplot after Box-Cox transformation') 

1028 

1029 >>> plt.show() 

1030 

1031 """ 

1032 x = np.asarray(x) 

1033 if x.ndim != 1: 

1034 raise ValueError("Data must be 1-dimensional.") 

1035 

1036 if x.size == 0: 

1037 return x 

1038 

1039 if np.all(x == x[0]): 

1040 raise ValueError("Data must not be constant.") 

1041 

1042 if any(x <= 0): 

1043 raise ValueError("Data must be positive.") 

1044 

1045 if lmbda is not None: # single transformation 

1046 return special.boxcox(x, lmbda) 

1047 

1048 # If lmbda=None, find the lmbda that maximizes the log-likelihood function. 

1049 lmax = boxcox_normmax(x, method='mle') 

1050 y = boxcox(x, lmax) 

1051 

1052 if alpha is None: 

1053 return y, lmax 

1054 else: 

1055 # Find confidence interval 

1056 interval = _boxcox_conf_interval(x, lmax, alpha) 

1057 return y, lmax, interval 

1058 

1059 

1060def boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr'): 

1061 """Compute optimal Box-Cox transform parameter for input data. 

1062 

1063 Parameters 

1064 ---------- 

1065 x : array_like 

1066 Input array. 

1067 brack : 2-tuple, optional 

1068 The starting interval for a downhill bracket search with 

1069 `optimize.brent`. Note that this is in most cases not critical; the 

1070 final result is allowed to be outside this bracket. 

1071 method : str, optional 

1072 The method to determine the optimal transform parameter (`boxcox` 

1073 ``lmbda`` parameter). Options are: 

1074 

1075 'pearsonr' (default) 

1076 Maximizes the Pearson correlation coefficient between 

1077 ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be 

1078 normally-distributed. 

1079 

1080 'mle' 

1081 Minimizes the log-likelihood `boxcox_llf`. This is the method used 

1082 in `boxcox`. 

1083 

1084 'all' 

1085 Use all optimization methods available, and return all results. 

1086 Useful to compare different methods. 

1087 

1088 Returns 

1089 ------- 

1090 maxlog : float or ndarray 

1091 The optimal transform parameter found. An array instead of a scalar 

1092 for ``method='all'``. 

1093 

1094 See Also 

1095 -------- 

1096 boxcox, boxcox_llf, boxcox_normplot 

1097 

1098 Examples 

1099 -------- 

1100 >>> from scipy import stats 

1101 >>> import matplotlib.pyplot as plt 

1102 >>> np.random.seed(1234) # make this example reproducible 

1103 

1104 Generate some data and determine optimal ``lmbda`` in various ways: 

1105 

1106 >>> x = stats.loggamma.rvs(5, size=30) + 5 

1107 >>> y, lmax_mle = stats.boxcox(x) 

1108 >>> lmax_pearsonr = stats.boxcox_normmax(x) 

1109 

1110 >>> lmax_mle 

1111 7.177... 

1112 >>> lmax_pearsonr 

1113 7.916... 

1114 >>> stats.boxcox_normmax(x, method='all') 

1115 array([ 7.91667384, 7.17718692]) 

1116 

1117 >>> fig = plt.figure() 

1118 >>> ax = fig.add_subplot(111) 

1119 >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax) 

1120 >>> ax.axvline(lmax_mle, color='r') 

1121 >>> ax.axvline(lmax_pearsonr, color='g', ls='--') 

1122 

1123 >>> plt.show() 

1124 

1125 """ 

1126 

1127 def _pearsonr(x, brack): 

1128 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 

1129 xvals = distributions.norm.ppf(osm_uniform) 

1130 

1131 def _eval_pearsonr(lmbda, xvals, samps): 

1132 # This function computes the x-axis values of the probability plot 

1133 # and computes a linear regression (including the correlation) and 

1134 # returns ``1 - r`` so that a minimization function maximizes the 

1135 # correlation. 

1136 y = boxcox(samps, lmbda) 

1137 yvals = np.sort(y) 

1138 r, prob = stats.pearsonr(xvals, yvals) 

1139 return 1 - r 

1140 

1141 return optimize.brent(_eval_pearsonr, brack=brack, args=(xvals, x)) 

1142 

1143 def _mle(x, brack): 

1144 def _eval_mle(lmb, data): 

1145 # function to minimize 

1146 return -boxcox_llf(lmb, data) 

1147 

1148 return optimize.brent(_eval_mle, brack=brack, args=(x,)) 

1149 

1150 def _all(x, brack): 

1151 maxlog = np.zeros(2, dtype=float) 

1152 maxlog[0] = _pearsonr(x, brack) 

1153 maxlog[1] = _mle(x, brack) 

1154 return maxlog 

1155 

1156 methods = {'pearsonr': _pearsonr, 

1157 'mle': _mle, 

1158 'all': _all} 

1159 if method not in methods.keys(): 

1160 raise ValueError("Method %s not recognized." % method) 

1161 

1162 optimfunc = methods[method] 

1163 return optimfunc(x, brack) 

1164 

1165 

1166def _normplot(method, x, la, lb, plot=None, N=80): 

1167 """Compute parameters for a Box-Cox or Yeo-Johnson normality plot, 

1168 optionally show it. See `boxcox_normplot` or `yeojohnson_normplot` for 

1169 details.""" 

1170 

1171 if method == 'boxcox': 

1172 title = 'Box-Cox Normality Plot' 

1173 transform_func = boxcox 

1174 else: 

1175 title = 'Yeo-Johnson Normality Plot' 

1176 transform_func = yeojohnson 

1177 

1178 x = np.asarray(x) 

1179 if x.size == 0: 

1180 return x 

1181 

1182 if lb <= la: 

1183 raise ValueError("`lb` has to be larger than `la`.") 

1184 

1185 lmbdas = np.linspace(la, lb, num=N) 

1186 ppcc = lmbdas * 0.0 

1187 for i, val in enumerate(lmbdas): 

1188 # Determine for each lmbda the square root of correlation coefficient 

1189 # of transformed x 

1190 z = transform_func(x, lmbda=val) 

1191 _, (_, _, r) = probplot(z, dist='norm', fit=True) 

1192 ppcc[i] = r 

1193 

1194 if plot is not None: 

1195 plot.plot(lmbdas, ppcc, 'x') 

1196 _add_axis_labels_title(plot, xlabel='$\\lambda$', 

1197 ylabel='Prob Plot Corr. Coef.', 

1198 title=title) 

1199 

1200 return lmbdas, ppcc 

1201 

1202 

1203def boxcox_normplot(x, la, lb, plot=None, N=80): 

1204 """Compute parameters for a Box-Cox normality plot, optionally show it. 

1205 

1206 A Box-Cox normality plot shows graphically what the best transformation 

1207 parameter is to use in `boxcox` to obtain a distribution that is close 

1208 to normal. 

1209 

1210 Parameters 

1211 ---------- 

1212 x : array_like 

1213 Input array. 

1214 la, lb : scalar 

1215 The lower and upper bounds for the ``lmbda`` values to pass to `boxcox` 

1216 for Box-Cox transformations. These are also the limits of the 

1217 horizontal axis of the plot if that is generated. 

1218 plot : object, optional 

1219 If given, plots the quantiles and least squares fit. 

1220 `plot` is an object that has to have methods "plot" and "text". 

1221 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

1222 or a custom object with the same methods. 

1223 Default is None, which means that no plot is created. 

1224 N : int, optional 

1225 Number of points on the horizontal axis (equally distributed from 

1226 `la` to `lb`). 

1227 

1228 Returns 

1229 ------- 

1230 lmbdas : ndarray 

1231 The ``lmbda`` values for which a Box-Cox transform was done. 

1232 ppcc : ndarray 

1233 Probability Plot Correlelation Coefficient, as obtained from `probplot` 

1234 when fitting the Box-Cox transformed input `x` against a normal 

1235 distribution. 

1236 

1237 See Also 

1238 -------- 

1239 probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max 

1240 

1241 Notes 

1242 ----- 

1243 Even if `plot` is given, the figure is not shown or saved by 

1244 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` 

1245 should be used after calling `probplot`. 

1246 

1247 Examples 

1248 -------- 

1249 >>> from scipy import stats 

1250 >>> import matplotlib.pyplot as plt 

1251 

1252 Generate some non-normally distributed data, and create a Box-Cox plot: 

1253 

1254 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1255 >>> fig = plt.figure() 

1256 >>> ax = fig.add_subplot(111) 

1257 >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax) 

1258 

1259 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in 

1260 the same plot: 

1261 

1262 >>> _, maxlog = stats.boxcox(x) 

1263 >>> ax.axvline(maxlog, color='r') 

1264 

1265 >>> plt.show() 

1266 

1267 """ 

1268 return _normplot('boxcox', x, la, lb, plot, N) 

1269 

1270 

1271def yeojohnson(x, lmbda=None): 

1272 r""" 

1273 Return a dataset transformed by a Yeo-Johnson power transformation. 

1274 

1275 Parameters 

1276 ---------- 

1277 x : ndarray 

1278 Input array. Should be 1-dimensional. 

1279 lmbda : float, optional 

1280 If ``lmbda`` is ``None``, find the lambda that maximizes the 

1281 log-likelihood function and return it as the second output argument. 

1282 Otherwise the transformation is done for the given value. 

1283 

1284 Returns 

1285 ------- 

1286 yeojohnson: ndarray 

1287 Yeo-Johnson power transformed array. 

1288 maxlog : float, optional 

1289 If the `lmbda` parameter is None, the second returned argument is 

1290 the lambda that maximizes the log-likelihood function. 

1291 

1292 See Also 

1293 -------- 

1294 probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox 

1295 

1296 Notes 

1297 ----- 

1298 The Yeo-Johnson transform is given by:: 

1299 

1300 y = ((x + 1)**lmbda - 1) / lmbda, for x >= 0, lmbda != 0 

1301 log(x + 1), for x >= 0, lmbda = 0 

1302 -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda), for x < 0, lmbda != 2 

1303 -log(-x + 1), for x < 0, lmbda = 2 

1304 

1305 Unlike `boxcox`, `yeojohnson` does not require the input data to be 

1306 positive. 

1307 

1308 .. versionadded:: 1.2.0 

1309 

1310 

1311 References 

1312 ---------- 

1313 I. Yeo and R.A. Johnson, "A New Family of Power Transformations to 

1314 Improve Normality or Symmetry", Biometrika 87.4 (2000): 

1315 

1316 

1317 Examples 

1318 -------- 

1319 >>> from scipy import stats 

1320 >>> import matplotlib.pyplot as plt 

1321 

1322 We generate some random variates from a non-normal distribution and make a 

1323 probability plot for it, to show it is non-normal in the tails: 

1324 

1325 >>> fig = plt.figure() 

1326 >>> ax1 = fig.add_subplot(211) 

1327 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1328 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1) 

1329 >>> ax1.set_xlabel('') 

1330 >>> ax1.set_title('Probplot against normal distribution') 

1331 

1332 We now use `yeojohnson` to transform the data so it's closest to normal: 

1333 

1334 >>> ax2 = fig.add_subplot(212) 

1335 >>> xt, lmbda = stats.yeojohnson(x) 

1336 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2) 

1337 >>> ax2.set_title('Probplot after Yeo-Johnson transformation') 

1338 

1339 >>> plt.show() 

1340 

1341 """ 

1342 

1343 x = np.asarray(x) 

1344 if x.size == 0: 

1345 return x 

1346 

1347 if np.issubdtype(x.dtype, np.complexfloating): 

1348 raise ValueError('Yeo-Johnson transformation is not defined for ' 

1349 'complex numbers.') 

1350 

1351 if np.issubdtype(x.dtype, np.integer): 

1352 x = x.astype(np.float64, copy=False) 

1353 

1354 if lmbda is not None: 

1355 return _yeojohnson_transform(x, lmbda) 

1356 

1357 # if lmbda=None, find the lmbda that maximizes the log-likelihood function. 

1358 lmax = yeojohnson_normmax(x) 

1359 y = _yeojohnson_transform(x, lmax) 

1360 

1361 return y, lmax 

1362 

1363 

1364def _yeojohnson_transform(x, lmbda): 

1365 """Return x transformed by the Yeo-Johnson power transform with given 

1366 parameter lmbda.""" 

1367 

1368 out = np.zeros_like(x) 

1369 pos = x >= 0 # binary mask 

1370 

1371 # when x >= 0 

1372 if abs(lmbda) < np.spacing(1.): 

1373 out[pos] = np.log1p(x[pos]) 

1374 else: # lmbda != 0 

1375 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda 

1376 

1377 # when x < 0 

1378 if abs(lmbda - 2) > np.spacing(1.): 

1379 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) 

1380 else: # lmbda == 2 

1381 out[~pos] = -np.log1p(-x[~pos]) 

1382 

1383 return out 

1384 

1385 

1386def yeojohnson_llf(lmb, data): 

1387 r"""The yeojohnson log-likelihood function. 

1388 

1389 Parameters 

1390 ---------- 

1391 lmb : scalar 

1392 Parameter for Yeo-Johnson transformation. See `yeojohnson` for 

1393 details. 

1394 data : array_like 

1395 Data to calculate Yeo-Johnson log-likelihood for. If `data` is 

1396 multi-dimensional, the log-likelihood is calculated along the first 

1397 axis. 

1398 

1399 Returns 

1400 ------- 

1401 llf : float 

1402 Yeo-Johnson log-likelihood of `data` given `lmb`. 

1403 

1404 See Also 

1405 -------- 

1406 yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax 

1407 

1408 Notes 

1409 ----- 

1410 The Yeo-Johnson log-likelihood function is defined here as 

1411 

1412 .. math:: 

1413 

1414 llf = N/2 \log(\hat{\sigma}^2) + (\lambda - 1) 

1415 \sum_i \text{ sign }(x_i)\log(|x_i| + 1) 

1416 

1417 where :math:`\hat{\sigma}^2` is estimated variance of the the Yeo-Johnson 

1418 transformed input data ``x``. 

1419 

1420 .. versionadded:: 1.2.0 

1421 

1422 Examples 

1423 -------- 

1424 >>> from scipy import stats 

1425 >>> import matplotlib.pyplot as plt 

1426 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes 

1427 >>> np.random.seed(1245) 

1428 

1429 Generate some random variates and calculate Yeo-Johnson log-likelihood 

1430 values for them for a range of ``lmbda`` values: 

1431 

1432 >>> x = stats.loggamma.rvs(5, loc=10, size=1000) 

1433 >>> lmbdas = np.linspace(-2, 10) 

1434 >>> llf = np.zeros(lmbdas.shape, dtype=float) 

1435 >>> for ii, lmbda in enumerate(lmbdas): 

1436 ... llf[ii] = stats.yeojohnson_llf(lmbda, x) 

1437 

1438 Also find the optimal lmbda value with `yeojohnson`: 

1439 

1440 >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x) 

1441 

1442 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a 

1443 horizontal line to check that that's really the optimum: 

1444 

1445 >>> fig = plt.figure() 

1446 >>> ax = fig.add_subplot(111) 

1447 >>> ax.plot(lmbdas, llf, 'b.-') 

1448 >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r') 

1449 >>> ax.set_xlabel('lmbda parameter') 

1450 >>> ax.set_ylabel('Yeo-Johnson log-likelihood') 

1451 

1452 Now add some probability plots to show that where the log-likelihood is 

1453 maximized the data transformed with `yeojohnson` looks closest to normal: 

1454 

1455 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' 

1456 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): 

1457 ... xt = stats.yeojohnson(x, lmbda=lmbda) 

1458 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) 

1459 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) 

1460 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') 

1461 ... ax_inset.set_xticklabels([]) 

1462 ... ax_inset.set_yticklabels([]) 

1463 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda) 

1464 

1465 >>> plt.show() 

1466 

1467 """ 

1468 data = np.asarray(data) 

1469 n_samples = data.shape[0] 

1470 

1471 if n_samples == 0: 

1472 return np.nan 

1473 

1474 trans = _yeojohnson_transform(data, lmb) 

1475 

1476 loglike = -n_samples / 2 * np.log(trans.var(axis=0)) 

1477 loglike += (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0) 

1478 

1479 return loglike 

1480 

1481 

1482def yeojohnson_normmax(x, brack=(-2, 2)): 

1483 """ 

1484 Compute optimal Yeo-Johnson transform parameter. 

1485  

1486 Compute optimal Yeo-Johnson transform parameter for input data, using 

1487 maximum likelihood estimation. 

1488 

1489 Parameters 

1490 ---------- 

1491 x : array_like 

1492 Input array. 

1493 brack : 2-tuple, optional 

1494 The starting interval for a downhill bracket search with 

1495 `optimize.brent`. Note that this is in most cases not critical; the 

1496 final result is allowed to be outside this bracket. 

1497 

1498 Returns 

1499 ------- 

1500 maxlog : float 

1501 The optimal transform parameter found. 

1502 

1503 See Also 

1504 -------- 

1505 yeojohnson, yeojohnson_llf, yeojohnson_normplot 

1506 

1507 Notes 

1508 ----- 

1509 .. versionadded:: 1.2.0 

1510 

1511 Examples 

1512 -------- 

1513 >>> from scipy import stats 

1514 >>> import matplotlib.pyplot as plt 

1515 >>> np.random.seed(1234) # make this example reproducible 

1516 

1517 Generate some data and determine optimal ``lmbda`` 

1518 

1519 >>> x = stats.loggamma.rvs(5, size=30) + 5 

1520 >>> lmax = stats.yeojohnson_normmax(x) 

1521 

1522 >>> fig = plt.figure() 

1523 >>> ax = fig.add_subplot(111) 

1524 >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax) 

1525 >>> ax.axvline(lmax, color='r') 

1526 

1527 >>> plt.show() 

1528 

1529 """ 

1530 

1531 def _neg_llf(lmbda, data): 

1532 return -yeojohnson_llf(lmbda, data) 

1533 

1534 return optimize.brent(_neg_llf, brack=brack, args=(x,)) 

1535 

1536 

1537def yeojohnson_normplot(x, la, lb, plot=None, N=80): 

1538 """Compute parameters for a Yeo-Johnson normality plot, optionally show it. 

1539 

1540 A Yeo-Johnson normality plot shows graphically what the best 

1541 transformation parameter is to use in `yeojohnson` to obtain a 

1542 distribution that is close to normal. 

1543 

1544 Parameters 

1545 ---------- 

1546 x : array_like 

1547 Input array. 

1548 la, lb : scalar 

1549 The lower and upper bounds for the ``lmbda`` values to pass to 

1550 `yeojohnson` for Yeo-Johnson transformations. These are also the 

1551 limits of the horizontal axis of the plot if that is generated. 

1552 plot : object, optional 

1553 If given, plots the quantiles and least squares fit. 

1554 `plot` is an object that has to have methods "plot" and "text". 

1555 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

1556 or a custom object with the same methods. 

1557 Default is None, which means that no plot is created. 

1558 N : int, optional 

1559 Number of points on the horizontal axis (equally distributed from 

1560 `la` to `lb`). 

1561 

1562 Returns 

1563 ------- 

1564 lmbdas : ndarray 

1565 The ``lmbda`` values for which a Yeo-Johnson transform was done. 

1566 ppcc : ndarray 

1567 Probability Plot Correlelation Coefficient, as obtained from `probplot` 

1568 when fitting the Box-Cox transformed input `x` against a normal 

1569 distribution. 

1570 

1571 See Also 

1572 -------- 

1573 probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max 

1574 

1575 Notes 

1576 ----- 

1577 Even if `plot` is given, the figure is not shown or saved by 

1578 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` 

1579 should be used after calling `probplot`. 

1580 

1581 .. versionadded:: 1.2.0 

1582 

1583 Examples 

1584 -------- 

1585 >>> from scipy import stats 

1586 >>> import matplotlib.pyplot as plt 

1587 

1588 Generate some non-normally distributed data, and create a Yeo-Johnson plot: 

1589 

1590 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1591 >>> fig = plt.figure() 

1592 >>> ax = fig.add_subplot(111) 

1593 >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax) 

1594 

1595 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in 

1596 the same plot: 

1597 

1598 >>> _, maxlog = stats.yeojohnson(x) 

1599 >>> ax.axvline(maxlog, color='r') 

1600 

1601 >>> plt.show() 

1602 

1603 """ 

1604 return _normplot('yeojohnson', x, la, lb, plot, N) 

1605 

1606 

1607ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue')) 

1608 

1609def shapiro(x): 

1610 """ 

1611 Perform the Shapiro-Wilk test for normality. 

1612 

1613 The Shapiro-Wilk test tests the null hypothesis that the 

1614 data was drawn from a normal distribution. 

1615 

1616 Parameters 

1617 ---------- 

1618 x : array_like 

1619 Array of sample data. 

1620 

1621 Returns 

1622 ------- 

1623 statistic : float 

1624 The test statistic. 

1625 p-value : float 

1626 The p-value for the hypothesis test. 

1627 

1628 See Also 

1629 -------- 

1630 anderson : The Anderson-Darling test for normality 

1631 kstest : The Kolmogorov-Smirnov test for goodness of fit. 

1632 

1633 Notes 

1634 ----- 

1635 The algorithm used is described in [4]_ but censoring parameters as 

1636 described are not implemented. For N > 5000 the W test statistic is accurate 

1637 but the p-value may not be. 

1638 

1639 The chance of rejecting the null hypothesis when it is true is close to 5% 

1640 regardless of sample size. 

1641 

1642 References 

1643 ---------- 

1644 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm 

1645 .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for 

1646 normality (complete samples), Biometrika, Vol. 52, pp. 591-611. 

1647 .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk, 

1648 Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of 

1649 Statistical Modeling and Analytics, Vol. 2, pp. 21-33. 

1650 .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4. 

1651 

1652 Examples 

1653 -------- 

1654 >>> from scipy import stats 

1655 >>> np.random.seed(12345678) 

1656 >>> x = stats.norm.rvs(loc=5, scale=3, size=100) 

1657 >>> shapiro_test = stats.shapiro(x) 

1658 >>> shapiro_test 

1659 ShapiroResult(statistic=0.9772805571556091, pvalue=0.08144091814756393) 

1660 >>> shapiro_test.statistic 

1661 0.9772805571556091 

1662 >>> shapiro_test.pvalue 

1663 0.08144091814756393 

1664 

1665 """ 

1666 x = np.ravel(x) 

1667 

1668 N = len(x) 

1669 if N < 3: 

1670 raise ValueError("Data must be at least length 3.") 

1671 

1672 a = zeros(N, 'f') 

1673 init = 0 

1674 

1675 y = sort(x) 

1676 a, w, pw, ifault = statlib.swilk(y, a[:N//2], init) 

1677 if ifault not in [0, 2]: 

1678 warnings.warn("Input data for shapiro has range zero. The results " 

1679 "may not be accurate.") 

1680 if N > 5000: 

1681 warnings.warn("p-value may not be accurate for N > 5000.") 

1682 

1683 return ShapiroResult(w, pw) 

1684 

1685 

1686# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and 

1687# Some Comparisons", Journal of the American Statistical 

1688# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737 

1689_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092]) 

1690_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957]) 

1691# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution", 

1692# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588. 

1693_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038]) 

1694# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based 

1695# on the Empirical Distribution Function.", Biometrika, 

1696# Vol. 66, Issue 3, Dec. 1979, pp 591-595. 

1697_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010]) 

1698 

1699 

1700AndersonResult = namedtuple('AndersonResult', ('statistic', 

1701 'critical_values', 

1702 'significance_level')) 

1703 

1704 

1705def anderson(x, dist='norm'): 

1706 """ 

1707 Anderson-Darling test for data coming from a particular distribution. 

1708 

1709 The Anderson-Darling test tests the null hypothesis that a sample is 

1710 drawn from a population that follows a particular distribution. 

1711 For the Anderson-Darling test, the critical values depend on 

1712 which distribution is being tested against. This function works 

1713 for normal, exponential, logistic, or Gumbel (Extreme Value 

1714 Type I) distributions. 

1715 

1716 Parameters 

1717 ---------- 

1718 x : array_like 

1719 Array of sample data. 

1720 dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 

1721 'extreme1'}, optional 

1722 The type of distribution to test against. The default is 'norm'. 

1723 The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the 

1724 same distribution. 

1725 

1726 Returns 

1727 ------- 

1728 statistic : float 

1729 The Anderson-Darling test statistic. 

1730 critical_values : list 

1731 The critical values for this distribution. 

1732 significance_level : list 

1733 The significance levels for the corresponding critical values 

1734 in percents. The function returns critical values for a 

1735 differing set of significance levels depending on the 

1736 distribution that is being tested against. 

1737 

1738 See Also 

1739 -------- 

1740 kstest : The Kolmogorov-Smirnov test for goodness-of-fit. 

1741 

1742 Notes 

1743 ----- 

1744 Critical values provided are for the following significance levels: 

1745 

1746 normal/exponenential 

1747 15%, 10%, 5%, 2.5%, 1% 

1748 logistic 

1749 25%, 10%, 5%, 2.5%, 1%, 0.5% 

1750 Gumbel 

1751 25%, 10%, 5%, 2.5%, 1% 

1752 

1753 If the returned statistic is larger than these critical values then 

1754 for the corresponding significance level, the null hypothesis that 

1755 the data come from the chosen distribution can be rejected. 

1756 The returned statistic is referred to as 'A2' in the references. 

1757 

1758 References 

1759 ---------- 

1760 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm 

1761 .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and 

1762 Some Comparisons, Journal of the American Statistical Association, 

1763 Vol. 69, pp. 730-737. 

1764 .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit 

1765 Statistics with Unknown Parameters, Annals of Statistics, Vol. 4, 

1766 pp. 357-369. 

1767 .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value 

1768 Distribution, Biometrika, Vol. 64, pp. 583-588. 

1769 .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference 

1770 to Tests for Exponentiality , Technical Report No. 262, 

1771 Department of Statistics, Stanford University, Stanford, CA. 

1772 .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution 

1773 Based on the Empirical Distribution Function, Biometrika, Vol. 66, 

1774 pp. 591-595. 

1775 

1776 """ 

1777 if dist not in ['norm', 'expon', 'gumbel', 'gumbel_l', 

1778 'gumbel_r', 'extreme1', 'logistic']: 

1779 raise ValueError("Invalid distribution; dist must be 'norm', " 

1780 "'expon', 'gumbel', 'extreme1' or 'logistic'.") 

1781 y = sort(x) 

1782 xbar = np.mean(x, axis=0) 

1783 N = len(y) 

1784 if dist == 'norm': 

1785 s = np.std(x, ddof=1, axis=0) 

1786 w = (y - xbar) / s 

1787 logcdf = distributions.norm.logcdf(w) 

1788 logsf = distributions.norm.logsf(w) 

1789 sig = array([15, 10, 5, 2.5, 1]) 

1790 critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3) 

1791 elif dist == 'expon': 

1792 w = y / xbar 

1793 logcdf = distributions.expon.logcdf(w) 

1794 logsf = distributions.expon.logsf(w) 

1795 sig = array([15, 10, 5, 2.5, 1]) 

1796 critical = around(_Avals_expon / (1.0 + 0.6/N), 3) 

1797 elif dist == 'logistic': 

1798 def rootfunc(ab, xj, N): 

1799 a, b = ab 

1800 tmp = (xj - a) / b 

1801 tmp2 = exp(tmp) 

1802 val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N, 

1803 np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N] 

1804 return array(val) 

1805 

1806 sol0 = array([xbar, np.std(x, ddof=1, axis=0)]) 

1807 sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5) 

1808 w = (y - sol[0]) / sol[1] 

1809 logcdf = distributions.logistic.logcdf(w) 

1810 logsf = distributions.logistic.logsf(w) 

1811 sig = array([25, 10, 5, 2.5, 1, 0.5]) 

1812 critical = around(_Avals_logistic / (1.0 + 0.25/N), 3) 

1813 elif dist == 'gumbel_r': 

1814 xbar, s = distributions.gumbel_r.fit(x) 

1815 w = (y - xbar) / s 

1816 logcdf = distributions.gumbel_r.logcdf(w) 

1817 logsf = distributions.gumbel_r.logsf(w) 

1818 sig = array([25, 10, 5, 2.5, 1]) 

1819 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3) 

1820 else: # (dist == 'gumbel') or (dist == 'gumbel_l') or (dist == 'extreme1') 

1821 xbar, s = distributions.gumbel_l.fit(x) 

1822 w = (y - xbar) / s 

1823 logcdf = distributions.gumbel_l.logcdf(w) 

1824 logsf = distributions.gumbel_l.logsf(w) 

1825 sig = array([25, 10, 5, 2.5, 1]) 

1826 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3) 

1827 

1828 i = arange(1, N + 1) 

1829 A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0) 

1830 

1831 return AndersonResult(A2, critical, sig) 

1832 

1833 

1834def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N): 

1835 """ 

1836 Compute A2akN equation 7 of Scholz and Stephens. 

1837 

1838 Parameters 

1839 ---------- 

1840 samples : sequence of 1-D array_like 

1841 Array of sample arrays. 

1842 Z : array_like 

1843 Sorted array of all observations. 

1844 Zstar : array_like 

1845 Sorted array of unique observations. 

1846 k : int 

1847 Number of samples. 

1848 n : array_like 

1849 Number of observations in each sample. 

1850 N : int 

1851 Total number of observations. 

1852 

1853 Returns 

1854 ------- 

1855 A2aKN : float 

1856 The A2aKN statistics of Scholz and Stephens 1987. 

1857 """ 

1858 

1859 A2akN = 0. 

1860 Z_ssorted_left = Z.searchsorted(Zstar, 'left') 

1861 if N == Zstar.size: 

1862 lj = 1. 

1863 else: 

1864 lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left 

1865 Bj = Z_ssorted_left + lj / 2. 

1866 for i in arange(0, k): 

1867 s = np.sort(samples[i]) 

1868 s_ssorted_right = s.searchsorted(Zstar, side='right') 

1869 Mij = s_ssorted_right.astype(float) 

1870 fij = s_ssorted_right - s.searchsorted(Zstar, 'left') 

1871 Mij -= fij / 2. 

1872 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.) 

1873 A2akN += inner.sum() / n[i] 

1874 A2akN *= (N - 1.) / N 

1875 return A2akN 

1876 

1877 

1878def _anderson_ksamp_right(samples, Z, Zstar, k, n, N): 

1879 """ 

1880 Compute A2akN equation 6 of Scholz & Stephens. 

1881 

1882 Parameters 

1883 ---------- 

1884 samples : sequence of 1-D array_like 

1885 Array of sample arrays. 

1886 Z : array_like 

1887 Sorted array of all observations. 

1888 Zstar : array_like 

1889 Sorted array of unique observations. 

1890 k : int 

1891 Number of samples. 

1892 n : array_like 

1893 Number of observations in each sample. 

1894 N : int 

1895 Total number of observations. 

1896 

1897 Returns 

1898 ------- 

1899 A2KN : float 

1900 The A2KN statistics of Scholz and Stephens 1987. 

1901 """ 

1902 

1903 A2kN = 0. 

1904 lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1], 

1905 'left') 

1906 Bj = lj.cumsum() 

1907 for i in arange(0, k): 

1908 s = np.sort(samples[i]) 

1909 Mij = s.searchsorted(Zstar[:-1], side='right') 

1910 inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj)) 

1911 A2kN += inner.sum() / n[i] 

1912 return A2kN 

1913 

1914 

1915Anderson_ksampResult = namedtuple('Anderson_ksampResult', 

1916 ('statistic', 'critical_values', 

1917 'significance_level')) 

1918 

1919 

1920def anderson_ksamp(samples, midrank=True): 

1921 """The Anderson-Darling test for k-samples. 

1922 

1923 The k-sample Anderson-Darling test is a modification of the 

1924 one-sample Anderson-Darling test. It tests the null hypothesis 

1925 that k-samples are drawn from the same population without having 

1926 to specify the distribution function of that population. The 

1927 critical values depend on the number of samples. 

1928 

1929 Parameters 

1930 ---------- 

1931 samples : sequence of 1-D array_like 

1932 Array of sample data in arrays. 

1933 midrank : bool, optional 

1934 Type of Anderson-Darling test which is computed. Default 

1935 (True) is the midrank test applicable to continuous and 

1936 discrete populations. If False, the right side empirical 

1937 distribution is used. 

1938 

1939 Returns 

1940 ------- 

1941 statistic : float 

1942 Normalized k-sample Anderson-Darling test statistic. 

1943 critical_values : array 

1944 The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%, 

1945 0.5%, 0.1%. 

1946 significance_level : float 

1947 An approximate significance level at which the null hypothesis for the 

1948 provided samples can be rejected. The value is floored / capped at 

1949 0.1% / 25%. 

1950 

1951 Raises 

1952 ------ 

1953 ValueError 

1954 If less than 2 samples are provided, a sample is empty, or no 

1955 distinct observations are in the samples. 

1956 

1957 See Also 

1958 -------- 

1959 ks_2samp : 2 sample Kolmogorov-Smirnov test 

1960 anderson : 1 sample Anderson-Darling test 

1961 

1962 Notes 

1963 ----- 

1964 [1]_ defines three versions of the k-sample Anderson-Darling test: 

1965 one for continuous distributions and two for discrete 

1966 distributions, in which ties between samples may occur. The 

1967 default of this routine is to compute the version based on the 

1968 midrank empirical distribution function. This test is applicable 

1969 to continuous and discrete data. If midrank is set to False, the 

1970 right side empirical distribution is used for a test for discrete 

1971 data. According to [1]_, the two discrete test statistics differ 

1972 only slightly if a few collisions due to round-off errors occur in 

1973 the test not adjusted for ties between samples. 

1974 

1975 The critical values corresponding to the significance levels from 0.01 

1976 to 0.25 are taken from [1]_. p-values are floored / capped 

1977 at 0.1% / 25%. Since the range of critical values might be extended in 

1978 future releases, it is recommended not to test ``p == 0.25``, but rather 

1979 ``p >= 0.25`` (analogously for the lower bound). 

1980 

1981 .. versionadded:: 0.14.0 

1982 

1983 References 

1984 ---------- 

1985 .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample 

1986 Anderson-Darling Tests, Journal of the American Statistical 

1987 Association, Vol. 82, pp. 918-924. 

1988 

1989 Examples 

1990 -------- 

1991 >>> from scipy import stats 

1992 >>> np.random.seed(314159) 

1993 

1994 The null hypothesis that the two random samples come from the same 

1995 distribution can be rejected at the 5% level because the returned 

1996 test value is greater than the critical value for 5% (1.961) but 

1997 not at the 2.5% level. The interpolation gives an approximate 

1998 significance level of 3.2%: 

1999 

2000 >>> stats.anderson_ksamp([np.random.normal(size=50), 

2001 ... np.random.normal(loc=0.5, size=30)]) 

2002 (2.4615796189876105, 

2003 array([ 0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546]), 

2004 0.03176687568842282) 

2005 

2006 

2007 The null hypothesis cannot be rejected for three samples from an 

2008 identical distribution. The reported p-value (25%) has been capped and 

2009 may not be very accurate (since it corresponds to the value 0.449 

2010 whereas the statistic is -0.731): 

2011 

2012 >>> stats.anderson_ksamp([np.random.normal(size=50), 

2013 ... np.random.normal(size=30), np.random.normal(size=20)]) 

2014 (-0.73091722665244196, 

2015 array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856, 

2016 4.07210043, 5.56419101]), 

2017 0.25) 

2018 

2019 """ 

2020 k = len(samples) 

2021 if (k < 2): 

2022 raise ValueError("anderson_ksamp needs at least two samples") 

2023 

2024 samples = list(map(np.asarray, samples)) 

2025 Z = np.sort(np.hstack(samples)) 

2026 N = Z.size 

2027 Zstar = np.unique(Z) 

2028 if Zstar.size < 2: 

2029 raise ValueError("anderson_ksamp needs more than one distinct " 

2030 "observation") 

2031 

2032 n = np.array([sample.size for sample in samples]) 

2033 if any(n == 0): 

2034 raise ValueError("anderson_ksamp encountered sample without " 

2035 "observations") 

2036 

2037 if midrank: 

2038 A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N) 

2039 else: 

2040 A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N) 

2041 

2042 H = (1. / n).sum() 

2043 hs_cs = (1. / arange(N - 1, 1, -1)).cumsum() 

2044 h = hs_cs[-1] + 1 

2045 g = (hs_cs / arange(2, N)).sum() 

2046 

2047 a = (4*g - 6) * (k - 1) + (10 - 6*g)*H 

2048 b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6 

2049 c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h 

2050 d = (2*h + 6)*k**2 - 4*h*k 

2051 sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.)) 

2052 m = k - 1 

2053 A2 = (A2kN - m) / math.sqrt(sigmasq) 

2054 

2055 # The b_i values are the interpolation coefficients from Table 2 

2056 # of Scholz and Stephens 1987 

2057 b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085]) 

2058 b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615]) 

2059 b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154]) 

2060 critical = b0 + b1 / math.sqrt(m) + b2 / m 

2061 

2062 sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001]) 

2063 if A2 < critical.min(): 

2064 p = sig.max() 

2065 warnings.warn("p-value capped: true value larger than {}".format(p), 

2066 stacklevel=2) 

2067 elif A2 > critical.max(): 

2068 p = sig.min() 

2069 warnings.warn("p-value floored: true value smaller than {}".format(p), 

2070 stacklevel=2) 

2071 else: 

2072 # interpolation of probit of significance level 

2073 pf = np.polyfit(critical, log(sig), 2) 

2074 p = math.exp(np.polyval(pf, A2)) 

2075 

2076 return Anderson_ksampResult(A2, critical, p) 

2077 

2078 

2079AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue')) 

2080 

2081 

2082def ansari(x, y): 

2083 """ 

2084 Perform the Ansari-Bradley test for equal scale parameters. 

2085 

2086 The Ansari-Bradley test is a non-parametric test for the equality 

2087 of the scale parameter of the distributions from which two 

2088 samples were drawn. 

2089 

2090 Parameters 

2091 ---------- 

2092 x, y : array_like 

2093 Arrays of sample data. 

2094 

2095 Returns 

2096 ------- 

2097 statistic : float 

2098 The Ansari-Bradley test statistic. 

2099 pvalue : float 

2100 The p-value of the hypothesis test. 

2101 

2102 See Also 

2103 -------- 

2104 fligner : A non-parametric test for the equality of k variances 

2105 mood : A non-parametric test for the equality of two scale parameters 

2106 

2107 Notes 

2108 ----- 

2109 The p-value given is exact when the sample sizes are both less than 

2110 55 and there are no ties, otherwise a normal approximation for the 

2111 p-value is used. 

2112 

2113 References 

2114 ---------- 

2115 .. [1] Sprent, Peter and N.C. Smeeton. Applied nonparametric statistical 

2116 methods. 3rd ed. Chapman and Hall/CRC. 2001. Section 5.8.2. 

2117 

2118 """ 

2119 x, y = asarray(x), asarray(y) 

2120 n = len(x) 

2121 m = len(y) 

2122 if m < 1: 

2123 raise ValueError("Not enough other observations.") 

2124 if n < 1: 

2125 raise ValueError("Not enough test observations.") 

2126 

2127 N = m + n 

2128 xy = r_[x, y] # combine 

2129 rank = stats.rankdata(xy) 

2130 symrank = amin(array((rank, N - rank + 1)), 0) 

2131 AB = np.sum(symrank[:n], axis=0) 

2132 uxy = unique(xy) 

2133 repeats = (len(uxy) != len(xy)) 

2134 exact = ((m < 55) and (n < 55) and not repeats) 

2135 if repeats and (m < 55 or n < 55): 

2136 warnings.warn("Ties preclude use of exact statistic.") 

2137 if exact: 

2138 astart, a1, ifault = statlib.gscale(n, m) 

2139 ind = AB - astart 

2140 total = np.sum(a1, axis=0) 

2141 if ind < len(a1)/2.0: 

2142 cind = int(ceil(ind)) 

2143 if ind == cind: 

2144 pval = 2.0 * np.sum(a1[:cind+1], axis=0) / total 

2145 else: 

2146 pval = 2.0 * np.sum(a1[:cind], axis=0) / total 

2147 else: 

2148 find = int(floor(ind)) 

2149 if ind == floor(ind): 

2150 pval = 2.0 * np.sum(a1[find:], axis=0) / total 

2151 else: 

2152 pval = 2.0 * np.sum(a1[find+1:], axis=0) / total 

2153 return AnsariResult(AB, min(1.0, pval)) 

2154 

2155 # otherwise compute normal approximation 

2156 if N % 2: # N odd 

2157 mnAB = n * (N+1.0)**2 / 4.0 / N 

2158 varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2) 

2159 else: 

2160 mnAB = n * (N+2.0) / 4.0 

2161 varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0) 

2162 if repeats: # adjust variance estimates 

2163 # compute np.sum(tj * rj**2,axis=0) 

2164 fac = np.sum(symrank**2, axis=0) 

2165 if N % 2: # N odd 

2166 varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1)) 

2167 else: # N even 

2168 varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1)) 

2169 

2170 z = (AB - mnAB) / sqrt(varAB) 

2171 pval = distributions.norm.sf(abs(z)) * 2.0 

2172 return AnsariResult(AB, pval) 

2173 

2174 

2175BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue')) 

2176 

2177 

2178def bartlett(*args): 

2179 """ 

2180 Perform Bartlett's test for equal variances. 

2181 

2182 Bartlett's test tests the null hypothesis that all input samples 

2183 are from populations with equal variances. For samples 

2184 from significantly non-normal populations, Levene's test 

2185 `levene` is more robust. 

2186 

2187 Parameters 

2188 ---------- 

2189 sample1, sample2,... : array_like 

2190 arrays of sample data. Only 1d arrays are accepted, they may have 

2191 different lengths. 

2192 

2193 Returns 

2194 ------- 

2195 statistic : float 

2196 The test statistic. 

2197 pvalue : float 

2198 The p-value of the test. 

2199 

2200 See Also 

2201 -------- 

2202 fligner : A non-parametric test for the equality of k variances 

2203 levene : A robust parametric test for equality of k variances 

2204 

2205 Notes 

2206 ----- 

2207 Conover et al. (1981) examine many of the existing parametric and 

2208 nonparametric tests by extensive simulations and they conclude that the 

2209 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be 

2210 superior in terms of robustness of departures from normality and power 

2211 ([3]_). 

2212 

2213 References 

2214 ---------- 

2215 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm 

2216 

2217 .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical 

2218 Methods, Eighth Edition, Iowa State University Press. 

2219 

2220 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 

2221 Hypothesis Testing based on Quadratic Inference Function. Technical 

2222 Report #99-03, Center for Likelihood Studies, Pennsylvania State 

2223 University. 

2224 

2225 .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical 

2226 Tests. Proceedings of the Royal Society of London. Series A, 

2227 Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282. 

2228 

2229 Examples 

2230 -------- 

2231 Test whether or not the lists `a`, `b` and `c` come from populations 

2232 with equal variances. 

2233 

2234 >>> from scipy.stats import bartlett 

2235 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 

2236 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 

2237 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 

2238 >>> stat, p = bartlett(a, b, c) 

2239 >>> p 

2240 1.1254782518834628e-05 

2241 

2242 The very small p-value suggests that the populations do not have equal 

2243 variances. 

2244 

2245 This is not surprising, given that the sample variance of `b` is much 

2246 larger than that of `a` and `c`: 

2247 

2248 >>> [np.var(x, ddof=1) for x in [a, b, c]] 

2249 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 

2250 """ 

2251 # Handle empty input and input that is not 1d 

2252 for a in args: 

2253 if np.asanyarray(a).size == 0: 

2254 return BartlettResult(np.nan, np.nan) 

2255 if np.asanyarray(a).ndim > 1: 

2256 raise ValueError('Samples must be one-dimensional.') 

2257 

2258 k = len(args) 

2259 if k < 2: 

2260 raise ValueError("Must enter at least two input sample vectors.") 

2261 Ni = zeros(k) 

2262 ssq = zeros(k, 'd') 

2263 for j in range(k): 

2264 Ni[j] = len(args[j]) 

2265 ssq[j] = np.var(args[j], ddof=1) 

2266 Ntot = np.sum(Ni, axis=0) 

2267 spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k)) 

2268 numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0) 

2269 denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) - 

2270 1.0/(Ntot - k)) 

2271 T = numer / denom 

2272 pval = distributions.chi2.sf(T, k - 1) # 1 - cdf 

2273 

2274 return BartlettResult(T, pval) 

2275 

2276 

2277LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue')) 

2278 

2279 

2280def levene(*args, **kwds): 

2281 """ 

2282 Perform Levene test for equal variances. 

2283 

2284 The Levene test tests the null hypothesis that all input samples 

2285 are from populations with equal variances. Levene's test is an 

2286 alternative to Bartlett's test `bartlett` in the case where 

2287 there are significant deviations from normality. 

2288 

2289 Parameters 

2290 ---------- 

2291 sample1, sample2, ... : array_like 

2292 The sample data, possibly with different lengths. Only one-dimensional 

2293 samples are accepted. 

2294 center : {'mean', 'median', 'trimmed'}, optional 

2295 Which function of the data to use in the test. The default 

2296 is 'median'. 

2297 proportiontocut : float, optional 

2298 When `center` is 'trimmed', this gives the proportion of data points 

2299 to cut from each end. (See `scipy.stats.trim_mean`.) 

2300 Default is 0.05. 

2301 

2302 Returns 

2303 ------- 

2304 statistic : float 

2305 The test statistic. 

2306 pvalue : float 

2307 The p-value for the test. 

2308 

2309 Notes 

2310 ----- 

2311 Three variations of Levene's test are possible. The possibilities 

2312 and their recommended usages are: 

2313 

2314 * 'median' : Recommended for skewed (non-normal) distributions> 

2315 * 'mean' : Recommended for symmetric, moderate-tailed distributions. 

2316 * 'trimmed' : Recommended for heavy-tailed distributions. 

2317 

2318 The test version using the mean was proposed in the original article 

2319 of Levene ([2]_) while the median and trimmed mean have been studied by 

2320 Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe 

2321 test. 

2322 

2323 References 

2324 ---------- 

2325 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm 

2326 .. [2] Levene, H. (1960). In Contributions to Probability and Statistics: 

2327 Essays in Honor of Harold Hotelling, I. Olkin et al. eds., 

2328 Stanford University Press, pp. 278-292. 

2329 .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American 

2330 Statistical Association, 69, 364-367 

2331 

2332 Examples 

2333 -------- 

2334 Test whether or not the lists `a`, `b` and `c` come from populations 

2335 with equal variances. 

2336 

2337 >>> from scipy.stats import levene 

2338 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 

2339 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 

2340 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 

2341 >>> stat, p = levene(a, b, c) 

2342 >>> p 

2343 0.002431505967249681 

2344 

2345 The small p-value suggests that the populations do not have equal 

2346 variances. 

2347 

2348 This is not surprising, given that the sample variance of `b` is much 

2349 larger than that of `a` and `c`: 

2350 

2351 >>> [np.var(x, ddof=1) for x in [a, b, c]] 

2352 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 

2353 """ 

2354 # Handle keyword arguments. 

2355 center = 'median' 

2356 proportiontocut = 0.05 

2357 for kw, value in kwds.items(): 

2358 if kw not in ['center', 'proportiontocut']: 

2359 raise TypeError("levene() got an unexpected keyword " 

2360 "argument '%s'" % kw) 

2361 if kw == 'center': 

2362 center = value 

2363 else: 

2364 proportiontocut = value 

2365 

2366 k = len(args) 

2367 if k < 2: 

2368 raise ValueError("Must enter at least two input sample vectors.") 

2369 # check for 1d input 

2370 for j in range(k): 

2371 if np.asanyarray(args[j]).ndim > 1: 

2372 raise ValueError('Samples must be one-dimensional.') 

2373 

2374 Ni = zeros(k) 

2375 Yci = zeros(k, 'd') 

2376 

2377 if center not in ['mean', 'median', 'trimmed']: 

2378 raise ValueError("Keyword argument <center> must be 'mean', 'median'" 

2379 " or 'trimmed'.") 

2380 

2381 if center == 'median': 

2382 func = lambda x: np.median(x, axis=0) 

2383 elif center == 'mean': 

2384 func = lambda x: np.mean(x, axis=0) 

2385 else: # center == 'trimmed' 

2386 args = tuple(stats.trimboth(np.sort(arg), proportiontocut) 

2387 for arg in args) 

2388 func = lambda x: np.mean(x, axis=0) 

2389 

2390 for j in range(k): 

2391 Ni[j] = len(args[j]) 

2392 Yci[j] = func(args[j]) 

2393 Ntot = np.sum(Ni, axis=0) 

2394 

2395 # compute Zij's 

2396 Zij = [None] * k 

2397 for i in range(k): 

2398 Zij[i] = abs(asarray(args[i]) - Yci[i]) 

2399 

2400 # compute Zbari 

2401 Zbari = zeros(k, 'd') 

2402 Zbar = 0.0 

2403 for i in range(k): 

2404 Zbari[i] = np.mean(Zij[i], axis=0) 

2405 Zbar += Zbari[i] * Ni[i] 

2406 

2407 Zbar /= Ntot 

2408 numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0) 

2409 

2410 # compute denom_variance 

2411 dvar = 0.0 

2412 for i in range(k): 

2413 dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0) 

2414 

2415 denom = (k - 1.0) * dvar 

2416 

2417 W = numer / denom 

2418 pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf 

2419 return LeveneResult(W, pval) 

2420 

2421 

2422def binom_test(x, n=None, p=0.5, alternative='two-sided'): 

2423 """ 

2424 Perform a test that the probability of success is p. 

2425 

2426 This is an exact, two-sided test of the null hypothesis 

2427 that the probability of success in a Bernoulli experiment 

2428 is `p`. 

2429 

2430 Parameters 

2431 ---------- 

2432 x : int or array_like 

2433 The number of successes, or if x has length 2, it is the 

2434 number of successes and the number of failures. 

2435 n : int 

2436 The number of trials. This is ignored if x gives both the 

2437 number of successes and failures. 

2438 p : float, optional 

2439 The hypothesized probability of success. ``0 <= p <= 1``. The 

2440 default value is ``p = 0.5``. 

2441 alternative : {'two-sided', 'greater', 'less'}, optional 

2442 Indicates the alternative hypothesis. The default value is 

2443 'two-sided'. 

2444 

2445 Returns 

2446 ------- 

2447 p-value : float 

2448 The p-value of the hypothesis test. 

2449 

2450 References 

2451 ---------- 

2452 .. [1] https://en.wikipedia.org/wiki/Binomial_test 

2453 

2454 Examples 

2455 -------- 

2456 >>> from scipy import stats 

2457 

2458 A car manufacturer claims that no more than 10% of their cars are unsafe. 

2459 15 cars are inspected for safety, 3 were found to be unsafe. Test the 

2460 manufacturer's claim: 

2461 

2462 >>> stats.binom_test(3, n=15, p=0.1, alternative='greater') 

2463 0.18406106910639114 

2464 

2465 The null hypothesis cannot be rejected at the 5% level of significance 

2466 because the returned p-value is greater than the critical value of 5%. 

2467 

2468 """ 

2469 x = atleast_1d(x).astype(np.int_) 

2470 if len(x) == 2: 

2471 n = x[1] + x[0] 

2472 x = x[0] 

2473 elif len(x) == 1: 

2474 x = x[0] 

2475 if n is None or n < x: 

2476 raise ValueError("n must be >= x") 

2477 n = np.int_(n) 

2478 else: 

2479 raise ValueError("Incorrect length for x.") 

2480 

2481 if (p > 1.0) or (p < 0.0): 

2482 raise ValueError("p must be in range [0,1]") 

2483 

2484 if alternative not in ('two-sided', 'less', 'greater'): 

2485 raise ValueError("alternative not recognized\n" 

2486 "should be 'two-sided', 'less' or 'greater'") 

2487 

2488 if alternative == 'less': 

2489 pval = distributions.binom.cdf(x, n, p) 

2490 return pval 

2491 

2492 if alternative == 'greater': 

2493 pval = distributions.binom.sf(x-1, n, p) 

2494 return pval 

2495 

2496 # if alternative was neither 'less' nor 'greater', then it's 'two-sided' 

2497 d = distributions.binom.pmf(x, n, p) 

2498 rerr = 1 + 1e-7 

2499 if x == p * n: 

2500 # special case as shortcut, would also be handled by `else` below 

2501 pval = 1. 

2502 elif x < p * n: 

2503 i = np.arange(np.ceil(p * n), n+1) 

2504 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) 

2505 pval = (distributions.binom.cdf(x, n, p) + 

2506 distributions.binom.sf(n - y, n, p)) 

2507 else: 

2508 i = np.arange(np.floor(p*n) + 1) 

2509 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) 

2510 pval = (distributions.binom.cdf(y-1, n, p) + 

2511 distributions.binom.sf(x-1, n, p)) 

2512 

2513 return min(1.0, pval) 

2514 

2515 

2516def _apply_func(x, g, func): 

2517 # g is list of indices into x 

2518 # separating x into different groups 

2519 # func should be applied over the groups 

2520 g = unique(r_[0, g, len(x)]) 

2521 output = [func(x[g[k]:g[k+1]]) for k in range(len(g) - 1)] 

2522 

2523 return asarray(output) 

2524 

2525 

2526FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue')) 

2527 

2528 

2529def fligner(*args, **kwds): 

2530 """ 

2531 Perform Fligner-Killeen test for equality of variance. 

2532 

2533 Fligner's test tests the null hypothesis that all input samples 

2534 are from populations with equal variances. Fligner-Killeen's test is 

2535 distribution free when populations are identical [2]_. 

2536 

2537 Parameters 

2538 ---------- 

2539 sample1, sample2, ... : array_like 

2540 Arrays of sample data. Need not be the same length. 

2541 center : {'mean', 'median', 'trimmed'}, optional 

2542 Keyword argument controlling which function of the data is used in 

2543 computing the test statistic. The default is 'median'. 

2544 proportiontocut : float, optional 

2545 When `center` is 'trimmed', this gives the proportion of data points 

2546 to cut from each end. (See `scipy.stats.trim_mean`.) 

2547 Default is 0.05. 

2548 

2549 Returns 

2550 ------- 

2551 statistic : float 

2552 The test statistic. 

2553 pvalue : float 

2554 The p-value for the hypothesis test. 

2555 

2556 See Also 

2557 -------- 

2558 bartlett : A parametric test for equality of k variances in normal samples 

2559 levene : A robust parametric test for equality of k variances 

2560 

2561 Notes 

2562 ----- 

2563 As with Levene's test there are three variants of Fligner's test that 

2564 differ by the measure of central tendency used in the test. See `levene` 

2565 for more information. 

2566 

2567 Conover et al. (1981) examine many of the existing parametric and 

2568 nonparametric tests by extensive simulations and they conclude that the 

2569 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be 

2570 superior in terms of robustness of departures from normality and power [3]_. 

2571 

2572 References 

2573 ---------- 

2574 .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 

2575 Hypothesis Testing based on Quadratic Inference Function. Technical 

2576 Report #99-03, Center for Likelihood Studies, Pennsylvania State 

2577 University. 

2578 https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf 

2579 

2580 .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample 

2581 tests for scale. 'Journal of the American Statistical Association.' 

2582 71(353), 210-213. 

2583 

2584 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 

2585 Hypothesis Testing based on Quadratic Inference Function. Technical 

2586 Report #99-03, Center for Likelihood Studies, Pennsylvania State 

2587 University. 

2588 

2589 .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A 

2590 comparative study of tests for homogeneity of variances, with 

2591 applications to the outer continental shelf biding data. 

2592 Technometrics, 23(4), 351-361. 

2593 

2594 Examples 

2595 -------- 

2596 Test whether or not the lists `a`, `b` and `c` come from populations 

2597 with equal variances. 

2598 

2599 >>> from scipy.stats import fligner 

2600 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 

2601 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 

2602 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 

2603 >>> stat, p = fligner(a, b, c) 

2604 >>> p 

2605 0.00450826080004775 

2606 

2607 The small p-value suggests that the populations do not have equal 

2608 variances. 

2609 

2610 This is not surprising, given that the sample variance of `b` is much 

2611 larger than that of `a` and `c`: 

2612 

2613 >>> [np.var(x, ddof=1) for x in [a, b, c]] 

2614 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 

2615 """ 

2616 # Handle empty input 

2617 for a in args: 

2618 if np.asanyarray(a).size == 0: 

2619 return FlignerResult(np.nan, np.nan) 

2620 

2621 # Handle keyword arguments. 

2622 center = 'median' 

2623 proportiontocut = 0.05 

2624 for kw, value in kwds.items(): 

2625 if kw not in ['center', 'proportiontocut']: 

2626 raise TypeError("fligner() got an unexpected keyword " 

2627 "argument '%s'" % kw) 

2628 if kw == 'center': 

2629 center = value 

2630 else: 

2631 proportiontocut = value 

2632 

2633 k = len(args) 

2634 if k < 2: 

2635 raise ValueError("Must enter at least two input sample vectors.") 

2636 

2637 if center not in ['mean', 'median', 'trimmed']: 

2638 raise ValueError("Keyword argument <center> must be 'mean', 'median'" 

2639 " or 'trimmed'.") 

2640 

2641 if center == 'median': 

2642 func = lambda x: np.median(x, axis=0) 

2643 elif center == 'mean': 

2644 func = lambda x: np.mean(x, axis=0) 

2645 else: # center == 'trimmed' 

2646 args = tuple(stats.trimboth(arg, proportiontocut) for arg in args) 

2647 func = lambda x: np.mean(x, axis=0) 

2648 

2649 Ni = asarray([len(args[j]) for j in range(k)]) 

2650 Yci = asarray([func(args[j]) for j in range(k)]) 

2651 Ntot = np.sum(Ni, axis=0) 

2652 # compute Zij's 

2653 Zij = [abs(asarray(args[i]) - Yci[i]) for i in range(k)] 

2654 allZij = [] 

2655 g = [0] 

2656 for i in range(k): 

2657 allZij.extend(list(Zij[i])) 

2658 g.append(len(allZij)) 

2659 

2660 ranks = stats.rankdata(allZij) 

2661 a = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5) 

2662 

2663 # compute Aibar 

2664 Aibar = _apply_func(a, g, np.sum) / Ni 

2665 anbar = np.mean(a, axis=0) 

2666 varsq = np.var(a, axis=0, ddof=1) 

2667 Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq 

2668 pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf 

2669 return FlignerResult(Xsq, pval) 

2670 

2671 

2672def mood(x, y, axis=0): 

2673 """ 

2674 Perform Mood's test for equal scale parameters. 

2675 

2676 Mood's two-sample test for scale parameters is a non-parametric 

2677 test for the null hypothesis that two samples are drawn from the 

2678 same distribution with the same scale parameter. 

2679 

2680 Parameters 

2681 ---------- 

2682 x, y : array_like 

2683 Arrays of sample data. 

2684 axis : int, optional 

2685 The axis along which the samples are tested. `x` and `y` can be of 

2686 different length along `axis`. 

2687 If `axis` is None, `x` and `y` are flattened and the test is done on 

2688 all values in the flattened arrays. 

2689 

2690 Returns 

2691 ------- 

2692 z : scalar or ndarray 

2693 The z-score for the hypothesis test. For 1-D inputs a scalar is 

2694 returned. 

2695 p-value : scalar ndarray 

2696 The p-value for the hypothesis test. 

2697 

2698 See Also 

2699 -------- 

2700 fligner : A non-parametric test for the equality of k variances 

2701 ansari : A non-parametric test for the equality of 2 variances 

2702 bartlett : A parametric test for equality of k variances in normal samples 

2703 levene : A parametric test for equality of k variances 

2704 

2705 Notes 

2706 ----- 

2707 The data are assumed to be drawn from probability distributions ``f(x)`` 

2708 and ``f(x/s) / s`` respectively, for some probability density function f. 

2709 The null hypothesis is that ``s == 1``. 

2710 

2711 For multi-dimensional arrays, if the inputs are of shapes 

2712 ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the 

2713 resulting z and p values will have shape ``(n0, n2, n3)``. Note that 

2714 ``n1`` and ``m1`` don't have to be equal, but the other dimensions do. 

2715 

2716 Examples 

2717 -------- 

2718 >>> from scipy import stats 

2719 >>> np.random.seed(1234) 

2720 >>> x2 = np.random.randn(2, 45, 6, 7) 

2721 >>> x1 = np.random.randn(2, 30, 6, 7) 

2722 >>> z, p = stats.mood(x1, x2, axis=1) 

2723 >>> p.shape 

2724 (2, 6, 7) 

2725 

2726 Find the number of points where the difference in scale is not significant: 

2727 

2728 >>> (p > 0.1).sum() 

2729 74 

2730 

2731 Perform the test with different scales: 

2732 

2733 >>> x1 = np.random.randn(2, 30) 

2734 >>> x2 = np.random.randn(2, 35) * 10.0 

2735 >>> stats.mood(x1, x2, axis=1) 

2736 (array([-5.7178125 , -5.25342163]), array([ 1.07904114e-08, 1.49299218e-07])) 

2737 

2738 """ 

2739 x = np.asarray(x, dtype=float) 

2740 y = np.asarray(y, dtype=float) 

2741 

2742 if axis is None: 

2743 x = x.flatten() 

2744 y = y.flatten() 

2745 axis = 0 

2746 

2747 # Determine shape of the result arrays 

2748 res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis]) 

2749 if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if 

2750 ax != axis])): 

2751 raise ValueError("Dimensions of x and y on all axes except `axis` " 

2752 "should match") 

2753 

2754 n = x.shape[axis] 

2755 m = y.shape[axis] 

2756 N = m + n 

2757 if N < 3: 

2758 raise ValueError("Not enough observations.") 

2759 

2760 xy = np.concatenate((x, y), axis=axis) 

2761 if axis != 0: 

2762 xy = np.rollaxis(xy, axis) 

2763 

2764 xy = xy.reshape(xy.shape[0], -1) 

2765 

2766 # Generalized to the n-dimensional case by adding the axis argument, and 

2767 # using for loops, since rankdata is not vectorized. For improving 

2768 # performance consider vectorizing rankdata function. 

2769 all_ranks = np.zeros_like(xy) 

2770 for j in range(xy.shape[1]): 

2771 all_ranks[:, j] = stats.rankdata(xy[:, j]) 

2772 

2773 Ri = all_ranks[:n] 

2774 M = np.sum((Ri - (N + 1.0) / 2)**2, axis=0) 

2775 # Approx stat. 

2776 mnM = n * (N * N - 1.0) / 12 

2777 varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180 

2778 z = (M - mnM) / sqrt(varM) 

2779 

2780 # sf for right tail, cdf for left tail. Factor 2 for two-sidedness 

2781 z_pos = z > 0 

2782 pval = np.zeros_like(z) 

2783 pval[z_pos] = 2 * distributions.norm.sf(z[z_pos]) 

2784 pval[~z_pos] = 2 * distributions.norm.cdf(z[~z_pos]) 

2785 

2786 if res_shape == (): 

2787 # Return scalars, not 0-D arrays 

2788 z = z[0] 

2789 pval = pval[0] 

2790 else: 

2791 z.shape = res_shape 

2792 pval.shape = res_shape 

2793 

2794 return z, pval 

2795 

2796 

2797WilcoxonResult = namedtuple('WilcoxonResult', ('statistic', 'pvalue')) 

2798 

2799 

2800def wilcoxon(x, y=None, zero_method="wilcox", correction=False, 

2801 alternative="two-sided", mode='auto'): 

2802 """ 

2803 Calculate the Wilcoxon signed-rank test. 

2804 

2805 The Wilcoxon signed-rank test tests the null hypothesis that two 

2806 related paired samples come from the same distribution. In particular, 

2807 it tests whether the distribution of the differences x - y is symmetric 

2808 about zero. It is a non-parametric version of the paired T-test. 

2809 

2810 Parameters 

2811 ---------- 

2812 x : array_like 

2813 Either the first set of measurements (in which case `y` is the second 

2814 set of measurements), or the differences between two sets of 

2815 measurements (in which case `y` is not to be specified.) Must be 

2816 one-dimensional. 

2817 y : array_like, optional 

2818 Either the second set of measurements (if `x` is the first set of 

2819 measurements), or not specified (if `x` is the differences between 

2820 two sets of measurements.) Must be one-dimensional. 

2821 zero_method : {'pratt', 'wilcox', 'zsplit'}, optional 

2822 The following options are available (default is 'wilcox'): 

2823  

2824 * 'pratt': Includes zero-differences in the ranking process, 

2825 but drops the ranks of the zeros, see [4]_, (more conservative). 

2826 * 'wilcox': Discards all zero-differences, the default. 

2827 * 'zsplit': Includes zero-differences in the ranking process and  

2828 split the zero rank between positive and negative ones. 

2829 correction : bool, optional 

2830 If True, apply continuity correction by adjusting the Wilcoxon rank 

2831 statistic by 0.5 towards the mean value when computing the 

2832 z-statistic if a normal approximation is used. Default is False. 

2833 alternative : {"two-sided", "greater", "less"}, optional 

2834 The alternative hypothesis to be tested, see Notes. Default is 

2835 "two-sided". 

2836 mode : {"auto", "exact", "approx"} 

2837 Method to calculate the p-value, see Notes. Default is "auto". 

2838 

2839 Returns 

2840 ------- 

2841 statistic : float 

2842 If `alternative` is "two-sided", the sum of the ranks of the 

2843 differences above or below zero, whichever is smaller. 

2844 Otherwise the sum of the ranks of the differences above zero. 

2845 pvalue : float 

2846 The p-value for the test depending on `alternative` and `mode`. 

2847 

2848 See Also 

2849 -------- 

2850 kruskal, mannwhitneyu 

2851 

2852 Notes 

2853 ----- 

2854 The test has been introduced in [4]_. Given n independent samples 

2855 (xi, yi) from a bivariate distribution (i.e. paired samples), 

2856 it computes the differences di = xi - yi. One assumption of the test 

2857 is that the differences are symmetric, see [2]_. 

2858 The two-sided test has the null hypothesis that the median of the 

2859 differences is zero against the alternative that it is different from 

2860 zero. The one-sided test has the null hypothesis that the median is  

2861 positive against the alternative that it is negative  

2862 (``alternative == 'less'``), or vice versa (``alternative == 'greater.'``). 

2863 

2864 To derive the p-value, the exact distribution (``mode == 'exact'``) 

2865 can be used for sample sizes of up to 25. The default ``mode == 'auto'`` 

2866 uses the exact distribution if there are at most 25 observations and no 

2867 ties, otherwise a normal approximation is used (``mode == 'approx'``). 

2868 

2869 The treatment of ties can be controlled by the parameter `zero_method`. 

2870 If ``zero_method == 'pratt'``, the normal approximation is adjusted as in 

2871 [5]_. A typical rule is to require that n > 20 ([2]_, p. 383). 

2872 

2873 References 

2874 ---------- 

2875 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test 

2876 .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971. 

2877 .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed 

2878 Rank Procedures, Journal of the American Statistical Association, 

2879 Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526` 

2880 .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods, 

2881 Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968` 

2882 .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank 

2883 Sampling Distribution When Zero Differences are Present, 

2884 Journal of the American Statistical Association, Vol. 62, 1967, 

2885 pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917` 

2886 

2887 Examples 

2888 -------- 

2889 In [4]_, the differences in height between cross- and self-fertilized 

2890 corn plants is given as follows: 

2891 

2892 >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75] 

2893 

2894 Cross-fertilized plants appear to be be higher. To test the null 

2895 hypothesis that there is no height difference, we can apply the 

2896 two-sided test: 

2897 

2898 >>> from scipy.stats import wilcoxon 

2899 >>> w, p = wilcoxon(d) 

2900 >>> w, p 

2901 (24.0, 0.041259765625) 

2902 

2903 Hence, we would reject the null hypothesis at a confidence level of 5%, 

2904 concluding that there is a difference in height between the groups. 

2905 To confirm that the median of the differences can be assumed to be 

2906 positive, we use: 

2907 

2908 >>> w, p = wilcoxon(d, alternative='greater') 

2909 >>> w, p 

2910 (96.0, 0.0206298828125) 

2911 

2912 This shows that the null hypothesis that the median is negative can be 

2913 rejected at a confidence level of 5% in favor of the alternative that 

2914 the median is greater than zero. The p-values above are exact. Using the 

2915 normal approximation gives very similar values: 

2916 

2917 >>> w, p = wilcoxon(d, mode='approx') 

2918 >>> w, p 

2919 (24.0, 0.04088813291185591) 

2920 

2921 Note that the statistic changed to 96 in the one-sided case (the sum 

2922 of ranks of positive differences) whereas it is 24 in the two-sided 

2923 case (the minimum of sum of ranks above and below zero). 

2924 

2925 """ 

2926 if mode not in ["auto", "approx", "exact"]: 

2927 raise ValueError("mode must be either 'auto', 'approx' or 'exact'") 

2928 

2929 if zero_method not in ["wilcox", "pratt", "zsplit"]: 

2930 raise ValueError("Zero method must be either 'wilcox' " 

2931 "or 'pratt' or 'zsplit'") 

2932 

2933 if alternative not in ["two-sided", "less", "greater"]: 

2934 raise ValueError("Alternative must be either 'two-sided', " 

2935 "'greater' or 'less'") 

2936 

2937 if y is None: 

2938 d = asarray(x) 

2939 if d.ndim > 1: 

2940 raise ValueError('Sample x must be one-dimensional.') 

2941 else: 

2942 x, y = map(asarray, (x, y)) 

2943 if x.ndim > 1 or y.ndim > 1: 

2944 raise ValueError('Samples x and y must be one-dimensional.') 

2945 if len(x) != len(y): 

2946 raise ValueError('The samples x and y must have the same length.') 

2947 d = x - y 

2948 

2949 if mode == "auto": 

2950 if len(d) <= 25: 

2951 mode = "exact" 

2952 else: 

2953 mode = "approx" 

2954 

2955 n_zero = np.sum(d == 0) 

2956 if n_zero > 0 and mode == "exact": 

2957 mode = "approx" 

2958 warnings.warn("Exact p-value calculation does not work if there are " 

2959 "ties. Switching to normal approximation.") 

2960 

2961 if mode == "approx": 

2962 if zero_method in ["wilcox", "pratt"]: 

2963 if n_zero == len(d): 

2964 raise ValueError("zero_method 'wilcox' and 'pratt' do not " 

2965 "work if x - y is zero for all elements.") 

2966 if zero_method == "wilcox": 

2967 # Keep all non-zero differences 

2968 d = compress(np.not_equal(d, 0), d) 

2969 

2970 count = len(d) 

2971 if count < 10 and mode == "approx": 

2972 warnings.warn("Sample size too small for normal approximation.") 

2973 

2974 r = stats.rankdata(abs(d)) 

2975 r_plus = np.sum((d > 0) * r) 

2976 r_minus = np.sum((d < 0) * r) 

2977 

2978 if zero_method == "zsplit": 

2979 r_zero = np.sum((d == 0) * r) 

2980 r_plus += r_zero / 2. 

2981 r_minus += r_zero / 2. 

2982 

2983 # return min for two-sided test, but r_plus for one-sided test 

2984 # the literature is not consistent here 

2985 # r_plus is more informative since r_plus + r_minus = count*(count+1)/2, 

2986 # i.e. the sum of the ranks, so r_minus and the min can be inferred 

2987 # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.) 

2988 # [3] uses the r_plus for the one-sided test, keep min for two-sided test 

2989 # to keep backwards compatibility 

2990 if alternative == "two-sided": 

2991 T = min(r_plus, r_minus) 

2992 else: 

2993 T = r_plus 

2994 

2995 if mode == "approx": 

2996 mn = count * (count + 1.) * 0.25 

2997 se = count * (count + 1.) * (2. * count + 1.) 

2998 

2999 if zero_method == "pratt": 

3000 r = r[d != 0] 

3001 # normal approximation needs to be adjusted, see Cureton (1967) 

3002 mn -= n_zero * (n_zero + 1.) * 0.25 

3003 se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.) 

3004 

3005 replist, repnum = find_repeats(r) 

3006 if repnum.size != 0: 

3007 # Correction for repeated elements. 

3008 se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() 

3009 

3010 se = sqrt(se / 24) 

3011 

3012 # apply continuity correction if applicable 

3013 d = 0 

3014 if correction: 

3015 if alternative == "two-sided": 

3016 d = 0.5 * np.sign(T - mn) 

3017 elif alternative == "less": 

3018 d = -0.5 

3019 else: 

3020 d = 0.5 

3021 

3022 # compute statistic and p-value using normal approximation 

3023 z = (T - mn - d) / se 

3024 if alternative == "two-sided": 

3025 prob = 2. * distributions.norm.sf(abs(z)) 

3026 elif alternative == "greater": 

3027 # large T = r_plus indicates x is greater than y; i.e. 

3028 # accept alternative in that case and return small p-value (sf) 

3029 prob = distributions.norm.sf(z) 

3030 else: 

3031 prob = distributions.norm.cdf(z) 

3032 elif mode == "exact": 

3033 # get frequencies cnt of the possible positive ranksums r_plus 

3034 cnt = _get_wilcoxon_distr(count) 

3035 # note: r_plus is int (ties not allowed), need int for slices below 

3036 r_plus = int(r_plus) 

3037 if alternative == "two-sided": 

3038 if r_plus == (len(cnt) - 1) // 2: 

3039 # r_plus is the center of the distribution. 

3040 prob = 1.0 

3041 else: 

3042 p_less = np.sum(cnt[:r_plus + 1]) / 2**count 

3043 p_greater = np.sum(cnt[r_plus:]) / 2**count 

3044 prob = 2*min(p_greater, p_less) 

3045 elif alternative == "greater": 

3046 prob = np.sum(cnt[r_plus:]) / 2**count 

3047 else: 

3048 prob = np.sum(cnt[:r_plus + 1]) / 2**count 

3049 

3050 return WilcoxonResult(T, prob) 

3051 

3052 

3053def median_test(*args, **kwds): 

3054 """ 

3055 Perform a Mood's median test. 

3056 

3057 Test that two or more samples come from populations with the same median. 

3058 

3059 Let ``n = len(args)`` be the number of samples. The "grand median" of 

3060 all the data is computed, and a contingency table is formed by 

3061 classifying the values in each sample as being above or below the grand 

3062 median. The contingency table, along with `correction` and `lambda_`, 

3063 are passed to `scipy.stats.chi2_contingency` to compute the test statistic 

3064 and p-value. 

3065 

3066 Parameters 

3067 ---------- 

3068 sample1, sample2, ... : array_like 

3069 The set of samples. There must be at least two samples. 

3070 Each sample must be a one-dimensional sequence containing at least 

3071 one value. The samples are not required to have the same length. 

3072 ties : str, optional 

3073 Determines how values equal to the grand median are classified in 

3074 the contingency table. The string must be one of:: 

3075 

3076 "below": 

3077 Values equal to the grand median are counted as "below". 

3078 "above": 

3079 Values equal to the grand median are counted as "above". 

3080 "ignore": 

3081 Values equal to the grand median are not counted. 

3082 

3083 The default is "below". 

3084 correction : bool, optional 

3085 If True, *and* there are just two samples, apply Yates' correction 

3086 for continuity when computing the test statistic associated with 

3087 the contingency table. Default is True. 

3088 lambda_ : float or str, optional 

3089 By default, the statistic computed in this test is Pearson's 

3090 chi-squared statistic. `lambda_` allows a statistic from the 

3091 Cressie-Read power divergence family to be used instead. See 

3092 `power_divergence` for details. 

3093 Default is 1 (Pearson's chi-squared statistic). 

3094 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3095 Defines how to handle when input contains nan. 'propagate' returns nan, 

3096 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3097 values. Default is 'propagate'. 

3098 

3099 Returns 

3100 ------- 

3101 stat : float 

3102 The test statistic. The statistic that is returned is determined by 

3103 `lambda_`. The default is Pearson's chi-squared statistic. 

3104 p : float 

3105 The p-value of the test. 

3106 m : float 

3107 The grand median. 

3108 table : ndarray 

3109 The contingency table. The shape of the table is (2, n), where 

3110 n is the number of samples. The first row holds the counts of the 

3111 values above the grand median, and the second row holds the counts 

3112 of the values below the grand median. The table allows further 

3113 analysis with, for example, `scipy.stats.chi2_contingency`, or with 

3114 `scipy.stats.fisher_exact` if there are two samples, without having 

3115 to recompute the table. If ``nan_policy`` is "propagate" and there 

3116 are nans in the input, the return value for ``table`` is ``None``. 

3117 

3118 See Also 

3119 -------- 

3120 kruskal : Compute the Kruskal-Wallis H-test for independent samples. 

3121 mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y. 

3122 

3123 Notes 

3124 ----- 

3125 .. versionadded:: 0.15.0 

3126 

3127 References 

3128 ---------- 

3129 .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill 

3130 (1950), pp. 394-399. 

3131 .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010). 

3132 See Sections 8.12 and 10.15. 

3133 

3134 Examples 

3135 -------- 

3136 A biologist runs an experiment in which there are three groups of plants. 

3137 Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants. 

3138 Each plant produces a number of seeds. The seed counts for each group 

3139 are:: 

3140 

3141 Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49 

3142 Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99 

3143 Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84 

3144 

3145 The following code applies Mood's median test to these samples. 

3146 

3147 >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49] 

3148 >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99] 

3149 >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84] 

3150 >>> from scipy.stats import median_test 

3151 >>> stat, p, med, tbl = median_test(g1, g2, g3) 

3152 

3153 The median is 

3154 

3155 >>> med 

3156 34.0 

3157 

3158 and the contingency table is 

3159 

3160 >>> tbl 

3161 array([[ 5, 10, 7], 

3162 [11, 5, 10]]) 

3163 

3164 `p` is too large to conclude that the medians are not the same: 

3165 

3166 >>> p 

3167 0.12609082774093244 

3168 

3169 The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to 

3170 `median_test`. 

3171 

3172 >>> g, p, med, tbl = median_test(g1, g2, g3, lambda_="log-likelihood") 

3173 >>> p 

3174 0.12224779737117837 

3175 

3176 The median occurs several times in the data, so we'll get a different 

3177 result if, for example, ``ties="above"`` is used: 

3178 

3179 >>> stat, p, med, tbl = median_test(g1, g2, g3, ties="above") 

3180 >>> p 

3181 0.063873276069553273 

3182 

3183 >>> tbl 

3184 array([[ 5, 11, 9], 

3185 [11, 4, 8]]) 

3186 

3187 This example demonstrates that if the data set is not large and there 

3188 are values equal to the median, the p-value can be sensitive to the 

3189 choice of `ties`. 

3190 

3191 """ 

3192 ties = kwds.pop('ties', 'below') 

3193 correction = kwds.pop('correction', True) 

3194 lambda_ = kwds.pop('lambda_', None) 

3195 nan_policy = kwds.pop('nan_policy', 'propagate') 

3196 

3197 if len(kwds) > 0: 

3198 bad_kwd = kwds.keys()[0] 

3199 raise TypeError("median_test() got an unexpected keyword " 

3200 "argument %r" % bad_kwd) 

3201 

3202 if len(args) < 2: 

3203 raise ValueError('median_test requires two or more samples.') 

3204 

3205 ties_options = ['below', 'above', 'ignore'] 

3206 if ties not in ties_options: 

3207 raise ValueError("invalid 'ties' option '%s'; 'ties' must be one " 

3208 "of: %s" % (ties, str(ties_options)[1:-1])) 

3209 

3210 data = [np.asarray(arg) for arg in args] 

3211 

3212 # Validate the sizes and shapes of the arguments. 

3213 for k, d in enumerate(data): 

3214 if d.size == 0: 

3215 raise ValueError("Sample %d is empty. All samples must " 

3216 "contain at least one value." % (k + 1)) 

3217 if d.ndim != 1: 

3218 raise ValueError("Sample %d has %d dimensions. All " 

3219 "samples must be one-dimensional sequences." % 

3220 (k + 1, d.ndim)) 

3221 

3222 cdata = np.concatenate(data) 

3223 contains_nan, nan_policy = _contains_nan(cdata, nan_policy) 

3224 if contains_nan and nan_policy == 'propagate': 

3225 return np.nan, np.nan, np.nan, None 

3226 

3227 if contains_nan: 

3228 grand_median = np.median(cdata[~np.isnan(cdata)]) 

3229 else: 

3230 grand_median = np.median(cdata) 

3231 # When the minimum version of numpy supported by scipy is 1.9.0, 

3232 # the above if/else statement can be replaced by the single line: 

3233 # grand_median = np.nanmedian(cdata) 

3234 

3235 # Create the contingency table. 

3236 table = np.zeros((2, len(data)), dtype=np.int64) 

3237 for k, sample in enumerate(data): 

3238 sample = sample[~np.isnan(sample)] 

3239 

3240 nabove = count_nonzero(sample > grand_median) 

3241 nbelow = count_nonzero(sample < grand_median) 

3242 nequal = sample.size - (nabove + nbelow) 

3243 table[0, k] += nabove 

3244 table[1, k] += nbelow 

3245 if ties == "below": 

3246 table[1, k] += nequal 

3247 elif ties == "above": 

3248 table[0, k] += nequal 

3249 

3250 # Check that no row or column of the table is all zero. 

3251 # Such a table can not be given to chi2_contingency, because it would have 

3252 # a zero in the table of expected frequencies. 

3253 rowsums = table.sum(axis=1) 

3254 if rowsums[0] == 0: 

3255 raise ValueError("All values are below the grand median (%r)." % 

3256 grand_median) 

3257 if rowsums[1] == 0: 

3258 raise ValueError("All values are above the grand median (%r)." % 

3259 grand_median) 

3260 if ties == "ignore": 

3261 # We already checked that each sample has at least one value, but it 

3262 # is possible that all those values equal the grand median. If `ties` 

3263 # is "ignore", that would result in a column of zeros in `table`. We 

3264 # check for that case here. 

3265 zero_cols = np.nonzero((table == 0).all(axis=0))[0] 

3266 if len(zero_cols) > 0: 

3267 msg = ("All values in sample %d are equal to the grand " 

3268 "median (%r), so they are ignored, resulting in an " 

3269 "empty sample." % (zero_cols[0] + 1, grand_median)) 

3270 raise ValueError(msg) 

3271 

3272 stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_, 

3273 correction=correction) 

3274 return stat, p, grand_median, table 

3275 

3276 

3277def _circfuncs_common(samples, high, low, nan_policy='propagate'): 

3278 # Ensure samples are array-like and size is not zero 

3279 samples = np.asarray(samples) 

3280 if samples.size == 0: 

3281 return np.nan, np.asarray(np.nan), np.asarray(np.nan), None 

3282 

3283 # Recast samples as radians that range between 0 and 2 pi and calculate 

3284 # the sine and cosine 

3285 sin_samp = sin((samples - low)*2.*pi / (high - low)) 

3286 cos_samp = cos((samples - low)*2.*pi / (high - low)) 

3287 

3288 # Apply the NaN policy 

3289 contains_nan, nan_policy = _contains_nan(samples, nan_policy) 

3290 if contains_nan and nan_policy == 'omit': 

3291 mask = np.isnan(samples) 

3292 # Set the sines and cosines that are NaN to zero 

3293 sin_samp[mask] = 0.0 

3294 cos_samp[mask] = 0.0 

3295 else: 

3296 mask = None 

3297 

3298 return samples, sin_samp, cos_samp, mask 

3299 

3300 

3301def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 

3302 """ 

3303 Compute the circular mean for samples in a range. 

3304 

3305 Parameters 

3306 ---------- 

3307 samples : array_like 

3308 Input array. 

3309 high : float or int, optional 

3310 High boundary for circular mean range. Default is ``2*pi``. 

3311 low : float or int, optional 

3312 Low boundary for circular mean range. Default is 0. 

3313 axis : int, optional 

3314 Axis along which means are computed. The default is to compute 

3315 the mean of the flattened array. 

3316 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3317 Defines how to handle when input contains nan. 'propagate' returns nan, 

3318 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3319 values. Default is 'propagate'. 

3320 

3321 Returns 

3322 ------- 

3323 circmean : float 

3324 Circular mean. 

3325 

3326 Examples 

3327 -------- 

3328 >>> from scipy.stats import circmean 

3329 >>> circmean([0.1, 2*np.pi+0.2, 6*np.pi+0.3]) 

3330 0.2 

3331 

3332 >>> from scipy.stats import circmean 

3333 >>> circmean([0.2, 1.4, 2.6], high = 1, low = 0) 

3334 0.4 

3335 

3336 """ 

3337 samples, sin_samp, cos_samp, nmask = _circfuncs_common(samples, high, low, 

3338 nan_policy=nan_policy) 

3339 sin_sum = sin_samp.sum(axis=axis) 

3340 cos_sum = cos_samp.sum(axis=axis) 

3341 res = arctan2(sin_sum, cos_sum) 

3342 

3343 mask_nan = ~np.isnan(res) 

3344 if mask_nan.ndim > 0: 

3345 mask = res[mask_nan] < 0 

3346 else: 

3347 mask = res < 0 

3348 

3349 if mask.ndim > 0: 

3350 mask_nan[mask_nan] = mask 

3351 res[mask_nan] += 2*pi 

3352 elif mask: 

3353 res += 2*pi 

3354 

3355 # Set output to NaN if no samples went into the mean 

3356 if nmask is not None: 

3357 if nmask.all(): 

3358 res = np.full(shape=res.shape, fill_value=np.nan) 

3359 else: 

3360 # Find out if any of the axis that are being averaged consist 

3361 # entirely of NaN. If one exists, set the result (res) to NaN 

3362 nshape = 0 if axis is None else axis 

3363 smask = nmask.shape[nshape] == nmask.sum(axis=axis) 

3364 if smask.any(): 

3365 res[smask] = np.nan 

3366 

3367 return res*(high - low)/2.0/pi + low 

3368 

3369 

3370def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 

3371 """ 

3372 Compute the circular variance for samples assumed to be in a range. 

3373 

3374 Parameters 

3375 ---------- 

3376 samples : array_like 

3377 Input array. 

3378 high : float or int, optional 

3379 High boundary for circular variance range. Default is ``2*pi``. 

3380 low : float or int, optional 

3381 Low boundary for circular variance range. Default is 0. 

3382 axis : int, optional 

3383 Axis along which variances are computed. The default is to compute 

3384 the variance of the flattened array. 

3385 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3386 Defines how to handle when input contains nan. 'propagate' returns nan, 

3387 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3388 values. Default is 'propagate'. 

3389 

3390 Returns 

3391 ------- 

3392 circvar : float 

3393 Circular variance. 

3394 

3395 Notes 

3396 ----- 

3397 This uses a definition of circular variance that in the limit of small 

3398 angles returns a number close to the 'linear' variance. 

3399 

3400 Examples 

3401 -------- 

3402 >>> from scipy.stats import circvar 

3403 >>> circvar([0, 2*np.pi/3, 5*np.pi/3]) 

3404 2.19722457734 

3405 

3406 """ 

3407 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low, 

3408 nan_policy=nan_policy) 

3409 if mask is None: 

3410 sin_mean = sin_samp.mean(axis=axis) 

3411 cos_mean = cos_samp.mean(axis=axis) 

3412 else: 

3413 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float)) 

3414 nsum[nsum == 0] = np.nan 

3415 sin_mean = sin_samp.sum(axis=axis) / nsum 

3416 cos_mean = cos_samp.sum(axis=axis) / nsum 

3417 R = hypot(sin_mean, cos_mean) 

3418 

3419 return ((high - low)/2.0/pi)**2 * 2 * log(1/R) 

3420 

3421 

3422def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 

3423 """ 

3424 Compute the circular standard deviation for samples assumed to be in the 

3425 range [low to high]. 

3426 

3427 Parameters 

3428 ---------- 

3429 samples : array_like 

3430 Input array. 

3431 high : float or int, optional 

3432 High boundary for circular standard deviation range. 

3433 Default is ``2*pi``. 

3434 low : float or int, optional 

3435 Low boundary for circular standard deviation range. Default is 0. 

3436 axis : int, optional 

3437 Axis along which standard deviations are computed. The default is 

3438 to compute the standard deviation of the flattened array. 

3439 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3440 Defines how to handle when input contains nan. 'propagate' returns nan, 

3441 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3442 values. Default is 'propagate'. 

3443 

3444 Returns 

3445 ------- 

3446 circstd : float 

3447 Circular standard deviation. 

3448 

3449 Notes 

3450 ----- 

3451 This uses a definition of circular standard deviation that in the limit of 

3452 small angles returns a number close to the 'linear' standard deviation. 

3453 

3454 Examples 

3455 -------- 

3456 >>> from scipy.stats import circstd 

3457 >>> circstd([0, 0.1*np.pi/2, 0.001*np.pi, 0.03*np.pi/2]) 

3458 0.063564063306 

3459 

3460 """ 

3461 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low, 

3462 nan_policy=nan_policy) 

3463 if mask is None: 

3464 sin_mean = sin_samp.mean(axis=axis) 

3465 cos_mean = cos_samp.mean(axis=axis) 

3466 else: 

3467 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float)) 

3468 nsum[nsum == 0] = np.nan 

3469 sin_mean = sin_samp.sum(axis=axis) / nsum 

3470 cos_mean = cos_samp.sum(axis=axis) / nsum 

3471 R = hypot(sin_mean, cos_mean) 

3472 

3473 return ((high - low)/2.0/pi) * sqrt(-2*log(R))