Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Additional statistics functions with support for masked arrays. 

3 

4""" 

5 

6# Original author (2007): Pierre GF Gerard-Marchant 

7 

8 

9__all__ = ['compare_medians_ms', 

10 'hdquantiles', 'hdmedian', 'hdquantiles_sd', 

11 'idealfourths', 

12 'median_cihs','mjci','mquantiles_cimj', 

13 'rsh', 

14 'trimmed_mean_ci',] 

15 

16 

17import numpy as np 

18from numpy import float_, int_, ndarray 

19 

20import numpy.ma as ma 

21from numpy.ma import MaskedArray 

22 

23from . import mstats_basic as mstats 

24 

25from scipy.stats.distributions import norm, beta, t, binom 

26 

27 

28def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,): 

29 """ 

30 Computes quantile estimates with the Harrell-Davis method. 

31 

32 The quantile estimates are calculated as a weighted linear combination 

33 of order statistics. 

34 

35 Parameters 

36 ---------- 

37 data : array_like 

38 Data array. 

39 prob : sequence, optional 

40 Sequence of quantiles to compute. 

41 axis : int or None, optional 

42 Axis along which to compute the quantiles. If None, use a flattened 

43 array. 

44 var : bool, optional 

45 Whether to return the variance of the estimate. 

46 

47 Returns 

48 ------- 

49 hdquantiles : MaskedArray 

50 A (p,) array of quantiles (if `var` is False), or a (2,p) array of 

51 quantiles and variances (if `var` is True), where ``p`` is the 

52 number of quantiles. 

53 

54 See Also 

55 -------- 

56 hdquantiles_sd 

57 

58 """ 

59 def _hd_1D(data,prob,var): 

60 "Computes the HD quantiles for a 1D array. Returns nan for invalid data." 

61 xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) 

62 # Don't use length here, in case we have a numpy scalar 

63 n = xsorted.size 

64 

65 hd = np.empty((2,len(prob)), float_) 

66 if n < 2: 

67 hd.flat = np.nan 

68 if var: 

69 return hd 

70 return hd[0] 

71 

72 v = np.arange(n+1) / float(n) 

73 betacdf = beta.cdf 

74 for (i,p) in enumerate(prob): 

75 _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) 

76 w = _w[1:] - _w[:-1] 

77 hd_mean = np.dot(w, xsorted) 

78 hd[0,i] = hd_mean 

79 # 

80 hd[1,i] = np.dot(w, (xsorted-hd_mean)**2) 

81 # 

82 hd[0, prob == 0] = xsorted[0] 

83 hd[0, prob == 1] = xsorted[-1] 

84 if var: 

85 hd[1, prob == 0] = hd[1, prob == 1] = np.nan 

86 return hd 

87 return hd[0] 

88 # Initialization & checks 

89 data = ma.array(data, copy=False, dtype=float_) 

90 p = np.array(prob, copy=False, ndmin=1) 

91 # Computes quantiles along axis (or globally) 

92 if (axis is None) or (data.ndim == 1): 

93 result = _hd_1D(data, p, var) 

94 else: 

95 if data.ndim > 2: 

96 raise ValueError("Array 'data' must be at most two dimensional, " 

97 "but got data.ndim = %d" % data.ndim) 

98 result = ma.apply_along_axis(_hd_1D, axis, data, p, var) 

99 

100 return ma.fix_invalid(result, copy=False) 

101 

102 

103def hdmedian(data, axis=-1, var=False): 

104 """ 

105 Returns the Harrell-Davis estimate of the median along the given axis. 

106 

107 Parameters 

108 ---------- 

109 data : ndarray 

110 Data array. 

111 axis : int, optional 

112 Axis along which to compute the quantiles. If None, use a flattened 

113 array. 

114 var : bool, optional 

115 Whether to return the variance of the estimate. 

116 

117 Returns 

118 ------- 

119 hdmedian : MaskedArray 

120 The median values. If ``var=True``, the variance is returned inside 

121 the masked array. E.g. for a 1-D array the shape change from (1,) to 

122 (2,). 

123 

124 """ 

125 result = hdquantiles(data,[0.5], axis=axis, var=var) 

126 return result.squeeze() 

127 

128 

129def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): 

130 """ 

131 The standard error of the Harrell-Davis quantile estimates by jackknife. 

132 

133 Parameters 

134 ---------- 

135 data : array_like 

136 Data array. 

137 prob : sequence, optional 

138 Sequence of quantiles to compute. 

139 axis : int, optional 

140 Axis along which to compute the quantiles. If None, use a flattened 

141 array. 

142 

143 Returns 

144 ------- 

145 hdquantiles_sd : MaskedArray 

146 Standard error of the Harrell-Davis quantile estimates. 

147 

148 See Also 

149 -------- 

150 hdquantiles 

151 

152 """ 

153 def _hdsd_1D(data, prob): 

154 "Computes the std error for 1D arrays." 

155 xsorted = np.sort(data.compressed()) 

156 n = len(xsorted) 

157 

158 hdsd = np.empty(len(prob), float_) 

159 if n < 2: 

160 hdsd.flat = np.nan 

161 

162 vv = np.arange(n) / float(n-1) 

163 betacdf = beta.cdf 

164 

165 for (i,p) in enumerate(prob): 

166 _w = betacdf(vv, (n+1)*p, (n+1)*(1-p)) 

167 w = _w[1:] - _w[:-1] 

168 mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)), 

169 list(range(k+1,n))].astype(int_)]) 

170 for k in range(n)], dtype=float_) 

171 mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1) 

172 hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n)) 

173 return hdsd 

174 

175 # Initialization & checks 

176 data = ma.array(data, copy=False, dtype=float_) 

177 p = np.array(prob, copy=False, ndmin=1) 

178 # Computes quantiles along axis (or globally) 

179 if (axis is None): 

180 result = _hdsd_1D(data, p) 

181 else: 

182 if data.ndim > 2: 

183 raise ValueError("Array 'data' must be at most two dimensional, " 

184 "but got data.ndim = %d" % data.ndim) 

185 result = ma.apply_along_axis(_hdsd_1D, axis, data, p) 

186 

187 return ma.fix_invalid(result, copy=False).ravel() 

188 

189 

190def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True), 

191 alpha=0.05, axis=None): 

192 """ 

193 Selected confidence interval of the trimmed mean along the given axis. 

194 

195 Parameters 

196 ---------- 

197 data : array_like 

198 Input data. 

199 limits : {None, tuple}, optional 

200 None or a two item tuple. 

201 Tuple of the percentages to cut on each side of the array, with respect 

202 to the number of unmasked data, as floats between 0. and 1. If ``n`` 

203 is the number of unmasked data before trimming, then 

204 (``n * limits[0]``)th smallest data and (``n * limits[1]``)th 

205 largest data are masked. The total number of unmasked data after 

206 trimming is ``n * (1. - sum(limits))``. 

207 The value of one limit can be set to None to indicate an open interval. 

208 

209 Defaults to (0.2, 0.2). 

210 inclusive : (2,) tuple of boolean, optional 

211 If relative==False, tuple indicating whether values exactly equal to 

212 the absolute limits are allowed. 

213 If relative==True, tuple indicating whether the number of data being 

214 masked on each side should be rounded (True) or truncated (False). 

215 

216 Defaults to (True, True). 

217 alpha : float, optional 

218 Confidence level of the intervals. 

219 

220 Defaults to 0.05. 

221 axis : int, optional 

222 Axis along which to cut. If None, uses a flattened version of `data`. 

223 

224 Defaults to None. 

225 

226 Returns 

227 ------- 

228 trimmed_mean_ci : (2,) ndarray 

229 The lower and upper confidence intervals of the trimmed data. 

230 

231 """ 

232 data = ma.array(data, copy=False) 

233 trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis) 

234 tmean = trimmed.mean(axis) 

235 tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis) 

236 df = trimmed.count(axis) - 1 

237 tppf = t.ppf(1-alpha/2.,df) 

238 return np.array((tmean - tppf*tstde, tmean+tppf*tstde)) 

239 

240 

241def mjci(data, prob=[0.25,0.5,0.75], axis=None): 

242 """ 

243 Returns the Maritz-Jarrett estimators of the standard error of selected 

244 experimental quantiles of the data. 

245 

246 Parameters 

247 ---------- 

248 data : ndarray 

249 Data array. 

250 prob : sequence, optional 

251 Sequence of quantiles to compute. 

252 axis : int or None, optional 

253 Axis along which to compute the quantiles. If None, use a flattened 

254 array. 

255 

256 """ 

257 def _mjci_1D(data, p): 

258 data = np.sort(data.compressed()) 

259 n = data.size 

260 prob = (np.array(p) * n + 0.5).astype(int_) 

261 betacdf = beta.cdf 

262 

263 mj = np.empty(len(prob), float_) 

264 x = np.arange(1,n+1, dtype=float_) / n 

265 y = x - 1./n 

266 for (i,m) in enumerate(prob): 

267 W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) 

268 C1 = np.dot(W,data) 

269 C2 = np.dot(W,data**2) 

270 mj[i] = np.sqrt(C2 - C1**2) 

271 return mj 

272 

273 data = ma.array(data, copy=False) 

274 if data.ndim > 2: 

275 raise ValueError("Array 'data' must be at most two dimensional, " 

276 "but got data.ndim = %d" % data.ndim) 

277 

278 p = np.array(prob, copy=False, ndmin=1) 

279 # Computes quantiles along axis (or globally) 

280 if (axis is None): 

281 return _mjci_1D(data, p) 

282 else: 

283 return ma.apply_along_axis(_mjci_1D, axis, data, p) 

284 

285 

286def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): 

287 """ 

288 Computes the alpha confidence interval for the selected quantiles of the 

289 data, with Maritz-Jarrett estimators. 

290 

291 Parameters 

292 ---------- 

293 data : ndarray 

294 Data array. 

295 prob : sequence, optional 

296 Sequence of quantiles to compute. 

297 alpha : float, optional 

298 Confidence level of the intervals. 

299 axis : int or None, optional 

300 Axis along which to compute the quantiles. 

301 If None, use a flattened array. 

302 

303 Returns 

304 ------- 

305 ci_lower : ndarray 

306 The lower boundaries of the confidence interval. Of the same length as 

307 `prob`. 

308 ci_upper : ndarray 

309 The upper boundaries of the confidence interval. Of the same length as 

310 `prob`. 

311 

312 """ 

313 alpha = min(alpha, 1 - alpha) 

314 z = norm.ppf(1 - alpha/2.) 

315 xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) 

316 smj = mjci(data, prob, axis=axis) 

317 return (xq - z * smj, xq + z * smj) 

318 

319 

320def median_cihs(data, alpha=0.05, axis=None): 

321 """ 

322 Computes the alpha-level confidence interval for the median of the data. 

323 

324 Uses the Hettmasperger-Sheather method. 

325 

326 Parameters 

327 ---------- 

328 data : array_like 

329 Input data. Masked values are discarded. The input should be 1D only, 

330 or `axis` should be set to None. 

331 alpha : float, optional 

332 Confidence level of the intervals. 

333 axis : int or None, optional 

334 Axis along which to compute the quantiles. If None, use a flattened 

335 array. 

336 

337 Returns 

338 ------- 

339 median_cihs 

340 Alpha level confidence interval. 

341 

342 """ 

343 def _cihs_1D(data, alpha): 

344 data = np.sort(data.compressed()) 

345 n = len(data) 

346 alpha = min(alpha, 1-alpha) 

347 k = int(binom._ppf(alpha/2., n, 0.5)) 

348 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) 

349 if gk < 1-alpha: 

350 k -= 1 

351 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) 

352 gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) 

353 I = (gk - 1 + alpha)/(gk - gkk) 

354 lambd = (n-k) * I / float(k + (n-2*k)*I) 

355 lims = (lambd*data[k] + (1-lambd)*data[k-1], 

356 lambd*data[n-k-1] + (1-lambd)*data[n-k]) 

357 return lims 

358 data = ma.array(data, copy=False) 

359 # Computes quantiles along axis (or globally) 

360 if (axis is None): 

361 result = _cihs_1D(data, alpha) 

362 else: 

363 if data.ndim > 2: 

364 raise ValueError("Array 'data' must be at most two dimensional, " 

365 "but got data.ndim = %d" % data.ndim) 

366 result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) 

367 

368 return result 

369 

370 

371def compare_medians_ms(group_1, group_2, axis=None): 

372 """ 

373 Compares the medians from two independent groups along the given axis. 

374 

375 The comparison is performed using the McKean-Schrader estimate of the 

376 standard error of the medians. 

377 

378 Parameters 

379 ---------- 

380 group_1 : array_like 

381 First dataset. Has to be of size >=7. 

382 group_2 : array_like 

383 Second dataset. Has to be of size >=7. 

384 axis : int, optional 

385 Axis along which the medians are estimated. If None, the arrays are 

386 flattened. If `axis` is not None, then `group_1` and `group_2` 

387 should have the same shape. 

388 

389 Returns 

390 ------- 

391 compare_medians_ms : {float, ndarray} 

392 If `axis` is None, then returns a float, otherwise returns a 1-D 

393 ndarray of floats with a length equal to the length of `group_1` 

394 along `axis`. 

395 

396 """ 

397 (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis)) 

398 (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), 

399 mstats.stde_median(group_2, axis=axis)) 

400 W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) 

401 return 1 - norm.cdf(W) 

402 

403 

404def idealfourths(data, axis=None): 

405 """ 

406 Returns an estimate of the lower and upper quartiles. 

407 

408 Uses the ideal fourths algorithm. 

409 

410 Parameters 

411 ---------- 

412 data : array_like 

413 Input array. 

414 axis : int, optional 

415 Axis along which the quartiles are estimated. If None, the arrays are 

416 flattened. 

417 

418 Returns 

419 ------- 

420 idealfourths : {list of floats, masked array} 

421 Returns the two internal values that divide `data` into four parts 

422 using the ideal fourths algorithm either along the flattened array 

423 (if `axis` is None) or along `axis` of `data`. 

424 

425 """ 

426 def _idf(data): 

427 x = data.compressed() 

428 n = len(x) 

429 if n < 3: 

430 return [np.nan,np.nan] 

431 (j,h) = divmod(n/4. + 5/12.,1) 

432 j = int(j) 

433 qlo = (1-h)*x[j-1] + h*x[j] 

434 k = n - j 

435 qup = (1-h)*x[k] + h*x[k-1] 

436 return [qlo, qup] 

437 data = ma.sort(data, axis=axis).view(MaskedArray) 

438 if (axis is None): 

439 return _idf(data) 

440 else: 

441 return ma.apply_along_axis(_idf, axis, data) 

442 

443 

444def rsh(data, points=None): 

445 """ 

446 Evaluates Rosenblatt's shifted histogram estimators for each data point. 

447 

448 Rosenblatt's estimator is a centered finite-difference approximation to the 

449 derivative of the empirical cumulative distribution function. 

450 

451 Parameters 

452 ---------- 

453 data : sequence 

454 Input data, should be 1-D. Masked values are ignored. 

455 points : sequence or None, optional 

456 Sequence of points where to evaluate Rosenblatt shifted histogram. 

457 If None, use the data. 

458 

459 """ 

460 data = ma.array(data, copy=False) 

461 if points is None: 

462 points = data 

463 else: 

464 points = np.array(points, copy=False, ndmin=1) 

465 

466 if data.ndim != 1: 

467 raise AttributeError("The input array should be 1D only !") 

468 

469 n = data.count() 

470 r = idealfourths(data, axis=None) 

471 h = 1.2 * (r[-1]-r[0]) / n**(1./5) 

472 nhi = (data[:,None] <= points[None,:] + h).sum(0) 

473 nlo = (data[:,None] < points[None,:] - h).sum(0) 

474 return (nhi-nlo) / (2.*n*h)