Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1'''Multiple Testing and P-Value Correction 

2 

3 

4Author: Josef Perktold 

5License: BSD-3 

6 

7''' 

8 

9 

10from collections import OrderedDict 

11 

12import numpy as np 

13 

14from statsmodels.stats._knockoff import RegressionFDR 

15 

16__all__ = ['fdrcorrection', 'fdrcorrection_twostage', 'local_fdr', 

17 'multipletests', 'NullDistribution', 'RegressionFDR'] 

18 

19# ============================================== 

20# 

21# Part 1: Multiple Tests and P-Value Correction 

22# 

23# ============================================== 

24 

25 

26def _ecdf(x): 

27 '''no frills empirical cdf used in fdrcorrection 

28 ''' 

29 nobs = len(x) 

30 return np.arange(1,nobs+1)/float(nobs) 

31 

32multitest_methods_names = {'b': 'Bonferroni', 

33 's': 'Sidak', 

34 'h': 'Holm', 

35 'hs': 'Holm-Sidak', 

36 'sh': 'Simes-Hochberg', 

37 'ho': 'Hommel', 

38 'fdr_bh': 'FDR Benjamini-Hochberg', 

39 'fdr_by': 'FDR Benjamini-Yekutieli', 

40 'fdr_tsbh': 'FDR 2-stage Benjamini-Hochberg', 

41 'fdr_tsbky': 'FDR 2-stage Benjamini-Krieger-Yekutieli', 

42 'fdr_gbs': 'FDR adaptive Gavrilov-Benjamini-Sarkar' 

43 } 

44 

45_alias_list = [['b', 'bonf', 'bonferroni'], 

46 ['s', 'sidak'], 

47 ['h', 'holm'], 

48 ['hs', 'holm-sidak'], 

49 ['sh', 'simes-hochberg'], 

50 ['ho', 'hommel'], 

51 ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp'], 

52 ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr'], 

53 ['fdr_tsbh', 'fdr_2sbh'], 

54 ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage'], 

55 ['fdr_gbs'] 

56 ] 

57 

58 

59multitest_alias = OrderedDict() 

60for m in _alias_list: 

61 multitest_alias[m[0]] = m[0] 

62 for a in m[1:]: 

63 multitest_alias[a] = m[0] 

64 

65def multipletests(pvals, alpha=0.05, method='hs', is_sorted=False, 

66 returnsorted=False): 

67 """ 

68 Test results and p-value correction for multiple tests 

69 

70 Parameters 

71 ---------- 

72 pvals : array_like, 1-d 

73 uncorrected p-values. Must be 1-dimensional. 

74 alpha : float 

75 FWER, family-wise error rate, e.g. 0.1 

76 method : str 

77 Method used for testing and adjustment of pvalues. Can be either the 

78 full name or initial letters. Available methods are: 

79 

80 - `bonferroni` : one-step correction 

81 - `sidak` : one-step correction 

82 - `holm-sidak` : step down method using Sidak adjustments 

83 - `holm` : step-down method using Bonferroni adjustments 

84 - `simes-hochberg` : step-up method (independent) 

85 - `hommel` : closed method based on Simes tests (non-negative) 

86 - `fdr_bh` : Benjamini/Hochberg (non-negative) 

87 - `fdr_by` : Benjamini/Yekutieli (negative) 

88 - `fdr_tsbh` : two stage fdr correction (non-negative) 

89 - `fdr_tsbky` : two stage fdr correction (non-negative) 

90 

91 is_sorted : bool 

92 If False (default), the p_values will be sorted, but the corrected 

93 pvalues are in the original order. If True, then it assumed that the 

94 pvalues are already sorted in ascending order. 

95 returnsorted : bool 

96 not tested, return sorted p-values instead of original sequence 

97 

98 Returns 

99 ------- 

100 reject : ndarray, boolean 

101 true for hypothesis that can be rejected for given alpha 

102 pvals_corrected : ndarray 

103 p-values corrected for multiple tests 

104 alphacSidak: float 

105 corrected alpha for Sidak method 

106 alphacBonf: float 

107 corrected alpha for Bonferroni method 

108 

109 Notes 

110 ----- 

111 There may be API changes for this function in the future. 

112 

113 Except for 'fdr_twostage', the p-value correction is independent of the 

114 alpha specified as argument. In these cases the corrected p-values 

115 can also be compared with a different alpha. In the case of 'fdr_twostage', 

116 the corrected p-values are specific to the given alpha, see 

117 ``fdrcorrection_twostage``. 

118 

119 The 'fdr_gbs' procedure is not verified against another package, p-values 

120 are derived from scratch and are not derived in the reference. In Monte 

121 Carlo experiments the method worked correctly and maintained the false 

122 discovery rate. 

123 

124 All procedures that are included, control FWER or FDR in the independent 

125 case, and most are robust in the positively correlated case. 

126 

127 `fdr_gbs`: high power, fdr control for independent case and only small 

128 violation in positively correlated case 

129 

130 **Timing**: 

131 

132 Most of the time with large arrays is spent in `argsort`. When 

133 we want to calculate the p-value for several methods, then it is more 

134 efficient to presort the pvalues, and put the results back into the 

135 original order outside of the function. 

136 

137 Method='hommel' is very slow for large arrays, since it requires the 

138 evaluation of n partitions, where n is the number of p-values. 

139 """ 

140 import gc 

141 pvals = np.asarray(pvals) 

142 alphaf = alpha # Notation ? 

143 

144 if not is_sorted: 

145 sortind = np.argsort(pvals) 

146 pvals = np.take(pvals, sortind) 

147 

148 ntests = len(pvals) 

149 alphacSidak = 1 - np.power((1. - alphaf), 1./ntests) 

150 alphacBonf = alphaf / float(ntests) 

151 if method.lower() in ['b', 'bonf', 'bonferroni']: 

152 reject = pvals <= alphacBonf 

153 pvals_corrected = pvals * float(ntests) 

154 

155 elif method.lower() in ['s', 'sidak']: 

156 reject = pvals <= alphacSidak 

157 pvals_corrected = 1 - np.power((1. - pvals), ntests) 

158 

159 elif method.lower() in ['hs', 'holm-sidak']: 

160 alphacSidak_all = 1 - np.power((1. - alphaf), 

161 1./np.arange(ntests, 0, -1)) 

162 notreject = pvals > alphacSidak_all 

163 del alphacSidak_all 

164 

165 nr_index = np.nonzero(notreject)[0] 

166 if nr_index.size == 0: 

167 # nonreject is empty, all rejected 

168 notrejectmin = len(pvals) 

169 else: 

170 notrejectmin = np.min(nr_index) 

171 notreject[notrejectmin:] = True 

172 reject = ~notreject 

173 del notreject 

174 

175 pvals_corrected_raw = 1 - np.power((1. - pvals), 

176 np.arange(ntests, 0, -1)) 

177 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) 

178 del pvals_corrected_raw 

179 

180 elif method.lower() in ['h', 'holm']: 

181 notreject = pvals > alphaf / np.arange(ntests, 0, -1) 

182 nr_index = np.nonzero(notreject)[0] 

183 if nr_index.size == 0: 

184 # nonreject is empty, all rejected 

185 notrejectmin = len(pvals) 

186 else: 

187 notrejectmin = np.min(nr_index) 

188 notreject[notrejectmin:] = True 

189 reject = ~notreject 

190 pvals_corrected_raw = pvals * np.arange(ntests, 0, -1) 

191 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) 

192 del pvals_corrected_raw 

193 gc.collect() 

194 

195 elif method.lower() in ['sh', 'simes-hochberg']: 

196 alphash = alphaf / np.arange(ntests, 0, -1) 

197 reject = pvals <= alphash 

198 rejind = np.nonzero(reject) 

199 if rejind[0].size > 0: 

200 rejectmax = np.max(np.nonzero(reject)) 

201 reject[:rejectmax] = True 

202 pvals_corrected_raw = np.arange(ntests, 0, -1) * pvals 

203 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 

204 del pvals_corrected_raw 

205 

206 elif method.lower() in ['ho', 'hommel']: 

207 # we need a copy because we overwrite it in a loop 

208 a = pvals.copy() 

209 for m in range(ntests, 1, -1): 

210 cim = np.min(m * pvals[-m:] / np.arange(1,m+1.)) 

211 a[-m:] = np.maximum(a[-m:], cim) 

212 a[:-m] = np.maximum(a[:-m], np.minimum(m * pvals[:-m], cim)) 

213 pvals_corrected = a 

214 reject = a <= alphaf 

215 

216 elif method.lower() in ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp']: 

217 # delegate, call with sorted pvals 

218 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, 

219 method='indep', 

220 is_sorted=True) 

221 elif method.lower() in ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr']: 

222 # delegate, call with sorted pvals 

223 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, 

224 method='n', 

225 is_sorted=True) 

226 elif method.lower() in ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage']: 

227 # delegate, call with sorted pvals 

228 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, 

229 method='bky', 

230 is_sorted=True)[:2] 

231 elif method.lower() in ['fdr_tsbh', 'fdr_2sbh']: 

232 # delegate, call with sorted pvals 

233 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, 

234 method='bh', 

235 is_sorted=True)[:2] 

236 

237 elif method.lower() in ['fdr_gbs']: 

238 #adaptive stepdown in Gavrilov, Benjamini, Sarkar, Annals of Statistics 2009 

239## notreject = pvals > alphaf / np.arange(ntests, 0, -1) #alphacSidak 

240## notrejectmin = np.min(np.nonzero(notreject)) 

241## notreject[notrejectmin:] = True 

242## reject = ~notreject 

243 

244 ii = np.arange(1, ntests + 1) 

245 q = (ntests + 1. - ii)/ii * pvals / (1. - pvals) 

246 pvals_corrected_raw = np.maximum.accumulate(q) #up requirementd 

247 

248 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 

249 del pvals_corrected_raw 

250 reject = pvals_corrected <= alpha 

251 

252 else: 

253 raise ValueError('method not recognized') 

254 

255 if pvals_corrected is not None: #not necessary anymore 

256 pvals_corrected[pvals_corrected>1] = 1 

257 if is_sorted or returnsorted: 

258 return reject, pvals_corrected, alphacSidak, alphacBonf 

259 else: 

260 pvals_corrected_ = np.empty_like(pvals_corrected) 

261 pvals_corrected_[sortind] = pvals_corrected 

262 del pvals_corrected 

263 reject_ = np.empty_like(reject) 

264 reject_[sortind] = reject 

265 return reject_, pvals_corrected_, alphacSidak, alphacBonf 

266 

267 

268def fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False): 

269 '''pvalue correction for false discovery rate 

270 

271 This covers Benjamini/Hochberg for independent or positively correlated and 

272 Benjamini/Yekutieli for general or negatively correlated tests. Both are 

273 available in the function multipletests, as method=`fdr_bh`, resp. `fdr_by`. 

274 

275 Parameters 

276 ---------- 

277 pvals : array_like 

278 set of p-values of the individual tests. 

279 alpha : float 

280 error rate 

281 method : {'indep', 'negcorr') 

282 

283 Returns 

284 ------- 

285 rejected : ndarray, bool 

286 True if a hypothesis is rejected, False if not 

287 pvalue-corrected : ndarray 

288 pvalues adjusted for multiple hypothesis testing to limit FDR 

289 

290 Notes 

291 ----- 

292 

293 If there is prior information on the fraction of true hypothesis, then alpha 

294 should be set to alpha * m/m_0 where m is the number of tests, 

295 given by the p-values, and m_0 is an estimate of the true hypothesis. 

296 (see Benjamini, Krieger and Yekuteli) 

297 

298 The two-step method of Benjamini, Krieger and Yekutiel that estimates the number 

299 of false hypotheses will be available (soon). 

300 

301 Method names can be abbreviated to first letter, 'i' or 'p' for fdr_bh and 'n' for 

302 fdr_by. 

303 

304 

305 

306 ''' 

307 pvals = np.asarray(pvals) 

308 

309 if not is_sorted: 

310 pvals_sortind = np.argsort(pvals) 

311 pvals_sorted = np.take(pvals, pvals_sortind) 

312 else: 

313 pvals_sorted = pvals # alias 

314 

315 if method in ['i', 'indep', 'p', 'poscorr']: 

316 ecdffactor = _ecdf(pvals_sorted) 

317 elif method in ['n', 'negcorr']: 

318 cm = np.sum(1./np.arange(1, len(pvals_sorted)+1)) #corrected this 

319 ecdffactor = _ecdf(pvals_sorted) / cm 

320## elif method in ['n', 'negcorr']: 

321## cm = np.sum(np.arange(len(pvals))) 

322## ecdffactor = ecdf(pvals_sorted)/cm 

323 else: 

324 raise ValueError('only indep and negcorr implemented') 

325 reject = pvals_sorted <= ecdffactor*alpha 

326 if reject.any(): 

327 rejectmax = max(np.nonzero(reject)[0]) 

328 reject[:rejectmax] = True 

329 

330 pvals_corrected_raw = pvals_sorted / ecdffactor 

331 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 

332 del pvals_corrected_raw 

333 pvals_corrected[pvals_corrected>1] = 1 

334 if not is_sorted: 

335 pvals_corrected_ = np.empty_like(pvals_corrected) 

336 pvals_corrected_[pvals_sortind] = pvals_corrected 

337 del pvals_corrected 

338 reject_ = np.empty_like(reject) 

339 reject_[pvals_sortind] = reject 

340 return reject_, pvals_corrected_ 

341 else: 

342 return reject, pvals_corrected 

343 

344 

345def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False, 

346 is_sorted=False): 

347 '''(iterated) two stage linear step-up procedure with estimation of number of true 

348 hypotheses 

349 

350 Benjamini, Krieger and Yekuteli, procedure in Definition 6 

351 

352 Parameters 

353 ---------- 

354 pvals : array_like 

355 set of p-values of the individual tests. 

356 alpha : float 

357 error rate 

358 method : {'bky', 'bh') 

359 see Notes for details 

360 

361 * 'bky' - implements the procedure in Definition 6 of Benjamini, Krieger 

362 and Yekuteli 2006 

363 * 'bh' - the two stage method of Benjamini and Hochberg 

364 

365 iter : bool 

366 

367 Returns 

368 ------- 

369 rejected : ndarray, bool 

370 True if a hypothesis is rejected, False if not 

371 pvalue-corrected : ndarray 

372 pvalues adjusted for multiple hypotheses testing to limit FDR 

373 m0 : int 

374 ntest - rej, estimated number of true hypotheses 

375 alpha_stages : list of floats 

376 A list of alphas that have been used at each stage 

377 

378 Notes 

379 ----- 

380 The returned corrected p-values are specific to the given alpha, they 

381 cannot be used for a different alpha. 

382 

383 The returned corrected p-values are from the last stage of the fdr_bh 

384 linear step-up procedure (fdrcorrection0 with method='indep') corrected 

385 for the estimated fraction of true hypotheses. 

386 This means that the rejection decision can be obtained with 

387 ``pval_corrected <= alpha``, where ``alpha`` is the original significance 

388 level. 

389 (Note: This has changed from earlier versions (<0.5.0) of statsmodels.) 

390 

391 BKY described several other multi-stage methods, which would be easy to implement. 

392 However, in their simulation the simple two-stage method (with iter=False) was the 

393 most robust to the presence of positive correlation 

394 

395 TODO: What should be returned? 

396 

397 ''' 

398 pvals = np.asarray(pvals) 

399 

400 if not is_sorted: 

401 pvals_sortind = np.argsort(pvals) 

402 pvals = np.take(pvals, pvals_sortind) 

403 

404 ntests = len(pvals) 

405 if method == 'bky': 

406 fact = (1.+alpha) 

407 alpha_prime = alpha / fact 

408 elif method == 'bh': 

409 fact = 1. 

410 alpha_prime = alpha 

411 else: 

412 raise ValueError("only 'bky' and 'bh' are available as method") 

413 

414 alpha_stages = [alpha_prime] 

415 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_prime, method='indep', 

416 is_sorted=True) 

417 r1 = rej.sum() 

418 if (r1 == 0) or (r1 == ntests): 

419 return rej, pvalscorr * fact, ntests - r1, alpha_stages 

420 ri_old = r1 

421 

422 while True: 

423 ntests0 = 1.0 * ntests - ri_old 

424 alpha_star = alpha_prime * ntests / ntests0 

425 alpha_stages.append(alpha_star) 

426 #print ntests0, alpha_star 

427 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_star, method='indep', 

428 is_sorted=True) 

429 ri = rej.sum() 

430 if (not iter) or ri == ri_old: 

431 break 

432 elif ri < ri_old: 

433 # prevent cycles and endless loops 

434 raise RuntimeError(" oops - should not be here") 

435 ri_old = ri 

436 

437 # make adjustment to pvalscorr to reflect estimated number of Non-Null cases 

438 # decision is then pvalscorr < alpha (or <=) 

439 pvalscorr *= ntests0 * 1.0 / ntests 

440 if method == 'bky': 

441 pvalscorr *= (1. + alpha) 

442 

443 if not is_sorted: 

444 pvalscorr_ = np.empty_like(pvalscorr) 

445 pvalscorr_[pvals_sortind] = pvalscorr 

446 del pvalscorr 

447 reject = np.empty_like(rej) 

448 reject[pvals_sortind] = rej 

449 return reject, pvalscorr_, ntests - ri, alpha_stages 

450 else: 

451 return rej, pvalscorr, ntests - ri, alpha_stages 

452 

453 

454def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, 

455 nbins=30): 

456 """ 

457 Calculate local FDR values for a list of Z-scores. 

458 

459 Parameters 

460 ---------- 

461 zscores : array_like 

462 A vector of Z-scores 

463 null_proportion : float 

464 The assumed proportion of true null hypotheses 

465 null_pdf : function mapping reals to positive reals 

466 The density of null Z-scores; if None, use standard normal 

467 deg : int 

468 The maximum exponent in the polynomial expansion of the 

469 density of non-null Z-scores 

470 nbins : int 

471 The number of bins for estimating the marginal density 

472 of Z-scores. 

473 

474 Returns 

475 ------- 

476 fdr : array_like 

477 A vector of FDR values 

478 

479 References 

480 ---------- 

481 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups 

482 Model. Statistical Science 23:1, 1-22. 

483 

484 Examples 

485 -------- 

486 Basic use (the null Z-scores are taken to be standard normal): 

487 

488 >>> from statsmodels.stats.multitest import local_fdr 

489 >>> import numpy as np 

490 >>> zscores = np.random.randn(30) 

491 >>> fdr = local_fdr(zscores) 

492 

493 Use a Gaussian null distribution estimated from the data: 

494 

495 >>> null = EmpiricalNull(zscores) 

496 >>> fdr = local_fdr(zscores, null_pdf=null.pdf) 

497 """ 

498 

499 from statsmodels.genmod.generalized_linear_model import GLM 

500 from statsmodels.genmod.generalized_linear_model import families 

501 from statsmodels.regression.linear_model import OLS 

502 

503 # Bins for Poisson modeling of the marginal Z-score density 

504 minz = min(zscores) 

505 maxz = max(zscores) 

506 bins = np.linspace(minz, maxz, nbins) 

507 

508 # Bin counts 

509 zhist = np.histogram(zscores, bins)[0] 

510 

511 # Bin centers 

512 zbins = (bins[:-1] + bins[1:]) / 2 

513 

514 # The design matrix at bin centers 

515 dmat = np.vander(zbins, deg + 1) 

516 

517 # Use this to get starting values for Poisson regression 

518 md = OLS(np.log(1 + zhist), dmat).fit() 

519 

520 # Poisson regression 

521 md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params) 

522 

523 # The design matrix for all Z-scores 

524 dmat_full = np.vander(zscores, deg + 1) 

525 

526 # The height of the estimated marginal density of Z-scores, 

527 # evaluated at every observed Z-score. 

528 fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) 

529 

530 # The null density. 

531 if null_pdf is None: 

532 f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) 

533 else: 

534 f0 = null_pdf(zscores) 

535 

536 # The local FDR values 

537 fdr = null_proportion * f0 / fz 

538 

539 fdr = np.clip(fdr, 0, 1) 

540 

541 return fdr 

542 

543 

544class NullDistribution(object): 

545 """ 

546 Estimate a Gaussian distribution for the null Z-scores. 

547 

548 The observed Z-scores consist of both null and non-null values. 

549 The fitted distribution of null Z-scores is Gaussian, but may have 

550 non-zero mean and/or non-unit scale. 

551 

552 Parameters 

553 ---------- 

554 zscores : array_like 

555 The observed Z-scores. 

556 null_lb : float 

557 Z-scores between `null_lb` and `null_ub` are all considered to be 

558 true null hypotheses. 

559 null_ub : float 

560 See `null_lb`. 

561 estimate_mean : bool 

562 If True, estimate the mean of the distribution. If False, the 

563 mean is fixed at zero. 

564 estimate_scale : bool 

565 If True, estimate the scale of the distribution. If False, the 

566 scale parameter is fixed at 1. 

567 estimate_null_proportion : bool 

568 If True, estimate the proportion of true null hypotheses (i.e. 

569 the proportion of z-scores with expected value zero). If False, 

570 this parameter is fixed at 1. 

571 

572 Attributes 

573 ---------- 

574 mean : float 

575 The estimated mean of the empirical null distribution 

576 sd : float 

577 The estimated standard deviation of the empirical null distribution 

578 null_proportion : float 

579 The estimated proportion of true null hypotheses among all hypotheses 

580 

581 References 

582 ---------- 

583 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups 

584 Model. Statistical Science 23:1, 1-22. 

585 

586 Notes 

587 ----- 

588 See also: 

589 

590 http://nipy.org/nipy/labs/enn.html#nipy.algorithms.statistics.empirical_pvalue.NormalEmpiricalNull.fdr 

591 """ 

592 

593 def __init__(self, zscores, null_lb=-1, null_ub=1, estimate_mean=True, 

594 estimate_scale=True, estimate_null_proportion=False): 

595 

596 # Extract the null z-scores 

597 ii = np.flatnonzero((zscores >= null_lb) & (zscores <= null_ub)) 

598 if len(ii) == 0: 

599 raise RuntimeError("No Z-scores fall between null_lb and null_ub") 

600 zscores0 = zscores[ii] 

601 

602 # Number of Z-scores, and null Z-scores 

603 n_zs, n_zs0 = len(zscores), len(zscores0) 

604 

605 # Unpack and transform the parameters to the natural scale, hold 

606 # parameters fixed as specified. 

607 def xform(params): 

608 

609 mean = 0. 

610 sd = 1. 

611 prob = 1. 

612 

613 ii = 0 

614 if estimate_mean: 

615 mean = params[ii] 

616 ii += 1 

617 if estimate_scale: 

618 sd = np.exp(params[ii]) 

619 ii += 1 

620 if estimate_null_proportion: 

621 prob = 1 / (1 + np.exp(-params[ii])) 

622 

623 return mean, sd, prob 

624 

625 

626 from scipy.stats.distributions import norm 

627 

628 

629 def fun(params): 

630 """ 

631 Negative log-likelihood of z-scores. 

632 

633 The function has three arguments, packed into a vector: 

634 

635 mean : location parameter 

636 logscale : log of the scale parameter 

637 logitprop : logit of the proportion of true nulls 

638 

639 The implementation follows section 4 from Efron 2008. 

640 """ 

641 

642 d, s, p = xform(params) 

643 

644 # Mass within the central region 

645 central_mass = (norm.cdf((null_ub - d) / s) - 

646 norm.cdf((null_lb - d) / s)) 

647 

648 # Probability that a Z-score is null and is in the central region 

649 cp = p * central_mass 

650 

651 # Binomial term 

652 rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp) 

653 

654 # Truncated Gaussian term for null Z-scores 

655 zv = (zscores0 - d) / s 

656 rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s) 

657 rval -= n_zs0 * np.log(central_mass) 

658 

659 return -rval 

660 

661 

662 # Estimate the parameters 

663 from scipy.optimize import minimize 

664 # starting values are mean = 0, scale = 1, p0 ~ 1 

665 mz = minimize(fun, np.r_[0., 0, 3], method="Nelder-Mead") 

666 mean, sd, prob = xform(mz['x']) 

667 

668 self.mean = mean 

669 self.sd = sd 

670 self.null_proportion = prob 

671 

672 

673 # The fitted null density function 

674 def pdf(self, zscores): 

675 """ 

676 Evaluates the fitted empirical null Z-score density. 

677 

678 Parameters 

679 ---------- 

680 zscores : scalar or array_like 

681 The point or points at which the density is to be 

682 evaluated. 

683 

684 Returns 

685 ------- 

686 The empirical null Z-score density evaluated at the given 

687 points. 

688 """ 

689 

690 zval = (zscores - self.mean) / self.sd 

691 return np.exp(-0.5*zval**2 - np.log(self.sd) - 0.5*np.log(2*np.pi))