Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2 

3Accelerated Failure Time (AFT) Model with empirical likelihood inference. 

4 

5AFT regression analysis is applicable when the researcher has access 

6to a randomly right censored dependent variable, a matrix of exogenous 

7variables and an indicatior variable (delta) that takes a value of 0 if the 

8observation is censored and 1 otherwise. 

9 

10AFT References 

11-------------- 

12 

13Stute, W. (1993). "Consistent Estimation Under Random Censorship when 

14Covariables are Present." Journal of Multivariate Analysis. 

15Vol. 45. Iss. 1. 89-103 

16 

17EL and AFT References 

18--------------------- 

19 

20Zhou, Kim And Bathke. "Empirical Likelihood Analysis for the Heteroskedastic 

21Accelerated Failure Time Model." Manuscript: 

22URL: www.ms.uky.edu/~mai/research/CasewiseEL20080724.pdf 

23 

24Zhou, M. (2005). Empirical Likelihood Ratio with Arbitrarily Censored/ 

25Truncated Data by EM Algorithm. Journal of Computational and Graphical 

26Statistics. 14:3, 643-656. 

27 

28 

29""" 

30import numpy as np 

31from statsmodels.regression.linear_model import OLS, WLS 

32from statsmodels.tools import add_constant 

33#from elregress import ElReg 

34from scipy import optimize 

35from scipy.stats import chi2 

36from .descriptive import _OptFuncts 

37import warnings 

38from statsmodels.tools.sm_exceptions import IterationLimitWarning 

39 

40class OptAFT(_OptFuncts): 

41 """ 

42 Provides optimization functions used in estimating and conducting 

43 inference in an AFT model. 

44 

45 Methods 

46 ------ 

47 

48 _opt_wtd_nuis_regress: 

49 Function optimized over nuisance parameters to compute 

50 the profile likelihood 

51 

52 _EM_test: 

53 Uses the modified Em algorithm of Zhou 2005 to maximize the 

54 likelihood of a parameter vector. 

55 """ 

56 def __init__(self): 

57 pass 

58 

59 def _opt_wtd_nuis_regress(self, test_vals): 

60 """ 

61 A function that is optimized over nuisance parameters to conduct a 

62 hypothesis test for the parameters of interest 

63 

64 Parameters 

65 ---------- 

66 

67 params: 1d array 

68 The regression coefficients of the model. This includes the 

69 nuisance and parameters of interests. 

70 

71 Returns 

72 ------- 

73 

74 llr: float 

75 -2 times the log likelihood of the nuisance parameters and the 

76 hypothesized value of the parameter(s) of interest. 

77 """ 

78 test_params = test_vals.reshape(self.model.nvar, 1) 

79 est_vect = self.model.uncens_exog * (self.model.uncens_endog - 

80 np.dot(self.model.uncens_exog, 

81 test_params)) 

82 eta_star = self._modif_newton(np.zeros(self.model.nvar), est_vect, 

83 self.model._fit_weights) 

84 denom = np.sum(self.model._fit_weights) + np.dot(eta_star, est_vect.T) 

85 self.new_weights = self.model._fit_weights / denom 

86 return -1 * np.sum(np.log(self.new_weights)) 

87 

88 def _EM_test(self, nuisance_params, params=None, param_nums=None, 

89 b0_vals=None, F=None, survidx=None, uncens_nobs=None, 

90 numcensbelow=None, km=None, uncensored=None, censored=None, 

91 maxiter=None, ftol=None): 

92 """ 

93 Uses EM algorithm to compute the maximum likelihood of a test 

94 

95 Parameters 

96 ---------- 

97 

98 Nuisance Params: ndarray 

99 Vector of values to be used as nuisance params. 

100 

101 maxiter: int 

102 Number of iterations in the EM algorithm for a parameter vector 

103 

104 Returns 

105 ------- 

106 -2 ''*'' log likelihood ratio at hypothesized values and 

107 nuisance params 

108 

109 Notes 

110 ----- 

111 Optional parameters are provided by the test_beta function. 

112 """ 

113 iters = 0 

114 params[param_nums] = b0_vals 

115 

116 nuis_param_index = np.int_(np.delete(np.arange(self.model.nvar), 

117 param_nums)) 

118 params[nuis_param_index] = nuisance_params 

119 to_test = params.reshape(self.model.nvar, 1) 

120 opt_res = np.inf 

121 diff = np.inf 

122 while iters < maxiter and diff > ftol: 

123 F = F.flatten() 

124 death = np.cumsum(F[::-1]) 

125 survivalprob = death[::-1] 

126 surv_point_mat = np.dot(F.reshape(-1, 1), 

127 1. / survivalprob[survidx].reshape(1, - 1)) 

128 surv_point_mat = add_constant(surv_point_mat) 

129 summed_wts = np.cumsum(surv_point_mat, axis=1) 

130 wts = summed_wts[np.int_(np.arange(uncens_nobs)), 

131 numcensbelow[uncensored]] 

132 # ^E step 

133 # See Zhou 2005, section 3. 

134 self.model._fit_weights = wts 

135 new_opt_res = self._opt_wtd_nuis_regress(to_test) 

136 # ^ Uncensored weights' contribution to likelihood value. 

137 F = self.new_weights 

138 # ^ M step 

139 diff = np.abs(new_opt_res - opt_res) 

140 opt_res = new_opt_res 

141 iters = iters + 1 

142 death = np.cumsum(F.flatten()[::-1]) 

143 survivalprob = death[::-1] 

144 llike = -opt_res + np.sum(np.log(survivalprob[survidx])) 

145 wtd_km = km.flatten() / np.sum(km) 

146 survivalmax = np.cumsum(wtd_km[::-1])[::-1] 

147 llikemax = np.sum(np.log(wtd_km[uncensored])) + \ 

148 np.sum(np.log(survivalmax[censored])) 

149 if iters == maxiter: 

150 warnings.warn('The EM reached the maximum number of iterations', 

151 IterationLimitWarning) 

152 return -2 * (llike - llikemax) 

153 

154 def _ci_limits_beta(self, b0, param_num=None): 

155 """ 

156 Returns the difference between the log likelihood for a 

157 parameter and some critical value. 

158 

159 Parameters 

160 ---------- 

161 b0: float 

162 Value of a regression parameter 

163 

164 param_num: int 

165 Parameter index of b0 

166 """ 

167 return self.test_beta([b0], [param_num])[0] - self.r0 

168 

169 

170class emplikeAFT(object): 

171 """ 

172 

173 Class for estimating and conducting inference in an AFT model. 

174 

175 Parameters 

176 ---------- 

177 

178 endog: nx1 array 

179 Response variables that are subject to random censoring 

180 

181 exog: nxk array 

182 Matrix of covariates 

183 

184 censors: nx1 array 

185 array with entries 0 or 1. 0 indicates a response was 

186 censored. 

187 

188 Attributes 

189 ---------- 

190 

191 nobs: float 

192 Number of observations 

193 

194 endog: ndarray 

195 Endog attay 

196 

197 exog: ndarray 

198 Exogenous variable matrix 

199 

200 censors 

201 Censors array but sets the max(endog) to uncensored 

202 

203 nvar: float 

204 Number of exogenous variables 

205 

206 uncens_nobs: float 

207 Number of uncensored observations 

208 

209 uncens_endog: ndarray 

210 Uncensored response variables 

211 

212 uncens_exog: ndarray 

213 Exogenous variables of the uncensored observations 

214 

215 Methods 

216 ------- 

217 

218 params: 

219 Fits model parameters 

220 

221 test_beta: 

222 Tests if beta = b0 for any vector b0. 

223 

224 Notes 

225 ----- 

226 

227 The data is immediately sorted in order of increasing endogenous 

228 variables 

229 

230 The last observation is assumed to be uncensored which makes 

231 estimation and inference possible. 

232 """ 

233 def __init__(self, endog, exog, censors): 

234 self.nobs = np.shape(exog)[0] 

235 self.endog = endog.reshape(self.nobs, 1) 

236 self.exog = exog.reshape(self.nobs, -1) 

237 self.censors = censors.reshape(self.nobs, 1) 

238 self.nvar = self.exog.shape[1] 

239 idx = np.lexsort((-self.censors[:, 0], self.endog[:, 0])) 

240 self.endog = self.endog[idx] 

241 self.exog = self.exog[idx] 

242 self.censors = self.censors[idx] 

243 self.censors[-1] = 1 # Sort in init, not in function 

244 self.uncens_nobs = int(np.sum(self.censors)) 

245 mask = self.censors.ravel().astype(bool) 

246 self.uncens_endog = self.endog[mask, :].reshape(-1, 1) 

247 self.uncens_exog = self.exog[mask, :] 

248 

249 

250 def _is_tied(self, endog, censors): 

251 """ 

252 Indicated if an observation takes the same value as the next 

253 ordered observation. 

254 

255 Parameters 

256 ---------- 

257 endog: ndarray 

258 Models endogenous variable 

259 censors: ndarray 

260 arrat indicating a censored array 

261 

262 Returns 

263 ------- 

264 indic_ties: ndarray 

265 ties[i]=1 if endog[i]==endog[i+1] and 

266 censors[i]=censors[i+1] 

267 """ 

268 nobs = int(self.nobs) 

269 endog_idx = endog[np.arange(nobs - 1)] == ( 

270 endog[np.arange(nobs - 1) + 1]) 

271 censors_idx = censors[np.arange(nobs - 1)] == ( 

272 censors[np.arange(nobs - 1) + 1]) 

273 indic_ties = endog_idx * censors_idx # Both true 

274 return np.int_(indic_ties) 

275 

276 def _km_w_ties(self, tie_indic, untied_km): 

277 """ 

278 Computes KM estimator value at each observation, taking into acocunt 

279 ties in the data. 

280 

281 Parameters 

282 ---------- 

283 tie_indic: 1d array 

284 Indicates if the i'th observation is the same as the ith +1 

285 untied_km: 1d array 

286 Km estimates at each observation assuming no ties. 

287 """ 

288 # TODO: Vectorize, even though it is only 1 pass through for any 

289 # function call 

290 num_same = 1 

291 idx_nums = [] 

292 for obs_num in np.arange(int(self.nobs - 1))[::-1]: 

293 if tie_indic[obs_num] == 1: 

294 idx_nums.append(obs_num) 

295 num_same = num_same + 1 

296 untied_km[obs_num] = untied_km[obs_num + 1] 

297 elif tie_indic[obs_num] == 0 and num_same > 1: 

298 idx_nums.append(max(idx_nums) + 1) 

299 idx_nums = np.asarray(idx_nums) 

300 untied_km[idx_nums] = untied_km[idx_nums] 

301 num_same = 1 

302 idx_nums = [] 

303 return untied_km.reshape(self.nobs, 1) 

304 

305 def _make_km(self, endog, censors): 

306 """ 

307 

308 Computes the Kaplan-Meier estimate for the weights in the AFT model 

309 

310 Parameters 

311 ---------- 

312 endog: nx1 array 

313 Array of response variables 

314 censors: nx1 array 

315 Censor-indicating variable 

316 

317 Returns 

318 ------- 

319 Kaplan Meier estimate for each observation 

320 

321 Notes 

322 ----- 

323 

324 This function makes calls to _is_tied and km_w_ties to handle ties in 

325 the data.If a censored observation and an uncensored observation has 

326 the same value, it is assumed that the uncensored happened first. 

327 """ 

328 nobs = self.nobs 

329 num = (nobs - (np.arange(nobs) + 1.)) 

330 denom = ((nobs - (np.arange(nobs) + 1.) + 1.)) 

331 km = (num / denom).reshape(nobs, 1) 

332 km = km ** np.abs(censors - 1.) 

333 km = np.cumprod(km) # If no ties, this is kaplan-meier 

334 tied = self._is_tied(endog, censors) 

335 wtd_km = self._km_w_ties(tied, km) 

336 return (censors / wtd_km).reshape(nobs, 1) 

337 

338 def fit(self): 

339 """ 

340 

341 Fits an AFT model and returns results instance 

342 

343 Parameters 

344 ---------- 

345 None 

346 

347 

348 Returns 

349 ------- 

350 Results instance. 

351 

352 Notes 

353 ----- 

354 To avoid dividing by zero, max(endog) is assumed to be uncensored. 

355 """ 

356 return AFTResults(self) 

357 

358 def predict(self, params, endog=None): 

359 if endog is None: 

360 endog = self.endog 

361 return np.dot(endog, params) 

362 

363 

364class AFTResults(OptAFT): 

365 def __init__(self, model): 

366 self.model = model 

367 

368 def params(self): 

369 """ 

370 

371 Fits an AFT model and returns parameters. 

372 

373 Parameters 

374 ---------- 

375 None 

376 

377 

378 Returns 

379 ------- 

380 Fitted params 

381 

382 Notes 

383 ----- 

384 To avoid dividing by zero, max(endog) is assumed to be uncensored. 

385 """ 

386 self.model.modif_censors = np.copy(self.model.censors) 

387 self.model.modif_censors[-1] = 1 

388 wts = self.model._make_km(self.model.endog, self.model.modif_censors) 

389 res = WLS(self.model.endog, self.model.exog, wts).fit() 

390 params = res.params 

391 return params 

392 

393 def test_beta(self, b0_vals, param_nums, ftol=10 ** - 5, maxiter=30, 

394 print_weights=1): 

395 """ 

396 Returns the profile log likelihood for regression parameters 

397 'param_num' at 'b0_vals.' 

398 

399 Parameters 

400 ---------- 

401 b0_vals: list 

402 The value of parameters to be tested 

403 

404 param_num: list 

405 Which parameters to be tested 

406 

407 maxiter: int, optional 

408 How many iterations to use in the EM algorithm. Default is 30 

409 

410 ftol: float, optional 

411 The function tolerance for the EM optimization. 

412 Default is 10''**''-5 

413 

414 print_weights: bool 

415 If true, returns the weights tate maximize the profile 

416 log likelihood. Default is False 

417 

418 Returns 

419 ------- 

420 

421 test_results : tuple 

422 The log-likelihood and p-pvalue of the test. 

423 

424 Notes 

425 ----- 

426 

427 The function will warn if the EM reaches the maxiter. However, when 

428 optimizing over nuisance parameters, it is possible to reach a 

429 maximum number of inner iterations for a specific value for the 

430 nuisance parameters while the resultsof the function are still valid. 

431 This usually occurs when the optimization over the nuisance parameters 

432 selects paramater values that yield a log-likihood ratio close to 

433 infinity. 

434 

435 Examples 

436 -------- 

437 

438 >>> import statsmodels.api as sm 

439 >>> import numpy as np 

440 

441 # Test parameter is .05 in one regressor no intercept model 

442 >>> data=sm.datasets.heart.load(as_pandas=False) 

443 >>> y = np.log10(data.endog) 

444 >>> x = data.exog 

445 >>> cens = data.censors 

446 >>> model = sm.emplike.emplikeAFT(y, x, cens) 

447 >>> res=model.test_beta([0], [0]) 

448 >>> res 

449 (1.4657739632606308, 0.22601365256959183) 

450 

451 #Test slope is 0 in model with intercept 

452 

453 >>> data=sm.datasets.heart.load(as_pandas=False) 

454 >>> y = np.log10(data.endog) 

455 >>> x = data.exog 

456 >>> cens = data.censors 

457 >>> model = sm.emplike.emplikeAFT(y, sm.add_constant(x), cens) 

458 >>> res = model.test_beta([0], [1]) 

459 >>> res 

460 (4.623487775078047, 0.031537049752572731) 

461 """ 

462 censors = self.model.censors 

463 endog = self.model.endog 

464 exog = self.model.exog 

465 uncensored = (censors == 1).flatten() 

466 censored = (censors == 0).flatten() 

467 uncens_endog = endog[uncensored] 

468 uncens_exog = exog[uncensored, :] 

469 reg_model = OLS(uncens_endog, uncens_exog).fit() 

470 llr, pval, new_weights = reg_model.el_test(b0_vals, param_nums, 

471 return_weights=True) # Needs to be changed 

472 km = self.model._make_km(endog, censors).flatten() # when merged 

473 uncens_nobs = self.model.uncens_nobs 

474 F = np.asarray(new_weights).reshape(uncens_nobs) 

475 # Step 0 ^ 

476 params = self.params() 

477 survidx = np.where(censors == 0) 

478 survidx = survidx[0] - np.arange(len(survidx[0])) 

479 numcensbelow = np.int_(np.cumsum(1 - censors)) 

480 if len(param_nums) == len(params): 

481 llr = self._EM_test([], F=F, params=params, 

482 param_nums=param_nums, 

483 b0_vals=b0_vals, survidx=survidx, 

484 uncens_nobs=uncens_nobs, 

485 numcensbelow=numcensbelow, km=km, 

486 uncensored=uncensored, censored=censored, 

487 ftol=ftol, maxiter=25) 

488 return llr, chi2.sf(llr, self.model.nvar) 

489 else: 

490 x0 = np.delete(params, param_nums) 

491 try: 

492 res = optimize.fmin(self._EM_test, x0, 

493 (params, param_nums, b0_vals, F, survidx, 

494 uncens_nobs, numcensbelow, km, uncensored, 

495 censored, maxiter, ftol), full_output=1, 

496 disp=0) 

497 

498 llr = res[1] 

499 return llr, chi2.sf(llr, len(param_nums)) 

500 except np.linalg.linalg.LinAlgError: 

501 return np.inf, 0 

502 

503 def ci_beta(self, param_num, beta_high, beta_low, sig=.05): 

504 """ 

505 Returns the confidence interval for a regression 

506 parameter in the AFT model. 

507 

508 Parameters 

509 ---------- 

510 

511 param_num: int 

512 Parameter number of interest 

513 

514 beta_high: float 

515 Upper bound for the confidence interval 

516 

517 beta_low: 

518 Lower bound for the confidence interval 

519 

520 sig: float, optional 

521 Significance level. Default is .05 

522 

523 Notes 

524 ----- 

525 If the function returns f(a) and f(b) must have different signs, 

526 consider widening the search area by adjusting beta_low and 

527 beta_high. 

528 

529 Also note that this process is computational intensive. There 

530 are 4 levels of optimization/solving. From outer to inner: 

531 

532 1) Solving so that llr-critical value = 0 

533 2) maximizing over nuisance parameters 

534 3) Using EM at each value of nuisamce parameters 

535 4) Using the _modified_Newton optimizer at each iteration 

536 of the EM algorithm. 

537 

538 Also, for very unlikely nuisance parameters, it is possible for 

539 the EM algorithm to not converge. This is not an indicator 

540 that the solver did not find the correct solution. It just means 

541 for a specific iteration of the nuisance parameters, the optimizer 

542 was unable to converge. 

543 

544 If the user desires to verify the success of the optimization, 

545 it is recommended to test the limits using test_beta. 

546 """ 

547 params = self.params() 

548 self.r0 = chi2.ppf(1 - sig, 1) 

549 ll = optimize.brentq(self._ci_limits_beta, beta_low, 

550 params[param_num], (param_num)) 

551 ul = optimize.brentq(self._ci_limits_beta, 

552 params[param_num], beta_high, (param_num)) 

553 return ll, ul