Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Implementation of proportional hazards regression models for duration 

3data that may be censored ("Cox models"). 

4 

5References 

6---------- 

7T Therneau (1996). Extending the Cox model. Technical report. 

8http://www.mayo.edu/research/documents/biostat-58pdf/DOC-10027288 

9 

10G Rodriguez (2005). Non-parametric estimation in survival models. 

11http://data.princeton.edu/pop509/NonParametricSurvival.pdf 

12 

13B Gillespie (2006). Checking the assumptions in the Cox proportional 

14hazards model. 

15http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf 

16""" 

17import numpy as np 

18 

19from statsmodels.base import model 

20import statsmodels.base.model as base 

21from statsmodels.tools.decorators import cache_readonly 

22from statsmodels.compat.pandas import Appender 

23 

24 

25_predict_docstring = """ 

26 Returns predicted values from the proportional hazards 

27 regression model. 

28 

29 Parameters 

30 ----------%(params_doc)s 

31 exog : array_like 

32 Data to use as `exog` in forming predictions. If not 

33 provided, the `exog` values from the model used to fit the 

34 data are used.%(cov_params_doc)s 

35 endog : array_like 

36 Duration (time) values at which the predictions are made. 

37 Only used if pred_type is either 'cumhaz' or 'surv'. If 

38 using model `exog`, defaults to model `endog` (time), but 

39 may be provided explicitly to make predictions at 

40 alternative times. 

41 strata : array_like 

42 A vector of stratum values used to form the predictions. 

43 Not used (may be 'None') if pred_type is 'lhr' or 'hr'. 

44 If `exog` is None, the model stratum values are used. If 

45 `exog` is not None and pred_type is 'surv' or 'cumhaz', 

46 stratum values must be provided (unless there is only one 

47 stratum). 

48 offset : array_like 

49 Offset values used to create the predicted values. 

50 pred_type : str 

51 If 'lhr', returns log hazard ratios, if 'hr' returns 

52 hazard ratios, if 'surv' returns the survival function, if 

53 'cumhaz' returns the cumulative hazard function. 

54 

55 Returns 

56 ------- 

57 A bunch containing two fields: `predicted_values` and 

58 `standard_errors`. 

59 

60 Notes 

61 ----- 

62 Standard errors are only returned when predicting the log 

63 hazard ratio (pred_type is 'lhr'). 

64 

65 Types `surv` and `cumhaz` require estimation of the cumulative 

66 hazard function. 

67""" 

68 

69_predict_params_doc = """ 

70 params : array_like 

71 The proportional hazards model parameters.""" 

72 

73_predict_cov_params_docstring = """ 

74 cov_params : array_like 

75 The covariance matrix of the estimated `params` vector, 

76 used to obtain prediction errors if pred_type='lhr', 

77 otherwise optional.""" 

78 

79 

80 

81class PHSurvivalTime(object): 

82 

83 def __init__(self, time, status, exog, strata=None, entry=None, 

84 offset=None): 

85 """ 

86 Represent a collection of survival times with possible 

87 stratification and left truncation. 

88 

89 Parameters 

90 ---------- 

91 time : array_like 

92 The times at which either the event (failure) occurs or 

93 the observation is censored. 

94 status : array_like 

95 Indicates whether the event (failure) occurs at `time` 

96 (`status` is 1), or if `time` is a censoring time (`status` 

97 is 0). 

98 exog : array_like 

99 The exogeneous (covariate) data matrix, cases are rows and 

100 variables are columns. 

101 strata : array_like 

102 Grouping variable defining the strata. If None, all 

103 observations are in a single stratum. 

104 entry : array_like 

105 Entry (left truncation) times. The observation is not 

106 part of the risk set for times before the entry time. If 

107 None, the entry time is treated as being zero, which 

108 gives no left truncation. The entry time must be less 

109 than or equal to `time`. 

110 offset : array_like 

111 An optional array of offsets 

112 """ 

113 

114 # Default strata 

115 if strata is None: 

116 strata = np.zeros(len(time), dtype=np.int32) 

117 

118 # Default entry times 

119 if entry is None: 

120 entry = np.zeros(len(time)) 

121 

122 # Parameter validity checks. 

123 n1, n2, n3, n4 = len(time), len(status), len(strata),\ 

124 len(entry) 

125 nv = [n1, n2, n3, n4] 

126 if max(nv) != min(nv): 

127 raise ValueError("endog, status, strata, and " + 

128 "entry must all have the same length") 

129 if min(time) < 0: 

130 raise ValueError("endog must be non-negative") 

131 if min(entry) < 0: 

132 raise ValueError("entry time must be non-negative") 

133 

134 # In Stata, this is entry >= time, in R it is >. 

135 if np.any(entry > time): 

136 raise ValueError("entry times may not occur " + 

137 "after event or censoring times") 

138 

139 # Get the row indices for the cases in each stratum 

140 stu = np.unique(strata) 

141 #sth = {x: [] for x in stu} # needs >=2.7 

142 sth = dict([(x, []) for x in stu]) 

143 for i,k in enumerate(strata): 

144 sth[k].append(i) 

145 stratum_rows = [np.asarray(sth[k], dtype=np.int32) for k in stu] 

146 stratum_names = stu 

147 

148 # Remove strata with no events 

149 ix = [i for i,ix in enumerate(stratum_rows) if status[ix].sum() > 0] 

150 self.nstrat_orig = len(stratum_rows) 

151 stratum_rows = [stratum_rows[i] for i in ix] 

152 stratum_names = [stratum_names[i] for i in ix] 

153 

154 # The number of strata 

155 nstrat = len(stratum_rows) 

156 self.nstrat = nstrat 

157 

158 # Remove subjects whose entry time occurs after the last event 

159 # in their stratum. 

160 for stx,ix in enumerate(stratum_rows): 

161 last_failure = max(time[ix][status[ix] == 1]) 

162 

163 # Stata uses < here, R uses <= 

164 ii = [i for i,t in enumerate(entry[ix]) if 

165 t <= last_failure] 

166 stratum_rows[stx] = stratum_rows[stx][ii] 

167 

168 # Remove subjects who are censored before the first event in 

169 # their stratum. 

170 for stx,ix in enumerate(stratum_rows): 

171 first_failure = min(time[ix][status[ix] == 1]) 

172 

173 ii = [i for i,t in enumerate(time[ix]) if 

174 t >= first_failure] 

175 stratum_rows[stx] = stratum_rows[stx][ii] 

176 

177 # Order by time within each stratum 

178 for stx,ix in enumerate(stratum_rows): 

179 ii = np.argsort(time[ix]) 

180 stratum_rows[stx] = stratum_rows[stx][ii] 

181 

182 if offset is not None: 

183 self.offset_s = [] 

184 for stx in range(nstrat): 

185 self.offset_s.append(offset[stratum_rows[stx]]) 

186 else: 

187 self.offset_s = None 

188 

189 # Number of informative subjects 

190 self.n_obs = sum([len(ix) for ix in stratum_rows]) 

191 

192 # Split everything by stratum 

193 self.time_s = [] 

194 self.exog_s = [] 

195 self.status_s = [] 

196 self.entry_s = [] 

197 for ix in stratum_rows: 

198 self.time_s.append(time[ix]) 

199 self.exog_s.append(exog[ix,:]) 

200 self.status_s.append(status[ix]) 

201 self.entry_s.append(entry[ix]) 

202 

203 self.stratum_rows = stratum_rows 

204 self.stratum_names = stratum_names 

205 

206 # Precalculate some indices needed to fit Cox models. 

207 # Distinct failure times within a stratum are always taken to 

208 # be sorted in ascending order. 

209 # 

210 # ufailt_ix[stx][k] is a list of indices for subjects who fail 

211 # at the k^th sorted unique failure time in stratum stx 

212 # 

213 # risk_enter[stx][k] is a list of indices for subjects who 

214 # enter the risk set at the k^th sorted unique failure time in 

215 # stratum stx 

216 # 

217 # risk_exit[stx][k] is a list of indices for subjects who exit 

218 # the risk set at the k^th sorted unique failure time in 

219 # stratum stx 

220 self.ufailt_ix, self.risk_enter, self.risk_exit, self.ufailt =\ 

221 [], [], [], [] 

222 

223 for stx in range(self.nstrat): 

224 

225 # All failure times 

226 ift = np.flatnonzero(self.status_s[stx] == 1) 

227 ft = self.time_s[stx][ift] 

228 

229 # Unique failure times 

230 uft = np.unique(ft) 

231 nuft = len(uft) 

232 

233 # Indices of cases that fail at each unique failure time 

234 #uft_map = {x:i for i,x in enumerate(uft)} # requires >=2.7 

235 uft_map = dict([(x, i) for i,x in enumerate(uft)]) # 2.6 

236 uft_ix = [[] for k in range(nuft)] 

237 for ix,ti in zip(ift,ft): 

238 uft_ix[uft_map[ti]].append(ix) 

239 

240 # Indices of cases (failed or censored) that enter the 

241 # risk set at each unique failure time. 

242 risk_enter1 = [[] for k in range(nuft)] 

243 for i,t in enumerate(self.time_s[stx]): 

244 ix = np.searchsorted(uft, t, "right") - 1 

245 if ix >= 0: 

246 risk_enter1[ix].append(i) 

247 

248 # Indices of cases (failed or censored) that exit the 

249 # risk set at each unique failure time. 

250 risk_exit1 = [[] for k in range(nuft)] 

251 for i,t in enumerate(self.entry_s[stx]): 

252 ix = np.searchsorted(uft, t) 

253 risk_exit1[ix].append(i) 

254 

255 self.ufailt.append(uft) 

256 self.ufailt_ix.append([np.asarray(x, dtype=np.int32) 

257 for x in uft_ix]) 

258 self.risk_enter.append([np.asarray(x, dtype=np.int32) 

259 for x in risk_enter1]) 

260 self.risk_exit.append([np.asarray(x, dtype=np.int32) 

261 for x in risk_exit1]) 

262 

263 

264class PHReg(model.LikelihoodModel): 

265 """ 

266 Cox Proportional Hazards Regression Model 

267 

268 The Cox PH Model is for right censored data. 

269 

270 Parameters 

271 ---------- 

272 endog : array_like 

273 The observed times (event or censoring) 

274 exog : 2D array_like 

275 The covariates or exogeneous variables 

276 status : array_like 

277 The censoring status values; status=1 indicates that an 

278 event occurred (e.g. failure or death), status=0 indicates 

279 that the observation was right censored. If None, defaults 

280 to status=1 for all cases. 

281 entry : array_like 

282 The entry times, if left truncation occurs 

283 strata : array_like 

284 Stratum labels. If None, all observations are taken to be 

285 in a single stratum. 

286 ties : str 

287 The method used to handle tied times, must be either 'breslow' 

288 or 'efron'. 

289 offset : array_like 

290 Array of offset values 

291 missing : str 

292 The method used to handle missing data 

293 

294 Notes 

295 ----- 

296 Proportional hazards regression models should not include an 

297 explicit or implicit intercept. The effect of an intercept is 

298 not identified using the partial likelihood approach. 

299 

300 `endog`, `event`, `strata`, `entry`, and the first dimension 

301 of `exog` all must have the same length 

302 """ 

303 

304 def __init__(self, endog, exog, status=None, entry=None, 

305 strata=None, offset=None, ties='breslow', 

306 missing='drop', **kwargs): 

307 

308 # Default is no censoring 

309 if status is None: 

310 status = np.ones(len(endog)) 

311 

312 super(PHReg, self).__init__(endog, exog, status=status, 

313 entry=entry, strata=strata, 

314 offset=offset, missing=missing, 

315 **kwargs) 

316 

317 # endog and exog are automatically converted, but these are 

318 # not 

319 if self.status is not None: 

320 self.status = np.asarray(self.status) 

321 if self.entry is not None: 

322 self.entry = np.asarray(self.entry) 

323 if self.strata is not None: 

324 self.strata = np.asarray(self.strata) 

325 if self.offset is not None: 

326 self.offset = np.asarray(self.offset) 

327 

328 self.surv = PHSurvivalTime(self.endog, self.status, 

329 self.exog, self.strata, 

330 self.entry, self.offset) 

331 self.nobs = len(self.endog) 

332 self.groups = None 

333 

334 # TODO: not used? 

335 self.missing = missing 

336 

337 self.df_resid = (np.float(self.exog.shape[0] - 

338 np.linalg.matrix_rank(self.exog))) 

339 self.df_model = np.float(np.linalg.matrix_rank(self.exog)) 

340 

341 ties = ties.lower() 

342 if ties not in ("efron", "breslow"): 

343 raise ValueError("`ties` must be either `efron` or " + 

344 "`breslow`") 

345 

346 self.ties = ties 

347 

348 @classmethod 

349 def from_formula(cls, formula, data, status=None, entry=None, 

350 strata=None, offset=None, subset=None, 

351 ties='breslow', missing='drop', *args, **kwargs): 

352 """ 

353 Create a proportional hazards regression model from a formula 

354 and dataframe. 

355 

356 Parameters 

357 ---------- 

358 formula : str or generic Formula object 

359 The formula specifying the model 

360 data : array_like 

361 The data for the model. See Notes. 

362 status : array_like 

363 The censoring status values; status=1 indicates that an 

364 event occurred (e.g. failure or death), status=0 indicates 

365 that the observation was right censored. If None, defaults 

366 to status=1 for all cases. 

367 entry : array_like 

368 The entry times, if left truncation occurs 

369 strata : array_like 

370 Stratum labels. If None, all observations are taken to be 

371 in a single stratum. 

372 offset : array_like 

373 Array of offset values 

374 subset : array_like 

375 An array-like object of booleans, integers, or index 

376 values that indicate the subset of df to use in the 

377 model. Assumes df is a `pandas.DataFrame` 

378 ties : str 

379 The method used to handle tied times, must be either 'breslow' 

380 or 'efron'. 

381 missing : str 

382 The method used to handle missing data 

383 args : extra arguments 

384 These are passed to the model 

385 kwargs : extra keyword arguments 

386 These are passed to the model with one exception. The 

387 ``eval_env`` keyword is passed to patsy. It can be either a 

388 :class:`patsy:patsy.EvalEnvironment` object or an integer 

389 indicating the depth of the namespace to use. For example, the 

390 default ``eval_env=0`` uses the calling namespace. If you wish 

391 to use a "clean" environment set ``eval_env=-1``. 

392 

393 Returns 

394 ------- 

395 model : PHReg model instance 

396 """ 

397 

398 # Allow array arguments to be passed by column name. 

399 if isinstance(status, str): 

400 status = data[status] 

401 if isinstance(entry, str): 

402 entry = data[entry] 

403 if isinstance(strata, str): 

404 strata = data[strata] 

405 if isinstance(offset, str): 

406 offset = data[offset] 

407 

408 import re 

409 terms = re.split(r"[+\-~]", formula) 

410 for term in terms: 

411 term = term.strip() 

412 if term in ("0", "1"): 

413 import warnings 

414 warnings.warn("PHReg formulas should not include any '0' or '1' terms") 

415 

416 mod = super(PHReg, cls).from_formula(formula, data, 

417 status=status, entry=entry, strata=strata, 

418 offset=offset, subset=subset, ties=ties, 

419 missing=missing, drop_cols=["Intercept"], *args, 

420 **kwargs) 

421 

422 return mod 

423 

424 def fit(self, groups=None, **args): 

425 """ 

426 Fit a proportional hazards regression model. 

427 

428 Parameters 

429 ---------- 

430 groups : array_like 

431 Labels indicating groups of observations that may be 

432 dependent. If present, the standard errors account for 

433 this dependence. Does not affect fitted values. 

434 

435 Returns 

436 ------- 

437 PHRegResults 

438 Returns a results instance. 

439 """ 

440 

441 # TODO process for missing values 

442 if groups is not None: 

443 if len(groups) != len(self.endog): 

444 msg = ("len(groups) = %d and len(endog) = %d differ" % 

445 (len(groups), len(self.endog))) 

446 raise ValueError(msg) 

447 self.groups = np.asarray(groups) 

448 else: 

449 self.groups = None 

450 

451 if 'disp' not in args: 

452 args['disp'] = False 

453 fit_rslts = super(PHReg, self).fit(**args) 

454 

455 if self.groups is None: 

456 cov_params = fit_rslts.cov_params() 

457 else: 

458 cov_params = self.robust_covariance(fit_rslts.params) 

459 

460 results = PHRegResults(self, fit_rslts.params, cov_params) 

461 

462 return results 

463 

464 def fit_regularized(self, method="elastic_net", alpha=0., 

465 start_params=None, refit=False, **kwargs): 

466 r""" 

467 Return a regularized fit to a linear regression model. 

468 

469 Parameters 

470 ---------- 

471 method : {'elastic_net'} 

472 Only the `elastic_net` approach is currently implemented. 

473 alpha : scalar or array_like 

474 The penalty weight. If a scalar, the same penalty weight 

475 applies to all variables in the model. If a vector, it 

476 must have the same length as `params`, and contains a 

477 penalty weight for each coefficient. 

478 start_params : array_like 

479 Starting values for `params`. 

480 refit : bool 

481 If True, the model is refit using only the variables that 

482 have non-zero coefficients in the regularized fit. The 

483 refitted model is not regularized. 

484 **kwargs 

485 Additional keyword arguments used to fit the model. 

486 

487 Returns 

488 ------- 

489 PHRegResults 

490 Returns a results instance. 

491 

492 Notes 

493 ----- 

494 The penalty is the ``elastic net`` penalty, which is a 

495 combination of L1 and L2 penalties. 

496 

497 The function that is minimized is: 

498 

499 .. math:: 

500 

501 -loglike/n + alpha*((1-L1\_wt)*|params|_2^2/2 + L1\_wt*|params|_1) 

502 

503 where :math:`|*|_1` and :math:`|*|_2` are the L1 and L2 norms. 

504 

505 Post-estimation results are based on the same data used to 

506 select variables, hence may be subject to overfitting biases. 

507 

508 The elastic_net method uses the following keyword arguments: 

509 

510 maxiter : int 

511 Maximum number of iterations 

512 L1_wt : float 

513 Must be in [0, 1]. The L1 penalty has weight L1_wt and the 

514 L2 penalty has weight 1 - L1_wt. 

515 cnvrg_tol : float 

516 Convergence threshold for line searches 

517 zero_tol : float 

518 Coefficients below this threshold are treated as zero. 

519 """ 

520 

521 from statsmodels.base.elastic_net import fit_elasticnet 

522 

523 if method != "elastic_net": 

524 raise ValueError("method for fit_regularied must be elastic_net") 

525 

526 defaults = {"maxiter" : 50, "L1_wt" : 1, "cnvrg_tol" : 1e-10, 

527 "zero_tol" : 1e-10} 

528 defaults.update(kwargs) 

529 

530 return fit_elasticnet(self, method=method, 

531 alpha=alpha, 

532 start_params=start_params, 

533 refit=refit, 

534 **defaults) 

535 

536 

537 def loglike(self, params): 

538 """ 

539 Returns the log partial likelihood function evaluated at 

540 `params`. 

541 """ 

542 

543 if self.ties == "breslow": 

544 return self.breslow_loglike(params) 

545 elif self.ties == "efron": 

546 return self.efron_loglike(params) 

547 

548 def score(self, params): 

549 """ 

550 Returns the score function evaluated at `params`. 

551 """ 

552 

553 if self.ties == "breslow": 

554 return self.breslow_gradient(params) 

555 elif self.ties == "efron": 

556 return self.efron_gradient(params) 

557 

558 def hessian(self, params): 

559 """ 

560 Returns the Hessian matrix of the log partial likelihood 

561 function evaluated at `params`. 

562 """ 

563 

564 if self.ties == "breslow": 

565 return self.breslow_hessian(params) 

566 else: 

567 return self.efron_hessian(params) 

568 

569 def breslow_loglike(self, params): 

570 """ 

571 Returns the value of the log partial likelihood function 

572 evaluated at `params`, using the Breslow method to handle tied 

573 times. 

574 """ 

575 

576 surv = self.surv 

577 

578 like = 0. 

579 

580 # Loop over strata 

581 for stx in range(surv.nstrat): 

582 

583 uft_ix = surv.ufailt_ix[stx] 

584 exog_s = surv.exog_s[stx] 

585 nuft = len(uft_ix) 

586 

587 linpred = np.dot(exog_s, params) 

588 if surv.offset_s is not None: 

589 linpred += surv.offset_s[stx] 

590 linpred -= linpred.max() 

591 e_linpred = np.exp(linpred) 

592 

593 xp0 = 0. 

594 

595 # Iterate backward through the unique failure times. 

596 for i in range(nuft)[::-1]: 

597 

598 # Update for new cases entering the risk set. 

599 ix = surv.risk_enter[stx][i] 

600 xp0 += e_linpred[ix].sum() 

601 

602 # Account for all cases that fail at this point. 

603 ix = uft_ix[i] 

604 like += (linpred[ix] - np.log(xp0)).sum() 

605 

606 # Update for cases leaving the risk set. 

607 ix = surv.risk_exit[stx][i] 

608 xp0 -= e_linpred[ix].sum() 

609 

610 return like 

611 

612 def efron_loglike(self, params): 

613 """ 

614 Returns the value of the log partial likelihood function 

615 evaluated at `params`, using the Efron method to handle tied 

616 times. 

617 """ 

618 

619 surv = self.surv 

620 

621 like = 0. 

622 

623 # Loop over strata 

624 for stx in range(surv.nstrat): 

625 

626 # exog and linear predictor for this stratum 

627 exog_s = surv.exog_s[stx] 

628 linpred = np.dot(exog_s, params) 

629 if surv.offset_s is not None: 

630 linpred += surv.offset_s[stx] 

631 linpred -= linpred.max() 

632 e_linpred = np.exp(linpred) 

633 

634 xp0 = 0. 

635 

636 # Iterate backward through the unique failure times. 

637 uft_ix = surv.ufailt_ix[stx] 

638 nuft = len(uft_ix) 

639 for i in range(nuft)[::-1]: 

640 

641 # Update for new cases entering the risk set. 

642 ix = surv.risk_enter[stx][i] 

643 xp0 += e_linpred[ix].sum() 

644 xp0f = e_linpred[uft_ix[i]].sum() 

645 

646 # Account for all cases that fail at this point. 

647 ix = uft_ix[i] 

648 like += linpred[ix].sum() 

649 

650 m = len(ix) 

651 J = np.arange(m, dtype=np.float64) / m 

652 like -= np.log(xp0 - J*xp0f).sum() 

653 

654 # Update for cases leaving the risk set. 

655 ix = surv.risk_exit[stx][i] 

656 xp0 -= e_linpred[ix].sum() 

657 

658 return like 

659 

660 def breslow_gradient(self, params): 

661 """ 

662 Returns the gradient of the log partial likelihood, using the 

663 Breslow method to handle tied times. 

664 """ 

665 

666 surv = self.surv 

667 

668 grad = 0. 

669 

670 # Loop over strata 

671 for stx in range(surv.nstrat): 

672 

673 # Indices of subjects in the stratum 

674 strat_ix = surv.stratum_rows[stx] 

675 

676 # Unique failure times in the stratum 

677 uft_ix = surv.ufailt_ix[stx] 

678 nuft = len(uft_ix) 

679 

680 # exog and linear predictor for the stratum 

681 exog_s = surv.exog_s[stx] 

682 linpred = np.dot(exog_s, params) 

683 if surv.offset_s is not None: 

684 linpred += surv.offset_s[stx] 

685 linpred -= linpred.max() 

686 e_linpred = np.exp(linpred) 

687 

688 xp0, xp1 = 0., 0. 

689 

690 # Iterate backward through the unique failure times. 

691 for i in range(nuft)[::-1]: 

692 

693 # Update for new cases entering the risk set. 

694 ix = surv.risk_enter[stx][i] 

695 if len(ix) > 0: 

696 v = exog_s[ix,:] 

697 xp0 += e_linpred[ix].sum() 

698 xp1 += (e_linpred[ix][:,None] * v).sum(0) 

699 

700 # Account for all cases that fail at this point. 

701 ix = uft_ix[i] 

702 grad += (exog_s[ix,:] - xp1 / xp0).sum(0) 

703 

704 # Update for cases leaving the risk set. 

705 ix = surv.risk_exit[stx][i] 

706 if len(ix) > 0: 

707 v = exog_s[ix,:] 

708 xp0 -= e_linpred[ix].sum() 

709 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 

710 

711 return grad 

712 

713 def efron_gradient(self, params): 

714 """ 

715 Returns the gradient of the log partial likelihood evaluated 

716 at `params`, using the Efron method to handle tied times. 

717 """ 

718 

719 surv = self.surv 

720 

721 grad = 0. 

722 

723 # Loop over strata 

724 for stx in range(surv.nstrat): 

725 

726 # Indices of cases in the stratum 

727 strat_ix = surv.stratum_rows[stx] 

728 

729 # exog and linear predictor of the stratum 

730 exog_s = surv.exog_s[stx] 

731 linpred = np.dot(exog_s, params) 

732 if surv.offset_s is not None: 

733 linpred += surv.offset_s[stx] 

734 linpred -= linpred.max() 

735 e_linpred = np.exp(linpred) 

736 

737 xp0, xp1 = 0., 0. 

738 

739 # Iterate backward through the unique failure times. 

740 uft_ix = surv.ufailt_ix[stx] 

741 nuft = len(uft_ix) 

742 for i in range(nuft)[::-1]: 

743 

744 # Update for new cases entering the risk set. 

745 ix = surv.risk_enter[stx][i] 

746 if len(ix) > 0: 

747 v = exog_s[ix,:] 

748 xp0 += e_linpred[ix].sum() 

749 xp1 += (e_linpred[ix][:,None] * v).sum(0) 

750 ixf = uft_ix[i] 

751 if len(ixf) > 0: 

752 v = exog_s[ixf,:] 

753 xp0f = e_linpred[ixf].sum() 

754 xp1f = (e_linpred[ixf][:,None] * v).sum(0) 

755 

756 # Consider all cases that fail at this point. 

757 grad += v.sum(0) 

758 

759 m = len(ixf) 

760 J = np.arange(m, dtype=np.float64) / m 

761 numer = xp1 - np.outer(J, xp1f) 

762 denom = xp0 - np.outer(J, xp0f) 

763 ratio = numer / denom 

764 rsum = ratio.sum(0) 

765 grad -= rsum 

766 

767 # Update for cases leaving the risk set. 

768 ix = surv.risk_exit[stx][i] 

769 if len(ix) > 0: 

770 v = exog_s[ix,:] 

771 xp0 -= e_linpred[ix].sum() 

772 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 

773 

774 return grad 

775 

776 def breslow_hessian(self, params): 

777 """ 

778 Returns the Hessian of the log partial likelihood evaluated at 

779 `params`, using the Breslow method to handle tied times. 

780 """ 

781 

782 surv = self.surv 

783 

784 hess = 0. 

785 

786 # Loop over strata 

787 for stx in range(surv.nstrat): 

788 

789 uft_ix = surv.ufailt_ix[stx] 

790 nuft = len(uft_ix) 

791 

792 exog_s = surv.exog_s[stx] 

793 

794 linpred = np.dot(exog_s, params) 

795 if surv.offset_s is not None: 

796 linpred += surv.offset_s[stx] 

797 linpred -= linpred.max() 

798 e_linpred = np.exp(linpred) 

799 

800 xp0, xp1, xp2 = 0., 0., 0. 

801 

802 # Iterate backward through the unique failure times. 

803 for i in range(nuft)[::-1]: 

804 

805 # Update for new cases entering the risk set. 

806 ix = surv.risk_enter[stx][i] 

807 if len(ix) > 0: 

808 xp0 += e_linpred[ix].sum() 

809 v = exog_s[ix,:] 

810 xp1 += (e_linpred[ix][:,None] * v).sum(0) 

811 elx = e_linpred[ix] 

812 xp2 += np.einsum("ij,ik,i->jk", v, v, elx) 

813 

814 # Account for all cases that fail at this point. 

815 m = len(uft_ix[i]) 

816 hess += m*(xp2 / xp0 - np.outer(xp1, xp1) / xp0**2) 

817 

818 # Update for new cases entering the risk set. 

819 ix = surv.risk_exit[stx][i] 

820 if len(ix) > 0: 

821 xp0 -= e_linpred[ix].sum() 

822 v = exog_s[ix,:] 

823 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 

824 elx = e_linpred[ix] 

825 xp2 -= np.einsum("ij,ik,i->jk", v, v, elx) 

826 return -hess 

827 

828 def efron_hessian(self, params): 

829 """ 

830 Returns the Hessian matrix of the partial log-likelihood 

831 evaluated at `params`, using the Efron method to handle tied 

832 times. 

833 """ 

834 

835 surv = self.surv 

836 

837 hess = 0. 

838 

839 # Loop over strata 

840 for stx in range(surv.nstrat): 

841 

842 exog_s = surv.exog_s[stx] 

843 

844 linpred = np.dot(exog_s, params) 

845 if surv.offset_s is not None: 

846 linpred += surv.offset_s[stx] 

847 linpred -= linpred.max() 

848 e_linpred = np.exp(linpred) 

849 

850 xp0, xp1, xp2 = 0., 0., 0. 

851 

852 # Iterate backward through the unique failure times. 

853 uft_ix = surv.ufailt_ix[stx] 

854 nuft = len(uft_ix) 

855 for i in range(nuft)[::-1]: 

856 

857 # Update for new cases entering the risk set. 

858 ix = surv.risk_enter[stx][i] 

859 if len(ix) > 0: 

860 xp0 += e_linpred[ix].sum() 

861 v = exog_s[ix,:] 

862 xp1 += (e_linpred[ix][:,None] * v).sum(0) 

863 elx = e_linpred[ix] 

864 xp2 += np.einsum("ij,ik,i->jk", v, v, elx) 

865 

866 ixf = uft_ix[i] 

867 if len(ixf) > 0: 

868 v = exog_s[ixf,:] 

869 xp0f = e_linpred[ixf].sum() 

870 xp1f = (e_linpred[ixf][:,None] * v).sum(0) 

871 elx = e_linpred[ixf] 

872 xp2f = np.einsum("ij,ik,i->jk", v, v, elx) 

873 

874 # Account for all cases that fail at this point. 

875 m = len(uft_ix[i]) 

876 J = np.arange(m, dtype=np.float64) / m 

877 c0 = xp0 - J*xp0f 

878 hess += xp2 * np.sum(1 / c0) 

879 hess -= xp2f * np.sum(J / c0) 

880 mat = (xp1[None, :] - np.outer(J, xp1f)) / c0[:, None] 

881 hess -= np.einsum("ij,ik->jk", mat, mat) 

882 

883 # Update for new cases entering the risk set. 

884 ix = surv.risk_exit[stx][i] 

885 if len(ix) > 0: 

886 xp0 -= e_linpred[ix].sum() 

887 v = exog_s[ix,:] 

888 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 

889 elx = e_linpred[ix] 

890 xp2 -= np.einsum("ij,ik,i->jk", v, v, elx) 

891 

892 return -hess 

893 

894 def robust_covariance(self, params): 

895 """ 

896 Returns a covariance matrix for the proportional hazards model 

897 regresion coefficient estimates that is robust to certain 

898 forms of model misspecification. 

899 

900 Parameters 

901 ---------- 

902 params : ndarray 

903 The parameter vector at which the covariance matrix is 

904 calculated. 

905 

906 Returns 

907 ------- 

908 The robust covariance matrix as a square ndarray. 

909 

910 Notes 

911 ----- 

912 This function uses the `groups` argument to determine groups 

913 within which observations may be dependent. The covariance 

914 matrix is calculated using the Huber-White "sandwich" approach. 

915 """ 

916 

917 if self.groups is None: 

918 raise ValueError("`groups` must be specified to calculate the robust covariance matrix") 

919 

920 hess = self.hessian(params) 

921 

922 score_obs = self.score_residuals(params) 

923 

924 # Collapse 

925 grads = {} 

926 for i,g in enumerate(self.groups): 

927 if g not in grads: 

928 grads[g] = 0. 

929 grads[g] += score_obs[i, :] 

930 grads = np.asarray(list(grads.values())) 

931 

932 mat = grads[None, :, :] 

933 mat = mat.T * mat 

934 mat = mat.sum(1) 

935 

936 hess_inv = np.linalg.inv(hess) 

937 cmat = np.dot(hess_inv, np.dot(mat, hess_inv)) 

938 

939 return cmat 

940 

941 def score_residuals(self, params): 

942 """ 

943 Returns the score residuals calculated at a given vector of 

944 parameters. 

945 

946 Parameters 

947 ---------- 

948 params : ndarray 

949 The parameter vector at which the score residuals are 

950 calculated. 

951 

952 Returns 

953 ------- 

954 The score residuals, returned as a ndarray having the same 

955 shape as `exog`. 

956 

957 Notes 

958 ----- 

959 Observations in a stratum with no observed events have undefined 

960 score residuals, and contain NaN in the returned matrix. 

961 """ 

962 

963 surv = self.surv 

964 

965 score_resid = np.zeros(self.exog.shape, dtype=np.float64) 

966 

967 # Use to set undefined values to NaN. 

968 mask = np.zeros(self.exog.shape[0], dtype=np.int32) 

969 

970 w_avg = self.weighted_covariate_averages(params) 

971 

972 # Loop over strata 

973 for stx in range(surv.nstrat): 

974 

975 uft_ix = surv.ufailt_ix[stx] 

976 exog_s = surv.exog_s[stx] 

977 nuft = len(uft_ix) 

978 strat_ix = surv.stratum_rows[stx] 

979 

980 xp0 = 0. 

981 

982 linpred = np.dot(exog_s, params) 

983 if surv.offset_s is not None: 

984 linpred += surv.offset_s[stx] 

985 linpred -= linpred.max() 

986 e_linpred = np.exp(linpred) 

987 

988 at_risk_ix = set([]) 

989 

990 # Iterate backward through the unique failure times. 

991 for i in range(nuft)[::-1]: 

992 

993 # Update for new cases entering the risk set. 

994 ix = surv.risk_enter[stx][i] 

995 at_risk_ix |= set(ix) 

996 xp0 += e_linpred[ix].sum() 

997 

998 atr_ix = list(at_risk_ix) 

999 leverage = exog_s[atr_ix, :] - w_avg[stx][i, :] 

1000 

1001 # Event indicators 

1002 d = np.zeros(exog_s.shape[0]) 

1003 d[uft_ix[i]] = 1 

1004 

1005 # The increment in the cumulative hazard 

1006 dchaz = len(uft_ix[i]) / xp0 

1007 

1008 # Piece of the martingale residual 

1009 mrp = d[atr_ix] - e_linpred[atr_ix] * dchaz 

1010 

1011 # Update the score residuals 

1012 ii = strat_ix[atr_ix] 

1013 score_resid[ii,:] += leverage * mrp[:, None] 

1014 mask[ii] = 1 

1015 

1016 # Update for cases leaving the risk set. 

1017 ix = surv.risk_exit[stx][i] 

1018 at_risk_ix -= set(ix) 

1019 xp0 -= e_linpred[ix].sum() 

1020 

1021 jj = np.flatnonzero(mask == 0) 

1022 if len(jj) > 0: 

1023 score_resid[jj, :] = np.nan 

1024 

1025 return score_resid 

1026 

1027 def weighted_covariate_averages(self, params): 

1028 """ 

1029 Returns the hazard-weighted average of covariate values for 

1030 subjects who are at-risk at a particular time. 

1031 

1032 Parameters 

1033 ---------- 

1034 params : ndarray 

1035 Parameter vector 

1036 

1037 Returns 

1038 ------- 

1039 averages : list of ndarrays 

1040 averages[stx][i,:] is a row vector containing the weighted 

1041 average values (for all the covariates) of at-risk 

1042 subjects a the i^th largest observed failure time in 

1043 stratum `stx`, using the hazard multipliers as weights. 

1044 

1045 Notes 

1046 ----- 

1047 Used to calculate leverages and score residuals. 

1048 """ 

1049 

1050 surv = self.surv 

1051 

1052 averages = [] 

1053 xp0, xp1 = 0., 0. 

1054 

1055 # Loop over strata 

1056 for stx in range(surv.nstrat): 

1057 

1058 uft_ix = surv.ufailt_ix[stx] 

1059 exog_s = surv.exog_s[stx] 

1060 nuft = len(uft_ix) 

1061 

1062 average_s = np.zeros((len(uft_ix), exog_s.shape[1]), 

1063 dtype=np.float64) 

1064 

1065 linpred = np.dot(exog_s, params) 

1066 if surv.offset_s is not None: 

1067 linpred += surv.offset_s[stx] 

1068 linpred -= linpred.max() 

1069 e_linpred = np.exp(linpred) 

1070 

1071 # Iterate backward through the unique failure times. 

1072 for i in range(nuft)[::-1]: 

1073 

1074 # Update for new cases entering the risk set. 

1075 ix = surv.risk_enter[stx][i] 

1076 xp0 += e_linpred[ix].sum() 

1077 xp1 += np.dot(e_linpred[ix], exog_s[ix, :]) 

1078 

1079 average_s[i, :] = xp1 / xp0 

1080 

1081 # Update for cases leaving the risk set. 

1082 ix = surv.risk_exit[stx][i] 

1083 xp0 -= e_linpred[ix].sum() 

1084 xp1 -= np.dot(e_linpred[ix], exog_s[ix, :]) 

1085 

1086 averages.append(average_s) 

1087 

1088 return averages 

1089 

1090 def baseline_cumulative_hazard(self, params): 

1091 """ 

1092 Estimate the baseline cumulative hazard and survival 

1093 functions. 

1094 

1095 Parameters 

1096 ---------- 

1097 params : ndarray 

1098 The model parameters. 

1099 

1100 Returns 

1101 ------- 

1102 A list of triples (time, hazard, survival) containing the time 

1103 values and corresponding cumulative hazard and survival 

1104 function values for each stratum. 

1105 

1106 Notes 

1107 ----- 

1108 Uses the Nelson-Aalen estimator. 

1109 """ 

1110 

1111 # TODO: some disagreements with R, not the same algorithm but 

1112 # hard to deduce what R is doing. Our results are reasonable. 

1113 

1114 surv = self.surv 

1115 rslt = [] 

1116 

1117 # Loop over strata 

1118 for stx in range(surv.nstrat): 

1119 

1120 uft = surv.ufailt[stx] 

1121 uft_ix = surv.ufailt_ix[stx] 

1122 exog_s = surv.exog_s[stx] 

1123 nuft = len(uft_ix) 

1124 

1125 linpred = np.dot(exog_s, params) 

1126 if surv.offset_s is not None: 

1127 linpred += surv.offset_s[stx] 

1128 e_linpred = np.exp(linpred) 

1129 

1130 xp0 = 0. 

1131 h0 = np.zeros(nuft, dtype=np.float64) 

1132 

1133 # Iterate backward through the unique failure times. 

1134 for i in range(nuft)[::-1]: 

1135 

1136 # Update for new cases entering the risk set. 

1137 ix = surv.risk_enter[stx][i] 

1138 xp0 += e_linpred[ix].sum() 

1139 

1140 # Account for all cases that fail at this point. 

1141 ix = uft_ix[i] 

1142 h0[i] = len(ix) / xp0 

1143 

1144 # Update for cases leaving the risk set. 

1145 ix = surv.risk_exit[stx][i] 

1146 xp0 -= e_linpred[ix].sum() 

1147 

1148 cumhaz = np.cumsum(h0) - h0 

1149 current_strata_surv = np.exp(-cumhaz) 

1150 rslt.append([uft, cumhaz, current_strata_surv]) 

1151 

1152 return rslt 

1153 

1154 def baseline_cumulative_hazard_function(self, params): 

1155 """ 

1156 Returns a function that calculates the baseline cumulative 

1157 hazard function for each stratum. 

1158 

1159 Parameters 

1160 ---------- 

1161 params : ndarray 

1162 The model parameters. 

1163 

1164 Returns 

1165 ------- 

1166 A dict mapping stratum names to the estimated baseline 

1167 cumulative hazard function. 

1168 """ 

1169 

1170 from scipy.interpolate import interp1d 

1171 surv = self.surv 

1172 base = self.baseline_cumulative_hazard(params) 

1173 

1174 cumhaz_f = {} 

1175 for stx in range(surv.nstrat): 

1176 time_h = base[stx][0] 

1177 cumhaz = base[stx][1] 

1178 time_h = np.r_[-np.inf, time_h, np.inf] 

1179 cumhaz = np.r_[cumhaz[0], cumhaz, cumhaz[-1]] 

1180 func = interp1d(time_h, cumhaz, kind='zero') 

1181 cumhaz_f[self.surv.stratum_names[stx]] = func 

1182 

1183 return cumhaz_f 

1184 

1185 @Appender(_predict_docstring % { 

1186 'params_doc': _predict_params_doc, 

1187 'cov_params_doc': _predict_cov_params_docstring}) 

1188 def predict(self, params, exog=None, cov_params=None, endog=None, 

1189 strata=None, offset=None, pred_type="lhr"): 

1190 

1191 pred_type = pred_type.lower() 

1192 if pred_type not in ["lhr", "hr", "surv", "cumhaz"]: 

1193 msg = "Type %s not allowed for prediction" % pred_type 

1194 raise ValueError(msg) 

1195 

1196 class bunch: 

1197 predicted_values = None 

1198 standard_errors = None 

1199 ret_val = bunch() 

1200 

1201 # Do not do anything with offset here because we want to allow 

1202 # different offsets to be specified even if exog is the model 

1203 # exog. 

1204 exog_provided = True 

1205 if exog is None: 

1206 exog = self.exog 

1207 exog_provided = False 

1208 

1209 lhr = np.dot(exog, params) 

1210 if offset is not None: 

1211 lhr += offset 

1212 # Never use self.offset unless we are also using self.exog 

1213 elif self.offset is not None and not exog_provided: 

1214 lhr += self.offset 

1215 

1216 # Handle lhr and hr prediction first, since they do not make 

1217 # use of the hazard function. 

1218 

1219 if pred_type == "lhr": 

1220 ret_val.predicted_values = lhr 

1221 if cov_params is not None: 

1222 mat = np.dot(exog, cov_params) 

1223 va = (mat * exog).sum(1) 

1224 ret_val.standard_errors = np.sqrt(va) 

1225 return ret_val 

1226 

1227 hr = np.exp(lhr) 

1228 

1229 if pred_type == "hr": 

1230 ret_val.predicted_values = hr 

1231 return ret_val 

1232 

1233 # Makes sure endog is defined 

1234 if endog is None and exog_provided: 

1235 msg = "If `exog` is provided `endog` must be provided." 

1236 raise ValueError(msg) 

1237 # Use model endog if using model exog 

1238 elif endog is None and not exog_provided: 

1239 endog = self.endog 

1240 

1241 # Make sure strata is defined 

1242 if strata is None: 

1243 if exog_provided and self.surv.nstrat > 1: 

1244 raise ValueError("`strata` must be provided") 

1245 if self.strata is None: 

1246 strata = [self.surv.stratum_names[0],] * len(endog) 

1247 else: 

1248 strata = self.strata 

1249 

1250 cumhaz = np.nan * np.ones(len(endog), dtype=np.float64) 

1251 stv = np.unique(strata) 

1252 bhaz = self.baseline_cumulative_hazard_function(params) 

1253 for stx in stv: 

1254 ix = np.flatnonzero(strata == stx) 

1255 func = bhaz[stx] 

1256 cumhaz[ix] = func(endog[ix]) * hr[ix] 

1257 

1258 if pred_type == "cumhaz": 

1259 ret_val.predicted_values = cumhaz 

1260 

1261 elif pred_type == "surv": 

1262 ret_val.predicted_values = np.exp(-cumhaz) 

1263 

1264 return ret_val 

1265 

1266 def get_distribution(self, params): 

1267 """ 

1268 Returns a scipy distribution object corresponding to the 

1269 distribution of uncensored endog (duration) values for each 

1270 case. 

1271 

1272 Parameters 

1273 ---------- 

1274 params : array_like 

1275 The proportional hazards model parameters. 

1276 

1277 Returns 

1278 ------- 

1279 A list of objects of type scipy.stats.distributions.rv_discrete 

1280 

1281 Notes 

1282 ----- 

1283 The distributions are obtained from a simple discrete estimate 

1284 of the survivor function that puts all mass on the observed 

1285 failure times within a stratum. 

1286 """ 

1287 

1288 # TODO: this returns a Python list of rv_discrete objects, so 

1289 # nothing can be vectorized. It appears that rv_discrete does 

1290 # not allow vectorization. 

1291 

1292 surv = self.surv 

1293 bhaz = self.baseline_cumulative_hazard(params) 

1294 

1295 # The arguments to rv_discrete_float, first obtained by 

1296 # stratum 

1297 pk, xk = [], [] 

1298 

1299 for stx in range(self.surv.nstrat): 

1300 

1301 exog_s = surv.exog_s[stx] 

1302 

1303 linpred = np.dot(exog_s, params) 

1304 if surv.offset_s is not None: 

1305 linpred += surv.offset_s[stx] 

1306 e_linpred = np.exp(linpred) 

1307 

1308 # The unique failure times for this stratum (the support 

1309 # of the distribution). 

1310 pts = bhaz[stx][0] 

1311 

1312 # The individual cumulative hazards for everyone in this 

1313 # stratum. 

1314 ichaz = np.outer(e_linpred, bhaz[stx][1]) 

1315 

1316 # The individual survival functions. 

1317 usurv = np.exp(-ichaz) 

1318 z = np.zeros((usurv.shape[0], 1)) 

1319 usurv = np.concatenate((usurv, z), axis=1) 

1320 

1321 # The individual survival probability masses. 

1322 probs = -np.diff(usurv, 1) 

1323 

1324 pk.append(probs) 

1325 xk.append(np.outer(np.ones(probs.shape[0]), pts)) 

1326 

1327 # Pad to make all strata have the same shape 

1328 mxc = max([x.shape[1] for x in xk]) 

1329 for k in range(self.surv.nstrat): 

1330 if xk[k].shape[1] < mxc: 

1331 xk1 = np.zeros((xk[k].shape[0], mxc)) 

1332 pk1 = np.zeros((pk[k].shape[0], mxc)) 

1333 xk1[:, 0:xk[k].shape[1]] = xk[k] 

1334 pk1[:, 0:pk[k].shape[1]] = pk[k] 

1335 xk[k], pk[k] = xk1, pk1 

1336 

1337 # Put the support points and probabilities into single matrices 

1338 xka = np.nan * np.ones((len(self.endog), mxc)) 

1339 pka = np.ones((len(self.endog), mxc), dtype=np.float64) / mxc 

1340 for stx in range(self.surv.nstrat): 

1341 ix = self.surv.stratum_rows[stx] 

1342 xka[ix, :] = xk[stx] 

1343 pka[ix, :] = pk[stx] 

1344 

1345 dist = rv_discrete_float(xka, pka) 

1346 

1347 return dist 

1348 

1349 

1350class PHRegResults(base.LikelihoodModelResults): 

1351 ''' 

1352 Class to contain results of fitting a Cox proportional hazards 

1353 survival model. 

1354 

1355 PHregResults inherits from statsmodels.LikelihoodModelResults 

1356 

1357 Parameters 

1358 ---------- 

1359 See statsmodels.LikelihoodModelResults 

1360 

1361 Attributes 

1362 ---------- 

1363 model : class instance 

1364 PHreg model instance that called fit. 

1365 normalized_cov_params : ndarray 

1366 The sampling covariance matrix of the estimates 

1367 params : ndarray 

1368 The coefficients of the fitted model. Each coefficient is the 

1369 log hazard ratio corresponding to a 1 unit difference in a 

1370 single covariate while holding the other covariates fixed. 

1371 bse : ndarray 

1372 The standard errors of the fitted parameters. 

1373 

1374 See Also 

1375 -------- 

1376 statsmodels.LikelihoodModelResults 

1377 ''' 

1378 

1379 def __init__(self, model, params, cov_params, scale=1., covariance_type="naive"): 

1380 

1381 # There is no scale parameter, but we need it for 

1382 # meta-procedures that work with results. 

1383 

1384 self.covariance_type = covariance_type 

1385 self.df_resid = model.df_resid 

1386 self.df_model = model.df_model 

1387 

1388 super(PHRegResults, self).__init__(model, params, scale=1., 

1389 normalized_cov_params=cov_params) 

1390 

1391 @cache_readonly 

1392 def standard_errors(self): 

1393 """ 

1394 Returns the standard errors of the parameter estimates. 

1395 """ 

1396 return np.sqrt(np.diag(self.cov_params())) 

1397 

1398 @cache_readonly 

1399 def bse(self): 

1400 """ 

1401 Returns the standard errors of the parameter estimates. 

1402 """ 

1403 return self.standard_errors 

1404 

1405 def get_distribution(self): 

1406 """ 

1407 Returns a scipy distribution object corresponding to the 

1408 distribution of uncensored endog (duration) values for each 

1409 case. 

1410 

1411 Returns 

1412 ------- 

1413 A list of objects of type scipy.stats.distributions.rv_discrete 

1414 

1415 Notes 

1416 ----- 

1417 The distributions are obtained from a simple discrete estimate 

1418 of the survivor function that puts all mass on the observed 

1419 failure times within a stratum. 

1420 """ 

1421 

1422 return self.model.get_distribution(self.params) 

1423 

1424 @Appender(_predict_docstring % {'params_doc': '', 'cov_params_doc': ''}) 

1425 def predict(self, endog=None, exog=None, strata=None, 

1426 offset=None, transform=True, pred_type="lhr"): 

1427 return super(PHRegResults, self).predict(exog=exog, 

1428 transform=transform, 

1429 cov_params=self.cov_params(), 

1430 endog=endog, 

1431 strata=strata, 

1432 offset=offset, 

1433 pred_type=pred_type) 

1434 

1435 def _group_stats(self, groups): 

1436 """ 

1437 Descriptive statistics of the groups. 

1438 """ 

1439 gsizes = np.unique(groups, return_counts=True) 

1440 gsizes = gsizes[1] 

1441 return gsizes.min(), gsizes.max(), gsizes.mean(), len(gsizes) 

1442 

1443 @cache_readonly 

1444 def weighted_covariate_averages(self): 

1445 """ 

1446 The average covariate values within the at-risk set at each 

1447 event time point, weighted by hazard. 

1448 """ 

1449 return self.model.weighted_covariate_averages(self.params) 

1450 

1451 @cache_readonly 

1452 def score_residuals(self): 

1453 """ 

1454 A matrix containing the score residuals. 

1455 """ 

1456 return self.model.score_residuals(self.params) 

1457 

1458 @cache_readonly 

1459 def baseline_cumulative_hazard(self): 

1460 """ 

1461 A list (corresponding to the strata) containing the baseline 

1462 cumulative hazard function evaluated at the event points. 

1463 """ 

1464 return self.model.baseline_cumulative_hazard(self.params) 

1465 

1466 @cache_readonly 

1467 def baseline_cumulative_hazard_function(self): 

1468 """ 

1469 A list (corresponding to the strata) containing function 

1470 objects that calculate the cumulative hazard function. 

1471 """ 

1472 return self.model.baseline_cumulative_hazard_function(self.params) 

1473 

1474 @cache_readonly 

1475 def schoenfeld_residuals(self): 

1476 """ 

1477 A matrix containing the Schoenfeld residuals. 

1478 

1479 Notes 

1480 ----- 

1481 Schoenfeld residuals for censored observations are set to zero. 

1482 """ 

1483 

1484 surv = self.model.surv 

1485 w_avg = self.weighted_covariate_averages 

1486 

1487 # Initialize at NaN since rows that belong to strata with no 

1488 # events have undefined residuals. 

1489 sch_resid = np.nan*np.ones(self.model.exog.shape, dtype=np.float64) 

1490 

1491 # Loop over strata 

1492 for stx in range(surv.nstrat): 

1493 

1494 uft = surv.ufailt[stx] 

1495 exog_s = surv.exog_s[stx] 

1496 time_s = surv.time_s[stx] 

1497 strat_ix = surv.stratum_rows[stx] 

1498 

1499 ii = np.searchsorted(uft, time_s) 

1500 

1501 # These subjects are censored after the last event in 

1502 # their stratum, so have empty risk sets and undefined 

1503 # residuals. 

1504 jj = np.flatnonzero(ii < len(uft)) 

1505 

1506 sch_resid[strat_ix[jj], :] = exog_s[jj, :] - w_avg[stx][ii[jj], :] 

1507 

1508 jj = np.flatnonzero(self.model.status == 0) 

1509 sch_resid[jj, :] = np.nan 

1510 

1511 return sch_resid 

1512 

1513 @cache_readonly 

1514 def martingale_residuals(self): 

1515 """ 

1516 The martingale residuals. 

1517 """ 

1518 

1519 surv = self.model.surv 

1520 

1521 # Initialize at NaN since rows that belong to strata with no 

1522 # events have undefined residuals. 

1523 mart_resid = np.nan*np.ones(len(self.model.endog), dtype=np.float64) 

1524 

1525 cumhaz_f_list = self.baseline_cumulative_hazard_function 

1526 

1527 # Loop over strata 

1528 for stx in range(surv.nstrat): 

1529 

1530 cumhaz_f = cumhaz_f_list[stx] 

1531 

1532 exog_s = surv.exog_s[stx] 

1533 time_s = surv.time_s[stx] 

1534 

1535 linpred = np.dot(exog_s, self.params) 

1536 if surv.offset_s is not None: 

1537 linpred += surv.offset_s[stx] 

1538 e_linpred = np.exp(linpred) 

1539 

1540 ii = surv.stratum_rows[stx] 

1541 chaz = cumhaz_f(time_s) 

1542 mart_resid[ii] = self.model.status[ii] - e_linpred * chaz 

1543 

1544 return mart_resid 

1545 

1546 def summary(self, yname=None, xname=None, title=None, alpha=.05): 

1547 """ 

1548 Summarize the proportional hazards regression results. 

1549 

1550 Parameters 

1551 ---------- 

1552 yname : str, optional 

1553 Default is `y` 

1554 xname : list[str], optional 

1555 Names for the exogenous variables, default is `x#` for ## in p the 

1556 number of regressors. Must match the number of parameters in 

1557 the model 

1558 title : str, optional 

1559 Title for the top table. If not None, then this replaces 

1560 the default title 

1561 alpha : float 

1562 significance level for the confidence intervals 

1563 

1564 Returns 

1565 ------- 

1566 smry : Summary instance 

1567 this holds the summary tables and text, which can be 

1568 printed or converted to various output formats. 

1569 

1570 See Also 

1571 -------- 

1572 statsmodels.iolib.summary2.Summary : class to hold summary results 

1573 """ 

1574 

1575 from statsmodels.iolib import summary2 

1576 from collections import OrderedDict 

1577 smry = summary2.Summary() 

1578 float_format = "%8.3f" 

1579 

1580 info = OrderedDict() 

1581 info["Model:"] = "PH Reg" 

1582 if yname is None: 

1583 yname = self.model.endog_names 

1584 info["Dependent variable:"] = yname 

1585 info["Ties:"] = self.model.ties.capitalize() 

1586 info["Sample size:"] = str(self.model.surv.n_obs) 

1587 info["Num. events:"] = str(int(sum(self.model.status))) 

1588 

1589 if self.model.groups is not None: 

1590 mn, mx, avg, num = self._group_stats(self.model.groups) 

1591 info["Num groups:"] = "%.0f" % num 

1592 info["Min group size:"] = "%.0f" % mn 

1593 info["Max group size:"] = "%.0f" % mx 

1594 info["Avg group size:"] = "%.1f" % avg 

1595 

1596 if self.model.strata is not None: 

1597 mn, mx, avg, num = self._group_stats(self.model.strata) 

1598 info["Num strata:"] = "%.0f" % num 

1599 info["Min stratum size:"] = "%.0f" % mn 

1600 info["Max stratum size:"] = "%.0f" % mx 

1601 info["Avg stratum size:"] = "%.1f" % avg 

1602 

1603 smry.add_dict(info, align='l', float_format=float_format) 

1604 

1605 param = summary2.summary_params(self, alpha=alpha) 

1606 param = param.rename(columns={"Coef.": "log HR", 

1607 "Std.Err.": "log HR SE"}) 

1608 param.insert(2, "HR", np.exp(param["log HR"])) 

1609 a = "[%.3f" % (alpha / 2) 

1610 param.loc[:, a] = np.exp(param.loc[:, a]) 

1611 a = "%.3f]" % (1 - alpha / 2) 

1612 param.loc[:, a] = np.exp(param.loc[:, a]) 

1613 if xname is not None: 

1614 param.index = xname 

1615 smry.add_df(param, float_format=float_format) 

1616 smry.add_title(title=title, results=self) 

1617 smry.add_text("Confidence intervals are for the hazard ratios") 

1618 

1619 dstrat = self.model.surv.nstrat_orig - self.model.surv.nstrat 

1620 if dstrat > 0: 

1621 if dstrat == 1: 

1622 smry.add_text("1 stratum dropped for having no events") 

1623 else: 

1624 smry.add_text("%d strata dropped for having no events" % dstrat) 

1625 

1626 if self.model.entry is not None: 

1627 n_entry = sum(self.model.entry != 0) 

1628 if n_entry == 1: 

1629 smry.add_text("1 observation has a positive entry time") 

1630 else: 

1631 smry.add_text("%d observations have positive entry times" % n_entry) 

1632 

1633 if self.model.groups is not None: 

1634 smry.add_text("Standard errors account for dependence within groups") 

1635 

1636 if hasattr(self, "regularized"): 

1637 smry.add_text("Standard errors do not account for the regularization") 

1638 

1639 return smry 

1640 

1641 

1642class rv_discrete_float(object): 

1643 """ 

1644 A class representing a collection of discrete distributions. 

1645 

1646 Parameters 

1647 ---------- 

1648 xk : 2d array_like 

1649 The support points, should be non-decreasing within each 

1650 row. 

1651 pk : 2d array_like 

1652 The probabilities, should sum to one within each row. 

1653 

1654 Notes 

1655 ----- 

1656 Each row of `xk`, and the corresponding row of `pk` describe a 

1657 discrete distribution. 

1658 

1659 `xk` and `pk` should both be two-dimensional ndarrays. Each row 

1660 of `pk` should sum to 1. 

1661 

1662 This class is used as a substitute for scipy.distributions. 

1663 rv_discrete, since that class does not allow non-integer support 

1664 points, or vectorized operations. 

1665 

1666 Only a limited number of methods are implemented here compared to 

1667 the other scipy distribution classes. 

1668 """ 

1669 

1670 def __init__(self, xk, pk): 

1671 

1672 self.xk = xk 

1673 self.pk = pk 

1674 self.cpk = np.cumsum(self.pk, axis=1) 

1675 

1676 def rvs(self): 

1677 """ 

1678 Returns a random sample from the discrete distribution. 

1679 

1680 A vector is returned containing a single draw from each row of 

1681 `xk`, using the probabilities of the corresponding row of `pk` 

1682 """ 

1683 

1684 n = self.xk.shape[0] 

1685 u = np.random.uniform(size=n) 

1686 

1687 ix = (self.cpk < u[:, None]).sum(1) 

1688 ii = np.arange(n, dtype=np.int32) 

1689 return self.xk[(ii,ix)] 

1690 

1691 def mean(self): 

1692 """ 

1693 Returns a vector containing the mean values of the discrete 

1694 distributions. 

1695 

1696 A vector is returned containing the mean value of each row of 

1697 `xk`, using the probabilities in the corresponding row of 

1698 `pk`. 

1699 """ 

1700 

1701 return (self.xk * self.pk).sum(1) 

1702 

1703 def var(self): 

1704 """ 

1705 Returns a vector containing the variances of the discrete 

1706 distributions. 

1707 

1708 A vector is returned containing the variance for each row of 

1709 `xk`, using the probabilities in the corresponding row of 

1710 `pk`. 

1711 """ 

1712 

1713 mn = self.mean() 

1714 xkc = self.xk - mn[:, None] 

1715 

1716 return (self.pk * (self.xk - xkc)**2).sum(1) 

1717 

1718 def std(self): 

1719 """ 

1720 Returns a vector containing the standard deviations of the 

1721 discrete distributions. 

1722 

1723 A vector is returned containing the standard deviation for 

1724 each row of `xk`, using the probabilities in the corresponding 

1725 row of `pk`. 

1726 """ 

1727 

1728 return np.sqrt(self.var())