Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2The RegressionFDR class implements the 'Knockoff' approach for 

3controlling false discovery rates (FDR) in regression analysis. 

4 

5The knockoff approach does not require standard errors. Thus one 

6application is to provide inference for parameter estimates that are 

7not smooth functions of the data. For example, the knockoff approach 

8can be used to do inference for parameter estimates obtained from the 

9LASSO, of from stepwise variable selection. 

10 

11The knockoff approach controls FDR for parameter estimates that may be 

12dependent, such as coefficient estimates in a multiple regression 

13model. 

14 

15The knockoff approach is applicable whenever the test statistic can be 

16computed entirely from x'y and x'x, where x is the design matrix and y 

17is the vector of responses. 

18 

19Reference 

20--------- 

21Rina Foygel Barber, Emmanuel Candes (2015). Controlling the False 

22Discovery Rate via Knockoffs. Annals of Statistics 43:5. 

23http://statweb.stanford.edu/~candes/papers/FDR_regression.pdf 

24""" 

25 

26import numpy as np 

27import pandas as pd 

28from statsmodels.iolib import summary2 

29 

30 

31class RegressionFDR(object): 

32 """ 

33 Control FDR in a regression procedure. 

34 

35 Parameters 

36 ---------- 

37 endog : array_like 

38 The dependent variable of the regression 

39 exog : array_like 

40 The independent variables of the regression 

41 regeffects : RegressionEffects instance 

42 An instance of a RegressionEffects class that can compute 

43 effect sizes for the regression coefficients. 

44 method : str 

45 The approach used to assess and control FDR, currently 

46 must be 'knockoff'. 

47 

48 Returns 

49 ------- 

50 Returns an instance of the RegressionFDR class. The `fdr` attribute 

51 holds the estimated false discovery rates. 

52 

53 Notes 

54 ----- 

55 This class Implements the knockoff method of Barber and Candes. 

56 This is an approach for controlling the FDR of a variety of 

57 regression estimation procedures, including correlation 

58 coefficients, OLS regression, OLS with forward selection, and 

59 LASSO regression. 

60 

61 For other approaches to FDR control in regression, see the 

62 statsmodels.stats.multitest module. Methods provided in that 

63 module use Z-scores or p-values, and therefore require standard 

64 errors for the coefficient estimates to be available. 

65 

66 The default method for constructing the augmented design matrix is 

67 the 'equivariant' approach, set `design_method='sdp'` to use an 

68 alternative approach involving semidefinite programming. See 

69 Barber and Candes for more information about both approaches. The 

70 sdp approach requires that the cvxopt package be installed. 

71 """ 

72 

73 def __init__(self, endog, exog, regeffects, method="knockoff", 

74 **kwargs): 

75 

76 if hasattr(exog, "columns"): 

77 self.xnames = exog.columns 

78 else: 

79 self.xnames = ["x%d" % j for j in range(exog.shape[1])] 

80 

81 exog = np.asarray(exog) 

82 endog = np.asarray(endog) 

83 

84 if "design_method" not in kwargs: 

85 kwargs["design_method"] = "equi" 

86 

87 nobs, nvar = exog.shape 

88 

89 if kwargs["design_method"] == "equi": 

90 exog1, exog2, _ = _design_knockoff_equi(exog) 

91 elif kwargs["design_method"] == "sdp": 

92 exog1, exog2, _ = _design_knockoff_sdp(exog) 

93 endog = endog - np.mean(endog) 

94 

95 self.endog = endog 

96 self.exog = np.concatenate((exog1, exog2), axis=1) 

97 self.exog1 = exog1 

98 self.exog2 = exog2 

99 

100 self.stats = regeffects.stats(self) 

101 

102 unq, inv, cnt = np.unique(self.stats, return_inverse=True, 

103 return_counts=True) 

104 

105 # The denominator of the FDR 

106 cc = np.cumsum(cnt) 

107 denom = len(self.stats) - cc + cnt 

108 denom[denom < 1] = 1 

109 

110 # The numerator of the FDR 

111 ii = np.searchsorted(unq, -unq, side='right') - 1 

112 numer = cc[ii] 

113 numer[ii < 0] = 0 

114 

115 # The knockoff+ estimated FDR 

116 fdrp = (1 + numer) / denom 

117 

118 # The knockoff estimated FDR 

119 fdr = numer / denom 

120 

121 self.fdr = fdr[inv] 

122 self.fdrp = fdrp[inv] 

123 self._ufdr = fdr 

124 self._unq = unq 

125 

126 df = pd.DataFrame(index=self.xnames) 

127 df["Stat"] = self.stats 

128 df["FDR+"] = self.fdrp 

129 df["FDR"] = self.fdr 

130 self.fdr_df = df 

131 

132 def threshold(self, tfdr): 

133 """ 

134 Returns the threshold statistic for a given target FDR. 

135 """ 

136 

137 if np.min(self._ufdr) <= tfdr: 

138 return self._unq[self._ufdr <= tfdr][0] 

139 else: 

140 return np.inf 

141 

142 def summary(self): 

143 

144 summ = summary2.Summary() 

145 summ.add_title("Regression FDR results") 

146 summ.add_df(self.fdr_df) 

147 

148 return summ 

149 

150 

151def _design_knockoff_sdp(exog): 

152 """ 

153 Use semidefinite programming to construct a knockoff design 

154 matrix. 

155 

156 Requires cvxopt to be installed. 

157 """ 

158 

159 try: 

160 from cvxopt import solvers, matrix 

161 except ImportError: 

162 raise ValueError("SDP knockoff designs require installation of cvxopt") 

163 

164 nobs, nvar = exog.shape 

165 

166 # Standardize exog 

167 xnm = np.sum(exog**2, 0) 

168 xnm = np.sqrt(xnm) 

169 exog = exog / xnm 

170 

171 Sigma = np.dot(exog.T, exog) 

172 

173 c = matrix(-np.ones(nvar)) 

174 

175 h0 = np.concatenate((np.zeros(nvar), np.ones(nvar))) 

176 h0 = matrix(h0) 

177 G0 = np.concatenate((-np.eye(nvar), np.eye(nvar)), axis=0) 

178 G0 = matrix(G0) 

179 

180 h1 = 2 * Sigma 

181 h1 = matrix(h1) 

182 i, j = np.diag_indices(nvar) 

183 G1 = np.zeros((nvar*nvar, nvar)) 

184 G1[i*nvar + j, i] = 1 

185 G1 = matrix(G1) 

186 

187 solvers.options['show_progress'] = False 

188 sol = solvers.sdp(c, G0, h0, [G1], [h1]) 

189 sl = np.asarray(sol['x']).ravel() 

190 

191 xcov = np.dot(exog.T, exog) 

192 exogn = _get_knmat(exog, xcov, sl) 

193 

194 return exog, exogn, sl 

195 

196 

197def _design_knockoff_equi(exog): 

198 """ 

199 Construct an equivariant design matrix for knockoff analysis. 

200 

201 Follows the 'equi-correlated knockoff approach of equation 2.4 in 

202 Barber and Candes. 

203 

204 Constructs a pair of design matrices exogs, exogn such that exogs 

205 is a scaled/centered version of the input matrix exog, exogn is 

206 another matrix of the same shape with cov(exogn) = cov(exogs), and 

207 the covariances between corresponding columns of exogn and exogs 

208 are as small as possible. 

209 """ 

210 

211 nobs, nvar = exog.shape 

212 

213 if nobs < 2*nvar: 

214 msg = "The equivariant knockoff can ony be used when n >= 2*p" 

215 raise ValueError(msg) 

216 

217 # Standardize exog 

218 xnm = np.sum(exog**2, 0) 

219 xnm = np.sqrt(xnm) 

220 exog = exog / xnm 

221 

222 xcov = np.dot(exog.T, exog) 

223 ev, _ = np.linalg.eig(xcov) 

224 evmin = np.min(ev) 

225 

226 sl = min(2*evmin, 1) 

227 sl = sl * np.ones(nvar) 

228 

229 exogn = _get_knmat(exog, xcov, sl) 

230 

231 return exog, exogn, sl 

232 

233 

234def _get_knmat(exog, xcov, sl): 

235 # Utility function, see equation 2.2 of Barber & Candes. 

236 

237 nobs, nvar = exog.shape 

238 

239 ash = np.linalg.inv(xcov) 

240 ash *= -np.outer(sl, sl) 

241 i, j = np.diag_indices(nvar) 

242 ash[i, j] += 2 * sl 

243 

244 umat = np.random.normal(size=(nobs, nvar)) 

245 u, _ = np.linalg.qr(exog) 

246 umat -= np.dot(u, np.dot(u.T, umat)) 

247 umat, _ = np.linalg.qr(umat) 

248 

249 ashr, xc, _ = np.linalg.svd(ash, 0) 

250 ashr *= np.sqrt(xc) 

251 ashr = ashr.T 

252 

253 ex = (sl[:, None] * np.linalg.solve(xcov, exog.T)).T 

254 exogn = exog - ex + np.dot(umat, ashr) 

255 

256 return exogn