Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/_knockoff.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2The RegressionFDR class implements the 'Knockoff' approach for
3controlling false discovery rates (FDR) in regression analysis.
5The knockoff approach does not require standard errors. Thus one
6application is to provide inference for parameter estimates that are
7not smooth functions of the data. For example, the knockoff approach
8can be used to do inference for parameter estimates obtained from the
9LASSO, of from stepwise variable selection.
11The knockoff approach controls FDR for parameter estimates that may be
12dependent, such as coefficient estimates in a multiple regression
13model.
15The knockoff approach is applicable whenever the test statistic can be
16computed entirely from x'y and x'x, where x is the design matrix and y
17is the vector of responses.
19Reference
20---------
21Rina Foygel Barber, Emmanuel Candes (2015). Controlling the False
22Discovery Rate via Knockoffs. Annals of Statistics 43:5.
23http://statweb.stanford.edu/~candes/papers/FDR_regression.pdf
24"""
26import numpy as np
27import pandas as pd
28from statsmodels.iolib import summary2
31class RegressionFDR(object):
32 """
33 Control FDR in a regression procedure.
35 Parameters
36 ----------
37 endog : array_like
38 The dependent variable of the regression
39 exog : array_like
40 The independent variables of the regression
41 regeffects : RegressionEffects instance
42 An instance of a RegressionEffects class that can compute
43 effect sizes for the regression coefficients.
44 method : str
45 The approach used to assess and control FDR, currently
46 must be 'knockoff'.
48 Returns
49 -------
50 Returns an instance of the RegressionFDR class. The `fdr` attribute
51 holds the estimated false discovery rates.
53 Notes
54 -----
55 This class Implements the knockoff method of Barber and Candes.
56 This is an approach for controlling the FDR of a variety of
57 regression estimation procedures, including correlation
58 coefficients, OLS regression, OLS with forward selection, and
59 LASSO regression.
61 For other approaches to FDR control in regression, see the
62 statsmodels.stats.multitest module. Methods provided in that
63 module use Z-scores or p-values, and therefore require standard
64 errors for the coefficient estimates to be available.
66 The default method for constructing the augmented design matrix is
67 the 'equivariant' approach, set `design_method='sdp'` to use an
68 alternative approach involving semidefinite programming. See
69 Barber and Candes for more information about both approaches. The
70 sdp approach requires that the cvxopt package be installed.
71 """
73 def __init__(self, endog, exog, regeffects, method="knockoff",
74 **kwargs):
76 if hasattr(exog, "columns"):
77 self.xnames = exog.columns
78 else:
79 self.xnames = ["x%d" % j for j in range(exog.shape[1])]
81 exog = np.asarray(exog)
82 endog = np.asarray(endog)
84 if "design_method" not in kwargs:
85 kwargs["design_method"] = "equi"
87 nobs, nvar = exog.shape
89 if kwargs["design_method"] == "equi":
90 exog1, exog2, _ = _design_knockoff_equi(exog)
91 elif kwargs["design_method"] == "sdp":
92 exog1, exog2, _ = _design_knockoff_sdp(exog)
93 endog = endog - np.mean(endog)
95 self.endog = endog
96 self.exog = np.concatenate((exog1, exog2), axis=1)
97 self.exog1 = exog1
98 self.exog2 = exog2
100 self.stats = regeffects.stats(self)
102 unq, inv, cnt = np.unique(self.stats, return_inverse=True,
103 return_counts=True)
105 # The denominator of the FDR
106 cc = np.cumsum(cnt)
107 denom = len(self.stats) - cc + cnt
108 denom[denom < 1] = 1
110 # The numerator of the FDR
111 ii = np.searchsorted(unq, -unq, side='right') - 1
112 numer = cc[ii]
113 numer[ii < 0] = 0
115 # The knockoff+ estimated FDR
116 fdrp = (1 + numer) / denom
118 # The knockoff estimated FDR
119 fdr = numer / denom
121 self.fdr = fdr[inv]
122 self.fdrp = fdrp[inv]
123 self._ufdr = fdr
124 self._unq = unq
126 df = pd.DataFrame(index=self.xnames)
127 df["Stat"] = self.stats
128 df["FDR+"] = self.fdrp
129 df["FDR"] = self.fdr
130 self.fdr_df = df
132 def threshold(self, tfdr):
133 """
134 Returns the threshold statistic for a given target FDR.
135 """
137 if np.min(self._ufdr) <= tfdr:
138 return self._unq[self._ufdr <= tfdr][0]
139 else:
140 return np.inf
142 def summary(self):
144 summ = summary2.Summary()
145 summ.add_title("Regression FDR results")
146 summ.add_df(self.fdr_df)
148 return summ
151def _design_knockoff_sdp(exog):
152 """
153 Use semidefinite programming to construct a knockoff design
154 matrix.
156 Requires cvxopt to be installed.
157 """
159 try:
160 from cvxopt import solvers, matrix
161 except ImportError:
162 raise ValueError("SDP knockoff designs require installation of cvxopt")
164 nobs, nvar = exog.shape
166 # Standardize exog
167 xnm = np.sum(exog**2, 0)
168 xnm = np.sqrt(xnm)
169 exog = exog / xnm
171 Sigma = np.dot(exog.T, exog)
173 c = matrix(-np.ones(nvar))
175 h0 = np.concatenate((np.zeros(nvar), np.ones(nvar)))
176 h0 = matrix(h0)
177 G0 = np.concatenate((-np.eye(nvar), np.eye(nvar)), axis=0)
178 G0 = matrix(G0)
180 h1 = 2 * Sigma
181 h1 = matrix(h1)
182 i, j = np.diag_indices(nvar)
183 G1 = np.zeros((nvar*nvar, nvar))
184 G1[i*nvar + j, i] = 1
185 G1 = matrix(G1)
187 solvers.options['show_progress'] = False
188 sol = solvers.sdp(c, G0, h0, [G1], [h1])
189 sl = np.asarray(sol['x']).ravel()
191 xcov = np.dot(exog.T, exog)
192 exogn = _get_knmat(exog, xcov, sl)
194 return exog, exogn, sl
197def _design_knockoff_equi(exog):
198 """
199 Construct an equivariant design matrix for knockoff analysis.
201 Follows the 'equi-correlated knockoff approach of equation 2.4 in
202 Barber and Candes.
204 Constructs a pair of design matrices exogs, exogn such that exogs
205 is a scaled/centered version of the input matrix exog, exogn is
206 another matrix of the same shape with cov(exogn) = cov(exogs), and
207 the covariances between corresponding columns of exogn and exogs
208 are as small as possible.
209 """
211 nobs, nvar = exog.shape
213 if nobs < 2*nvar:
214 msg = "The equivariant knockoff can ony be used when n >= 2*p"
215 raise ValueError(msg)
217 # Standardize exog
218 xnm = np.sum(exog**2, 0)
219 xnm = np.sqrt(xnm)
220 exog = exog / xnm
222 xcov = np.dot(exog.T, exog)
223 ev, _ = np.linalg.eig(xcov)
224 evmin = np.min(ev)
226 sl = min(2*evmin, 1)
227 sl = sl * np.ones(nvar)
229 exogn = _get_knmat(exog, xcov, sl)
231 return exog, exogn, sl
234def _get_knmat(exog, xcov, sl):
235 # Utility function, see equation 2.2 of Barber & Candes.
237 nobs, nvar = exog.shape
239 ash = np.linalg.inv(xcov)
240 ash *= -np.outer(sl, sl)
241 i, j = np.diag_indices(nvar)
242 ash[i, j] += 2 * sl
244 umat = np.random.normal(size=(nobs, nvar))
245 u, _ = np.linalg.qr(exog)
246 umat -= np.dot(u, np.dot(u.T, umat))
247 umat, _ = np.linalg.qr(umat)
249 ashr, xc, _ = np.linalg.svd(ash, 0)
250 ashr *= np.sqrt(xc)
251 ashr = ashr.T
253 ex = (sl[:, None] * np.linalg.solve(xcov, exog.T)).T
254 exogn = exog - ex + np.dot(umat, ashr)
256 return exogn