Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/regression/quantile_regression.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
3'''
4Quantile regression model
6Model parameters are estimated using iterated reweighted least squares. The
7asymptotic covariance matrix estimated using kernel density estimation.
9Author: Vincent Arel-Bundock
10License: BSD-3
11Created: 2013-03-19
13The original IRLS function was written for Matlab by Shapour Mohammadi,
14University of Tehran, 2008 (shmohammadi@gmail.com), with some lines based on
15code written by James P. Lesage in Applied Econometrics Using MATLAB(1999).PP.
1673-4. Translated to python with permission from original author by Christian
17Prinoth (christian at prinoth dot name).
18'''
20import numpy as np
21import warnings
22import scipy.stats as stats
23from numpy.linalg import pinv
24from scipy.stats import norm
25from statsmodels.tools.decorators import cache_readonly
26from statsmodels.regression.linear_model import (RegressionModel,
27 RegressionResults,
28 RegressionResultsWrapper)
29from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
30 IterationLimitWarning)
32class QuantReg(RegressionModel):
33 '''Quantile Regression
35 Estimate a quantile regression model using iterative reweighted least
36 squares.
38 Parameters
39 ----------
40 endog : array or dataframe
41 endogenous/response variable
42 exog : array or dataframe
43 exogenous/explanatory variable(s)
45 Notes
46 -----
47 The Least Absolute Deviation (LAD) estimator is a special case where
48 quantile is set to 0.5 (q argument of the fit method).
50 The asymptotic covariance matrix is estimated following the procedure in
51 Greene (2008, p.407-408), using either the logistic or gaussian kernels
52 (kernel argument of the fit method).
54 References
55 ----------
56 General:
58 * Birkes, D. and Y. Dodge(1993). Alternative Methods of Regression, John Wiley and Sons.
59 * Green,W. H. (2008). Econometric Analysis. Sixth Edition. International Student Edition.
60 * Koenker, R. (2005). Quantile Regression. New York: Cambridge University Press.
61 * LeSage, J. P.(1999). Applied Econometrics Using MATLAB,
63 Kernels (used by the fit method):
65 * Green (2008) Table 14.2
67 Bandwidth selection (used by the fit method):
69 * Bofinger, E. (1975). Estimation of a density function using order statistics. Australian Journal of Statistics 17: 1-17.
70 * Chamberlain, G. (1994). Quantile regression, censoring, and the structure of wages. In Advances in Econometrics, Vol. 1: Sixth World Congress, ed. C. A. Sims, 171-209. Cambridge: Cambridge University Press.
71 * Hall, P., and S. Sheather. (1988). On the distribution of the Studentized quantile. Journal of the Royal Statistical Society, Series B 50: 381-391.
73 Keywords: Least Absolute Deviation(LAD) Regression, Quantile Regression,
74 Regression, Robust Estimation.
75 '''
77 def __init__(self, endog, exog, **kwargs):
78 super(QuantReg, self).__init__(endog, exog, **kwargs)
80 def whiten(self, data):
81 """
82 QuantReg model whitener does nothing: returns data.
83 """
84 return data
86 def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
87 max_iter=1000, p_tol=1e-6, **kwargs):
88 """
89 Solve by Iterative Weighted Least Squares
91 Parameters
92 ----------
93 q : float
94 Quantile must be between 0 and 1
95 vcov : str, method used to calculate the variance-covariance matrix
96 of the parameters. Default is ``robust``:
98 - robust : heteroskedasticity robust standard errors (as suggested
99 in Greene 6th edition)
100 - iid : iid errors (as in Stata 12)
102 kernel : str, kernel to use in the kernel density estimation for the
103 asymptotic covariance matrix:
105 - epa: Epanechnikov
106 - cos: Cosine
107 - gau: Gaussian
108 - par: Parzene
110 bandwidth : str, Bandwidth selection method in kernel density
111 estimation for asymptotic covariance estimate (full
112 references in QuantReg docstring):
114 - hsheather: Hall-Sheather (1988)
115 - bofinger: Bofinger (1975)
116 - chamberlain: Chamberlain (1994)
117 """
119 if q < 0 or q > 1:
120 raise Exception('p must be between 0 and 1')
122 kern_names = ['biw', 'cos', 'epa', 'gau', 'par']
123 if kernel not in kern_names:
124 raise Exception("kernel must be one of " + ', '.join(kern_names))
125 else:
126 kernel = kernels[kernel]
128 if bandwidth == 'hsheather':
129 bandwidth = hall_sheather
130 elif bandwidth == 'bofinger':
131 bandwidth = bofinger
132 elif bandwidth == 'chamberlain':
133 bandwidth = chamberlain
134 else:
135 raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'")
137 endog = self.endog
138 exog = self.exog
139 nobs = self.nobs
140 exog_rank = np.linalg.matrix_rank(self.exog)
141 self.rank = exog_rank
142 self.df_model = float(self.rank - self.k_constant)
143 self.df_resid = self.nobs - self.rank
144 n_iter = 0
145 xstar = exog
147 beta = np.ones(exog_rank)
148 # TODO: better start, initial beta is used only for convergence check
150 # Note the following does not work yet,
151 # the iteration loop always starts with OLS as initial beta
152 # if start_params is not None:
153 # if len(start_params) != rank:
154 # raise ValueError('start_params has wrong length')
155 # beta = start_params
156 # else:
157 # # start with OLS
158 # beta = np.dot(np.linalg.pinv(exog), endog)
160 diff = 10
161 cycle = False
163 history = dict(params = [], mse=[])
164 while n_iter < max_iter and diff > p_tol and not cycle:
165 n_iter += 1
166 beta0 = beta
167 xtx = np.dot(xstar.T, exog)
168 xty = np.dot(xstar.T, endog)
169 beta = np.dot(pinv(xtx), xty)
170 resid = endog - np.dot(exog, beta)
172 mask = np.abs(resid) < .000001
173 resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001
174 resid = np.where(resid < 0, q * resid, (1-q) * resid)
175 resid = np.abs(resid)
176 xstar = exog / resid[:, np.newaxis]
177 diff = np.max(np.abs(beta - beta0))
178 history['params'].append(beta)
179 history['mse'].append(np.mean(resid*resid))
181 if (n_iter >= 300) and (n_iter % 100 == 0):
182 # check for convergence circle, should not happen
183 for ii in range(2, 10):
184 if np.all(beta == history['params'][-ii]):
185 cycle = True
186 warnings.warn("Convergence cycle detected", ConvergenceWarning)
187 break
189 if n_iter == max_iter:
190 warnings.warn("Maximum number of iterations (" + str(max_iter) +
191 ") reached.", IterationLimitWarning)
193 e = endog - np.dot(exog, beta)
194 # Greene (2008, p.407) writes that Stata 6 uses this bandwidth:
195 # h = 0.9 * np.std(e) / (nobs**0.2)
196 # Instead, we calculate bandwidth as in Stata 12
197 iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25)
198 h = bandwidth(nobs, q)
199 h = min(np.std(endog),
200 iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h))
202 fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h))
204 if vcov == 'robust':
205 d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
206 xtxi = pinv(np.dot(exog.T, exog))
207 xtdx = np.dot(exog.T * d[np.newaxis, :], exog)
208 vcov = xtxi @ xtdx @ xtxi
209 elif vcov == 'iid':
210 vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog))
211 else:
212 raise Exception("vcov must be 'robust' or 'iid'")
214 lfit = QuantRegResults(self, beta, normalized_cov_params=vcov)
216 lfit.q = q
217 lfit.iterations = n_iter
218 lfit.sparsity = 1. / fhat0
219 lfit.bandwidth = h
220 lfit.history = history
222 return RegressionResultsWrapper(lfit)
225def _parzen(u):
226 z = np.where(np.abs(u) <= .5, 4./3 - 8. * u**2 + 8. * np.abs(u)**3,
227 8. * (1 - np.abs(u))**3 / 3.)
228 z[np.abs(u) > 1] = 0
229 return z
232kernels = {}
233kernels['biw'] = lambda u: 15. / 16 * (1 - u**2)**2 * np.where(np.abs(u) <= 1, 1, 0)
234kernels['cos'] = lambda u: np.where(np.abs(u) <= .5, 1 + np.cos(2 * np.pi * u), 0)
235kernels['epa'] = lambda u: 3. / 4 * (1-u**2) * np.where(np.abs(u) <= 1, 1, 0)
236kernels['gau'] = lambda u: norm.pdf(u)
237kernels['par'] = _parzen
238#kernels['bet'] = lambda u: np.where(np.abs(u) <= 1, .75 * (1 - u) * (1 + u), 0)
239#kernels['log'] = lambda u: logistic.pdf(u) * (1 - logistic.pdf(u))
240#kernels['tri'] = lambda u: np.where(np.abs(u) <= 1, 1 - np.abs(u), 0)
241#kernels['trw'] = lambda u: 35. / 32 * (1 - u**2)**3 * np.where(np.abs(u) <= 1, 1, 0)
242#kernels['uni'] = lambda u: 1. / 2 * np.where(np.abs(u) <= 1, 1, 0)
245def hall_sheather(n, q, alpha=.05):
246 z = norm.ppf(q)
247 num = 1.5 * norm.pdf(z)**2.
248 den = 2. * z**2. + 1.
249 h = n**(-1. / 3) * norm.ppf(1. - alpha / 2.)**(2./3) * (num / den)**(1./3)
250 return h
253def bofinger(n, q):
254 num = 9. / 2 * norm.pdf(2 * norm.ppf(q))**4
255 den = (2 * norm.ppf(q)**2 + 1)**2
256 h = n**(-1. / 5) * (num / den)**(1. / 5)
257 return h
260def chamberlain(n, q, alpha=.05):
261 return norm.ppf(1 - alpha / 2) * np.sqrt(q*(1 - q) / n)
264class QuantRegResults(RegressionResults):
265 '''Results instance for the QuantReg model'''
267 @cache_readonly
268 def prsquared(self):
269 q = self.q
270 endog = self.model.endog
271 e = self.resid
272 e = np.where(e < 0, (1 - q) * e, q * e)
273 e = np.abs(e)
274 ered = endog - stats.scoreatpercentile(endog, q * 100)
275 ered = np.where(ered < 0, (1 - q) * ered, q * ered)
276 ered = np.abs(ered)
277 return 1 - np.sum(e) / np.sum(ered)
279 #@cache_readonly
280 def scale(self):
281 return 1.
283 @cache_readonly
284 def bic(self):
285 return np.nan
287 @cache_readonly
288 def aic(self):
289 return np.nan
291 @cache_readonly
292 def llf(self):
293 return np.nan
295 @cache_readonly
296 def rsquared(self):
297 return np.nan
299 @cache_readonly
300 def rsquared_adj(self):
301 return np.nan
303 @cache_readonly
304 def mse(self):
305 return np.nan
307 @cache_readonly
308 def mse_model(self):
309 return np.nan
311 @cache_readonly
312 def mse_total(self):
313 return np.nan
315 @cache_readonly
316 def centered_tss(self):
317 return np.nan
319 @cache_readonly
320 def uncentered_tss(self):
321 return np.nan
323 @cache_readonly
324 def HC0_se(self):
325 raise NotImplementedError
327 @cache_readonly
328 def HC1_se(self):
329 raise NotImplementedError
331 @cache_readonly
332 def HC2_se(self):
333 raise NotImplementedError
335 @cache_readonly
336 def HC3_se(self):
337 raise NotImplementedError
339 def summary(self, yname=None, xname=None, title=None, alpha=.05):
340 """Summarize the Regression Results
342 Parameters
343 ----------
344 yname : str, optional
345 Default is `y`
346 xname : list[str], optional
347 Names for the exogenous variables. Default is `var_##` for ## in
348 the number of regressors. Must match the number of parameters
349 in the model
350 title : str, optional
351 Title for the top table. If not None, then this replaces the
352 default title
353 alpha : float
354 significance level for the confidence intervals
356 Returns
357 -------
358 smry : Summary instance
359 this holds the summary tables and text, which can be printed or
360 converted to various output formats.
362 See Also
363 --------
364 statsmodels.iolib.summary.Summary : class to hold summary results
365 """
366 eigvals = self.eigenvals
367 condno = self.condition_number
369 top_left = [('Dep. Variable:', None),
370 ('Model:', None),
371 ('Method:', ['Least Squares']),
372 ('Date:', None),
373 ('Time:', None)
374 ]
376 top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]),
377 ('Bandwidth:', ["%#8.4g" % self.bandwidth]),
378 ('Sparsity:', ["%#8.4g" % self.sparsity]),
379 ('No. Observations:', None),
380 ('Df Residuals:', None),
381 ('Df Model:', None)
382 ]
384 if title is None:
385 title = self.model.__class__.__name__ + ' ' + "Regression Results"
387 # create summary table instance
388 from statsmodels.iolib.summary import Summary
389 smry = Summary()
390 smry.add_table_2cols(self, gleft=top_left, gright=top_right,
391 yname=yname, xname=xname, title=title)
392 smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha,
393 use_t=self.use_t)
395 # add warnings/notes, added to text format only
396 etext = []
397 if eigvals[-1] < 1e-10:
398 wstr = "The smallest eigenvalue is %6.3g. This might indicate "
399 wstr += "that there are\n"
400 wstr += "strong multicollinearity problems or that the design "
401 wstr += "matrix is singular."
402 wstr = wstr % eigvals[-1]
403 etext.append(wstr)
404 elif condno > 1000: # TODO: what is recommended
405 wstr = "The condition number is large, %6.3g. This might "
406 wstr += "indicate that there are\n"
407 wstr += "strong multicollinearity or other numerical "
408 wstr += "problems."
409 wstr = wstr % condno
410 etext.append(wstr)
412 if etext:
413 smry.add_extra_txt(etext)
415 return smry