Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/nonparametric/kernel_regression.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Multivariate Conditional and Unconditional Kernel Density Estimation
3with Mixed Data Types
5References
6----------
7[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
8 Princeton University Press. (2007)
9[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
10 and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
11 http://dx.doi.org/10.1561/0800000009
12[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
13 with Categorical and Continuous Data." Working Paper. (2000)
14[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
15 Distributions Annals of Economics and Finance 5, 211-235 (2004)
16[5] Liu, R., Yang, L. "Kernel estimation of multivariate
17 cumulative distribution function."
18 Journal of Nonparametric Statistics (2008)
19[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
20 with Categorical and Continuous Data." Working Paper
21[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
22 regression" Statistica Sinica 14(2004), pp. 485-512
23[8] Racine, J.: "Consistent Significance Testing for Nonparametric
24 Regression" Journal of Business & Economics Statistics
25[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
26 Categorical Predictor Variables in Nonparametric Regression
27 Models", 2006, Econometric Reviews 25, 523-544
29"""
31# TODO: make default behavior efficient=True above a certain n_obs
32import copy
34import numpy as np
35from scipy import optimize
36from scipy.stats.mstats import mquantiles
38from ._kernel_base import GenericKDE, EstimatorSettings, gpke, \
39 LeaveOneOut, _get_type_pos, _adjust_shape, _compute_min_std_IQR, kernel_func
42__all__ = ['KernelReg', 'KernelCensoredReg']
45class KernelReg(GenericKDE):
46 """
47 Nonparametric kernel regression class.
49 Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``.
50 Note that the "local constant" type of regression provided here is also
51 known as Nadaraya-Watson kernel regression; "local linear" is an extension
52 of that which suffers less from bias issues at the edge of the support. Note
53 that specifying a custom kernel works only with "local linear" kernel
54 regression. For example, a custom ``tricube`` kernel yields LOESS regression.
56 Parameters
57 ----------
58 endog : array_like
59 This is the dependent variable.
60 exog : array_like
61 The training data for the independent variable(s)
62 Each element in the list is a separate variable
63 var_type : str
64 The type of the variables, one character per variable:
66 - c: continuous
67 - u: unordered (discrete)
68 - o: ordered (discrete)
70 reg_type : {'lc', 'll'}, optional
71 Type of regression estimator. 'lc' means local constant and
72 'll' local Linear estimator. Default is 'll'
73 bw : str or array_like, optional
74 Either a user-specified bandwidth or the method for bandwidth
75 selection. If a string, valid values are 'cv_ls' (least-squares
76 cross-validation) and 'aic' (AIC Hurvich bandwidth estimation).
77 Default is 'cv_ls'. User specified bandwidth must have as many
78 entries as the number of variables.
79 ckertype : str, optional
80 The kernel used for the continuous variables.
81 okertype : str, optional
82 The kernel used for the ordered discrete variables.
83 ukertype : str, optional
84 The kernel used for the unordered discrete variables.
85 defaults : EstimatorSettings instance, optional
86 The default values for the efficient bandwidth estimation.
88 Attributes
89 ----------
90 bw : array_like
91 The bandwidth parameters.
92 """
93 def __init__(self, endog, exog, var_type, reg_type='ll', bw='cv_ls',
94 ckertype='gaussian', okertype='wangryzin',
95 ukertype='aitchisonaitken', defaults=None):
96 self.var_type = var_type
97 self.data_type = var_type
98 self.reg_type = reg_type
99 self.ckertype = ckertype
100 self.okertype = okertype
101 self.ukertype = ukertype
102 if not (self.ckertype in kernel_func and self.ukertype in kernel_func
103 and self.okertype in kernel_func):
104 raise ValueError('user specified kernel must be a supported '
105 'kernel from statsmodels.nonparametric.kernels.')
107 self.k_vars = len(self.var_type)
108 self.endog = _adjust_shape(endog, 1)
109 self.exog = _adjust_shape(exog, self.k_vars)
110 self.data = np.column_stack((self.endog, self.exog))
111 self.nobs = np.shape(self.exog)[0]
112 self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear)
113 defaults = EstimatorSettings() if defaults is None else defaults
114 self._set_defaults(defaults)
115 if not isinstance(bw, str):
116 bw = np.asarray(bw)
117 if len(bw) != self.k_vars:
118 raise ValueError('bw must have the same dimension as the '
119 'number of variables.')
120 if not self.efficient:
121 self.bw = self._compute_reg_bw(bw)
122 else:
123 self.bw = self._compute_efficient(bw)
125 def _compute_reg_bw(self, bw):
126 if not isinstance(bw, str):
127 self._bw_method = "user-specified"
128 return np.asarray(bw)
129 else:
130 # The user specified a bandwidth selection method e.g. 'cv_ls'
131 self._bw_method = bw
132 # Workaround to avoid instance methods in __dict__
133 if bw == 'cv_ls':
134 res = self.cv_loo
135 else: # bw == 'aic'
136 res = self.aic_hurvich
137 X = np.std(self.exog, axis=0)
138 h0 = 1.06 * X * \
139 self.nobs ** (- 1. / (4 + np.size(self.exog, axis=1)))
141 func = self.est[self.reg_type]
142 bw_estimated = optimize.fmin(res, x0=h0, args=(func, ),
143 maxiter=1e3, maxfun=1e3, disp=0)
144 return bw_estimated
146 def _est_loc_linear(self, bw, endog, exog, data_predict):
147 """
148 Local linear estimator of g(x) in the regression ``y = g(x) + e``.
150 Parameters
151 ----------
152 bw : array_like
153 Vector of bandwidth value(s).
154 endog : 1D array_like
155 The dependent variable.
156 exog : 1D or 2D array_like
157 The independent variable(s).
158 data_predict : 1D array_like of length K, where K is the number of variables.
159 The point at which the density is estimated.
161 Returns
162 -------
163 D_x : array_like
164 The value of the conditional mean at `data_predict`.
166 Notes
167 -----
168 See p. 81 in [1] and p.38 in [2] for the formulas.
169 Unlike other methods, this one requires that `data_predict` be 1D.
170 """
171 nobs, k_vars = exog.shape
172 ker = gpke(bw, data=exog, data_predict=data_predict,
173 var_type=self.var_type,
174 ckertype=self.ckertype,
175 ukertype=self.ukertype,
176 okertype=self.okertype,
177 tosum=False) / float(nobs)
178 # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
179 # See also p. 38 in [2]
180 #ix_cont = np.arange(self.k_vars) # Use all vars instead of continuous only
181 # Note: because ix_cont was defined here such that it selected all
182 # columns, I removed the indexing with it from exog/data_predict.
184 # Convert ker to a 2-D array to make matrix operations below work
185 ker = ker[:, np.newaxis]
187 M12 = exog - data_predict
188 M22 = np.dot(M12.T, M12 * ker)
189 M12 = (M12 * ker).sum(axis=0)
190 M = np.empty((k_vars + 1, k_vars + 1))
191 M[0, 0] = ker.sum()
192 M[0, 1:] = M12
193 M[1:, 0] = M12
194 M[1:, 1:] = M22
196 ker_endog = ker * endog
197 V = np.empty((k_vars + 1, 1))
198 V[0, 0] = ker_endog.sum()
199 V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)
201 mean_mfx = np.dot(np.linalg.pinv(M), V)
202 mean = mean_mfx[0]
203 mfx = mean_mfx[1:, :]
204 return mean, mfx
206 def _est_loc_constant(self, bw, endog, exog, data_predict):
207 """
208 Local constant estimator of g(x) in the regression
209 y = g(x) + e
211 Parameters
212 ----------
213 bw : array_like
214 Array of bandwidth value(s).
215 endog : 1D array_like
216 The dependent variable.
217 exog : 1D or 2D array_like
218 The independent variable(s).
219 data_predict : 1D or 2D array_like
220 The point(s) at which the density is estimated.
222 Returns
223 -------
224 G : ndarray
225 The value of the conditional mean at `data_predict`.
226 B_x : ndarray
227 The marginal effects.
228 """
229 ker_x = gpke(bw, data=exog, data_predict=data_predict,
230 var_type=self.var_type,
231 ckertype=self.ckertype,
232 ukertype=self.ukertype,
233 okertype=self.okertype,
234 tosum=False)
235 ker_x = np.reshape(ker_x, np.shape(endog))
236 G_numer = (ker_x * endog).sum(axis=0)
237 G_denom = ker_x.sum(axis=0)
238 G = G_numer / G_denom
239 nobs = exog.shape[0]
240 f_x = G_denom / float(nobs)
241 ker_xc = gpke(bw, data=exog, data_predict=data_predict,
242 var_type=self.var_type,
243 ckertype='d_gaussian',
244 #okertype='wangryzin_reg',
245 tosum=False)
247 ker_xc = ker_xc[:, np.newaxis]
248 d_mx = -(endog * ker_xc).sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
249 d_fx = -ker_xc.sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
250 B_x = d_mx / f_x - G * d_fx / f_x
251 B_x = (G_numer * d_fx - G_denom * d_mx) / (G_denom**2)
252 #B_x = (f_x * d_mx - m_x * d_fx) / (f_x ** 2)
253 return G, B_x
255 def aic_hurvich(self, bw, func=None):
256 """
257 Computes the AIC Hurvich criteria for the estimation of the bandwidth.
259 Parameters
260 ----------
261 bw : str or array_like
262 See the ``bw`` parameter of `KernelReg` for details.
264 Returns
265 -------
266 aic : ndarray
267 The AIC Hurvich criteria, one element for each variable.
268 func : None
269 Unused here, needed in signature because it's used in `cv_loo`.
271 References
272 ----------
273 See ch.2 in [1] and p.35 in [2].
274 """
275 H = np.empty((self.nobs, self.nobs))
276 for j in range(self.nobs):
277 H[:, j] = gpke(bw, data=self.exog, data_predict=self.exog[j,:],
278 ckertype=self.ckertype, ukertype=self.ukertype,
279 okertype=self.okertype, var_type=self.var_type,
280 tosum=False)
282 denom = H.sum(axis=1)
283 H = H / denom
284 gx = KernelReg(endog=self.endog, exog=self.exog, var_type=self.var_type,
285 reg_type=self.reg_type, bw=bw,
286 defaults=EstimatorSettings(efficient=False)).fit()[0]
287 gx = np.reshape(gx, (self.nobs, 1))
288 sigma = ((self.endog - gx)**2).sum(axis=0) / float(self.nobs)
290 frac = (1 + np.trace(H) / float(self.nobs)) / \
291 (1 - (np.trace(H) + 2) / float(self.nobs))
292 #siga = np.dot(self.endog.T, (I - H).T)
293 #sigb = np.dot((I - H), self.endog)
294 #sigma = np.dot(siga, sigb) / float(self.nobs)
295 aic = np.log(sigma) + frac
296 return aic
298 def cv_loo(self, bw, func):
299 r"""
300 The cross-validation function with leave-one-out estimator.
302 Parameters
303 ----------
304 bw : array_like
305 Vector of bandwidth values.
306 func : callable function
307 Returns the estimator of g(x). Can be either ``_est_loc_constant``
308 (local constant) or ``_est_loc_linear`` (local_linear).
310 Returns
311 -------
312 L : float
313 The value of the CV function.
315 Notes
316 -----
317 Calculates the cross-validation least-squares function. This function
318 is minimized by compute_bw to calculate the optimal value of `bw`.
320 For details see p.35 in [2]
322 .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}
324 where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
325 and :math:`h` is the vector of bandwidths
326 """
327 LOO_X = LeaveOneOut(self.exog)
328 LOO_Y = LeaveOneOut(self.endog).__iter__()
329 L = 0
330 for ii, X_not_i in enumerate(LOO_X):
331 Y = next(LOO_Y)
332 G = func(bw, endog=Y, exog=-X_not_i,
333 data_predict=-self.exog[ii, :])[0]
334 L += (self.endog[ii] - G) ** 2
336 # Note: There might be a way to vectorize this. See p.72 in [1]
337 return L / self.nobs
339 def r_squared(self):
340 r"""
341 Returns the R-Squared for the nonparametric regression.
343 Notes
344 -----
345 For more details see p.45 in [2]
346 The R-Squared is calculated by:
348 .. math:: R^{2}=\frac{\left[\sum_{i=1}^{n}
349 (Y_{i}-\bar{y})(\hat{Y_{i}}-\bar{y}\right]^{2}}{\sum_{i=1}^{n}
350 (Y_{i}-\bar{y})^{2}\sum_{i=1}^{n}(\hat{Y_{i}}-\bar{y})^{2}},
352 where :math:`\hat{Y_{i}}` is the mean calculated in `fit` at the exog
353 points.
354 """
355 Y = np.squeeze(self.endog)
356 Yhat = self.fit()[0]
357 Y_bar = np.mean(Yhat)
358 R2_numer = (((Y - Y_bar) * (Yhat - Y_bar)).sum())**2
359 R2_denom = ((Y - Y_bar)**2).sum(axis=0) * \
360 ((Yhat - Y_bar)**2).sum(axis=0)
361 return R2_numer / R2_denom
363 def fit(self, data_predict=None):
364 """
365 Returns the mean and marginal effects at the `data_predict` points.
367 Parameters
368 ----------
369 data_predict : array_like, optional
370 Points at which to return the mean and marginal effects. If not
371 given, ``data_predict == exog``.
373 Returns
374 -------
375 mean : ndarray
376 The regression result for the mean (i.e. the actual curve).
377 mfx : ndarray
378 The marginal effects, i.e. the partial derivatives of the mean.
379 """
380 func = self.est[self.reg_type]
381 if data_predict is None:
382 data_predict = self.exog
383 else:
384 data_predict = _adjust_shape(data_predict, self.k_vars)
386 N_data_predict = np.shape(data_predict)[0]
387 mean = np.empty((N_data_predict,))
388 mfx = np.empty((N_data_predict, self.k_vars))
389 for i in range(N_data_predict):
390 mean_mfx = func(self.bw, self.endog, self.exog,
391 data_predict=data_predict[i, :])
392 mean[i] = mean_mfx[0]
393 mfx_c = np.squeeze(mean_mfx[1])
394 mfx[i, :] = mfx_c
396 return mean, mfx
398 def sig_test(self, var_pos, nboot=50, nested_res=25, pivot=False):
399 """
400 Significance test for the variables in the regression.
402 Parameters
403 ----------
404 var_pos : sequence
405 The position of the variable in exog to be tested.
407 Returns
408 -------
409 sig : str
410 The level of significance:
412 - `*` : at 90% confidence level
413 - `**` : at 95% confidence level
414 - `***` : at 99* confidence level
415 - "Not Significant" : if not significant
416 """
417 var_pos = np.asarray(var_pos)
418 ix_cont, ix_ord, ix_unord = _get_type_pos(self.var_type)
419 if np.any(ix_cont[var_pos]):
420 if np.any(ix_ord[var_pos]) or np.any(ix_unord[var_pos]):
421 raise ValueError("Discrete variable in hypothesis. Must be continuous")
423 Sig = TestRegCoefC(self, var_pos, nboot, nested_res, pivot)
424 else:
425 Sig = TestRegCoefD(self, var_pos, nboot)
427 return Sig.sig
429 def __repr__(self):
430 """Provide something sane to print."""
431 rpr = "KernelReg instance\n"
432 rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n"
433 rpr += "Number of samples: N = " + str(self.nobs) + "\n"
434 rpr += "Variable types: " + self.var_type + "\n"
435 rpr += "BW selection method: " + self._bw_method + "\n"
436 rpr += "Estimator type: " + self.reg_type + "\n"
437 return rpr
439 def _get_class_vars_type(self):
440 """Helper method to be able to pass needed vars to _compute_subset."""
441 class_type = 'KernelReg'
442 class_vars = (self.var_type, self.k_vars, self.reg_type)
443 return class_type, class_vars
445 def _compute_dispersion(self, data):
446 """
447 Computes the measure of dispersion.
449 The minimum of the standard deviation and interquartile range / 1.349
451 References
452 ----------
453 See the user guide for the np package in R.
454 In the notes on bwscaling option in npreg, npudens, npcdens there is
455 a discussion on the measure of dispersion
456 """
457 data = data[:, 1:]
458 return _compute_min_std_IQR(data)
461class KernelCensoredReg(KernelReg):
462 """
463 Nonparametric censored regression.
465 Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``,
466 where y is left-censored. Left censored variable Y is defined as
467 ``Y = min {Y', L}`` where ``L`` is the value at which ``Y`` is censored
468 and ``Y'`` is the true value of the variable.
470 Parameters
471 ----------
472 endog : list with one element which is array_like
473 This is the dependent variable.
474 exog : list
475 The training data for the independent variable(s)
476 Each element in the list is a separate variable
477 dep_type : str
478 The type of the dependent variable(s)
479 c: Continuous
480 u: Unordered (Discrete)
481 o: Ordered (Discrete)
482 reg_type : str
483 Type of regression estimator
484 lc: Local Constant Estimator
485 ll: Local Linear Estimator
486 bw : array_like
487 Either a user-specified bandwidth or
488 the method for bandwidth selection.
489 cv_ls: cross-validation least squares
490 aic: AIC Hurvich Estimator
491 ckertype : str, optional
492 The kernel used for the continuous variables.
493 okertype : str, optional
494 The kernel used for the ordered discrete variables.
495 ukertype : str, optional
496 The kernel used for the unordered discrete variables.
497 censor_val : float
498 Value at which the dependent variable is censored
499 defaults : EstimatorSettings instance, optional
500 The default values for the efficient bandwidth estimation
502 Attributes
503 ----------
504 bw : array_like
505 The bandwidth parameters
506 """
507 def __init__(self, endog, exog, var_type, reg_type, bw='cv_ls',
508 ckertype='gaussian',
509 ukertype='aitchison_aitken_reg',
510 okertype='wangryzin_reg',
511 censor_val=0, defaults=None):
512 self.var_type = var_type
513 self.data_type = var_type
514 self.reg_type = reg_type
515 self.ckertype = ckertype
516 self.okertype = okertype
517 self.ukertype = ukertype
518 if not (self.ckertype in kernel_func and self.ukertype in kernel_func
519 and self.okertype in kernel_func):
520 raise ValueError('user specified kernel must be a supported '
521 'kernel from statsmodels.nonparametric.kernels.')
523 self.k_vars = len(self.var_type)
524 self.endog = _adjust_shape(endog, 1)
525 self.exog = _adjust_shape(exog, self.k_vars)
526 self.data = np.column_stack((self.endog, self.exog))
527 self.nobs = np.shape(self.exog)[0]
528 self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear)
529 defaults = EstimatorSettings() if defaults is None else defaults
530 self._set_defaults(defaults)
531 self.censor_val = censor_val
532 if self.censor_val is not None:
533 self.censored(censor_val)
534 else:
535 self.W_in = np.ones((self.nobs, 1))
537 if not self.efficient:
538 self.bw = self._compute_reg_bw(bw)
539 else:
540 self.bw = self._compute_efficient(bw)
542 def censored(self, censor_val):
543 # see pp. 341-344 in [1]
544 self.d = (self.endog != censor_val) * 1.
545 ix = np.argsort(np.squeeze(self.endog))
546 self.sortix = ix
547 self.sortix_rev = np.zeros(ix.shape, int)
548 self.sortix_rev[ix] = np.arange(len(ix))
549 self.endog = np.squeeze(self.endog[ix])
550 self.endog = _adjust_shape(self.endog, 1)
551 self.exog = np.squeeze(self.exog[ix])
552 self.d = np.squeeze(self.d[ix])
553 self.W_in = np.empty((self.nobs, 1))
554 for i in range(1, self.nobs + 1):
555 P=1
556 for j in range(1, i):
557 P *= ((self.nobs - j)/(float(self.nobs)-j+1))**self.d[j-1]
558 self.W_in[i-1,0] = P * self.d[i-1] / (float(self.nobs) - i + 1 )
560 def __repr__(self):
561 """Provide something sane to print."""
562 rpr = "KernelCensoredReg instance\n"
563 rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n"
564 rpr += "Number of samples: nobs = " + str(self.nobs) + "\n"
565 rpr += "Variable types: " + self.var_type + "\n"
566 rpr += "BW selection method: " + self._bw_method + "\n"
567 rpr += "Estimator type: " + self.reg_type + "\n"
568 return rpr
570 def _est_loc_linear(self, bw, endog, exog, data_predict, W):
571 """
572 Local linear estimator of g(x) in the regression ``y = g(x) + e``.
574 Parameters
575 ----------
576 bw : array_like
577 Vector of bandwidth value(s)
578 endog : 1D array_like
579 The dependent variable
580 exog : 1D or 2D array_like
581 The independent variable(s)
582 data_predict : 1D array_like of length K, where K is
583 the number of variables. The point at which
584 the density is estimated
586 Returns
587 -------
588 D_x : array_like
589 The value of the conditional mean at data_predict
591 Notes
592 -----
593 See p. 81 in [1] and p.38 in [2] for the formulas
594 Unlike other methods, this one requires that data_predict be 1D
595 """
596 nobs, k_vars = exog.shape
597 ker = gpke(bw, data=exog, data_predict=data_predict,
598 var_type=self.var_type,
599 ckertype=self.ckertype,
600 ukertype=self.ukertype,
601 okertype=self.okertype, tosum=False)
602 # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
603 # See also p. 38 in [2]
605 # Convert ker to a 2-D array to make matrix operations below work
606 ker = W * ker[:, np.newaxis]
608 M12 = exog - data_predict
609 M22 = np.dot(M12.T, M12 * ker)
610 M12 = (M12 * ker).sum(axis=0)
611 M = np.empty((k_vars + 1, k_vars + 1))
612 M[0, 0] = ker.sum()
613 M[0, 1:] = M12
614 M[1:, 0] = M12
615 M[1:, 1:] = M22
617 ker_endog = ker * endog
618 V = np.empty((k_vars + 1, 1))
619 V[0, 0] = ker_endog.sum()
620 V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)
622 mean_mfx = np.dot(np.linalg.pinv(M), V)
623 mean = mean_mfx[0]
624 mfx = mean_mfx[1:, :]
625 return mean, mfx
628 def cv_loo(self, bw, func):
629 r"""
630 The cross-validation function with leave-one-out
631 estimator
633 Parameters
634 ----------
635 bw : array_like
636 Vector of bandwidth values
637 func : callable function
638 Returns the estimator of g(x).
639 Can be either ``_est_loc_constant`` (local constant) or
640 ``_est_loc_linear`` (local_linear).
642 Returns
643 -------
644 L : float
645 The value of the CV function
647 Notes
648 -----
649 Calculates the cross-validation least-squares
650 function. This function is minimized by compute_bw
651 to calculate the optimal value of bw
653 For details see p.35 in [2]
655 .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}
657 where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
658 and :math:`h` is the vector of bandwidths
659 """
660 LOO_X = LeaveOneOut(self.exog)
661 LOO_Y = LeaveOneOut(self.endog).__iter__()
662 LOO_W = LeaveOneOut(self.W_in).__iter__()
663 L = 0
664 for ii, X_not_i in enumerate(LOO_X):
665 Y = next(LOO_Y)
666 w = next(LOO_W)
667 G = func(bw, endog=Y, exog=-X_not_i,
668 data_predict=-self.exog[ii, :], W=w)[0]
669 L += (self.endog[ii] - G) ** 2
671 # Note: There might be a way to vectorize this. See p.72 in [1]
672 return L / self.nobs
674 def fit(self, data_predict=None):
675 """
676 Returns the marginal effects at the data_predict points.
677 """
678 func = self.est[self.reg_type]
679 if data_predict is None:
680 data_predict = self.exog
681 else:
682 data_predict = _adjust_shape(data_predict, self.k_vars)
684 N_data_predict = np.shape(data_predict)[0]
685 mean = np.empty((N_data_predict,))
686 mfx = np.empty((N_data_predict, self.k_vars))
687 for i in range(N_data_predict):
688 mean_mfx = func(self.bw, self.endog, self.exog,
689 data_predict=data_predict[i, :],
690 W=self.W_in)
691 mean[i] = mean_mfx[0]
692 mfx_c = np.squeeze(mean_mfx[1])
693 mfx[i, :] = mfx_c
695 return mean, mfx
698class TestRegCoefC(object):
699 """
700 Significance test for continuous variables in a nonparametric regression.
702 The null hypothesis is ``dE(Y|X)/dX_not_i = 0``, the alternative hypothesis
703 is ``dE(Y|X)/dX_not_i != 0``.
705 Parameters
706 ----------
707 model : KernelReg instance
708 This is the nonparametric regression model whose elements
709 are tested for significance.
710 test_vars : tuple, list of integers, array_like
711 index of position of the continuous variables to be tested
712 for significance. E.g. (1,3,5) jointly tests variables at
713 position 1,3 and 5 for significance.
714 nboot : int
715 Number of bootstrap samples used to determine the distribution
716 of the test statistic in a finite sample. Default is 400
717 nested_res : int
718 Number of nested resamples used to calculate lambda.
719 Must enable the pivot option
720 pivot : bool
721 Pivot the test statistic by dividing by its standard error
722 Significantly increases computational time. But pivot statistics
723 have more desirable properties
724 (See references)
726 Attributes
727 ----------
728 sig : str
729 The significance level of the variable(s) tested
730 "Not Significant": Not significant at the 90% confidence level
731 Fails to reject the null
732 "*": Significant at the 90% confidence level
733 "**": Significant at the 95% confidence level
734 "***": Significant at the 99% confidence level
736 Notes
737 -----
738 This class allows testing of joint hypothesis as long as all variables
739 are continuous.
741 References
742 ----------
743 Racine, J.: "Consistent Significance Testing for Nonparametric Regression"
744 Journal of Business & Economics Statistics.
746 Chapter 12 in [1].
747 """
748 # Significance of continuous vars in nonparametric regression
749 # Racine: Consistent Significance Testing for Nonparametric Regression
750 # Journal of Business & Economics Statistics
751 def __init__(self, model, test_vars, nboot=400, nested_res=400,
752 pivot=False):
753 self.nboot = nboot
754 self.nres = nested_res
755 self.test_vars = test_vars
756 self.model = model
757 self.bw = model.bw
758 self.var_type = model.var_type
759 self.k_vars = len(self.var_type)
760 self.endog = model.endog
761 self.exog = model.exog
762 self.gx = model.est[model.reg_type]
763 self.test_vars = test_vars
764 self.pivot = pivot
765 self.run()
767 def run(self):
768 self.test_stat = self._compute_test_stat(self.endog, self.exog)
769 self.sig = self._compute_sig()
771 def _compute_test_stat(self, Y, X):
772 """
773 Computes the test statistic. See p.371 in [8].
774 """
775 lam = self._compute_lambda(Y, X)
776 t = lam
777 if self.pivot:
778 se_lam = self._compute_se_lambda(Y, X)
779 t = lam / float(se_lam)
781 return t
783 def _compute_lambda(self, Y, X):
784 """Computes only lambda -- the main part of the test statistic"""
785 n = np.shape(X)[0]
786 Y = _adjust_shape(Y, 1)
787 X = _adjust_shape(X, self.k_vars)
788 b = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
789 defaults = EstimatorSettings(efficient=False)).fit()[1]
791 b = b[:, self.test_vars]
792 b = np.reshape(b, (n, len(self.test_vars)))
793 #fct = np.std(b) # Pivot the statistic by dividing by SE
794 fct = 1. # Do not Pivot -- Bootstrapping works better if Pivot
795 lam = ((b / fct) ** 2).sum() / float(n)
796 return lam
798 def _compute_se_lambda(self, Y, X):
799 """
800 Calculates the SE of lambda by nested resampling
801 Used to pivot the statistic.
802 Bootstrapping works better with estimating pivotal statistics
803 but slows down computation significantly.
804 """
805 n = np.shape(Y)[0]
806 lam = np.empty(shape=(self.nres,))
807 for i in range(self.nres):
808 ind = np.random.randint(0, n, size=(n, 1))
809 Y1 = Y[ind, 0]
810 X1 = X[ind, :]
811 lam[i] = self._compute_lambda(Y1, X1)
813 se_lambda = np.std(lam)
814 return se_lambda
816 def _compute_sig(self):
817 """
818 Computes the significance value for the variable(s) tested.
820 The empirical distribution of the test statistic is obtained through
821 bootstrapping the sample. The null hypothesis is rejected if the test
822 statistic is larger than the 90, 95, 99 percentiles.
823 """
824 t_dist = np.empty(shape=(self.nboot, ))
825 Y = self.endog
826 X = copy.deepcopy(self.exog)
827 n = np.shape(Y)[0]
829 X[:, self.test_vars] = np.mean(X[:, self.test_vars], axis=0)
830 # Calculate the restricted mean. See p. 372 in [8]
831 M = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
832 defaults=EstimatorSettings(efficient=False)).fit()[0]
833 M = np.reshape(M, (n, 1))
834 e = Y - M
835 e = e - np.mean(e) # recenter residuals
836 for i in range(self.nboot):
837 ind = np.random.randint(0, n, size=(n, 1))
838 e_boot = e[ind, 0]
839 Y_boot = M + e_boot
840 t_dist[i] = self._compute_test_stat(Y_boot, self.exog)
842 self.t_dist = t_dist
843 sig = "Not Significant"
844 if self.test_stat > mquantiles(t_dist, 0.9):
845 sig = "*"
846 if self.test_stat > mquantiles(t_dist, 0.95):
847 sig = "**"
848 if self.test_stat > mquantiles(t_dist, 0.99):
849 sig = "***"
851 return sig
854class TestRegCoefD(TestRegCoefC):
855 """
856 Significance test for the categorical variables in a nonparametric
857 regression.
859 Parameters
860 ----------
861 model : Instance of KernelReg class
862 This is the nonparametric regression model whose elements
863 are tested for significance.
864 test_vars : tuple, list of one element
865 index of position of the discrete variable to be tested
866 for significance. E.g. (3) tests variable at
867 position 3 for significance.
868 nboot : int
869 Number of bootstrap samples used to determine the distribution
870 of the test statistic in a finite sample. Default is 400
872 Attributes
873 ----------
874 sig : str
875 The significance level of the variable(s) tested
876 "Not Significant": Not significant at the 90% confidence level
877 Fails to reject the null
878 "*": Significant at the 90% confidence level
879 "**": Significant at the 95% confidence level
880 "***": Significant at the 99% confidence level
882 Notes
883 -----
884 This class currently does not allow joint hypothesis.
885 Only one variable can be tested at a time
887 References
888 ----------
889 See [9] and chapter 12 in [1].
890 """
892 def _compute_test_stat(self, Y, X):
893 """Computes the test statistic"""
895 dom_x = np.sort(np.unique(self.exog[:, self.test_vars]))
897 n = np.shape(X)[0]
898 model = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
899 defaults = EstimatorSettings(efficient=False))
900 X1 = copy.deepcopy(X)
901 X1[:, self.test_vars] = 0
903 m0 = model.fit(data_predict=X1)[0]
904 m0 = np.reshape(m0, (n, 1))
905 zvec = np.zeros((n, 1)) # noqa:E741
906 for i in dom_x[1:] :
907 X1[:, self.test_vars] = i
908 m1 = model.fit(data_predict=X1)[0]
909 m1 = np.reshape(m1, (n, 1))
910 zvec += (m1 - m0) ** 2 # noqa:E741
912 avg = zvec.sum(axis=0) / float(n)
913 return avg
915 def _compute_sig(self):
916 """Calculates the significance level of the variable tested"""
918 m = self._est_cond_mean()
919 Y = self.endog
920 X = self.exog
921 n = np.shape(X)[0]
922 u = Y - m
923 u = u - np.mean(u) # center
924 fct1 = (1 - 5**0.5) / 2.
925 fct2 = (1 + 5**0.5) / 2.
926 u1 = fct1 * u
927 u2 = fct2 * u
928 r = fct2 / (5 ** 0.5)
929 I_dist = np.empty((self.nboot,1))
930 for j in range(self.nboot):
931 u_boot = copy.deepcopy(u2)
933 prob = np.random.uniform(0,1, size = (n,1))
934 ind = prob < r
935 u_boot[ind] = u1[ind]
936 Y_boot = m + u_boot
937 I_dist[j] = self._compute_test_stat(Y_boot, X)
939 sig = "Not Significant"
940 if self.test_stat > mquantiles(I_dist, 0.9):
941 sig = "*"
942 if self.test_stat > mquantiles(I_dist, 0.95):
943 sig = "**"
944 if self.test_stat > mquantiles(I_dist, 0.99):
945 sig = "***"
947 return sig
949 def _est_cond_mean(self):
950 """
951 Calculates the expected conditional mean
952 m(X, Z=l) for all possible l
953 """
954 self.dom_x = np.sort(np.unique(self.exog[:, self.test_vars]))
955 X = copy.deepcopy(self.exog)
956 m=0
957 for i in self.dom_x:
958 X[:, self.test_vars] = i
959 m += self.model.fit(data_predict = X)[0]
961 m = m / float(len(self.dom_x))
962 m = np.reshape(m, (np.shape(self.exog)[0], 1))
963 return m