Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/sandbox/nonparametric/kernels.py : 21%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
3"""
4This models contains the Kernels for Kernel smoothing.
6Hopefully in the future they may be reused/extended for other kernel based
7method
9References:
10----------
12Pointwise Kernel Confidence Bounds
13(smoothconf)
14http://fedc.wiwi.hu-berlin.de/xplore/ebooks/html/anr/anrhtmlframe62.html
15"""
17# pylint: disable-msg=C0103
18# pylint: disable-msg=W0142
19# pylint: disable-msg=E1101
20# pylint: disable-msg=E0611
21from statsmodels.compat.python import lzip, lfilter
22import numpy as np
23import scipy.integrate
24from scipy.special import factorial
25from numpy import exp, multiply, square, divide, subtract, inf
28class NdKernel(object):
29 """Generic N-dimensial kernel
31 Parameters
32 ----------
33 n : int
34 The number of series for kernel estimates
35 kernels : list
36 kernels
38 Can be constructed from either
39 a) a list of n kernels which will be treated as
40 indepent marginals on a gaussian copula (specified by H)
41 or b) a single univariate kernel which will be applied radially to the
42 mahalanobis distance defined by H.
44 In the case of the Gaussian these are both equivalent, and the second constructiong
45 is prefered.
46 """
47 def __init__(self, n, kernels = None, H = None):
48 if kernels is None:
49 kernels = Gaussian()
51 self._kernels = kernels
52 self.weights = None
54 if H is None:
55 H = np.matrix( np.identity(n))
57 self._H = H
58 self._Hrootinv = np.linalg.cholesky( H.I )
60 def getH(self):
61 """Getter for kernel bandwidth, H"""
62 return self._H
64 def setH(self, value):
65 """Setter for kernel bandwidth, H"""
66 self._H = value
68 H = property(getH, setH, doc="Kernel bandwidth matrix")
70 def density(self, xs, x):
72 n = len(xs)
73 #xs = self.in_domain( xs, xs, x )[0]
75 if len(xs)>0: ## Need to do product of marginal distributions
76 #w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
77 #vectorized does not work:
78 if self.weights is not None:
79 w = np.mean(self((xs-x) * self._Hrootinv).T * self.weights)/sum(self.weights)
80 else:
81 w = np.mean(self((xs-x) * self._Hrootinv )) #transposed
82 #w = np.mean([self(xd) for xd in ((xs-x) * self._Hrootinv)] ) #transposed
83 return w
84 else:
85 return np.nan
87 def _kernweight(self, x ):
88 """returns the kernel weight for the independent multivariate kernel"""
89 if isinstance( self._kernels, CustomKernel ):
90 ## Radial case
91 #d = x.T * x
92 #x is matrix, 2d, element wise sqrt looks wrong
93 #d = np.sqrt( x.T * x )
94 x = np.asarray(x)
95 #d = np.sqrt( (x * x).sum(-1) )
96 d = (x * x).sum(-1)
97 return self._kernels( np.asarray(d) )
99 def __call__(self, x):
100 """
101 This simply returns the value of the kernel function at x
103 Does the same as weight if the function is normalised
104 """
105 return self._kernweight(x)
108class CustomKernel(object):
109 """
110 Generic 1D Kernel object.
111 Can be constructed by selecting a standard named Kernel,
112 or providing a lambda expression and domain.
113 The domain allows some algorithms to run faster for finite domain kernels.
114 """
115 # MC: Not sure how this will look in the end - or even still exist.
116 # Main purpose of this is to allow custom kernels and to allow speed up
117 # from finite support.
119 def __init__(self, shape, h = 1.0, domain = None, norm = None):
120 """
121 shape should be a function taking and returning numeric type.
123 For sanity it should always return positive or zero but this is not
124 enforced in case you want to do weird things. Bear in mind that the
125 statistical tests etc. may not be valid for non-positive kernels.
127 The bandwidth of the kernel is supplied as h.
129 You may specify a domain as a list of 2 values [min, max], in which case
130 kernel will be treated as zero outside these values. This will speed up
131 calculation.
133 You may also specify the normalisation constant for the supplied Kernel.
134 If you do this number will be stored and used as the normalisation
135 without calculation. It is recommended you do this if you know the
136 constant, to speed up calculation. In particular if the shape function
137 provided is already normalised you should provide norm = 1.0.
139 Warning: I think several calculations assume that the kernel is
140 normalized. No tests for non-normalized kernel.
141 """
142 self._normconst = norm # a value or None, if None, then calculate
143 self.domain = domain
144 self.weights = None
145 if callable(shape):
146 self._shape = shape
147 else:
148 raise TypeError("shape must be a callable object/function")
149 self._h = h
150 self._L2Norm = None
151 self._kernel_var = None
152 self._normal_reference_constant = None
153 self._order = None
155 def geth(self):
156 """Getter for kernel bandwidth, h"""
157 return self._h
158 def seth(self, value):
159 """Setter for kernel bandwidth, h"""
160 self._h = value
161 h = property(geth, seth, doc="Kernel Bandwidth")
163 def in_domain(self, xs, ys, x):
164 """
165 Returns the filtered (xs, ys) based on the Kernel domain centred on x
166 """
167 # Disable black-list functions: filter used for speed instead of
168 # list-comprehension
169 # pylint: disable-msg=W0141
170 def isInDomain(xy):
171 """Used for filter to check if point is in the domain"""
172 u = (xy[0]-x)/self.h
173 return u >= self.domain[0] and u <= self.domain[1]
175 if self.domain is None:
176 return (xs, ys)
177 else:
178 filtered = lfilter(isInDomain, lzip(xs, ys))
179 if len(filtered) > 0:
180 xs, ys = lzip(*filtered)
181 return (xs, ys)
182 else:
183 return ([], [])
185 def density(self, xs, x):
186 """Returns the kernel density estimate for point x based on x-values
187 xs
188 """
189 xs = np.asarray(xs)
190 n = len(xs) # before in_domain?
191 if self.weights is not None:
192 xs, weights = self.in_domain( xs, self.weights, x )
193 else:
194 xs = self.in_domain( xs, xs, x )[0]
195 xs = np.asarray(xs)
196 #print 'len(xs)', len(xs), x
197 if xs.ndim == 1:
198 xs = xs[:,None]
199 if len(xs)>0:
200 h = self.h
201 if self.weights is not None:
202 w = 1 / h * np.sum(self((xs-x)/h).T * weights, axis=1)
203 else:
204 w = 1. / (h * n) * np.sum(self((xs-x)/h), axis=0)
205 return w
206 else:
207 return np.nan
209 def density_var(self, density, nobs):
210 """approximate pointwise variance for kernel density
212 not verified
214 Parameters
215 ----------
216 density : array_lie
217 pdf of the kernel density
218 nobs : int
219 number of observations used in the KDE estimation
221 Returns
222 -------
223 kde_var : ndarray
224 estimated variance of the density estimate
226 Notes
227 -----
228 This uses the asymptotic normal approximation to the distribution of
229 the density estimate.
230 """
231 return np.asarray(density) * self.L2Norm / self.h / nobs
233 def density_confint(self, density, nobs, alpha=0.05):
234 """approximate pointwise confidence interval for kernel density
236 The confidence interval is centered at the estimated density and
237 ignores the bias of the density estimate.
239 not verified
241 Parameters
242 ----------
243 density : array_lie
244 pdf of the kernel density
245 nobs : int
246 number of observations used in the KDE estimation
248 Returns
249 -------
250 conf_int : ndarray
251 estimated confidence interval of the density estimate, lower bound
252 in first column and upper bound in second column
254 Notes
255 -----
256 This uses the asymptotic normal approximation to the distribution of
257 the density estimate. The lower bound can be negative for density
258 values close to zero.
259 """
260 from scipy import stats
261 crit = stats.norm.isf(alpha / 2.)
262 density = np.asarray(density)
263 half_width = crit * np.sqrt(self.density_var(density, nobs))
264 conf_int = np.column_stack((density - half_width, density + half_width))
265 return conf_int
267 def smooth(self, xs, ys, x):
268 """Returns the kernel smoothing estimate for point x based on x-values
269 xs and y-values ys.
270 Not expected to be called by the user.
271 """
272 xs, ys = self.in_domain(xs, ys, x)
274 if len(xs)>0:
275 w = np.sum(self((xs-x)/self.h))
276 #TODO: change the below to broadcasting when shape is sorted
277 v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)])
278 return v / w
279 else:
280 return np.nan
282 def smoothvar(self, xs, ys, x):
283 """Returns the kernel smoothing estimate of the variance at point x.
284 """
285 xs, ys = self.in_domain(xs, ys, x)
287 if len(xs) > 0:
288 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
289 sqresid = square( subtract(ys, fittedvals) )
290 w = np.sum(self((xs-x)/self.h))
291 v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
292 return v / w
293 else:
294 return np.nan
296 def smoothconf(self, xs, ys, x, alpha=0.05):
297 """Returns the kernel smoothing estimate with confidence 1sigma bounds
298 """
299 xs, ys = self.in_domain(xs, ys, x)
301 if len(xs) > 0:
302 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
303 #fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle
304 sqresid = square(
305 subtract(ys, fittedvals)
306 )
307 w = np.sum(self((xs-x)/self.h))
308 #var = sqresid.sum() / (len(sqresid) - 0) # nonlocal var ? JP just trying
309 v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
310 var = v / w
311 sd = np.sqrt(var)
312 K = self.L2Norm
313 yhat = self.smooth(xs, ys, x)
314 from scipy import stats
315 crit = stats.norm.isf(alpha / 2)
316 err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const)
317 return (yhat - err, yhat, yhat + err)
318 else:
319 return (np.nan, np.nan, np.nan)
321 @property
322 def L2Norm(self):
323 """Returns the integral of the square of the kernal from -inf to inf"""
324 if self._L2Norm is None:
325 L2Func = lambda x: (self.norm_const*self._shape(x))**2
326 if self.domain is None:
327 self._L2Norm = scipy.integrate.quad(L2Func, -inf, inf)[0]
328 else:
329 self._L2Norm = scipy.integrate.quad(L2Func, self.domain[0],
330 self.domain[1])[0]
331 return self._L2Norm
333 @property
334 def norm_const(self):
335 """
336 Normalising constant for kernel (integral from -inf to inf)
337 """
338 if self._normconst is None:
339 if self.domain is None:
340 quadres = scipy.integrate.quad(self._shape, -inf, inf)
341 else:
342 quadres = scipy.integrate.quad(self._shape, self.domain[0],
343 self.domain[1])
344 self._normconst = 1.0/(quadres[0])
345 return self._normconst
347 @property
348 def kernel_var(self):
349 """Returns the second moment of the kernel"""
350 if self._kernel_var is None:
351 func = lambda x: x**2 * self.norm_const * self._shape(x)
352 if self.domain is None:
353 self._kernel_var = scipy.integrate.quad(func, -inf, inf)[0]
354 else:
355 self._kernel_var = scipy.integrate.quad(func, self.domain[0],
356 self.domain[1])[0]
357 return self._kernel_var
359 def moments(self, n):
361 if n > 2:
362 msg = "Only first and second moment currently implemented"
363 raise NotImplementedError(msg)
365 if n == 1:
366 return 0
368 if n == 2:
369 return self.kernel_var
371 @property
372 def normal_reference_constant(self):
373 """
374 Constant used for silverman normal reference asymtotic bandwidth
375 calculation.
377 C = 2((pi^(1/2)*(nu!)^3 R(k))/(2nu(2nu)!kap_nu(k)^2))^(1/(2nu+1))
378 nu = kernel order
379 kap_nu = nu'th moment of kernel
380 R = kernel roughness (square of L^2 norm)
382 Note: L2Norm property returns square of norm.
383 """
384 nu = self._order
386 if not nu == 2:
387 msg = "Only implemented for second order kernels"
388 raise NotImplementedError(msg)
390 if self._normal_reference_constant is None:
391 C = np.pi**(.5) * factorial(nu)**3 * self.L2Norm
392 C /= (2 * nu * factorial(2 * nu) * self.moments(nu)**2)
393 C = 2*C**(1.0/(2*nu+1))
394 self._normal_reference_constant = C
396 return self._normal_reference_constant
399 def weight(self, x):
400 """This returns the normalised weight at distance x"""
401 return self.norm_const*self._shape(x)
403 def __call__(self, x):
404 """
405 This simply returns the value of the kernel function at x
407 Does the same as weight if the function is normalised
408 """
409 return self._shape(x)
412class Uniform(CustomKernel):
413 def __init__(self, h=1.0):
414 CustomKernel.__init__(self, shape=lambda x: 0.5 * np.ones(x.shape), h=h,
415 domain=[-1.0, 1.0], norm = 1.0)
416 self._L2Norm = 0.5
417 self._kernel_var = 1. / 3
418 self._order = 2
421class Triangular(CustomKernel):
422 def __init__(self, h=1.0):
423 CustomKernel.__init__(self, shape=lambda x: 1 - abs(x), h=h,
424 domain=[-1.0, 1.0], norm = 1.0)
425 self._L2Norm = 2.0/3.0
426 self._kernel_var = 1. / 6
427 self._order = 2
430class Epanechnikov(CustomKernel):
431 def __init__(self, h=1.0):
432 CustomKernel.__init__(self, shape=lambda x: 0.75*(1 - x*x), h=h,
433 domain=[-1.0, 1.0], norm = 1.0)
434 self._L2Norm = 0.6
435 self._kernel_var = 0.2
436 self._order = 2
439class Biweight(CustomKernel):
440 def __init__(self, h=1.0):
441 CustomKernel.__init__(self, shape=lambda x: 0.9375*(1 - x*x)**2, h=h,
442 domain=[-1.0, 1.0], norm = 1.0)
443 self._L2Norm = 5.0/7.0
444 self._kernel_var = 1. / 7
445 self._order = 2
447 def smooth(self, xs, ys, x):
448 """Returns the kernel smoothing estimate for point x based on x-values
449 xs and y-values ys.
450 Not expected to be called by the user.
452 Special implementation optimised for Biweight.
453 """
454 xs, ys = self.in_domain(xs, ys, x)
456 if len(xs) > 0:
457 w = np.sum(square(subtract(1, square(divide(subtract(xs, x),
458 self.h)))))
459 v = np.sum(multiply(ys, square(subtract(1, square(divide(
460 subtract(xs, x), self.h))))))
461 return v / w
462 else:
463 return np.nan
465 def smoothvar(self, xs, ys, x):
466 """
467 Returns the kernel smoothing estimate of the variance at point x.
468 """
469 xs, ys = self.in_domain(xs, ys, x)
471 if len(xs) > 0:
472 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
473 rs = square(subtract(ys, fittedvals))
474 w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
475 self.h)))))
476 v = np.sum(multiply(rs, square(subtract(1, square(divide(
477 subtract(xs, x), self.h))))))
478 return v / w
479 else:
480 return np.nan
482 def smoothconf_(self, xs, ys, x):
483 """Returns the kernel smoothing estimate with confidence 1sigma bounds
484 """
485 xs, ys = self.in_domain(xs, ys, x)
487 if len(xs) > 0:
488 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
489 rs = square(subtract(ys, fittedvals))
490 w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
491 self.h)))))
492 v = np.sum(multiply(rs, square(subtract(1, square(divide(
493 subtract(xs, x), self.h))))))
494 var = v / w
495 sd = np.sqrt(var)
496 K = self.L2Norm
497 yhat = self.smooth(xs, ys, x)
498 err = sd * K / np.sqrt(0.9375 * w * self.h)
499 return (yhat - err, yhat, yhat + err)
500 else:
501 return (np.nan, np.nan, np.nan)
503class Triweight(CustomKernel):
504 def __init__(self, h=1.0):
505 CustomKernel.__init__(self, shape=lambda x: 1.09375*(1 - x*x)**3, h=h,
506 domain=[-1.0, 1.0], norm = 1.0)
507 self._L2Norm = 350.0/429.0
508 self._kernel_var = 1. / 9
509 self._order = 2
512class Gaussian(CustomKernel):
513 """
514 Gaussian (Normal) Kernel
516 K(u) = 1 / (sqrt(2*pi)) exp(-0.5 u**2)
517 """
518 def __init__(self, h=1.0):
519 CustomKernel.__init__(self, shape = lambda x: 0.3989422804014327 *
520 np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0)
521 self._L2Norm = 1.0/(2.0*np.sqrt(np.pi))
522 self._kernel_var = 1.0
523 self._order = 2
525 def smooth(self, xs, ys, x):
526 """Returns the kernel smoothing estimate for point x based on x-values
527 xs and y-values ys.
528 Not expected to be called by the user.
530 Special implementation optimised for Gaussian.
531 """
532 w = np.sum(exp(multiply(square(divide(subtract(xs, x),
533 self.h)),-0.5)))
534 v = np.sum(multiply(ys, exp(multiply(square(divide(subtract(xs, x),
535 self.h)), -0.5))))
536 return v/w
538class Cosine(CustomKernel):
539 """
540 Cosine Kernel
542 K(u) = pi/4 cos(0.5 * pi * u) between -1.0 and 1.0
543 """
544 def __init__(self, h=1.0):
545 CustomKernel.__init__(self, shape=lambda x: 0.78539816339744828 *
546 np.cos(np.pi/2.0 * x), h=h, domain=[-1.0, 1.0], norm = 1.0)
547 self._L2Norm = np.pi**2/16.0
548 self._kernel_var = 0.1894305308612978 # = 1 - 8 / np.pi**2
549 self._order = 2
552class Cosine2(CustomKernel):
553 """
554 Cosine2 Kernel
556 K(u) = 1 + cos(2 * pi * u) between -0.5 and 0.5
558 Note: this is the same Cosine kernel that Stata uses
559 """
560 def __init__(self, h=1.0):
561 CustomKernel.__init__(self, shape=lambda x: 1 + np.cos(2.0 * np.pi * x)
562 , h=h, domain=[-0.5, 0.5], norm = 1.0)
563 self._L2Norm = 1.5
564 self._kernel_var = 0.03267274151216444 # = 1/12. - 0.5 / np.pi**2
565 self._order = 2