Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Univariate Kernel Density Estimators 

3 

4References 

5---------- 

6Racine, Jeff. (2008) "Nonparametric Econometrics: A Primer," Foundation and 

7 Trends in Econometrics: Vol 3: No 1, pp1-88. 

8 http://dx.doi.org/10.1561/0800000009 

9 

10https://en.wikipedia.org/wiki/Kernel_%28statistics%29 

11 

12Silverman, B.W. Density Estimation for Statistics and Data Analysis. 

13""" 

14import numpy as np 

15from scipy import integrate, stats 

16from statsmodels.sandbox.nonparametric import kernels 

17from statsmodels.tools.decorators import cache_readonly 

18from statsmodels.tools.validation import array_like 

19from . import bandwidths 

20from .kdetools import (forrt, revrt, silverman_transform) 

21from .linbin import fast_linbin 

22 

23#### Kernels Switch for estimators #### 

24 

25kernel_switch = dict(gau=kernels.Gaussian, epa=kernels.Epanechnikov, 

26 uni=kernels.Uniform, tri=kernels.Triangular, 

27 biw=kernels.Biweight, triw=kernels.Triweight, 

28 cos=kernels.Cosine, cos2=kernels.Cosine2) 

29 

30def _checkisfit(self): 

31 try: 

32 self.density 

33 except: 

34 raise ValueError("Call fit to fit the density first") 

35 

36 

37#### Kernel Density Estimator Class ### 

38 

39 

40class KDEUnivariate(object): 

41 """ 

42 Univariate Kernel Density Estimator. 

43 

44 Parameters 

45 ---------- 

46 endog : array_like 

47 The variable for which the density estimate is desired. 

48 

49 Notes 

50 ----- 

51 If cdf, sf, cumhazard, or entropy are computed, they are computed based on 

52 the definition of the kernel rather than the FFT approximation, even if 

53 the density is fit with FFT = True. 

54 

55 `KDEUnivariate` is much faster than `KDEMultivariate`, due to its FFT-based 

56 implementation. It should be preferred for univariate, continuous data. 

57 `KDEMultivariate` also supports mixed data. 

58 

59 See Also 

60 -------- 

61 KDEMultivariate 

62 kdensity, kdensityfft 

63 

64 Examples 

65 -------- 

66 >>> import statsmodels.api as sm 

67 >>> import matplotlib.pyplot as plt 

68 

69 >>> nobs = 300 

70 >>> np.random.seed(1234) # Seed random generator 

71 >>> dens = sm.nonparametric.KDEUnivariate(np.random.normal(size=nobs)) 

72 >>> dens.fit() 

73 >>> plt.plot(dens.cdf) 

74 >>> plt.show() 

75 """ 

76 

77 def __init__(self, endog): 

78 self.endog = array_like(endog, "endog", ndim=1, contiguous=True) 

79 

80 def fit(self, kernel="gau", bw="normal_reference", fft=True, weights=None, 

81 gridsize=None, adjust=1, cut=3, clip=(-np.inf, np.inf)): 

82 """ 

83 Attach the density estimate to the KDEUnivariate class. 

84 

85 Parameters 

86 ---------- 

87 kernel : str 

88 The Kernel to be used. Choices are: 

89 

90 - "biw" for biweight 

91 - "cos" for cosine 

92 - "epa" for Epanechnikov 

93 - "gau" for Gaussian. 

94 - "tri" for triangular 

95 - "triw" for triweight 

96 - "uni" for uniform 

97 

98 bw : str, float 

99 The bandwidth to use. Choices are: 

100 

101 - "scott" - 1.059 * A * nobs ** (-1/5.), where A is 

102 `min(std(X),IQR/1.34)` 

103 - "silverman" - .9 * A * nobs ** (-1/5.), where A is 

104 `min(std(X),IQR/1.34)` 

105 - "normal_reference" - C * A * nobs ** (-1/5.), where C is 

106 calculated from the kernel. Equivalent (up to 2 dp) to the 

107 "scott" bandwidth for gaussian kernels. See bandwidths.py 

108 - If a float is given, it is the bandwidth. 

109 

110 fft : bool 

111 Whether or not to use FFT. FFT implementation is more 

112 computationally efficient. However, only the Gaussian kernel 

113 is implemented. If FFT is False, then a 'nobs' x 'gridsize' 

114 intermediate array is created. 

115 gridsize : int 

116 If gridsize is None, max(len(X), 50) is used. 

117 cut : float 

118 Defines the length of the grid past the lowest and highest values 

119 of X so that the kernel goes to zero. The end points are 

120 -/+ cut*bw*{min(X) or max(X)} 

121 adjust : float 

122 An adjustment factor for the bw. Bandwidth becomes bw * adjust. 

123 """ 

124 try: 

125 bw = float(bw) 

126 self.bw_method = "user-given" 

127 except: 

128 self.bw_method = bw 

129 endog = self.endog 

130 

131 if fft: 

132 if kernel != "gau": 

133 msg = "Only gaussian kernel is available for fft" 

134 raise NotImplementedError(msg) 

135 if weights is not None: 

136 msg = "Weights are not implemented for fft" 

137 raise NotImplementedError(msg) 

138 density, grid, bw = kdensityfft(endog, kernel=kernel, bw=bw, 

139 adjust=adjust, weights=weights, gridsize=gridsize, 

140 clip=clip, cut=cut) 

141 else: 

142 density, grid, bw = kdensity(endog, kernel=kernel, bw=bw, 

143 adjust=adjust, weights=weights, gridsize=gridsize, 

144 clip=clip, cut=cut) 

145 self.density = density 

146 self.support = grid 

147 self.bw = bw 

148 self.kernel = kernel_switch[kernel](h=bw) # we instantiate twice, 

149 # should this passed to funcs? 

150 # put here to ensure empty cache after re-fit with new options 

151 self.kernel.weights = weights 

152 if weights is not None: 

153 self.kernel.weights /= weights.sum() 

154 self._cache = {} 

155 

156 @cache_readonly 

157 def cdf(self): 

158 """ 

159 Returns the cumulative distribution function evaluated at the support. 

160 

161 Notes 

162 ----- 

163 Will not work if fit has not been called. 

164 """ 

165 _checkisfit(self) 

166 kern = self.kernel 

167 if kern.domain is None: # TODO: test for grid point at domain bound 

168 a,b = -np.inf,np.inf 

169 else: 

170 a,b = kern.domain 

171 func = lambda x,s: kern.density(s,x) 

172 

173 support = self.support 

174 support = np.r_[a,support] 

175 gridsize = len(support) 

176 endog = self.endog 

177 probs = [integrate.quad(func, support[i - 1], support[i], 

178 args=endog)[0] for i in range(1, gridsize)] 

179 return np.cumsum(probs) 

180 

181 @cache_readonly 

182 def cumhazard(self): 

183 """ 

184 Returns the hazard function evaluated at the support. 

185 

186 Notes 

187 ----- 

188 Will not work if fit has not been called. 

189 """ 

190 _checkisfit(self) 

191 return -np.log(self.sf) 

192 

193 @cache_readonly 

194 def sf(self): 

195 """ 

196 Returns the survival function evaluated at the support. 

197 

198 Notes 

199 ----- 

200 Will not work if fit has not been called. 

201 """ 

202 _checkisfit(self) 

203 return 1 - self.cdf 

204 

205 @cache_readonly 

206 def entropy(self): 

207 """ 

208 Returns the differential entropy evaluated at the support 

209 

210 Notes 

211 ----- 

212 Will not work if fit has not been called. 1e-12 is added to each 

213 probability to ensure that log(0) is not called. 

214 """ 

215 _checkisfit(self) 

216 

217 def entr(x,s): 

218 pdf = kern.density(s,x) 

219 return pdf*np.log(pdf+1e-12) 

220 

221 kern = self.kernel 

222 

223 if kern.domain is not None: 

224 a, b = self.domain 

225 else: 

226 a, b = -np.inf, np.inf 

227 endog = self.endog 

228 #TODO: below could run into integr problems, cf. stats.dist._entropy 

229 return -integrate.quad(entr, a, b, args=(endog,))[0] 

230 

231 @cache_readonly 

232 def icdf(self): 

233 """ 

234 Inverse Cumulative Distribution (Quantile) Function 

235 

236 Notes 

237 ----- 

238 Will not work if fit has not been called. Uses 

239 `scipy.stats.mstats.mquantiles`. 

240 """ 

241 _checkisfit(self) 

242 gridsize = len(self.density) 

243 return stats.mstats.mquantiles(self.endog, np.linspace(0, 1, gridsize)) 

244 

245 def evaluate(self, point): 

246 """ 

247 Evaluate density at a single point. 

248 

249 Parameters 

250 ---------- 

251 point : float 

252 Point at which to evaluate the density. 

253 """ 

254 _checkisfit(self) 

255 return self.kernel.density(self.endog, point) 

256 

257 

258#### Kernel Density Estimator Functions #### 

259 

260def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None, 

261 adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True): 

262 """ 

263 Rosenblatt-Parzen univariate kernel density estimator. 

264 

265 Parameters 

266 ---------- 

267 X : array_like 

268 The variable for which the density estimate is desired. 

269 kernel : str 

270 The Kernel to be used. Choices are 

271 - "biw" for biweight 

272 - "cos" for cosine 

273 - "epa" for Epanechnikov 

274 - "gau" for Gaussian. 

275 - "tri" for triangular 

276 - "triw" for triweight 

277 - "uni" for uniform 

278 bw : str, float 

279 "scott" - 1.059 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34) 

280 "silverman" - .9 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34) 

281 If a float is given, it is the bandwidth. 

282 weights : array or None 

283 Optional weights. If the X value is clipped, then this weight is 

284 also dropped. 

285 gridsize : int 

286 If gridsize is None, max(len(X), 50) is used. 

287 adjust : float 

288 An adjustment factor for the bw. Bandwidth becomes bw * adjust. 

289 clip : tuple 

290 Observations in X that are outside of the range given by clip are 

291 dropped. The number of observations in X is then shortened. 

292 cut : float 

293 Defines the length of the grid past the lowest and highest values of X 

294 so that the kernel goes to zero. The end points are 

295 -/+ cut*bw*{min(X) or max(X)} 

296 retgrid : bool 

297 Whether or not to return the grid over which the density is estimated. 

298 

299 Returns 

300 ------- 

301 density : ndarray 

302 The densities estimated at the grid points. 

303 grid : ndarray, optional 

304 The grid points at which the density is estimated. 

305 

306 Notes 

307 ----- 

308 Creates an intermediate (`gridsize` x `nobs`) array. Use FFT for a more 

309 computationally efficient version. 

310 """ 

311 X = np.asarray(X) 

312 if X.ndim == 1: 

313 X = X[:, None] 

314 clip_x = np.logical_and(X > clip[0], X < clip[1]) 

315 X = X[clip_x] 

316 

317 nobs = len(X) # after trim 

318 

319 if gridsize is None: 

320 gridsize = max(nobs,50) # do not need to resize if no FFT 

321 

322 # handle weights 

323 if weights is None: 

324 weights = np.ones(nobs) 

325 q = nobs 

326 else: 

327 # ensure weights is a numpy array 

328 weights = np.asarray(weights) 

329 

330 if len(weights) != len(clip_x): 

331 msg = "The length of the weights must be the same as the given X." 

332 raise ValueError(msg) 

333 weights = weights[clip_x.squeeze()] 

334 q = weights.sum() 

335 

336 # Get kernel object corresponding to selection 

337 kern = kernel_switch[kernel]() 

338 

339 # if bw is None, select optimal bandwidth for kernel 

340 try: 

341 bw = float(bw) 

342 except: 

343 bw = bandwidths.select_bandwidth(X, bw, kern) 

344 bw *= adjust 

345 

346 a = np.min(X, axis=0) - cut * bw 

347 b = np.max(X, axis=0) + cut * bw 

348 grid = np.linspace(a, b, gridsize) 

349 

350 k = (X.T - grid[:, None])/bw # uses broadcasting to make a gridsize x nobs 

351 

352 # set kernel bandwidth 

353 kern.seth(bw) 

354 

355 # truncate to domain 

356 if kern.domain is not None: # will not work for piecewise kernels like parzen 

357 z_lo, z_high = kern.domain 

358 domain_mask = (k < z_lo) | (k > z_high) 

359 k = kern(k) # estimate density 

360 k[domain_mask] = 0 

361 else: 

362 k = kern(k) # estimate density 

363 

364 k[k < 0] = 0 # get rid of any negative values, do we need this? 

365 

366 dens = np.dot(k, weights)/(q*bw) 

367 

368 if retgrid: 

369 return dens, grid, bw 

370 else: 

371 return dens, bw 

372 

373def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None, 

374 adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True): 

375 """ 

376 Rosenblatt-Parzen univariate kernel density estimator 

377 

378 Parameters 

379 ---------- 

380 X : array_like 

381 The variable for which the density estimate is desired. 

382 kernel : str 

383 ONLY GAUSSIAN IS CURRENTLY IMPLEMENTED. 

384 "bi" for biweight 

385 "cos" for cosine 

386 "epa" for Epanechnikov, default 

387 "epa2" for alternative Epanechnikov 

388 "gau" for Gaussian. 

389 "par" for Parzen 

390 "rect" for rectangular 

391 "tri" for triangular 

392 bw : str, float 

393 "scott" - 1.059 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34) 

394 "silverman" - .9 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34) 

395 If a float is given, it is the bandwidth. 

396 weights : array or None 

397 WEIGHTS ARE NOT CURRENTLY IMPLEMENTED. 

398 Optional weights. If the X value is clipped, then this weight is 

399 also dropped. 

400 gridsize : int 

401 If gridsize is None, min(len(X), 512) is used. Note that the provided 

402 number is rounded up to the next highest power of 2. 

403 adjust : float 

404 An adjustment factor for the bw. Bandwidth becomes bw * adjust. 

405 clip : tuple 

406 Observations in X that are outside of the range given by clip are 

407 dropped. The number of observations in X is then shortened. 

408 cut : float 

409 Defines the length of the grid past the lowest and highest values of X 

410 so that the kernel goes to zero. The end points are 

411 -/+ cut*bw*{X.min() or X.max()} 

412 retgrid : bool 

413 Whether or not to return the grid over which the density is estimated. 

414 

415 Returns 

416 ------- 

417 density : ndarray 

418 The densities estimated at the grid points. 

419 grid : ndarray, optional 

420 The grid points at which the density is estimated. 

421 

422 Notes 

423 ----- 

424 Only the default kernel is implemented. Weights are not implemented yet. 

425 This follows Silverman (1982) with changes suggested by Jones and Lotwick 

426 (1984). However, the discretization step is replaced by linear binning 

427 of Fan and Marron (1994). This should be extended to accept the parts 

428 that are dependent only on the data to speed things up for 

429 cross-validation. 

430 

431 References 

432 ---------- 

433 Fan, J. and J.S. Marron. (1994) `Fast implementations of nonparametric 

434 curve estimators`. Journal of Computational and Graphical Statistics. 

435 3.1, 35-56. 

436 Jones, M.C. and H.W. Lotwick. (1984) `Remark AS R50: A Remark on Algorithm 

437 AS 176. Kernal Density Estimation Using the Fast Fourier Transform`. 

438 Journal of the Royal Statistical Society. Series C. 33.1, 120-2. 

439 Silverman, B.W. (1982) `Algorithm AS 176. Kernel density estimation using 

440 the Fast Fourier Transform. Journal of the Royal Statistical Society. 

441 Series C. 31.2, 93-9. 

442 """ 

443 X = np.asarray(X) 

444 X = X[np.logical_and(X > clip[0], X < clip[1])] # will not work for two columns. 

445 # will affect underlying data? 

446 

447 # Get kernel object corresponding to selection 

448 kern = kernel_switch[kernel]() 

449 

450 try: 

451 bw = float(bw) 

452 except: 

453 bw = bandwidths.select_bandwidth(X, bw, kern) # will cross-val fit this pattern? 

454 bw *= adjust 

455 

456 nobs = len(X) # after trim 

457 

458 # 1 Make grid and discretize the data 

459 if gridsize is None: 

460 gridsize = np.max((nobs, 512.)) 

461 gridsize = 2**np.ceil(np.log2(gridsize)) # round to next power of 2 

462 

463 a = np.min(X) - cut * bw 

464 b = np.max(X) + cut * bw 

465 grid,delta = np.linspace(a, b, int(gridsize), retstep=True) 

466 RANGE = b - a 

467 

468#TODO: Fix this? 

469# This is the Silverman binning function, but I believe it's buggy (SS) 

470# weighting according to Silverman 

471# count = counts(X,grid) 

472# binned = np.zeros_like(grid) #xi_{k} in Silverman 

473# j = 0 

474# for k in range(int(gridsize-1)): 

475# if count[k]>0: # there are points of X in the grid here 

476# Xingrid = X[j:j+count[k]] # get all these points 

477# # get weights at grid[k],grid[k+1] 

478# binned[k] += np.sum(grid[k+1]-Xingrid) 

479# binned[k+1] += np.sum(Xingrid-grid[k]) 

480# j += count[k] 

481# binned /= (nobs)*delta**2 # normalize binned to sum to 1/delta 

482 

483#NOTE: THE ABOVE IS WRONG, JUST TRY WITH LINEAR BINNING 

484 binned = fast_linbin(X, a, b, gridsize) / (delta * nobs) 

485 

486 # step 2 compute FFT of the weights, using Munro (1976) FFT convention 

487 y = forrt(binned) 

488 

489 # step 3 and 4 for optimal bw compute zstar and the density estimate f 

490 # do not have to redo the above if just changing bw, ie., for cross val 

491 

492#NOTE: silverman_transform is the closed form solution of the FFT of the 

493#gaussian kernel. Not yet sure how to generalize it. 

494 zstar = silverman_transform(bw, gridsize, RANGE)*y # 3.49 in Silverman 

495 # 3.50 w Gaussian kernel 

496 f = revrt(zstar) 

497 if retgrid: 

498 return f, grid, bw 

499 else: 

500 return f, bw 

501 

502if __name__ == "__main__": 

503 import numpy as np 

504 np.random.seed(12345) 

505 xi = np.random.randn(100) 

506 f,grid, bw1 = kdensity(xi, kernel="gau", bw=.372735, retgrid=True) 

507 f2, bw2 = kdensityfft(xi, kernel="gau", bw="silverman",retgrid=False) 

508 

509# do some checking vs. silverman algo. 

510# you need denes.f, http://lib.stat.cmu.edu/apstat/176 

511#NOTE: I (SS) made some changes to the Fortran 

512# and the FFT stuff from Munro http://lib.stat.cmu.edu/apstat/97o 

513# then compile everything and link to denest with f2py 

514#Make pyf file as usual, then compile shared object 

515#f2py denest.f -m denest2 -h denest.pyf 

516#edit pyf 

517#-c flag makes it available to other programs, fPIC builds a shared library 

518#/usr/bin/gfortran -Wall -c -fPIC fft.f 

519#f2py -c denest.pyf ./fft.o denest.f 

520 

521 try: 

522 from denest2 import denest # @UnresolvedImport 

523 a = -3.4884382032045504 

524 b = 4.3671504686785605 

525 RANGE = b - a 

526 bw = bandwidths.bw_silverman(xi) 

527 

528 ft,smooth,ifault,weights,smooth1 = denest(xi,a,b,bw,np.zeros(512),np.zeros(512),0, 

529 np.zeros(512), np.zeros(512)) 

530# We use a different binning algo, so only accurate up to 3 decimal places 

531 np.testing.assert_almost_equal(f2, smooth, 3) 

532#NOTE: for debugging 

533# y2 = forrt(weights) 

534# RJ = np.arange(512/2+1) 

535# FAC1 = 2*(np.pi*bw/RANGE)**2 

536# RJFAC = RJ**2*FAC1 

537# BC = 1 - RJFAC/(6*(bw/((b-a)/M))**2) 

538# FAC = np.exp(-RJFAC)/BC 

539# SMOOTH = np.r_[FAC,FAC[1:-1]] * y2 

540 

541# dens = revrt(SMOOTH) 

542 

543 except: 

544# ft = np.loadtxt('./ft_silver.csv') 

545# smooth = np.loadtxt('./smooth_silver.csv') 

546 print("Did not get the estimates from the Silverman algorithm")