Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import numpy as np 

2from scipy.stats import scoreatpercentile as sap 

3 

4from statsmodels.compat.pandas import Substitution 

5from statsmodels.sandbox.nonparametric import kernels 

6 

7def _select_sigma(X): 

8 """ 

9 Returns the smaller of std(X, ddof=1) or normalized IQR(X) over axis 0. 

10 

11 References 

12 ---------- 

13 Silverman (1986) p.47 

14 """ 

15# normalize = norm.ppf(.75) - norm.ppf(.25) 

16 normalize = 1.349 

17# IQR = np.subtract.reduce(percentile(X, [75,25], 

18# axis=axis), axis=axis)/normalize 

19 IQR = (sap(X, 75) - sap(X, 25))/normalize 

20 return np.minimum(np.std(X, axis=0, ddof=1), IQR) 

21 

22 

23## Univariate Rule of Thumb Bandwidths ## 

24def bw_scott(x, kernel=None): 

25 """ 

26 Scott's Rule of Thumb 

27 

28 Parameters 

29 ---------- 

30 x : array_like 

31 Array for which to get the bandwidth 

32 kernel : CustomKernel object 

33 Unused 

34 

35 Returns 

36 ------- 

37 bw : float 

38 The estimate of the bandwidth 

39 

40 Notes 

41 ----- 

42 Returns 1.059 * A * n ** (-1/5.) where :: 

43 

44 A = min(std(x, ddof=1), IQR/1.349) 

45 IQR = np.subtract.reduce(np.percentile(x, [75,25])) 

46 

47 References 

48 ---------- 

49 

50 Scott, D.W. (1992) Multivariate Density Estimation: Theory, Practice, and 

51 Visualization. 

52 """ 

53 A = _select_sigma(x) 

54 n = len(x) 

55 return 1.059 * A * n ** (-0.2) 

56 

57def bw_silverman(x, kernel=None): 

58 """ 

59 Silverman's Rule of Thumb 

60 

61 Parameters 

62 ---------- 

63 x : array_like 

64 Array for which to get the bandwidth 

65 kernel : CustomKernel object 

66 Unused 

67 

68 Returns 

69 ------- 

70 bw : float 

71 The estimate of the bandwidth 

72 

73 Notes 

74 ----- 

75 Returns .9 * A * n ** (-1/5.) where :: 

76 

77 A = min(std(x, ddof=1), IQR/1.349) 

78 IQR = np.subtract.reduce(np.percentile(x, [75,25])) 

79 

80 References 

81 ---------- 

82 

83 Silverman, B.W. (1986) `Density Estimation.` 

84 """ 

85 A = _select_sigma(x) 

86 n = len(x) 

87 return .9 * A * n ** (-0.2) 

88 

89 

90def bw_normal_reference(x, kernel=kernels.Gaussian): 

91 """ 

92 Plug-in bandwidth with kernel specific constant based on normal reference. 

93 

94 This bandwidth minimizes the mean integrated square error if the true 

95 distribution is the normal. This choice is an appropriate bandwidth for 

96 single peaked distributions that are similar to the normal distribution. 

97 

98 Parameters 

99 ---------- 

100 x : array_like 

101 Array for which to get the bandwidth 

102 kernel : CustomKernel object 

103 Used to calculate the constant for the plug-in bandwidth. 

104 

105 Returns 

106 ------- 

107 bw : float 

108 The estimate of the bandwidth 

109 

110 Notes 

111 ----- 

112 Returns C * A * n ** (-1/5.) where :: 

113 

114 A = min(std(x, ddof=1), IQR/1.349) 

115 IQR = np.subtract.reduce(np.percentile(x, [75,25])) 

116 C = constant from Hansen (2009) 

117 

118 When using a Gaussian kernel this is equivalent to the 'scott' bandwidth up 

119 to two decimal places. This is the accuracy to which the 'scott' constant is 

120 specified. 

121 

122 References 

123 ---------- 

124 

125 Silverman, B.W. (1986) `Density Estimation.` 

126 Hansen, B.E. (2009) `Lecture Notes on Nonparametrics.` 

127 """ 

128 C = kernel.normal_reference_constant 

129 A = _select_sigma(x) 

130 n = len(x) 

131 return C * A * n ** (-0.2) 

132 

133## Plug-In Methods ## 

134 

135## Least Squares Cross-Validation ## 

136 

137## Helper Functions ## 

138 

139bandwidth_funcs = { 

140 "scott": bw_scott, 

141 "silverman": bw_silverman, 

142 "normal_reference": bw_normal_reference, 

143} 

144 

145 

146@Substitution(", ".join(sorted(bandwidth_funcs.keys()))) 

147def select_bandwidth(x, bw, kernel): 

148 """ 

149 Selects bandwidth for a selection rule bw 

150 

151 this is a wrapper around existing bandwidth selection rules 

152 

153 Parameters 

154 ---------- 

155 x : array_like 

156 Array for which to get the bandwidth 

157 bw : str 

158 name of bandwidth selection rule, currently supported are: 

159 %s 

160 kernel : not used yet 

161 

162 Returns 

163 ------- 

164 bw : float 

165 The estimate of the bandwidth 

166 """ 

167 bw = bw.lower() 

168 if bw not in bandwidth_funcs: 

169 raise ValueError("Bandwidth %s not understood" % bw) 

170 bandwidth = bandwidth_funcs[bw](x, kernel) 

171 if np.any(bandwidth == 0): 

172 # eventually this can fall back on another selection criterion. 

173 err = "Selected KDE bandwidth is 0. Cannot estimate density." 

174 raise RuntimeError(err) 

175 else: 

176 return bandwidth