Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Support and standalone functions for Robust Linear Models 

3 

4References 

5---------- 

6PJ Huber. 'Robust Statistics' John Wiley and Sons, Inc., New York, 1981. 

7 

8R Venables, B Ripley. 'Modern Applied Statistics in S' 

9 Springer, New York, 2002. 

10""" 

11import numpy as np 

12from scipy.stats import norm as Gaussian 

13from . import norms 

14from statsmodels.tools import tools 

15from statsmodels.tools.validation import array_like, float_like 

16 

17 

18def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median): 

19 # c \approx .6745 

20 """ 

21 The Median Absolute Deviation along given axis of an array 

22 

23 Parameters 

24 ---------- 

25 a : array_like 

26 Input array. 

27 c : float, optional 

28 The normalization constant. Defined as scipy.stats.norm.ppf(3/4.), 

29 which is approximately .6745. 

30 axis : int, optional 

31 The default is 0. Can also be None. 

32 center : callable or float 

33 If a callable is provided, such as the default `np.median` then it 

34 is expected to be called center(a). The axis argument will be applied 

35 via np.apply_over_axes. Otherwise, provide a float. 

36 

37 Returns 

38 ------- 

39 mad : float 

40 `mad` = median(abs(`a` - center))/`c` 

41 """ 

42 a = array_like(a, 'a', ndim=None) 

43 c = float_like(c, 'c') 

44 if callable(center) and a.size: 

45 center = np.apply_over_axes(center, a, axis) 

46 else: 

47 center = 0.0 

48 

49 return np.median((np.abs(a-center)) / c, axis=axis) 

50 

51 

52class Huber(object): 

53 """ 

54 Huber's proposal 2 for estimating location and scale jointly. 

55 

56 Parameters 

57 ---------- 

58 c : float, optional 

59 Threshold used in threshold for chi=psi**2. Default value is 1.5. 

60 tol : float, optional 

61 Tolerance for convergence. Default value is 1e-08. 

62 maxiter : int, optional0 

63 Maximum number of iterations. Default value is 30. 

64 norm : statsmodels.robust.norms.RobustNorm, optional 

65 A robust norm used in M estimator of location. If None, 

66 the location estimator defaults to a one-step 

67 fixed point version of the M-estimator using Huber's T. 

68 

69 call 

70 Return joint estimates of Huber's scale and location. 

71 

72 Examples 

73 -------- 

74 >>> import numpy as np 

75 >>> import statsmodels.api as sm 

76 >>> chem_data = np.array([2.20, 2.20, 2.4, 2.4, 2.5, 2.7, 2.8, 2.9, 3.03, 

77 ... 3.03, 3.10, 3.37, 3.4, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.7, 3.7, 

78 ... 3.77, 5.28, 28.95]) 

79 >>> sm.robust.scale.huber(chem_data) 

80 (array(3.2054980819923693), array(0.67365260010478967)) 

81 """ 

82 

83 def __init__(self, c=1.5, tol=1.0e-08, maxiter=30, norm=None): 

84 self.c = c 

85 self.maxiter = maxiter 

86 self.tol = tol 

87 self.norm = norm 

88 tmp = 2 * Gaussian.cdf(c) - 1 

89 self.gamma = tmp + c**2 * (1 - tmp) - 2 * c * Gaussian.pdf(c) 

90 

91 def __call__(self, a, mu=None, initscale=None, axis=0): 

92 """ 

93 Compute Huber's proposal 2 estimate of scale, using an optional 

94 initial value of scale and an optional estimate of mu. If mu 

95 is supplied, it is not reestimated. 

96 

97 Parameters 

98 ---------- 

99 a : ndarray 

100 1d array 

101 mu : float or None, optional 

102 If the location mu is supplied then it is not reestimated. 

103 Default is None, which means that it is estimated. 

104 initscale : float or None, optional 

105 A first guess on scale. If initscale is None then the standardized 

106 median absolute deviation of a is used. 

107 

108 Notes 

109 ----- 

110 `Huber` minimizes the function 

111 

112 sum(psi((a[i]-mu)/scale)**2) 

113 

114 as a function of (mu, scale), where 

115 

116 psi(x) = np.clip(x, -self.c, self.c) 

117 """ 

118 a = np.asarray(a) 

119 if mu is None: 

120 n = a.shape[0] - 1 

121 mu = np.median(a, axis=axis) 

122 est_mu = True 

123 else: 

124 n = a.shape[0] 

125 mu = mu 

126 est_mu = False 

127 

128 if initscale is None: 

129 scale = mad(a, axis=axis) 

130 else: 

131 scale = initscale 

132 scale = tools.unsqueeze(scale, axis, a.shape) 

133 mu = tools.unsqueeze(mu, axis, a.shape) 

134 return self._estimate_both(a, scale, mu, axis, est_mu, n) 

135 

136 def _estimate_both(self, a, scale, mu, axis, est_mu, n): 

137 """ 

138 Estimate scale and location simultaneously with the following 

139 pseudo_loop: 

140 

141 while not_converged: 

142 mu, scale = estimate_location(a, scale, mu), estimate_scale(a, scale, mu) 

143 

144 where estimate_location is an M-estimator and estimate_scale implements 

145 the check used in Section 5.5 of Venables & Ripley 

146 """ # noqa:E501 

147 for _ in range(self.maxiter): 

148 # Estimate the mean along a given axis 

149 if est_mu: 

150 if self.norm is None: 

151 # This is a one-step fixed-point estimator 

152 # if self.norm == norms.HuberT 

153 # It should be faster than using norms.HuberT 

154 nmu = np.clip(a, mu-self.c*scale, 

155 mu+self.c*scale).sum(axis) / a.shape[axis] 

156 else: 

157 nmu = norms.estimate_location(a, scale, self.norm, axis, 

158 mu, self.maxiter, self.tol) 

159 else: 

160 # Effectively, do nothing 

161 nmu = mu.squeeze() 

162 nmu = tools.unsqueeze(nmu, axis, a.shape) 

163 

164 subset = np.less_equal(np.abs((a - mu)/scale), self.c) 

165 card = subset.sum(axis) 

166 

167 scale_num = np.sum(subset * (a - nmu)**2, axis) 

168 scale_denom = (n * self.gamma - (a.shape[axis] - card) * self.c**2) 

169 nscale = np.sqrt(scale_num / scale_denom) 

170 nscale = tools.unsqueeze(nscale, axis, a.shape) 

171 

172 test1 = np.alltrue(np.less_equal(np.abs(scale - nscale), 

173 nscale * self.tol)) 

174 test2 = np.alltrue( 

175 np.less_equal(np.abs(mu - nmu), nscale * self.tol)) 

176 if not (test1 and test2): 

177 mu = nmu 

178 scale = nscale 

179 else: 

180 return nmu.squeeze(), nscale.squeeze() 

181 raise ValueError('joint estimation of location and scale failed ' 

182 'to converge in %d iterations' % self.maxiter) 

183 

184 

185huber = Huber() 

186 

187 

188class HuberScale(object): 

189 r""" 

190 Huber's scaling for fitting robust linear models. 

191 

192 Huber's scale is intended to be used as the scale estimate in the 

193 IRLS algorithm and is slightly different than the `Huber` class. 

194 

195 Parameters 

196 ---------- 

197 d : float, optional 

198 d is the tuning constant for Huber's scale. Default is 2.5 

199 tol : float, optional 

200 The convergence tolerance 

201 maxiter : int, optiona 

202 The maximum number of iterations. The default is 30. 

203 

204 Methods 

205 ------- 

206 call 

207 Return's Huber's scale computed as below 

208 

209 Notes 

210 -------- 

211 Huber's scale is the iterative solution to 

212 

213 scale_(i+1)**2 = 1/(n*h)*sum(chi(r/sigma_i)*sigma_i**2 

214 

215 where the Huber function is 

216 

217 chi(x) = (x**2)/2 for \|x\| < d 

218 chi(x) = (d**2)/2 for \|x\| >= d 

219 

220 and the Huber constant h = (n-p)/n*(d**2 + (1-d**2)*\ 

221 scipy.stats.norm.cdf(d) - .5 - d*sqrt(2*pi)*exp(-0.5*d**2) 

222 """ 

223 def __init__(self, d=2.5, tol=1e-08, maxiter=30): 

224 self.d = d 

225 self.tol = tol 

226 self.maxiter = maxiter 

227 

228 def __call__(self, df_resid, nobs, resid): 

229 h = df_resid / nobs * ( 

230 self.d ** 2 

231 + (1 - self.d ** 2) * Gaussian.cdf(self.d) 

232 - .5 - self.d / (np.sqrt(2 * np.pi)) * np.exp(-.5 * self.d ** 2) 

233 ) 

234 s = mad(resid) 

235 

236 def subset(x): 

237 return np.less(np.abs(resid / x), self.d) 

238 

239 def chi(s): 

240 return subset(s) * (resid / s) ** 2 / 2 + (1 - subset(s)) * \ 

241 (self.d ** 2 / 2) 

242 

243 scalehist = [np.inf, s] 

244 niter = 1 

245 while (np.abs(scalehist[niter - 1] - scalehist[niter]) > self.tol 

246 and niter < self.maxiter): 

247 nscale = np.sqrt(1 / (nobs * h) * np.sum(chi(scalehist[-1])) * 

248 scalehist[-1] ** 2) 

249 scalehist.append(nscale) 

250 niter += 1 

251 # TODO: raise on convergence failure? 

252 return scalehist[-1] 

253 

254 

255hubers_scale = HuberScale()