Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/robust/scale.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Support and standalone functions for Robust Linear Models
4References
5----------
6PJ Huber. 'Robust Statistics' John Wiley and Sons, Inc., New York, 1981.
8R Venables, B Ripley. 'Modern Applied Statistics in S'
9 Springer, New York, 2002.
10"""
11import numpy as np
12from scipy.stats import norm as Gaussian
13from . import norms
14from statsmodels.tools import tools
15from statsmodels.tools.validation import array_like, float_like
18def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
19 # c \approx .6745
20 """
21 The Median Absolute Deviation along given axis of an array
23 Parameters
24 ----------
25 a : array_like
26 Input array.
27 c : float, optional
28 The normalization constant. Defined as scipy.stats.norm.ppf(3/4.),
29 which is approximately .6745.
30 axis : int, optional
31 The default is 0. Can also be None.
32 center : callable or float
33 If a callable is provided, such as the default `np.median` then it
34 is expected to be called center(a). The axis argument will be applied
35 via np.apply_over_axes. Otherwise, provide a float.
37 Returns
38 -------
39 mad : float
40 `mad` = median(abs(`a` - center))/`c`
41 """
42 a = array_like(a, 'a', ndim=None)
43 c = float_like(c, 'c')
44 if callable(center) and a.size:
45 center = np.apply_over_axes(center, a, axis)
46 else:
47 center = 0.0
49 return np.median((np.abs(a-center)) / c, axis=axis)
52class Huber(object):
53 """
54 Huber's proposal 2 for estimating location and scale jointly.
56 Parameters
57 ----------
58 c : float, optional
59 Threshold used in threshold for chi=psi**2. Default value is 1.5.
60 tol : float, optional
61 Tolerance for convergence. Default value is 1e-08.
62 maxiter : int, optional0
63 Maximum number of iterations. Default value is 30.
64 norm : statsmodels.robust.norms.RobustNorm, optional
65 A robust norm used in M estimator of location. If None,
66 the location estimator defaults to a one-step
67 fixed point version of the M-estimator using Huber's T.
69 call
70 Return joint estimates of Huber's scale and location.
72 Examples
73 --------
74 >>> import numpy as np
75 >>> import statsmodels.api as sm
76 >>> chem_data = np.array([2.20, 2.20, 2.4, 2.4, 2.5, 2.7, 2.8, 2.9, 3.03,
77 ... 3.03, 3.10, 3.37, 3.4, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.7, 3.7,
78 ... 3.77, 5.28, 28.95])
79 >>> sm.robust.scale.huber(chem_data)
80 (array(3.2054980819923693), array(0.67365260010478967))
81 """
83 def __init__(self, c=1.5, tol=1.0e-08, maxiter=30, norm=None):
84 self.c = c
85 self.maxiter = maxiter
86 self.tol = tol
87 self.norm = norm
88 tmp = 2 * Gaussian.cdf(c) - 1
89 self.gamma = tmp + c**2 * (1 - tmp) - 2 * c * Gaussian.pdf(c)
91 def __call__(self, a, mu=None, initscale=None, axis=0):
92 """
93 Compute Huber's proposal 2 estimate of scale, using an optional
94 initial value of scale and an optional estimate of mu. If mu
95 is supplied, it is not reestimated.
97 Parameters
98 ----------
99 a : ndarray
100 1d array
101 mu : float or None, optional
102 If the location mu is supplied then it is not reestimated.
103 Default is None, which means that it is estimated.
104 initscale : float or None, optional
105 A first guess on scale. If initscale is None then the standardized
106 median absolute deviation of a is used.
108 Notes
109 -----
110 `Huber` minimizes the function
112 sum(psi((a[i]-mu)/scale)**2)
114 as a function of (mu, scale), where
116 psi(x) = np.clip(x, -self.c, self.c)
117 """
118 a = np.asarray(a)
119 if mu is None:
120 n = a.shape[0] - 1
121 mu = np.median(a, axis=axis)
122 est_mu = True
123 else:
124 n = a.shape[0]
125 mu = mu
126 est_mu = False
128 if initscale is None:
129 scale = mad(a, axis=axis)
130 else:
131 scale = initscale
132 scale = tools.unsqueeze(scale, axis, a.shape)
133 mu = tools.unsqueeze(mu, axis, a.shape)
134 return self._estimate_both(a, scale, mu, axis, est_mu, n)
136 def _estimate_both(self, a, scale, mu, axis, est_mu, n):
137 """
138 Estimate scale and location simultaneously with the following
139 pseudo_loop:
141 while not_converged:
142 mu, scale = estimate_location(a, scale, mu), estimate_scale(a, scale, mu)
144 where estimate_location is an M-estimator and estimate_scale implements
145 the check used in Section 5.5 of Venables & Ripley
146 """ # noqa:E501
147 for _ in range(self.maxiter):
148 # Estimate the mean along a given axis
149 if est_mu:
150 if self.norm is None:
151 # This is a one-step fixed-point estimator
152 # if self.norm == norms.HuberT
153 # It should be faster than using norms.HuberT
154 nmu = np.clip(a, mu-self.c*scale,
155 mu+self.c*scale).sum(axis) / a.shape[axis]
156 else:
157 nmu = norms.estimate_location(a, scale, self.norm, axis,
158 mu, self.maxiter, self.tol)
159 else:
160 # Effectively, do nothing
161 nmu = mu.squeeze()
162 nmu = tools.unsqueeze(nmu, axis, a.shape)
164 subset = np.less_equal(np.abs((a - mu)/scale), self.c)
165 card = subset.sum(axis)
167 scale_num = np.sum(subset * (a - nmu)**2, axis)
168 scale_denom = (n * self.gamma - (a.shape[axis] - card) * self.c**2)
169 nscale = np.sqrt(scale_num / scale_denom)
170 nscale = tools.unsqueeze(nscale, axis, a.shape)
172 test1 = np.alltrue(np.less_equal(np.abs(scale - nscale),
173 nscale * self.tol))
174 test2 = np.alltrue(
175 np.less_equal(np.abs(mu - nmu), nscale * self.tol))
176 if not (test1 and test2):
177 mu = nmu
178 scale = nscale
179 else:
180 return nmu.squeeze(), nscale.squeeze()
181 raise ValueError('joint estimation of location and scale failed '
182 'to converge in %d iterations' % self.maxiter)
185huber = Huber()
188class HuberScale(object):
189 r"""
190 Huber's scaling for fitting robust linear models.
192 Huber's scale is intended to be used as the scale estimate in the
193 IRLS algorithm and is slightly different than the `Huber` class.
195 Parameters
196 ----------
197 d : float, optional
198 d is the tuning constant for Huber's scale. Default is 2.5
199 tol : float, optional
200 The convergence tolerance
201 maxiter : int, optiona
202 The maximum number of iterations. The default is 30.
204 Methods
205 -------
206 call
207 Return's Huber's scale computed as below
209 Notes
210 --------
211 Huber's scale is the iterative solution to
213 scale_(i+1)**2 = 1/(n*h)*sum(chi(r/sigma_i)*sigma_i**2
215 where the Huber function is
217 chi(x) = (x**2)/2 for \|x\| < d
218 chi(x) = (d**2)/2 for \|x\| >= d
220 and the Huber constant h = (n-p)/n*(d**2 + (1-d**2)*\
221 scipy.stats.norm.cdf(d) - .5 - d*sqrt(2*pi)*exp(-0.5*d**2)
222 """
223 def __init__(self, d=2.5, tol=1e-08, maxiter=30):
224 self.d = d
225 self.tol = tol
226 self.maxiter = maxiter
228 def __call__(self, df_resid, nobs, resid):
229 h = df_resid / nobs * (
230 self.d ** 2
231 + (1 - self.d ** 2) * Gaussian.cdf(self.d)
232 - .5 - self.d / (np.sqrt(2 * np.pi)) * np.exp(-.5 * self.d ** 2)
233 )
234 s = mad(resid)
236 def subset(x):
237 return np.less(np.abs(resid / x), self.d)
239 def chi(s):
240 return subset(s) * (resid / s) ** 2 / 2 + (1 - subset(s)) * \
241 (self.d ** 2 / 2)
243 scalehist = [np.inf, s]
244 niter = 1
245 while (np.abs(scalehist[niter - 1] - scalehist[niter]) > self.tol
246 and niter < self.maxiter):
247 nscale = np.sqrt(1 / (nobs * h) * np.sum(chi(scalehist[-1])) *
248 scalehist[-1] ** 2)
249 scalehist.append(nscale)
250 niter += 1
251 # TODO: raise on convergence failure?
252 return scalehist[-1]
255hubers_scale = HuberScale()