Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from collections import namedtuple 

2 

3import numpy as np 

4 

5from . import distributions 

6 

7 

8__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes'] 

9 

10LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept', 

11 'rvalue', 'pvalue', 

12 'stderr')) 

13 

14 

15def linregress(x, y=None): 

16 """ 

17 Calculate a linear least-squares regression for two sets of measurements. 

18 

19 Parameters 

20 ---------- 

21 x, y : array_like 

22 Two sets of measurements. Both arrays should have the same length. If 

23 only `x` is given (and ``y=None``), then it must be a two-dimensional 

24 array where one dimension has length 2. The two sets of measurements 

25 are then found by splitting the array along the length-2 dimension. In 

26 the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is 

27 equivalent to ``linregress(x[0], x[1])``. 

28 

29 Returns 

30 ------- 

31 slope : float 

32 Slope of the regression line. 

33 intercept : float 

34 Intercept of the regression line. 

35 rvalue : float 

36 Correlation coefficient. 

37 pvalue : float 

38 Two-sided p-value for a hypothesis test whose null hypothesis is 

39 that the slope is zero, using Wald Test with t-distribution of 

40 the test statistic. 

41 stderr : float 

42 Standard error of the estimated gradient. 

43 

44 See also 

45 -------- 

46 :func:`scipy.optimize.curve_fit` : Use non-linear 

47 least squares to fit a function to data. 

48 :func:`scipy.optimize.leastsq` : Minimize the sum of 

49 squares of a set of equations. 

50 

51 Notes 

52 ----- 

53 Missing values are considered pair-wise: if a value is missing in `x`, 

54 the corresponding value in `y` is masked. 

55 

56 Examples 

57 -------- 

58 >>> import matplotlib.pyplot as plt 

59 >>> from scipy import stats 

60 

61 Generate some data: 

62 

63 >>> np.random.seed(12345678) 

64 >>> x = np.random.random(10) 

65 >>> y = 1.6*x + np.random.random(10) 

66 

67 Perform the linear regression: 

68 

69 >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) 

70 >>> print("slope: %f intercept: %f" % (slope, intercept)) 

71 slope: 1.944864 intercept: 0.268578 

72 

73 To get coefficient of determination (R-squared): 

74 

75 >>> print("R-squared: %f" % r_value**2) 

76 R-squared: 0.735498 

77 

78 Plot the data along with the fitted line: 

79 

80 >>> plt.plot(x, y, 'o', label='original data') 

81 >>> plt.plot(x, intercept + slope*x, 'r', label='fitted line') 

82 >>> plt.legend() 

83 >>> plt.show() 

84 

85 Example for the case where only x is provided as a 2x2 array: 

86 

87 >>> x = np.array([[0, 1], [0, 2]]) 

88 >>> r = stats.linregress(x) 

89 >>> r.slope, r.intercept 

90 (2.0, 0.0) 

91 

92 """ 

93 TINY = 1.0e-20 

94 if y is None: # x is a (2, N) or (N, 2) shaped array_like 

95 x = np.asarray(x) 

96 if x.shape[0] == 2: 

97 x, y = x 

98 elif x.shape[1] == 2: 

99 x, y = x.T 

100 else: 

101 msg = ("If only `x` is given as input, it has to be of shape " 

102 "(2, N) or (N, 2), provided shape was %s" % str(x.shape)) 

103 raise ValueError(msg) 

104 else: 

105 x = np.asarray(x) 

106 y = np.asarray(y) 

107 

108 if x.size == 0 or y.size == 0: 

109 raise ValueError("Inputs must not be empty.") 

110 

111 n = len(x) 

112 xmean = np.mean(x, None) 

113 ymean = np.mean(y, None) 

114 

115 # average sum of squares: 

116 ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat 

117 r_num = ssxym 

118 r_den = np.sqrt(ssxm * ssym) 

119 if r_den == 0.0: 

120 r = 0.0 

121 else: 

122 r = r_num / r_den 

123 # test for numerical error propagation 

124 if r > 1.0: 

125 r = 1.0 

126 elif r < -1.0: 

127 r = -1.0 

128 

129 df = n - 2 

130 slope = r_num / ssxm 

131 intercept = ymean - slope*xmean 

132 if n == 2: 

133 # handle case when only two points are passed in 

134 if y[0] == y[1]: 

135 prob = 1.0 

136 else: 

137 prob = 0.0 

138 sterrest = 0.0 

139 else: 

140 t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY))) 

141 prob = 2 * distributions.t.sf(np.abs(t), df) 

142 sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df) 

143 

144 return LinregressResult(slope, intercept, r, prob, sterrest) 

145 

146 

147def theilslopes(y, x=None, alpha=0.95): 

148 r""" 

149 Computes the Theil-Sen estimator for a set of points (x, y). 

150 

151 `theilslopes` implements a method for robust linear regression. It 

152 computes the slope as the median of all slopes between paired values. 

153 

154 Parameters 

155 ---------- 

156 y : array_like 

157 Dependent variable. 

158 x : array_like or None, optional 

159 Independent variable. If None, use ``arange(len(y))`` instead. 

160 alpha : float, optional 

161 Confidence degree between 0 and 1. Default is 95% confidence. 

162 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are 

163 interpreted as "find the 90% confidence interval". 

164 

165 Returns 

166 ------- 

167 medslope : float 

168 Theil slope. 

169 medintercept : float 

170 Intercept of the Theil line, as ``median(y) - medslope*median(x)``. 

171 lo_slope : float 

172 Lower bound of the confidence interval on `medslope`. 

173 up_slope : float 

174 Upper bound of the confidence interval on `medslope`. 

175 

176 See also 

177 -------- 

178 siegelslopes : a similar technique using repeated medians 

179 

180 Notes 

181 ----- 

182 The implementation of `theilslopes` follows [1]_. The intercept is 

183 not defined in [1]_, and here it is defined as ``median(y) - 

184 medslope*median(x)``, which is given in [3]_. Other definitions of 

185 the intercept exist in the literature. A confidence interval for 

186 the intercept is not given as this question is not addressed in 

187 [1]_. 

188 

189 References 

190 ---------- 

191 .. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau", 

192 J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968. 

193 .. [2] H. Theil, "A rank-invariant method of linear and polynomial 

194 regression analysis I, II and III", Nederl. Akad. Wetensch., Proc. 

195 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950. 

196 .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed., 

197 John Wiley and Sons, New York, pp. 493. 

198 

199 Examples 

200 -------- 

201 >>> from scipy import stats 

202 >>> import matplotlib.pyplot as plt 

203 

204 >>> x = np.linspace(-5, 5, num=150) 

205 >>> y = x + np.random.normal(size=x.size) 

206 >>> y[11:15] += 10 # add outliers 

207 >>> y[-5:] -= 7 

208 

209 Compute the slope, intercept and 90% confidence interval. For comparison, 

210 also compute the least-squares fit with `linregress`: 

211 

212 >>> res = stats.theilslopes(y, x, 0.90) 

213 >>> lsq_res = stats.linregress(x, y) 

214 

215 Plot the results. The Theil-Sen regression line is shown in red, with the 

216 dashed red lines illustrating the confidence interval of the slope (note 

217 that the dashed red lines are not the confidence interval of the regression 

218 as the confidence interval of the intercept is not included). The green 

219 line shows the least-squares fit for comparison. 

220 

221 >>> fig = plt.figure() 

222 >>> ax = fig.add_subplot(111) 

223 >>> ax.plot(x, y, 'b.') 

224 >>> ax.plot(x, res[1] + res[0] * x, 'r-') 

225 >>> ax.plot(x, res[1] + res[2] * x, 'r--') 

226 >>> ax.plot(x, res[1] + res[3] * x, 'r--') 

227 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-') 

228 >>> plt.show() 

229 

230 """ 

231 # We copy both x and y so we can use _find_repeats. 

232 y = np.array(y).flatten() 

233 if x is None: 

234 x = np.arange(len(y), dtype=float) 

235 else: 

236 x = np.array(x, dtype=float).flatten() 

237 if len(x) != len(y): 

238 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x))) 

239 

240 # Compute sorted slopes only when deltax > 0 

241 deltax = x[:, np.newaxis] - x 

242 deltay = y[:, np.newaxis] - y 

243 slopes = deltay[deltax > 0] / deltax[deltax > 0] 

244 slopes.sort() 

245 medslope = np.median(slopes) 

246 medinter = np.median(y) - medslope * np.median(x) 

247 # Now compute confidence intervals 

248 if alpha > 0.5: 

249 alpha = 1. - alpha 

250 

251 z = distributions.norm.ppf(alpha / 2.) 

252 # This implements (2.6) from Sen (1968) 

253 _, nxreps = _find_repeats(x) 

254 _, nyreps = _find_repeats(y) 

255 nt = len(slopes) # N in Sen (1968) 

256 ny = len(y) # n in Sen (1968) 

257 # Equation 2.6 in Sen (1968): 

258 sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) - 

259 sum(k * (k-1) * (2*k + 5) for k in nxreps) - 

260 sum(k * (k-1) * (2*k + 5) for k in nyreps)) 

261 # Find the confidence interval indices in `slopes` 

262 sigma = np.sqrt(sigsq) 

263 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1) 

264 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0) 

265 delta = slopes[[Rl, Ru]] 

266 return medslope, medinter, delta[0], delta[1] 

267 

268 

269def _find_repeats(arr): 

270 # This function assumes it may clobber its input. 

271 if len(arr) == 0: 

272 return np.array(0, np.float64), np.array(0, np.intp) 

273 

274 # XXX This cast was previously needed for the Fortran implementation, 

275 # should we ditch it? 

276 arr = np.asarray(arr, np.float64).ravel() 

277 arr.sort() 

278 

279 # Taken from NumPy 1.9's np.unique. 

280 change = np.concatenate(([True], arr[1:] != arr[:-1])) 

281 unique = arr[change] 

282 change_idx = np.concatenate(np.nonzero(change) + ([arr.size],)) 

283 freq = np.diff(change_idx) 

284 atleast2 = freq > 1 

285 return unique[atleast2], freq[atleast2] 

286 

287 

288def siegelslopes(y, x=None, method="hierarchical"): 

289 r""" 

290 Computes the Siegel estimator for a set of points (x, y). 

291 

292 `siegelslopes` implements a method for robust linear regression 

293 using repeated medians (see [1]_) to fit a line to the points (x, y). 

294 The method is robust to outliers with an asymptotic breakdown point 

295 of 50%. 

296 

297 Parameters 

298 ---------- 

299 y : array_like 

300 Dependent variable. 

301 x : array_like or None, optional 

302 Independent variable. If None, use ``arange(len(y))`` instead. 

303 method : {'hierarchical', 'separate'} 

304 If 'hierarchical', estimate the intercept using the estimated 

305 slope ``medslope`` (default option). 

306 If 'separate', estimate the intercept independent of the estimated 

307 slope. See Notes for details. 

308 

309 Returns 

310 ------- 

311 medslope : float 

312 Estimate of the slope of the regression line. 

313 medintercept : float 

314 Estimate of the intercept of the regression line. 

315 

316 See also 

317 -------- 

318 theilslopes : a similar technique without repeated medians 

319 

320 Notes 

321 ----- 

322 With ``n = len(y)``, compute ``m_j`` as the median of 

323 the slopes from the point ``(x[j], y[j])`` to all other `n-1` points. 

324 ``medslope`` is then the median of all slopes ``m_j``. 

325 Two ways are given to estimate the intercept in [1]_ which can be chosen 

326 via the parameter ``method``. 

327 The hierarchical approach uses the estimated slope ``medslope`` 

328 and computes ``medintercept`` as the median of ``y - medslope*x``. 

329 The other approach estimates the intercept separately as follows: for 

330 each point ``(x[j], y[j])``, compute the intercepts of all the `n-1` 

331 lines through the remaining points and take the median ``i_j``. 

332 ``medintercept`` is the median of the ``i_j``. 

333 

334 The implementation computes `n` times the median of a vector of size `n` 

335 which can be slow for large vectors. There are more efficient algorithms 

336 (see [2]_) which are not implemented here. 

337 

338 References 

339 ---------- 

340 .. [1] A. Siegel, "Robust Regression Using Repeated Medians", 

341 Biometrika, Vol. 69, pp. 242-244, 1982. 

342 

343 .. [2] A. Stein and M. Werman, "Finding the repeated median regression 

344 line", Proceedings of the Third Annual ACM-SIAM Symposium on 

345 Discrete Algorithms, pp. 409-413, 1992. 

346 

347 Examples 

348 -------- 

349 >>> from scipy import stats 

350 >>> import matplotlib.pyplot as plt 

351 

352 >>> x = np.linspace(-5, 5, num=150) 

353 >>> y = x + np.random.normal(size=x.size) 

354 >>> y[11:15] += 10 # add outliers 

355 >>> y[-5:] -= 7 

356 

357 Compute the slope and intercept. For comparison, also compute the 

358 least-squares fit with `linregress`: 

359 

360 >>> res = stats.siegelslopes(y, x) 

361 >>> lsq_res = stats.linregress(x, y) 

362 

363 Plot the results. The Siegel regression line is shown in red. The green 

364 line shows the least-squares fit for comparison. 

365 

366 >>> fig = plt.figure() 

367 >>> ax = fig.add_subplot(111) 

368 >>> ax.plot(x, y, 'b.') 

369 >>> ax.plot(x, res[1] + res[0] * x, 'r-') 

370 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-') 

371 >>> plt.show() 

372 

373 """ 

374 if method not in ['hierarchical', 'separate']: 

375 raise ValueError("method can only be 'hierarchical' or 'separate'") 

376 y = np.asarray(y).ravel() 

377 if x is None: 

378 x = np.arange(len(y), dtype=float) 

379 else: 

380 x = np.asarray(x, dtype=float).ravel() 

381 if len(x) != len(y): 

382 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x))) 

383 

384 deltax = x[:, np.newaxis] - x 

385 deltay = y[:, np.newaxis] - y 

386 slopes, intercepts = [], [] 

387 

388 for j in range(len(x)): 

389 id_nonzero = deltax[j, :] != 0 

390 slopes_j = deltay[j, id_nonzero] / deltax[j, id_nonzero] 

391 medslope_j = np.median(slopes_j) 

392 slopes.append(medslope_j) 

393 if method == 'separate': 

394 z = y*x[j] - y[j]*x 

395 medintercept_j = np.median(z[id_nonzero] / deltax[j, id_nonzero]) 

396 intercepts.append(medintercept_j) 

397 

398 medslope = np.median(np.asarray(slopes)) 

399 if method == "separate": 

400 medinter = np.median(np.asarray(intercepts)) 

401 else: 

402 medinter = np.median(y - medslope*x) 

403 

404 return medslope, medinter