Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/_lilliefors.py : 41%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Implements Lilliefors corrected Kolmogorov-Smirnov tests for normal and
3exponential distributions.
5`kstest_fit` is provided as a top-level function to access both tests.
6`kstest_normal` and `kstest_exponential` are provided as convenience functions
7with the appropriate test as the default.
8`lilliefors` is provided as an alias for `kstest_fit`.
10Created on Sat Oct 01 13:16:49 2011
12Author: Josef Perktold
13License: BSD-3
15pvalues for Lilliefors test are based on formula and table in
17An Analytic Approximation to the Distribution of Lilliefors's Test Statistic
18for Normality
19Author(s): Gerard E. Dallal and Leland WilkinsonSource: The American
20Statistician, Vol. 40, No. 4 (Nov., 1986), pp. 294-296
21Published by: American Statistical Association
22Stable URL: http://www.jstor.org/stable/2684607 .
24On the Kolmogorov-Smirnov Test for Normality with Mean and Variance Unknown
25Hubert W. Lilliefors
26Journal of the American Statistical Association, Vol. 62, No. 318.
27(Jun., 1967), pp. 399-402.
29---
31Updated 2017-07-23
32Jacob C. Kimmel
34Ref:
35Lilliefors, H.W.
36On the Kolmogorov-Smirnov test for the exponential distribution with mean
37unknown. Journal of the American Statistical Association, Vol 64, No. 325.
38(1969), pp. 387–389.
39"""
40from functools import partial
42import numpy as np
43from scipy import stats
45from ._lilliefors_critical_values import (critical_values,
46 asymp_critical_values,
47 PERCENTILES)
48from .tabledist import TableDist
51def _make_asymptotic_function(params):
52 """
53 Generates an asymptotic distribution callable from a param matrix
55 Polynomial is a[0] * x**(-1/2) + a[1] * x**(-1) + a[2] * x**(-3/2)
57 Parameters
58 ----------
59 params : ndarray
60 Array with shape (nalpha, 3) where nalpha is the number of
61 significance levels
62 """
64 def f(n):
65 poly = np.array([1, np.log(n), np.log(n) ** 2])
66 return np.exp(poly.dot(params.T))
68 return f
71def ksstat(x, cdf, alternative='two_sided', args=()):
72 """
73 Calculate statistic for the Kolmogorov-Smirnov test for goodness of fit
75 This calculates the test statistic for a test of the distribution G(x) of
76 an observed variable against a given distribution F(x). Under the null
77 hypothesis the two distributions are identical, G(x)=F(x). The
78 alternative hypothesis can be either 'two_sided' (default), 'less'
79 or 'greater'. The KS test is only valid for continuous distributions.
81 Parameters
82 ----------
83 x : array_like, 1d
84 array of observations
85 cdf : str or callable
86 string: name of a distribution in scipy.stats
87 callable: function to evaluate cdf
88 alternative : 'two_sided' (default), 'less' or 'greater'
89 defines the alternative hypothesis (see explanation)
90 args : tuple, sequence
91 distribution parameters for call to cdf
94 Returns
95 -------
96 D : float
97 KS test statistic, either D, D+ or D-
99 See Also
100 --------
101 scipy.stats.kstest
103 Notes
104 -----
106 In the one-sided test, the alternative is that the empirical
107 cumulative distribution function of the random variable is "less"
108 or "greater" than the cumulative distribution function F(x) of the
109 hypothesis, G(x)<=F(x), resp. G(x)>=F(x).
111 In contrast to scipy.stats.kstest, this function only calculates the
112 statistic which can be used either as distance measure or to implement
113 case specific p-values.
114 """
115 nobs = float(len(x))
117 if isinstance(cdf, str):
118 cdf = getattr(stats.distributions, cdf).cdf
119 elif hasattr(cdf, 'cdf'):
120 cdf = getattr(cdf, 'cdf')
122 x = np.sort(x)
123 cdfvals = cdf(x, *args)
125 d_plus = (np.arange(1.0, nobs + 1) / nobs - cdfvals).max()
126 d_min = (cdfvals - np.arange(0.0, nobs) / nobs).max()
127 if alternative == 'greater':
128 return d_plus
129 elif alternative == 'less':
130 return d_min
132 return np.max([d_plus, d_min])
135def get_lilliefors_table(dist='norm'):
136 """
137 Generates tables for significance levels of Lilliefors test statistics
139 Tables for available normal and exponential distribution testing,
140 as specified in Lilliefors references above
142 Parameters
143 ----------
144 dist : str
145 distribution being tested in set {'norm', 'exp'}.
147 Returns
148 -------
149 lf : TableDist object.
150 table of critical values
151 """
152 # function just to keep things together
153 # for this test alpha is sf probability, i.e. right tail probability
155 alpha = 1 - np.array(PERCENTILES) / 100.0
156 alpha = alpha[::-1]
157 dist = 'normal' if dist == 'norm' else dist
158 if dist not in critical_values:
159 raise ValueError("Invalid dist parameter. Must be 'norm' or 'exp'")
160 cv_data = critical_values[dist]
161 acv_data = asymp_critical_values[dist]
163 size = np.array(sorted(cv_data), dtype=np.float)
164 crit_lf = np.array([cv_data[key] for key in sorted(cv_data)])
165 crit_lf = crit_lf[:, ::-1]
167 asym_params = np.array([acv_data[key] for key in sorted(acv_data)])
168 asymp_fn = _make_asymptotic_function((asym_params[::-1]))
170 lf = TableDist(alpha, size, crit_lf, asymptotic=asymp_fn)
171 return lf
174lilliefors_table_norm = get_lilliefors_table(dist='norm')
175lilliefors_table_expon = get_lilliefors_table(dist='exp')
178def pval_lf(d_max, n):
179 """
180 Approximate pvalues for Lilliefors test
182 This is only valid for pvalues smaller than 0.1 which is not checked in
183 this function.
185 Parameters
186 ----------
187 d_max : array_like
188 two-sided Kolmogorov-Smirnov test statistic
189 n : int or float
190 sample size
192 Returns
193 -------
194 p-value : float or ndarray
195 pvalue according to approximation formula of Dallal and Wilkinson.
197 Notes
198 -----
199 This is mainly a helper function where the calling code should dispatch
200 on bound violations. Therefore it does not check whether the pvalue is in
201 the valid range.
203 Precision for the pvalues is around 2 to 3 decimals. This approximation is
204 also used by other statistical packages (e.g. R:fBasics) but might not be
205 the most precise available.
207 References
208 ----------
209 DallalWilkinson1986
210 """
211 # todo: check boundaries, valid range for n and Dmax
212 if n > 100:
213 d_max *= (n / 100.) ** 0.49
214 n = 100
215 pval = np.exp(-7.01256 * d_max ** 2 * (n + 2.78019)
216 + 2.99587 * d_max * np.sqrt(n + 2.78019) - 0.122119
217 + 0.974598 / np.sqrt(n) + 1.67997 / n)
218 return pval
221def kstest_fit(x, dist='norm', pvalmethod=None):
222 """
223 Test assumed normal or exponential distribution using Lilliefors' test.
225 Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters.
227 Parameters
228 ----------
229 x : array_like, 1d
230 Data to test.
231 dist : {'norm', 'exp'}, optional
232 The assumed distribution.
233 pvalmethod : {'approx', 'table'}, optional
234 The method used to compute the p-value of the test statistic. In
235 general, 'table' is preferred and makes use of a very large simulation.
236 'approx' is only valid for normality. if `dist = 'exp'` `table` is
237 always used. 'approx' uses the approximation formula of Dalal and
238 Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1,
239 then the result of `table` is returned.
241 Returns
242 -------
243 ksstat : float
244 Kolmogorov-Smirnov test statistic with estimated mean and variance.
245 pvalue : float
246 If the pvalue is lower than some threshold, e.g. 0.05, then we can
247 reject the Null hypothesis that the sample comes from a normal
248 distribution.
250 Notes
251 -----
252 'table' uses an improved table based on 10,000,000 simulations. The
253 critical values are approximated using
254 log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2
255 where cv_alpha is the critical value for a test with size alpha,
256 b_alpha is an alpha-specific intercept term and c[1] and c[2] are
257 coefficients that are shared all alphas.
258 Values in the table are linearly interpolated. Values outside the
259 range are be returned as bounds, 0.990 for large and 0.001 for small
260 pvalues.
262 For implementation details, see lilliefors_critical_value_simulation.py in
263 the test directory.
264 """
265 if pvalmethod is None:
266 pvalmethod = 'approx'
267 import warnings
268 msg = 'The default pvalmethod will change from "approx" to "table" ' \
269 'after 0.11. The "table" method uses values from a very large ' \
270 'simulation and is more accurate. Explicitly set this ' \
271 'parameter to "approx" or "table" to silence this warning'
272 warnings.warn(msg, FutureWarning)
274 x = np.asarray(x)
275 nobs = len(x)
277 if dist == 'norm':
278 z = (x - x.mean()) / x.std(ddof=1)
279 test_d = stats.norm.cdf
280 lilliefors_table = lilliefors_table_norm
281 elif dist == 'exp':
282 z = x / x.mean()
283 test_d = stats.expon.cdf
284 lilliefors_table = lilliefors_table_expon
285 pvalmethod = 'table'
286 else:
287 raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'")
289 min_nobs = 4 if dist == 'norm' else 3
290 if nobs < min_nobs:
291 raise ValueError('Test for distribution {0} requires at least {1} '
292 'observations'.format(dist, min_nobs))
294 d_ks = ksstat(z, test_d, alternative='two_sided')
296 if pvalmethod == 'approx':
297 pval = pval_lf(d_ks, nobs)
298 # check pval is in desired range
299 if pval > 0.1:
300 pval = lilliefors_table.prob(d_ks, nobs)
301 else: # pvalmethod == 'table'
302 pval = lilliefors_table.prob(d_ks, nobs)
304 return d_ks, pval
307lilliefors = kstest_fit
308kstest_normal = kstest_fit
309kstest_exponential = partial(kstest_fit, dist='exp')