Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/sandbox/stats/runs.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1'''runstest
3formulas for mean and var of runs taken from SAS manual NPAR tests, also idea
4for runstest_1samp and runstest_2samp
6Description in NIST handbook and dataplot does not explain their expected
7values, or variance
9Note:
10There are (at least) two definitions of runs used in literature. The classical
11definition which is also used here, is that runs are sequences of identical
12observations separated by observations with different realizations.
13The second definition allows for overlapping runs, or runs where counting a
14run is also started after a run of a fixed length of the same kind.
17TODO
18* add one-sided tests where possible or where it makes sense
20'''
22import numpy as np
23from scipy import stats
24from scipy.special import comb
25import warnings
27class Runs(object):
28 '''class for runs in a binary sequence
31 Parameters
32 ----------
33 x : array_like, 1d
34 data array,
37 Notes
38 -----
39 This was written as a more general class for runs. This has some redundant
40 calculations when only the runs_test is used.
42 TODO: make it lazy
44 The runs test could be generalized to more than 1d if there is a use case
45 for it.
47 This should be extended once I figure out what the distribution of runs
48 of any length k is.
50 The exact distribution for the runs test is also available but not yet
51 verified.
53 '''
55 def __init__(self, x):
56 self.x = np.asarray(x)
58 self.runstart = runstart = np.nonzero(np.diff(np.r_[[-np.inf], x, [np.inf]]))[0]
59 self.runs = runs = np.diff(runstart)
60 self.runs_sign = runs_sign = x[runstart[:-1]]
61 self.runs_pos = runs[runs_sign==1]
62 self.runs_neg = runs[runs_sign==0]
63 self.runs_freqs = np.bincount(runs)
64 self.n_runs = len(self.runs)
65 self.n_pos = (x==1).sum()
67 def runs_test(self, correction=True):
68 '''basic version of runs test
70 Parameters
71 ----------
72 correction : bool
73 Following the SAS manual, for samplesize below 50, the test
74 statistic is corrected by 0.5. This can be turned off with
75 correction=False, and was included to match R, tseries, which
76 does not use any correction.
78 pvalue based on normal distribution, with integer correction
80 '''
81 self.npo = npo = (self.runs_pos).sum()
82 self.nne = nne = (self.runs_neg).sum()
84 #n_r = self.n_runs
85 n = npo + nne
86 npn = npo * nne
87 rmean = 2. * npn / n + 1
88 rvar = 2. * npn * (2.*npn - n) / n**2. / (n-1.)
89 rstd = np.sqrt(rvar)
90 rdemean = self.n_runs - rmean
91 if n >= 50 or not correction:
92 z = rdemean
93 else:
94 if rdemean > 0.5:
95 z = rdemean - 0.5
96 elif rdemean < 0.5:
97 z = rdemean + 0.5
98 else:
99 z = 0.
101 z /= rstd
102 pval = 2 * stats.norm.sf(np.abs(z))
103 return z, pval
105def runstest_1samp(x, cutoff='mean', correction=True):
106 '''use runs test on binary discretized data above/below cutoff
108 Parameters
109 ----------
110 x : array_like
111 data, numeric
112 cutoff : {'mean', 'median'} or number
113 This specifies the cutoff to split the data into large and small
114 values.
115 correction : bool
116 Following the SAS manual, for samplesize below 50, the test
117 statistic is corrected by 0.5. This can be turned off with
118 correction=False, and was included to match R, tseries, which
119 does not use any correction.
121 Returns
122 -------
123 z_stat : float
124 test statistic, asymptotically normally distributed
125 p-value : float
126 p-value, reject the null hypothesis if it is below an type 1 error
127 level, alpha .
129 '''
131 if cutoff == 'mean':
132 cutoff = np.mean(x)
133 elif cutoff == 'median':
134 cutoff = np.median(x)
135 xindicator = (x >= cutoff).astype(int)
136 return Runs(xindicator).runs_test(correction=correction)
138def runstest_2samp(x, y=None, groups=None, correction=True):
139 '''Wald-Wolfowitz runstest for two samples
141 This tests whether two samples come from the same distribution.
143 Parameters
144 ----------
145 x : array_like
146 data, numeric, contains either one group, if y is also given, or
147 both groups, if additionally a group indicator is provided
148 y : array_like (optional)
149 data, numeric
150 groups : array_like
151 group labels or indicator the data for both groups is given in a
152 single 1-dimensional array, x. If group labels are not [0,1], then
153 correction : bool
154 Following the SAS manual, for samplesize below 50, the test
155 statistic is corrected by 0.5. This can be turned off with
156 correction=False, and was included to match R, tseries, which
157 does not use any correction.
159 Returns
160 -------
161 z_stat : float
162 test statistic, asymptotically normally distributed
163 p-value : float
164 p-value, reject the null hypothesis if it is below an type 1 error
165 level, alpha .
168 Notes
169 -----
170 Wald-Wolfowitz runs test.
172 If there are ties, then then the test statistic and p-value that is
173 reported, is based on the higher p-value between sorting all tied
174 observations of the same group
177 This test is intended for continuous distributions
178 SAS has treatment for ties, but not clear, and sounds more complicated
179 (minimum and maximum possible runs prevent use of argsort)
180 (maybe it's not so difficult, idea: add small positive noise to first
181 one, run test, then to the other, run test, take max(?) p-value - DONE
182 This gives not the minimum and maximum of the number of runs, but should
183 be close. Not true, this is close to minimum but far away from maximum.
184 maximum number of runs would use alternating groups in the ties.)
185 Maybe adding random noise would be the better approach.
187 SAS has exact distribution for sample size <=30, does not look standard
188 but should be easy to add.
190 currently two-sided test only
192 This has not been verified against a reference implementation. In a short
193 Monte Carlo simulation where both samples are normally distribute, the test
194 seems to be correctly sized for larger number of observations (30 or
195 larger), but conservative (i.e. reject less often than nominal) with a
196 sample size of 10 in each group.
198 See Also
199 --------
200 runs_test_1samp
201 Runs
202 RunsProb
204 '''
205 x = np.asarray(x)
206 if y is not None:
207 y = np.asarray(y)
208 groups = np.concatenate((np.zeros(len(x)), np.ones(len(y))))
209 # note reassigning x
210 x = np.concatenate((x, y))
211 gruni = np.arange(2)
212 elif groups is not None:
213 gruni = np.unique(groups)
214 if gruni.size != 2: # pylint: disable=E1103
215 raise ValueError('not exactly two groups specified')
216 #require groups to be numeric ???
217 else:
218 raise ValueError('either y or groups is necessary')
220 xargsort = np.argsort(x)
221 #check for ties
222 x_sorted = x[xargsort]
223 x_diff = np.diff(x_sorted) # used for detecting and handling ties
224 if x_diff.min() == 0:
225 print('ties detected') #replace with warning
226 x_mindiff = x_diff[x_diff > 0].min()
227 eps = x_mindiff/2.
228 xx = x.copy() #do not change original, just in case
230 xx[groups==gruni[0]] += eps
231 xargsort = np.argsort(xx)
232 xindicator = groups[xargsort]
233 z0, p0 = Runs(xindicator).runs_test(correction=correction)
235 xx[groups==gruni[0]] -= eps #restore xx = x
236 xx[groups==gruni[1]] += eps
237 xargsort = np.argsort(xx)
238 xindicator = groups[xargsort]
239 z1, p1 = Runs(xindicator).runs_test(correction=correction)
241 idx = np.argmax([p0,p1])
242 return [z0, z1][idx], [p0, p1][idx]
244 else:
245 xindicator = groups[xargsort]
246 return Runs(xindicator).runs_test(correction=correction)
249class TotalRunsProb(object):
250 '''class for the probability distribution of total runs
252 This is the exact probability distribution for the (Wald-Wolfowitz)
253 runs test. The random variable is the total number of runs if the
254 sample has (n0, n1) observations of groups 0 and 1.
257 Notes
258 -----
259 Written as a class so I can store temporary calculations, but I do not
260 think it matters much.
262 Formulas taken from SAS manual for one-sided significance level.
264 Could be converted to a full univariate distribution, subclassing
265 scipy.stats.distributions.
267 *Status*
268 Not verified yet except for mean.
272 '''
274 def __init__(self, n0, n1):
275 self.n0 = n0
276 self.n1 = n1
277 self.n = n = n0 + n1
278 self.comball = comb(n, n1)
280 def runs_prob_even(self, r):
281 n0, n1 = self.n0, self.n1
282 tmp0 = comb(n0-1, r//2-1)
283 tmp1 = comb(n1-1, r//2-1)
284 return tmp0 * tmp1 * 2. / self.comball
286 def runs_prob_odd(self, r):
287 n0, n1 = self.n0, self.n1
288 k = (r+1)//2
289 tmp0 = comb(n0-1, k-1)
290 tmp1 = comb(n1-1, k-2)
291 tmp3 = comb(n0-1, k-2)
292 tmp4 = comb(n1-1, k-1)
293 return (tmp0 * tmp1 + tmp3 * tmp4) / self.comball
295 def pdf(self, r):
296 r = np.asarray(r)
297 r_isodd = np.mod(r, 2) > 0
298 r_odd = r[r_isodd]
299 r_even = r[~r_isodd]
300 runs_pdf = np.zeros(r.shape)
301 runs_pdf[r_isodd] = self.runs_prob_odd(r_odd)
302 runs_pdf[~r_isodd] = self.runs_prob_even(r_even)
303 return runs_pdf
306 def cdf(self, r):
307 r_ = np.arange(2,r+1)
308 cdfval = self.runs_prob_even(r_[::2]).sum()
309 cdfval += self.runs_prob_odd(r_[1::2]).sum()
310 return cdfval
313class RunsProb(object):
314 '''distribution of success runs of length k or more (classical definition)
316 The underlying process is assumed to be a sequence of Bernoulli trials
317 of a given length n.
319 not sure yet, how to interpret or use the distribution for runs
320 of length k or more.
322 Musseli also has longest success run, and waiting time distribution
323 negative binomial of order k and geometric of order k
325 need to compare with Godpole
327 need a MonteCarlo function to do some quick tests before doing more
330 '''
334 def pdf(self, x, k, n, p):
335 '''distribution of success runs of length k or more
337 Parameters
338 ----------
339 x : float
340 count of runs of length n
341 k : int
342 length of runs
343 n : int
344 total number of observations or trials
345 p : float
346 probability of success in each Bernoulli trial
348 Returns
349 -------
350 pdf : float
351 probability that x runs of length of k are observed
353 Notes
354 -----
355 not yet vectorized
357 References
358 ----------
359 Muselli 1996, theorem 3
360 '''
362 q = 1-p
363 m = np.arange(x, (n+1)//(k+1)+1)[:,None]
364 terms = (-1)**(m-x) * comb(m, x) * p**(m*k) * q**(m-1) \
365 * (comb(n - m*k, m - 1) + q * comb(n - m*k, m))
366 return terms.sum(0)
368 def pdf_nb(self, x, k, n, p):
369 pass
370 #y = np.arange(m-1, n-mk+1
372'''
373>>> [np.sum([RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]) for k in range(16)]
374[0.99999332193894064, 0.99999999999999367, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
375>>> [(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)]
376[6.9998931510341809, 4.1406249999999929, 2.4414062500000075, 1.4343261718749996, 0.83923339843749856, 0.48875808715820324, 0.28312206268310569, 0.1629814505577086, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
377>>> np.array([(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)])/11
378array([ 0.63635392, 0.37642045, 0.22194602, 0.13039329, 0.07629395,
379 0.04443255, 0.02573837, 0.0148165 , 0. , 0. ,
380 0. , 0. , 0. , 0. , 0. , 0. ])
381>>> np.diff([(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)][::-1])
382array([ 0. , 0. , 0. , 0. , 0. ,
383 0. , 0. , 0.16298145, 0.12014061, 0.20563602,
384 0.35047531, 0.59509277, 1.00708008, 1.69921875, 2.85926815])
385'''
389def median_test_ksample(x, groups):
390 '''chisquare test for equality of median/location
392 This tests whether all groups have the same fraction of observations
393 above the median.
395 Parameters
396 ----------
397 x : array_like
398 data values stacked for all groups
399 groups : array_like
400 group labels or indicator
402 Returns
403 -------
404 stat : float
405 test statistic
406 pvalue : float
407 pvalue from the chisquare distribution
408 others ????
409 currently some test output, table and expected
411 '''
412 x = np.asarray(x)
413 gruni = np.unique(groups)
414 xli = [x[groups==group] for group in gruni]
415 xmedian = np.median(x)
416 counts_larger = np.array([(xg > xmedian).sum() for xg in xli])
417 counts = np.array([len(xg) for xg in xli])
418 counts_smaller = counts - counts_larger
419 nobs = counts.sum()
420 n_larger = (x > xmedian).sum()
421 n_smaller = nobs - n_larger
422 table = np.vstack((counts_smaller, counts_larger))
424 #the following should be replaced by chisquare_contingency table
425 expected = np.vstack((counts * 1. / nobs * n_smaller,
426 counts * 1. / nobs * n_larger))
428 if (expected < 5).any():
429 print('Warning: There are cells with less than 5 expected' \
430 'observations. The chisquare distribution might not be a good' \
431 'approximation for the true distribution.')
433 #check ddof
434 return stats.chisquare(table.ravel(), expected.ravel(), ddof=1), table, expected
439def cochrans_q(x):
440 '''Cochran's Q test for identical effect of k treatments
442 Cochran's Q is a k-sample extension of the McNemar test. If there are only
443 two treatments, then Cochran's Q test and McNemar test are equivalent.
445 Test that the probability of success is the same for each treatment.
446 The alternative is that at least two treatments have a different
447 probability of success.
449 Parameters
450 ----------
451 x : array_like, 2d (N,k)
452 data with N cases and k variables
454 Returns
455 -------
456 q_stat : float
457 test statistic
458 pvalue : float
459 pvalue from the chisquare distribution
461 Notes
462 -----
463 In Wikipedia terminology, rows are blocks and columns are treatments.
464 The number of rows N, should be large for the chisquare distribution to be
465 a good approximation.
466 The Null hypothesis of the test is that all treatments have the
467 same effect.
469 References
470 ----------
471 https://en.wikipedia.org/wiki/Cochran_test
472 SAS Manual for NPAR TESTS
474 '''
476 warnings.warn("Deprecated, use stats.cochrans_q instead", DeprecationWarning)
478 x = np.asarray(x)
479 gruni = np.unique(x)
480 N, k = x.shape
481 count_row_success = (x==gruni[-1]).sum(1, float)
482 count_col_success = (x==gruni[-1]).sum(0, float)
483 count_row_ss = count_row_success.sum()
484 count_col_ss = count_col_success.sum()
485 assert count_row_ss == count_col_ss #just a calculation check
488 #this is SAS manual
489 q_stat = (k-1) * (k * np.sum(count_col_success**2) - count_col_ss**2) \
490 / (k * count_row_ss - np.sum(count_row_success**2))
492 #Note: the denominator looks just like k times the variance of the
493 #columns
495 #Wikipedia uses a different, but equivalent expression
496## q_stat = (k-1) * (k * np.sum(count_row_success**2) - count_row_ss**2) \
497## / (k * count_col_ss - np.sum(count_col_success**2))
499 return q_stat, stats.chi2.sf(q_stat, k-1)
501def mcnemar(x, y=None, exact=True, correction=True):
502 '''McNemar test
504 Parameters
505 ----------
506 x, y : array_like
507 two paired data samples. If y is None, then x can be a 2 by 2
508 contingency table. x and y can have more than one dimension, then
509 the results are calculated under the assumption that axis zero
510 contains the observation for the samples.
511 exact : bool
512 If exact is true, then the binomial distribution will be used.
513 If exact is false, then the chisquare distribution will be used, which
514 is the approximation to the distribution of the test statistic for
515 large sample sizes.
516 correction : bool
517 If true, then a continuity correction is used for the chisquare
518 distribution (if exact is false.)
520 Returns
521 -------
522 stat : float or int, array
523 The test statistic is the chisquare statistic if exact is false. If the
524 exact binomial distribution is used, then this contains the min(n1, n2),
525 where n1, n2 are cases that are zero in one sample but one in the other
526 sample.
528 pvalue : float or array
529 p-value of the null hypothesis of equal effects.
531 Notes
532 -----
533 This is a special case of Cochran's Q test. The results when the chisquare
534 distribution is used are identical, except for continuity correction.
536 '''
538 warnings.warn("Deprecated, use stats.TableSymmetry instead", DeprecationWarning)
540 x = np.asarray(x)
541 if y is None and x.shape[0] == x.shape[1]:
542 if x.shape[0] != 2:
543 raise ValueError('table needs to be 2 by 2')
544 n1, n2 = x[1, 0], x[0, 1]
545 else:
546 # I'm not checking here whether x and y are binary,
547 # is not this also paired sign test
548 n1 = np.sum(x < y, 0)
549 n2 = np.sum(x > y, 0)
551 if exact:
552 stat = np.minimum(n1, n2)
553 # binom is symmetric with p=0.5
554 pval = stats.binom.cdf(stat, n1 + n2, 0.5) * 2
555 pval = np.minimum(pval, 1) # limit to 1 if n1==n2
556 else:
557 corr = int(correction) # convert bool to 0 or 1
558 stat = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
559 df = 1
560 pval = stats.chi2.sf(stat, df)
561 return stat, pval
564def symmetry_bowker(table):
565 '''Test for symmetry of a (k, k) square contingency table
567 This is an extension of the McNemar test to test the Null hypothesis
568 that the contingency table is symmetric around the main diagonal, that is
570 n_{i, j} = n_{j, i} for all i, j
572 Parameters
573 ----------
574 table : array_like, 2d, (k, k)
575 a square contingency table that contains the count for k categories
576 in rows and columns.
578 Returns
579 -------
580 statistic : float
581 chisquare test statistic
582 p-value : float
583 p-value of the test statistic based on chisquare distribution
584 df : int
585 degrees of freedom of the chisquare distribution
587 Notes
588 -----
589 Implementation is based on the SAS documentation, R includes it in
590 `mcnemar.test` if the table is not 2 by 2.
592 The pvalue is based on the chisquare distribution which requires that the
593 sample size is not very small to be a good approximation of the true
594 distribution. For 2x2 contingency tables exact distribution can be
595 obtained with `mcnemar`
597 See Also
598 --------
599 mcnemar
602 '''
604 warnings.warn("Deprecated, use stats.TableSymmetry instead", DeprecationWarning)
606 table = np.asarray(table)
607 k, k2 = table.shape
608 if k != k2:
609 raise ValueError('table needs to be square')
611 #low_idx = np.tril_indices(k, -1) # this does not have Fortran order
612 upp_idx = np.triu_indices(k, 1)
614 tril = table.T[upp_idx] # lower triangle in column order
615 triu = table[upp_idx] # upper triangle in row order
617 stat = ((tril - triu)**2 / (tril + triu + 1e-20)).sum()
618 df = k * (k-1) / 2.
619 pval = stats.chi2.sf(stat, df)
621 return stat, pval, df
624if __name__ == '__main__':
626 x1 = np.array([1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])
628 print(Runs(x1).runs_test())
629 print(runstest_1samp(x1, cutoff='mean'))
630 print(runstest_2samp(np.arange(16,0,-1), groups=x1))
631 print(TotalRunsProb(7,9).cdf(11))
632 print(median_test_ksample(np.random.randn(100), np.random.randint(0,2,100)))
633 print(cochrans_q(np.random.randint(0,2,(100,8))))