Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from collections import namedtuple 

2import numpy as np 

3import warnings 

4from ._continuous_distns import chi2 

5from . import _wilcoxon_data 

6 

7 

8Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult', 

9 ('statistic', 'pvalue')) 

10 

11 

12def epps_singleton_2samp(x, y, t=(0.4, 0.8)): 

13 """ 

14 Compute the Epps-Singleton (ES) test statistic. 

15 

16 Test the null hypothesis that two samples have the same underlying 

17 probability distribution. 

18 

19 Parameters 

20 ---------- 

21 x, y : array-like 

22 The two samples of observations to be tested. Input must not have more 

23 than one dimension. Samples can have different lengths. 

24 t : array-like, optional 

25 The points (t1, ..., tn) where the empirical characteristic function is 

26 to be evaluated. It should be positive distinct numbers. The default 

27 value (0.4, 0.8) is proposed in [1]_. Input must not have more than 

28 one dimension. 

29 

30 Returns 

31 ------- 

32 statistic : float 

33 The test statistic. 

34 pvalue : float 

35 The associated p-value based on the asymptotic chi2-distribution. 

36 

37 See Also 

38 -------- 

39 ks_2samp, anderson_ksamp 

40 

41 Notes 

42 ----- 

43 Testing whether two samples are generated by the same underlying 

44 distribution is a classical question in statistics. A widely used test is 

45 the Kolmogorov-Smirnov (KS) test which relies on the empirical 

46 distribution function. Epps and Singleton introduce a test based on the 

47 empirical characteristic function in [1]_. 

48 

49 One advantage of the ES test compared to the KS test is that is does 

50 not assume a continuous distribution. In [1]_, the authors conclude 

51 that the test also has a higher power than the KS test in many 

52 examples. They recommend the use of the ES test for discrete samples as 

53 well as continuous samples with at least 25 observations each, whereas 

54 `anderson_ksamp` is recommended for smaller sample sizes in the 

55 continuous case. 

56 

57 The p-value is computed from the asymptotic distribution of the test 

58 statistic which follows a `chi2` distribution. If the sample size of both 

59 `x` and `y` is below 25, the small sample correction proposed in [1]_ is 

60 applied to the test statistic. 

61 

62 The default values of `t` are determined in [1]_ by considering 

63 various distributions and finding good values that lead to a high power 

64 of the test in general. Table III in [1]_ gives the optimal values for 

65 the distributions tested in that study. The values of `t` are scaled by 

66 the semi-interquartile range in the implementation, see [1]_. 

67 

68 References 

69 ---------- 

70 .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample 

71 problem using the empirical characteristic function", Journal of 

72 Statistical Computation and Simulation 26, p. 177--203, 1986. 

73 

74 .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions 

75 - the Epps-Singleton two-sample test using the empirical characteristic 

76 function", The Stata Journal 9(3), p. 454--465, 2009. 

77 

78 """ 

79 

80 x, y, t = np.asarray(x), np.asarray(y), np.asarray(t) 

81 # check if x and y are valid inputs 

82 if x.ndim > 1: 

83 raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim)) 

84 if y.ndim > 1: 

85 raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim)) 

86 nx, ny = len(x), len(y) 

87 if (nx < 5) or (ny < 5): 

88 raise ValueError('x and y should have at least 5 elements, but len(x) ' 

89 '= {} and len(y) = {}.'.format(nx, ny)) 

90 if not np.isfinite(x).all(): 

91 raise ValueError('x must not contain nonfinite values.') 

92 if not np.isfinite(y).all(): 

93 raise ValueError('y must not contain nonfinite values.') 

94 n = nx + ny 

95 

96 # check if t is valid 

97 if t.ndim > 1: 

98 raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim)) 

99 if np.less_equal(t, 0).any(): 

100 raise ValueError('t must contain positive elements only.') 

101 

102 # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid 

103 # circular import 

104 from scipy.stats import iqr 

105 sigma = iqr(np.hstack((x, y))) / 2 

106 ts = np.reshape(t, (-1, 1)) / sigma 

107 

108 # covariance estimation of ES test 

109 gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t)) 

110 gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T 

111 cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate 

112 cov_y = np.cov(gy.T, bias=True) 

113 est_cov = (n/nx)*cov_x + (n/ny)*cov_y 

114 est_cov_inv = np.linalg.pinv(est_cov) 

115 r = np.linalg.matrix_rank(est_cov_inv) 

116 if r < 2*len(t): 

117 warnings.warn('Estimated covariance matrix does not have full rank. ' 

118 'This indicates a bad choice of the input t and the ' 

119 'test might not be consistent.') # see p. 183 in [1]_ 

120 

121 # compute test statistic w distributed asympt. as chisquare with df=r 

122 g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0) 

123 w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff)) 

124 

125 # apply small-sample correction 

126 if (max(nx, ny) < 25): 

127 corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7))) 

128 w = corr * w 

129 

130 p = chi2.sf(w, r) 

131 

132 return Epps_Singleton_2sampResult(w, p) 

133 

134 

135def _get_wilcoxon_distr(n): 

136 """ 

137 Distribution of counts of the Wilcoxon ranksum statistic r_plus (sum of 

138 ranks of positive differences). 

139 Returns an array with the counts/frequencies of all the possible ranks 

140 r = 0, ..., n*(n+1)/2 

141 """ 

142 cnt = _wilcoxon_data.COUNTS.get(n) 

143 

144 if cnt is None: 

145 raise ValueError("The exact distribution of the Wilcoxon test " 

146 "statistic is not implemented for n={}".format(n)) 

147 

148 return np.array(cnt, dtype=int)