Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Some functions for working with contingency tables (i.e. cross tabulations). 

2""" 

3 

4 

5from functools import reduce 

6import numpy as np 

7from .stats import power_divergence 

8 

9 

10__all__ = ['margins', 'expected_freq', 'chi2_contingency'] 

11 

12 

13def margins(a): 

14 """Return a list of the marginal sums of the array `a`. 

15 

16 Parameters 

17 ---------- 

18 a : ndarray 

19 The array for which to compute the marginal sums. 

20 

21 Returns 

22 ------- 

23 margsums : list of ndarrays 

24 A list of length `a.ndim`. `margsums[k]` is the result 

25 of summing `a` over all axes except `k`; it has the same 

26 number of dimensions as `a`, but the length of each axis 

27 except axis `k` will be 1. 

28 

29 Examples 

30 -------- 

31 >>> a = np.arange(12).reshape(2, 6) 

32 >>> a 

33 array([[ 0, 1, 2, 3, 4, 5], 

34 [ 6, 7, 8, 9, 10, 11]]) 

35 >>> from scipy.stats.contingency import margins 

36 >>> m0, m1 = margins(a) 

37 >>> m0 

38 array([[15], 

39 [51]]) 

40 >>> m1 

41 array([[ 6, 8, 10, 12, 14, 16]]) 

42 

43 >>> b = np.arange(24).reshape(2,3,4) 

44 >>> m0, m1, m2 = margins(b) 

45 >>> m0 

46 array([[[ 66]], 

47 [[210]]]) 

48 >>> m1 

49 array([[[ 60], 

50 [ 92], 

51 [124]]]) 

52 >>> m2 

53 array([[[60, 66, 72, 78]]]) 

54 """ 

55 margsums = [] 

56 ranged = list(range(a.ndim)) 

57 for k in ranged: 

58 marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k]) 

59 margsums.append(marg) 

60 return margsums 

61 

62 

63def expected_freq(observed): 

64 """ 

65 Compute the expected frequencies from a contingency table. 

66 

67 Given an n-dimensional contingency table of observed frequencies, 

68 compute the expected frequencies for the table based on the marginal 

69 sums under the assumption that the groups associated with each 

70 dimension are independent. 

71 

72 Parameters 

73 ---------- 

74 observed : array_like 

75 The table of observed frequencies. (While this function can handle 

76 a 1-D array, that case is trivial. Generally `observed` is at 

77 least 2-D.) 

78 

79 Returns 

80 ------- 

81 expected : ndarray of float64 

82 The expected frequencies, based on the marginal sums of the table. 

83 Same shape as `observed`. 

84 

85 Examples 

86 -------- 

87 >>> observed = np.array([[10, 10, 20],[20, 20, 20]]) 

88 >>> from scipy.stats.contingency import expected_freq 

89 >>> expected_freq(observed) 

90 array([[ 12., 12., 16.], 

91 [ 18., 18., 24.]]) 

92 

93 """ 

94 # Typically `observed` is an integer array. If `observed` has a large 

95 # number of dimensions or holds large values, some of the following 

96 # computations may overflow, so we first switch to floating point. 

97 observed = np.asarray(observed, dtype=np.float64) 

98 

99 # Create a list of the marginal sums. 

100 margsums = margins(observed) 

101 

102 # Create the array of expected frequencies. The shapes of the 

103 # marginal sums returned by apply_over_axes() are just what we 

104 # need for broadcasting in the following product. 

105 d = observed.ndim 

106 expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1) 

107 return expected 

108 

109 

110def chi2_contingency(observed, correction=True, lambda_=None): 

111 """Chi-square test of independence of variables in a contingency table. 

112 

113 This function computes the chi-square statistic and p-value for the 

114 hypothesis test of independence of the observed frequencies in the 

115 contingency table [1]_ `observed`. The expected frequencies are computed 

116 based on the marginal sums under the assumption of independence; see 

117 `scipy.stats.contingency.expected_freq`. The number of degrees of 

118 freedom is (expressed using numpy functions and attributes):: 

119 

120 dof = observed.size - sum(observed.shape) + observed.ndim - 1 

121 

122 

123 Parameters 

124 ---------- 

125 observed : array_like 

126 The contingency table. The table contains the observed frequencies 

127 (i.e. number of occurrences) in each category. In the two-dimensional 

128 case, the table is often described as an "R x C table". 

129 correction : bool, optional 

130 If True, *and* the degrees of freedom is 1, apply Yates' correction 

131 for continuity. The effect of the correction is to adjust each 

132 observed value by 0.5 towards the corresponding expected value. 

133 lambda_ : float or str, optional. 

134 By default, the statistic computed in this test is Pearson's 

135 chi-squared statistic [2]_. `lambda_` allows a statistic from the 

136 Cressie-Read power divergence family [3]_ to be used instead. See 

137 `power_divergence` for details. 

138 

139 Returns 

140 ------- 

141 chi2 : float 

142 The test statistic. 

143 p : float 

144 The p-value of the test 

145 dof : int 

146 Degrees of freedom 

147 expected : ndarray, same shape as `observed` 

148 The expected frequencies, based on the marginal sums of the table. 

149 

150 See Also 

151 -------- 

152 contingency.expected_freq 

153 fisher_exact 

154 chisquare 

155 power_divergence 

156 

157 Notes 

158 ----- 

159 An often quoted guideline for the validity of this calculation is that 

160 the test should be used only if the observed and expected frequencies 

161 in each cell are at least 5. 

162 

163 This is a test for the independence of different categories of a 

164 population. The test is only meaningful when the dimension of 

165 `observed` is two or more. Applying the test to a one-dimensional 

166 table will always result in `expected` equal to `observed` and a 

167 chi-square statistic equal to 0. 

168 

169 This function does not handle masked arrays, because the calculation 

170 does not make sense with missing values. 

171 

172 Like stats.chisquare, this function computes a chi-square statistic; 

173 the convenience this function provides is to figure out the expected 

174 frequencies and degrees of freedom from the given contingency table. 

175 If these were already known, and if the Yates' correction was not 

176 required, one could use stats.chisquare. That is, if one calls:: 

177 

178 chi2, p, dof, ex = chi2_contingency(obs, correction=False) 

179 

180 then the following is true:: 

181 

182 (chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(), 

183 ddof=obs.size - 1 - dof) 

184 

185 The `lambda_` argument was added in version 0.13.0 of scipy. 

186 

187 References 

188 ---------- 

189 .. [1] "Contingency table", 

190 https://en.wikipedia.org/wiki/Contingency_table 

191 .. [2] "Pearson's chi-squared test", 

192 https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test 

193 .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit 

194 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), 

195 pp. 440-464. 

196 

197 Examples 

198 -------- 

199 A two-way example (2 x 3): 

200 

201 >>> from scipy.stats import chi2_contingency 

202 >>> obs = np.array([[10, 10, 20], [20, 20, 20]]) 

203 >>> chi2_contingency(obs) 

204 (2.7777777777777777, 

205 0.24935220877729619, 

206 2, 

207 array([[ 12., 12., 16.], 

208 [ 18., 18., 24.]])) 

209 

210 Perform the test using the log-likelihood ratio (i.e. the "G-test") 

211 instead of Pearson's chi-squared statistic. 

212 

213 >>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood") 

214 >>> g, p 

215 (2.7688587616781319, 0.25046668010954165) 

216 

217 A four-way example (2 x 2 x 2 x 2): 

218 

219 >>> obs = np.array( 

220 ... [[[[12, 17], 

221 ... [11, 16]], 

222 ... [[11, 12], 

223 ... [15, 16]]], 

224 ... [[[23, 15], 

225 ... [30, 22]], 

226 ... [[14, 17], 

227 ... [15, 16]]]]) 

228 >>> chi2_contingency(obs) 

229 (8.7584514426741897, 

230 0.64417725029295503, 

231 11, 

232 array([[[[ 14.15462386, 14.15462386], 

233 [ 16.49423111, 16.49423111]], 

234 [[ 11.2461395 , 11.2461395 ], 

235 [ 13.10500554, 13.10500554]]], 

236 [[[ 19.5591166 , 19.5591166 ], 

237 [ 22.79202844, 22.79202844]], 

238 [[ 15.54012004, 15.54012004], 

239 [ 18.10873492, 18.10873492]]]])) 

240 """ 

241 observed = np.asarray(observed) 

242 if np.any(observed < 0): 

243 raise ValueError("All values in `observed` must be nonnegative.") 

244 if observed.size == 0: 

245 raise ValueError("No data; `observed` has size 0.") 

246 

247 expected = expected_freq(observed) 

248 if np.any(expected == 0): 

249 # Include one of the positions where expected is zero in 

250 # the exception message. 

251 zeropos = list(zip(*np.nonzero(expected == 0)))[0] 

252 raise ValueError("The internally computed table of expected " 

253 "frequencies has a zero element at %s." % (zeropos,)) 

254 

255 # The degrees of freedom 

256 dof = expected.size - sum(expected.shape) + expected.ndim - 1 

257 

258 if dof == 0: 

259 # Degenerate case; this occurs when `observed` is 1D (or, more 

260 # generally, when it has only one nontrivial dimension). In this 

261 # case, we also have observed == expected, so chi2 is 0. 

262 chi2 = 0.0 

263 p = 1.0 

264 else: 

265 if dof == 1 and correction: 

266 # Adjust `observed` according to Yates' correction for continuity. 

267 observed = observed + 0.5 * np.sign(expected - observed) 

268 

269 chi2, p = power_divergence(observed, expected, 

270 ddof=observed.size - 1 - dof, axis=None, 

271 lambda_=lambda_) 

272 

273 return chi2, p, dof, expected