Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/stats/contingency.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Some functions for working with contingency tables (i.e. cross tabulations).
2"""
5from functools import reduce
6import numpy as np
7from .stats import power_divergence
10__all__ = ['margins', 'expected_freq', 'chi2_contingency']
13def margins(a):
14 """Return a list of the marginal sums of the array `a`.
16 Parameters
17 ----------
18 a : ndarray
19 The array for which to compute the marginal sums.
21 Returns
22 -------
23 margsums : list of ndarrays
24 A list of length `a.ndim`. `margsums[k]` is the result
25 of summing `a` over all axes except `k`; it has the same
26 number of dimensions as `a`, but the length of each axis
27 except axis `k` will be 1.
29 Examples
30 --------
31 >>> a = np.arange(12).reshape(2, 6)
32 >>> a
33 array([[ 0, 1, 2, 3, 4, 5],
34 [ 6, 7, 8, 9, 10, 11]])
35 >>> from scipy.stats.contingency import margins
36 >>> m0, m1 = margins(a)
37 >>> m0
38 array([[15],
39 [51]])
40 >>> m1
41 array([[ 6, 8, 10, 12, 14, 16]])
43 >>> b = np.arange(24).reshape(2,3,4)
44 >>> m0, m1, m2 = margins(b)
45 >>> m0
46 array([[[ 66]],
47 [[210]]])
48 >>> m1
49 array([[[ 60],
50 [ 92],
51 [124]]])
52 >>> m2
53 array([[[60, 66, 72, 78]]])
54 """
55 margsums = []
56 ranged = list(range(a.ndim))
57 for k in ranged:
58 marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
59 margsums.append(marg)
60 return margsums
63def expected_freq(observed):
64 """
65 Compute the expected frequencies from a contingency table.
67 Given an n-dimensional contingency table of observed frequencies,
68 compute the expected frequencies for the table based on the marginal
69 sums under the assumption that the groups associated with each
70 dimension are independent.
72 Parameters
73 ----------
74 observed : array_like
75 The table of observed frequencies. (While this function can handle
76 a 1-D array, that case is trivial. Generally `observed` is at
77 least 2-D.)
79 Returns
80 -------
81 expected : ndarray of float64
82 The expected frequencies, based on the marginal sums of the table.
83 Same shape as `observed`.
85 Examples
86 --------
87 >>> observed = np.array([[10, 10, 20],[20, 20, 20]])
88 >>> from scipy.stats.contingency import expected_freq
89 >>> expected_freq(observed)
90 array([[ 12., 12., 16.],
91 [ 18., 18., 24.]])
93 """
94 # Typically `observed` is an integer array. If `observed` has a large
95 # number of dimensions or holds large values, some of the following
96 # computations may overflow, so we first switch to floating point.
97 observed = np.asarray(observed, dtype=np.float64)
99 # Create a list of the marginal sums.
100 margsums = margins(observed)
102 # Create the array of expected frequencies. The shapes of the
103 # marginal sums returned by apply_over_axes() are just what we
104 # need for broadcasting in the following product.
105 d = observed.ndim
106 expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
107 return expected
110def chi2_contingency(observed, correction=True, lambda_=None):
111 """Chi-square test of independence of variables in a contingency table.
113 This function computes the chi-square statistic and p-value for the
114 hypothesis test of independence of the observed frequencies in the
115 contingency table [1]_ `observed`. The expected frequencies are computed
116 based on the marginal sums under the assumption of independence; see
117 `scipy.stats.contingency.expected_freq`. The number of degrees of
118 freedom is (expressed using numpy functions and attributes)::
120 dof = observed.size - sum(observed.shape) + observed.ndim - 1
123 Parameters
124 ----------
125 observed : array_like
126 The contingency table. The table contains the observed frequencies
127 (i.e. number of occurrences) in each category. In the two-dimensional
128 case, the table is often described as an "R x C table".
129 correction : bool, optional
130 If True, *and* the degrees of freedom is 1, apply Yates' correction
131 for continuity. The effect of the correction is to adjust each
132 observed value by 0.5 towards the corresponding expected value.
133 lambda_ : float or str, optional.
134 By default, the statistic computed in this test is Pearson's
135 chi-squared statistic [2]_. `lambda_` allows a statistic from the
136 Cressie-Read power divergence family [3]_ to be used instead. See
137 `power_divergence` for details.
139 Returns
140 -------
141 chi2 : float
142 The test statistic.
143 p : float
144 The p-value of the test
145 dof : int
146 Degrees of freedom
147 expected : ndarray, same shape as `observed`
148 The expected frequencies, based on the marginal sums of the table.
150 See Also
151 --------
152 contingency.expected_freq
153 fisher_exact
154 chisquare
155 power_divergence
157 Notes
158 -----
159 An often quoted guideline for the validity of this calculation is that
160 the test should be used only if the observed and expected frequencies
161 in each cell are at least 5.
163 This is a test for the independence of different categories of a
164 population. The test is only meaningful when the dimension of
165 `observed` is two or more. Applying the test to a one-dimensional
166 table will always result in `expected` equal to `observed` and a
167 chi-square statistic equal to 0.
169 This function does not handle masked arrays, because the calculation
170 does not make sense with missing values.
172 Like stats.chisquare, this function computes a chi-square statistic;
173 the convenience this function provides is to figure out the expected
174 frequencies and degrees of freedom from the given contingency table.
175 If these were already known, and if the Yates' correction was not
176 required, one could use stats.chisquare. That is, if one calls::
178 chi2, p, dof, ex = chi2_contingency(obs, correction=False)
180 then the following is true::
182 (chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
183 ddof=obs.size - 1 - dof)
185 The `lambda_` argument was added in version 0.13.0 of scipy.
187 References
188 ----------
189 .. [1] "Contingency table",
190 https://en.wikipedia.org/wiki/Contingency_table
191 .. [2] "Pearson's chi-squared test",
192 https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
193 .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
194 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
195 pp. 440-464.
197 Examples
198 --------
199 A two-way example (2 x 3):
201 >>> from scipy.stats import chi2_contingency
202 >>> obs = np.array([[10, 10, 20], [20, 20, 20]])
203 >>> chi2_contingency(obs)
204 (2.7777777777777777,
205 0.24935220877729619,
206 2,
207 array([[ 12., 12., 16.],
208 [ 18., 18., 24.]]))
210 Perform the test using the log-likelihood ratio (i.e. the "G-test")
211 instead of Pearson's chi-squared statistic.
213 >>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
214 >>> g, p
215 (2.7688587616781319, 0.25046668010954165)
217 A four-way example (2 x 2 x 2 x 2):
219 >>> obs = np.array(
220 ... [[[[12, 17],
221 ... [11, 16]],
222 ... [[11, 12],
223 ... [15, 16]]],
224 ... [[[23, 15],
225 ... [30, 22]],
226 ... [[14, 17],
227 ... [15, 16]]]])
228 >>> chi2_contingency(obs)
229 (8.7584514426741897,
230 0.64417725029295503,
231 11,
232 array([[[[ 14.15462386, 14.15462386],
233 [ 16.49423111, 16.49423111]],
234 [[ 11.2461395 , 11.2461395 ],
235 [ 13.10500554, 13.10500554]]],
236 [[[ 19.5591166 , 19.5591166 ],
237 [ 22.79202844, 22.79202844]],
238 [[ 15.54012004, 15.54012004],
239 [ 18.10873492, 18.10873492]]]]))
240 """
241 observed = np.asarray(observed)
242 if np.any(observed < 0):
243 raise ValueError("All values in `observed` must be nonnegative.")
244 if observed.size == 0:
245 raise ValueError("No data; `observed` has size 0.")
247 expected = expected_freq(observed)
248 if np.any(expected == 0):
249 # Include one of the positions where expected is zero in
250 # the exception message.
251 zeropos = list(zip(*np.nonzero(expected == 0)))[0]
252 raise ValueError("The internally computed table of expected "
253 "frequencies has a zero element at %s." % (zeropos,))
255 # The degrees of freedom
256 dof = expected.size - sum(expected.shape) + expected.ndim - 1
258 if dof == 0:
259 # Degenerate case; this occurs when `observed` is 1D (or, more
260 # generally, when it has only one nontrivial dimension). In this
261 # case, we also have observed == expected, so chi2 is 0.
262 chi2 = 0.0
263 p = 1.0
264 else:
265 if dof == 1 and correction:
266 # Adjust `observed` according to Yates' correction for continuity.
267 observed = observed + 0.5 * np.sign(expected - observed)
269 chi2, p = power_divergence(observed, expected,
270 ddof=observed.size - 1 - dof, axis=None,
271 lambda_=lambda_)
273 return chi2, p, dof, expected