Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/contingency_tables.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Methods for analyzing two-way contingency tables (i.e. frequency
3tables for observations that are cross-classified with respect to two
4categorical variables).
6The main classes are:
8 * Table : implements methods that can be applied to any two-way
9 contingency table.
11 * SquareTable : implements methods that can be applied to a square
12 two-way contingency table.
14 * Table2x2 : implements methods that can be applied to a 2x2
15 contingency table.
17 * StratifiedTable : implements methods that can be applied to a
18 collection of 2x2 contingency tables.
20Also contains functions for conducting McNemar's test and Cochran's q
21test.
23Note that the inference procedures may depend on how the data were
24sampled. In general the observed units are independent and
25identically distributed.
26"""
28from statsmodels.tools.decorators import cache_readonly
29import numpy as np
30from scipy import stats
31import pandas as pd
32import warnings
33from statsmodels import iolib
34from statsmodels.tools import sm_exceptions
37def _make_df_square(table):
38 """
39 Reindex a pandas DataFrame so that it becomes square, meaning that
40 the row and column indices contain the same values, in the same
41 order. The row and column index are extended to achieve this.
42 """
44 if not isinstance(table, pd.DataFrame):
45 return table
47 # If the table is not square, make it square
48 if not table.index.equals(table.columns):
49 ix = list(set(table.index) | set(table.columns))
50 ix.sort()
51 table = table.reindex(index=ix, columns=ix, fill_value=0)
53 # Ensures that the rows and columns are in the same order.
54 table = table.reindex(table.columns)
56 return table
59class _Bunch(object):
61 def __repr__(self):
62 return "<bunch containing results, print to see contents>"
64 def __str__(self):
65 ky = [k for k, _ in self.__dict__.items()]
66 ky.sort()
67 m = max([len(k) for k in ky])
68 tab = []
69 f = "{:" + str(m) + "} {}"
70 for k in ky:
71 tab.append(f.format(k, self.__dict__[k]))
72 return "\n".join(tab)
75class Table(object):
76 """
77 A two-way contingency table.
79 Parameters
80 ----------
81 table : array_like
82 A contingency table.
83 shift_zeros : bool
84 If True and any cell count is zero, add 0.5 to all values
85 in the table.
87 Attributes
88 ----------
89 table_orig : array_like
90 The original table is cached as `table_orig`.
92 See Also
93 --------
94 statsmodels.graphics.mosaicplot.mosaic
95 scipy.stats.chi2_contingency
97 Notes
98 -----
99 The inference procedures used here are all based on a sampling
100 model in which the units are independent and identically
101 distributed, with each unit being classified with respect to two
102 categorical variables.
104 References
105 ----------
106 Definitions of residuals:
107 https://onlinecourses.science.psu.edu/stat504/node/86
108 """
110 def __init__(self, table, shift_zeros=True):
112 self.table_orig = table
113 self.table = np.asarray(table, dtype=np.float64)
115 if shift_zeros and (self.table.min() == 0):
116 self.table[self.table == 0] = 0.5
118 def __str__(self):
119 s = ("A %dx%d contingency table with counts:\n" %
120 tuple(self.table.shape))
121 s += np.array_str(self.table)
122 return s
124 @classmethod
125 def from_data(cls, data, shift_zeros=True):
126 """
127 Construct a Table object from data.
129 Parameters
130 ----------
131 data : array_like
132 The raw data, from which a contingency table is constructed
133 using the first two columns.
134 shift_zeros : bool
135 If True and any cell count is zero, add 0.5 to all values
136 in the table.
138 Returns
139 -------
140 A Table instance.
141 """
143 if isinstance(data, pd.DataFrame):
144 table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
145 else:
146 table = pd.crosstab(data[:, 0], data[:, 1])
148 return cls(table, shift_zeros)
150 def test_nominal_association(self):
151 """
152 Assess independence for nominal factors.
154 Assessment of independence between rows and columns using
155 chi^2 testing. The rows and columns are treated as nominal
156 (unordered) categorical variables.
158 Returns
159 -------
160 A bunch containing the following attributes:
162 statistic : float
163 The chi^2 test statistic.
164 df : int
165 The degrees of freedom of the reference distribution
166 pvalue : float
167 The p-value for the test.
168 """
170 statistic = np.asarray(self.chi2_contribs).sum()
171 df = np.prod(np.asarray(self.table.shape) - 1)
172 pvalue = 1 - stats.chi2.cdf(statistic, df)
173 b = _Bunch()
174 b.statistic = statistic
175 b.df = df
176 b.pvalue = pvalue
177 return b
179 def test_ordinal_association(self, row_scores=None, col_scores=None):
180 """
181 Assess independence between two ordinal variables.
183 This is the 'linear by linear' association test, which uses
184 weights or scores to target the test to have more power
185 against ordered alternatives.
187 Parameters
188 ----------
189 row_scores : array_like
190 An array of numeric row scores
191 col_scores : array_like
192 An array of numeric column scores
194 Returns
195 -------
196 A bunch with the following attributes:
198 statistic : float
199 The test statistic.
200 null_mean : float
201 The expected value of the test statistic under the null
202 hypothesis.
203 null_sd : float
204 The standard deviation of the test statistic under the
205 null hypothesis.
206 zscore : float
207 The Z-score for the test statistic.
208 pvalue : float
209 The p-value for the test.
211 Notes
212 -----
213 The scores define the trend to which the test is most sensitive.
215 Using the default row and column scores gives the
216 Cochran-Armitage trend test.
217 """
219 if row_scores is None:
220 row_scores = np.arange(self.table.shape[0])
222 if col_scores is None:
223 col_scores = np.arange(self.table.shape[1])
225 if len(row_scores) != self.table.shape[0]:
226 msg = ("The length of `row_scores` must match the first " +
227 "dimension of `table`.")
228 raise ValueError(msg)
230 if len(col_scores) != self.table.shape[1]:
231 msg = ("The length of `col_scores` must match the second " +
232 "dimension of `table`.")
233 raise ValueError(msg)
235 # The test statistic
236 statistic = np.dot(row_scores, np.dot(self.table, col_scores))
238 # Some needed quantities
239 n_obs = self.table.sum()
240 rtot = self.table.sum(1)
241 um = np.dot(row_scores, rtot)
242 u2m = np.dot(row_scores**2, rtot)
243 ctot = self.table.sum(0)
244 vn = np.dot(col_scores, ctot)
245 v2n = np.dot(col_scores**2, ctot)
247 # The null mean and variance of the test statistic
248 e_stat = um * vn / n_obs
249 v_stat = (u2m - um**2 / n_obs) * (v2n - vn**2 / n_obs) / (n_obs - 1)
250 sd_stat = np.sqrt(v_stat)
252 zscore = (statistic - e_stat) / sd_stat
253 pvalue = 2 * stats.norm.cdf(-np.abs(zscore))
255 b = _Bunch()
256 b.statistic = statistic
257 b.null_mean = e_stat
258 b.null_sd = sd_stat
259 b.zscore = zscore
260 b.pvalue = pvalue
261 return b
263 @cache_readonly
264 def marginal_probabilities(self):
265 """
266 Estimate marginal probability distributions for the rows and columns.
268 Returns
269 -------
270 row : ndarray
271 Marginal row probabilities
272 col : ndarray
273 Marginal column probabilities
274 """
276 n = self.table.sum()
277 row = self.table.sum(1) / n
278 col = self.table.sum(0) / n
280 if isinstance(self.table_orig, pd.DataFrame):
281 row = pd.Series(row, self.table_orig.index)
282 col = pd.Series(col, self.table_orig.columns)
284 return row, col
286 @cache_readonly
287 def independence_probabilities(self):
288 """
289 Returns fitted joint probabilities under independence.
291 The returned table is outer(row, column), where row and
292 column are the estimated marginal distributions
293 of the rows and columns.
294 """
296 row, col = self.marginal_probabilities
297 itab = np.outer(row, col)
299 if isinstance(self.table_orig, pd.DataFrame):
300 itab = pd.DataFrame(itab, self.table_orig.index,
301 self.table_orig.columns)
303 return itab
305 @cache_readonly
306 def fittedvalues(self):
307 """
308 Returns fitted cell counts under independence.
310 The returned cell counts are estimates under a model
311 where the rows and columns of the table are independent.
312 """
314 probs = self.independence_probabilities
315 fit = self.table.sum() * probs
316 return fit
318 @cache_readonly
319 def resid_pearson(self):
320 """
321 Returns Pearson residuals.
323 The Pearson residuals are calculated under a model where
324 the rows and columns of the table are independent.
325 """
327 fit = self.fittedvalues
328 resids = (self.table - fit) / np.sqrt(fit)
329 return resids
331 @cache_readonly
332 def standardized_resids(self):
333 """
334 Returns standardized residuals under independence.
335 """
337 row, col = self.marginal_probabilities
338 sresids = self.resid_pearson / np.sqrt(np.outer(1 - row, 1 - col))
339 return sresids
341 @cache_readonly
342 def chi2_contribs(self):
343 """
344 Returns the contributions to the chi^2 statistic for independence.
346 The returned table contains the contribution of each cell to the chi^2
347 test statistic for the null hypothesis that the rows and columns
348 are independent.
349 """
351 return self.resid_pearson**2
353 @cache_readonly
354 def local_log_oddsratios(self):
355 """
356 Returns local log odds ratios.
358 The local log odds ratios are the log odds ratios
359 calculated for contiguous 2x2 sub-tables.
360 """
362 ta = self.table.copy()
363 a = ta[0:-1, 0:-1]
364 b = ta[0:-1, 1:]
365 c = ta[1:, 0:-1]
366 d = ta[1:, 1:]
367 tab = np.log(a) + np.log(d) - np.log(b) - np.log(c)
368 rslt = np.empty(self.table.shape, np.float64)
369 rslt *= np.nan
370 rslt[0:-1, 0:-1] = tab
372 if isinstance(self.table_orig, pd.DataFrame):
373 rslt = pd.DataFrame(rslt, index=self.table_orig.index,
374 columns=self.table_orig.columns)
376 return rslt
378 @cache_readonly
379 def local_oddsratios(self):
380 """
381 Returns local odds ratios.
383 See documentation for local_log_oddsratios.
384 """
386 return np.exp(self.local_log_oddsratios)
388 @cache_readonly
389 def cumulative_log_oddsratios(self):
390 """
391 Returns cumulative log odds ratios.
393 The cumulative log odds ratios for a contingency table
394 with ordered rows and columns are calculated by collapsing
395 all cells to the left/right and above/below a given point,
396 to obtain a 2x2 table from which a log odds ratio can be
397 calculated.
398 """
400 ta = self.table.cumsum(0).cumsum(1)
402 a = ta[0:-1, 0:-1]
403 b = ta[0:-1, -1:] - a
404 c = ta[-1:, 0:-1] - a
405 d = ta[-1, -1] - (a + b + c)
407 tab = np.log(a) + np.log(d) - np.log(b) - np.log(c)
408 rslt = np.empty(self.table.shape, np.float64)
409 rslt *= np.nan
410 rslt[0:-1, 0:-1] = tab
412 if isinstance(self.table_orig, pd.DataFrame):
413 rslt = pd.DataFrame(rslt, index=self.table_orig.index,
414 columns=self.table_orig.columns)
416 return rslt
418 @cache_readonly
419 def cumulative_oddsratios(self):
420 """
421 Returns the cumulative odds ratios for a contingency table.
423 See documentation for cumulative_log_oddsratio.
424 """
426 return np.exp(self.cumulative_log_oddsratios)
429class SquareTable(Table):
430 """
431 Methods for analyzing a square contingency table.
433 Parameters
434 ----------
435 table : array_like
436 A square contingency table, or DataFrame that is converted
437 to a square form.
438 shift_zeros : bool
439 If True and any cell count is zero, add 0.5 to all values
440 in the table.
442 These methods should only be used when the rows and columns of the
443 table have the same categories. If `table` is provided as a
444 Pandas DataFrame, the row and column indices will be extended to
445 create a square table, inserting zeros where a row or column is
446 missing. Otherwise the table should be provided in a square form,
447 with the (implicit) row and column categories appearing in the
448 same order.
449 """
451 def __init__(self, table, shift_zeros=True):
452 table = _make_df_square(table) # Non-pandas passes through
453 k1, k2 = table.shape
454 if k1 != k2:
455 raise ValueError('table must be square')
457 super(SquareTable, self).__init__(table, shift_zeros)
459 def symmetry(self, method="bowker"):
460 """
461 Test for symmetry of a joint distribution.
463 This procedure tests the null hypothesis that the joint
464 distribution is symmetric around the main diagonal, that is
466 .. math::
468 p_{i, j} = p_{j, i} for all i, j
470 Returns
471 -------
472 A bunch with attributes:
474 statistic : float
475 chisquare test statistic
476 p-value : float
477 p-value of the test statistic based on chisquare distribution
478 df : int
479 degrees of freedom of the chisquare distribution
481 Notes
482 -----
483 The implementation is based on the SAS documentation. R includes
484 it in `mcnemar.test` if the table is not 2 by 2. However a more
485 direct generalization of the McNemar test to larger tables is
486 provided by the homogeneity test (TableSymmetry.homogeneity).
488 The p-value is based on the chi-square distribution which requires
489 that the sample size is not very small to be a good approximation
490 of the true distribution. For 2x2 contingency tables the exact
491 distribution can be obtained with `mcnemar`
493 See Also
494 --------
495 mcnemar
496 homogeneity
497 """
499 if method.lower() != "bowker":
500 raise ValueError("method for symmetry testing must be 'bowker'")
502 k = self.table.shape[0]
503 upp_idx = np.triu_indices(k, 1)
505 tril = self.table.T[upp_idx] # lower triangle in column order
506 triu = self.table[upp_idx] # upper triangle in row order
508 statistic = ((tril - triu)**2 / (tril + triu + 1e-20)).sum()
509 df = k * (k-1) / 2.
510 pvalue = stats.chi2.sf(statistic, df)
512 b = _Bunch()
513 b.statistic = statistic
514 b.pvalue = pvalue
515 b.df = df
517 return b
519 def homogeneity(self, method="stuart_maxwell"):
520 """
521 Compare row and column marginal distributions.
523 Parameters
524 ----------
525 method : str
526 Either 'stuart_maxwell' or 'bhapkar', leading to two different
527 estimates of the covariance matrix for the estimated
528 difference between the row margins and the column margins.
530 Returns a bunch with attributes:
532 statistic : float
533 The chi^2 test statistic
534 pvalue : float
535 The p-value of the test statistic
536 df : int
537 The degrees of freedom of the reference distribution
539 Notes
540 -----
541 For a 2x2 table this is equivalent to McNemar's test. More
542 generally the procedure tests the null hypothesis that the
543 marginal distribution of the row factor is equal to the
544 marginal distribution of the column factor. For this to be
545 meaningful, the two factors must have the same sample space
546 (i.e. the same categories).
547 """
549 if self.table.shape[0] < 1:
550 raise ValueError('table is empty')
551 elif self.table.shape[0] == 1:
552 b = _Bunch()
553 b.statistic = 0
554 b.pvalue = 1
555 b.df = 0
556 return b
558 method = method.lower()
559 if method not in ["bhapkar", "stuart_maxwell"]:
560 raise ValueError("method '%s' for homogeneity not known" % method)
562 n_obs = self.table.sum()
563 pr = self.table.astype(np.float64) / n_obs
565 # Compute margins, eliminate last row/column so there is no
566 # degeneracy
567 row = pr.sum(1)[0:-1]
568 col = pr.sum(0)[0:-1]
569 pr = pr[0:-1, 0:-1]
571 # The estimated difference between row and column margins.
572 d = col - row
574 # The degrees of freedom of the chi^2 reference distribution.
575 df = pr.shape[0]
577 if method == "bhapkar":
578 vmat = -(pr + pr.T) - np.outer(d, d)
579 dv = col + row - 2*np.diag(pr) - d**2
580 np.fill_diagonal(vmat, dv)
581 elif method == "stuart_maxwell":
582 vmat = -(pr + pr.T)
583 dv = row + col - 2*np.diag(pr)
584 np.fill_diagonal(vmat, dv)
586 try:
587 statistic = n_obs * np.dot(d, np.linalg.solve(vmat, d))
588 except np.linalg.LinAlgError:
589 warnings.warn("Unable to invert covariance matrix",
590 sm_exceptions.SingularMatrixWarning)
591 b = _Bunch()
592 b.statistic = np.nan
593 b.pvalue = np.nan
594 b.df = df
595 return b
597 pvalue = 1 - stats.chi2.cdf(statistic, df)
599 b = _Bunch()
600 b.statistic = statistic
601 b.pvalue = pvalue
602 b.df = df
604 return b
606 def summary(self, alpha=0.05, float_format="%.3f"):
607 """
608 Produce a summary of the analysis.
610 Parameters
611 ----------
612 alpha : float
613 `1 - alpha` is the nominal coverage probability of the interval.
614 float_format : str
615 Used to format numeric values in the table.
616 method : str
617 The method for producing the confidence interval. Currently
618 must be 'normal' which uses the normal approximation.
619 """
621 fmt = float_format
623 headers = ["Statistic", "P-value", "DF"]
624 stubs = ["Symmetry", "Homogeneity"]
625 sy = self.symmetry()
626 hm = self.homogeneity()
627 data = [[fmt % sy.statistic, fmt % sy.pvalue, '%d' % sy.df],
628 [fmt % hm.statistic, fmt % hm.pvalue, '%d' % hm.df]]
629 tab = iolib.SimpleTable(data, headers, stubs, data_aligns="r",
630 table_dec_above='')
632 return tab
635class Table2x2(SquareTable):
636 """
637 Analyses that can be performed on a 2x2 contingency table.
639 Parameters
640 ----------
641 table : array_like
642 A 2x2 contingency table
643 shift_zeros : bool
644 If true, 0.5 is added to all cells of the table if any cell is
645 equal to zero.
647 Notes
648 -----
649 The inference procedures used here are all based on a sampling
650 model in which the units are independent and identically
651 distributed, with each unit being classified with respect to two
652 categorical variables.
654 Note that for the risk ratio, the analysis is not symmetric with
655 respect to the rows and columns of the contingency table. The two
656 rows define population subgroups, column 0 is the number of
657 'events', and column 1 is the number of 'non-events'.
658 """
660 def __init__(self, table, shift_zeros=True):
662 if type(table) is list:
663 table = np.asarray(table)
665 if (table.ndim != 2) or (table.shape[0] != 2) or (table.shape[1] != 2):
666 raise ValueError("Table2x2 takes a 2x2 table as input.")
668 super(Table2x2, self).__init__(table, shift_zeros)
670 @classmethod
671 def from_data(cls, data, shift_zeros=True):
672 """
673 Construct a Table object from data.
675 Parameters
676 ----------
677 data : array_like
678 The raw data, the first column defines the rows and the
679 second column defines the columns.
680 shift_zeros : bool
681 If True, and if there are any zeros in the contingency
682 table, add 0.5 to all four cells of the table.
683 """
685 if isinstance(data, pd.DataFrame):
686 table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
687 else:
688 table = pd.crosstab(data[:, 0], data[:, 1])
689 return cls(table, shift_zeros)
691 @cache_readonly
692 def log_oddsratio(self):
693 """
694 Returns the log odds ratio for a 2x2 table.
695 """
697 f = self.table.flatten()
698 return np.dot(np.log(f), np.r_[1, -1, -1, 1])
700 @cache_readonly
701 def oddsratio(self):
702 """
703 Returns the odds ratio for a 2x2 table.
704 """
706 return (self.table[0, 0] * self.table[1, 1] /
707 (self.table[0, 1] * self.table[1, 0]))
709 @cache_readonly
710 def log_oddsratio_se(self):
711 """
712 Returns the standard error for the log odds ratio.
713 """
715 return np.sqrt(np.sum(1 / self.table))
717 def oddsratio_pvalue(self, null=1):
718 """
719 P-value for a hypothesis test about the odds ratio.
721 Parameters
722 ----------
723 null : float
724 The null value of the odds ratio.
725 """
727 return self.log_oddsratio_pvalue(np.log(null))
729 def log_oddsratio_pvalue(self, null=0):
730 """
731 P-value for a hypothesis test about the log odds ratio.
733 Parameters
734 ----------
735 null : float
736 The null value of the log odds ratio.
737 """
739 zscore = (self.log_oddsratio - null) / self.log_oddsratio_se
740 pvalue = 2 * stats.norm.cdf(-np.abs(zscore))
741 return pvalue
743 def log_oddsratio_confint(self, alpha=0.05, method="normal"):
744 """
745 A confidence level for the log odds ratio.
747 Parameters
748 ----------
749 alpha : float
750 `1 - alpha` is the nominal coverage probability of the
751 confidence interval.
752 method : str
753 The method for producing the confidence interval. Currently
754 must be 'normal' which uses the normal approximation.
755 """
757 f = -stats.norm.ppf(alpha / 2)
758 lor = self.log_oddsratio
759 se = self.log_oddsratio_se
760 lcb = lor - f * se
761 ucb = lor + f * se
762 return lcb, ucb
764 def oddsratio_confint(self, alpha=0.05, method="normal"):
765 """
766 A confidence interval for the odds ratio.
768 Parameters
769 ----------
770 alpha : float
771 `1 - alpha` is the nominal coverage probability of the
772 confidence interval.
773 method : str
774 The method for producing the confidence interval. Currently
775 must be 'normal' which uses the normal approximation.
776 """
777 lcb, ucb = self.log_oddsratio_confint(alpha, method=method)
778 return np.exp(lcb), np.exp(ucb)
780 @cache_readonly
781 def riskratio(self):
782 """
783 Returns the risk ratio for a 2x2 table.
785 The risk ratio is calculated with respect to the rows.
786 """
788 p = self.table[:, 0] / self.table.sum(1)
789 return p[0] / p[1]
791 @cache_readonly
792 def log_riskratio(self):
793 """
794 Returns the log of the risk ratio.
795 """
797 return np.log(self.riskratio)
799 @cache_readonly
800 def log_riskratio_se(self):
801 """
802 Returns the standard error of the log of the risk ratio.
803 """
805 n = self.table.sum(1)
806 p = self.table[:, 0] / n
807 va = np.sum((1 - p) / (n*p))
808 return np.sqrt(va)
810 def riskratio_pvalue(self, null=1):
811 """
812 p-value for a hypothesis test about the risk ratio.
814 Parameters
815 ----------
816 null : float
817 The null value of the risk ratio.
818 """
820 return self.log_riskratio_pvalue(np.log(null))
822 def log_riskratio_pvalue(self, null=0):
823 """
824 p-value for a hypothesis test about the log risk ratio.
826 Parameters
827 ----------
828 null : float
829 The null value of the log risk ratio.
830 """
832 zscore = (self.log_riskratio - null) / self.log_riskratio_se
833 pvalue = 2 * stats.norm.cdf(-np.abs(zscore))
834 return pvalue
836 def log_riskratio_confint(self, alpha=0.05, method="normal"):
837 """
838 A confidence interval for the log risk ratio.
840 Parameters
841 ----------
842 alpha : float
843 `1 - alpha` is the nominal coverage probability of the
844 confidence interval.
845 method : str
846 The method for producing the confidence interval. Currently
847 must be 'normal' which uses the normal approximation.
848 """
849 f = -stats.norm.ppf(alpha / 2)
850 lrr = self.log_riskratio
851 se = self.log_riskratio_se
852 lcb = lrr - f * se
853 ucb = lrr + f * se
854 return lcb, ucb
856 def riskratio_confint(self, alpha=0.05, method="normal"):
857 """
858 A confidence interval for the risk ratio.
860 Parameters
861 ----------
862 alpha : float
863 `1 - alpha` is the nominal coverage probability of the
864 confidence interval.
865 method : str
866 The method for producing the confidence interval. Currently
867 must be 'normal' which uses the normal approximation.
868 """
869 lcb, ucb = self.log_riskratio_confint(alpha, method=method)
870 return np.exp(lcb), np.exp(ucb)
872 def summary(self, alpha=0.05, float_format="%.3f", method="normal"):
873 """
874 Summarizes results for a 2x2 table analysis.
876 Parameters
877 ----------
878 alpha : float
879 `1 - alpha` is the nominal coverage probability of the confidence
880 intervals.
881 float_format : str
882 Used to format the numeric values in the table.
883 method : str
884 The method for producing the confidence interval. Currently
885 must be 'normal' which uses the normal approximation.
886 """
888 def fmt(x):
889 if isinstance(x, str):
890 return x
891 return float_format % x
893 headers = ["Estimate", "SE", "LCB", "UCB", "p-value"]
894 stubs = ["Odds ratio", "Log odds ratio", "Risk ratio",
895 "Log risk ratio"]
897 lcb1, ucb1 = self.oddsratio_confint(alpha, method)
898 lcb2, ucb2 = self.log_oddsratio_confint(alpha, method)
899 lcb3, ucb3 = self.riskratio_confint(alpha, method)
900 lcb4, ucb4 = self.log_riskratio_confint(alpha, method)
901 data = [[fmt(x) for x in [self.oddsratio, "", lcb1, ucb1,
902 self.oddsratio_pvalue()]],
903 [fmt(x) for x in [self.log_oddsratio, self.log_oddsratio_se,
904 lcb2, ucb2, self.oddsratio_pvalue()]],
905 [fmt(x) for x in [self.riskratio, "", lcb3, ucb3,
906 self.riskratio_pvalue()]],
907 [fmt(x) for x in [self.log_riskratio, self.log_riskratio_se,
908 lcb4, ucb4, self.riskratio_pvalue()]]]
909 tab = iolib.SimpleTable(data, headers, stubs, data_aligns="r",
910 table_dec_above='')
911 return tab
914class StratifiedTable(object):
915 """
916 Analyses for a collection of 2x2 contingency tables.
918 Such a collection may arise by stratifying a single 2x2 table with
919 respect to another factor. This class implements the
920 'Cochran-Mantel-Haenszel' and 'Breslow-Day' procedures for
921 analyzing collections of 2x2 contingency tables.
923 Parameters
924 ----------
925 tables : list or ndarray
926 Either a list containing several 2x2 contingency tables, or
927 a 2x2xk ndarray in which each slice along the third axis is a
928 2x2 contingency table.
930 Notes
931 -----
932 This results are based on a sampling model in which the units are
933 independent both within and between strata.
934 """
936 def __init__(self, tables, shift_zeros=False):
938 if isinstance(tables, np.ndarray):
939 sp = tables.shape
940 if (len(sp) != 3) or (sp[0] != 2) or (sp[1] != 2):
941 raise ValueError("If an ndarray, argument must be 2x2xn")
942 table = tables
943 else:
944 if any([np.asarray(x).shape != (2, 2) for x in tables]):
945 m = "If `tables` is a list, all of its elements should be 2x2"
946 raise ValueError(m)
948 # Create a data cube
949 table = np.dstack(tables).astype(np.float64)
951 if shift_zeros:
952 zx = (table == 0).sum(0).sum(0)
953 ix = np.flatnonzero(zx > 0)
954 if len(ix) > 0:
955 table = table.copy()
956 table[:, :, ix] += 0.5
958 self.table = table
960 self._cache = {}
962 # Quantities to precompute. Table entries are [[a, b], [c,
963 # d]], 'ad' is 'a * d', 'apb' is 'a + b', 'dma' is 'd - a',
964 # etc.
965 self._apb = table[0, 0, :] + table[0, 1, :]
966 self._apc = table[0, 0, :] + table[1, 0, :]
967 self._bpd = table[0, 1, :] + table[1, 1, :]
968 self._cpd = table[1, 0, :] + table[1, 1, :]
969 self._ad = table[0, 0, :] * table[1, 1, :]
970 self._bc = table[0, 1, :] * table[1, 0, :]
971 self._apd = table[0, 0, :] + table[1, 1, :]
972 self._dma = table[1, 1, :] - table[0, 0, :]
973 self._n = table.sum(0).sum(0)
975 @classmethod
976 def from_data(cls, var1, var2, strata, data):
977 """
978 Construct a StratifiedTable object from data.
980 Parameters
981 ----------
982 var1 : int or string
983 The column index or name of `data` specifying the variable
984 defining the rows of the contingency table. The variable
985 must have only two distinct values.
986 var2 : int or string
987 The column index or name of `data` specifying the variable
988 defining the columns of the contingency table. The variable
989 must have only two distinct values.
990 strata : int or string
991 The column index or name of `data` specifying the variable
992 defining the strata.
993 data : array_like
994 The raw data. A cross-table for analysis is constructed
995 from the first two columns.
997 Returns
998 -------
999 A StratifiedTable instance.
1000 """
1002 if not isinstance(data, pd.DataFrame):
1003 data1 = pd.DataFrame(index=np.arange(data.shape[0]),
1004 columns=[var1, var2, strata])
1005 data1.loc[:, var1] = data[:, var1]
1006 data1.loc[:, var2] = data[:, var2]
1007 data1.loc[:, strata] = data[:, strata]
1008 else:
1009 data1 = data[[var1, var2, strata]]
1011 gb = data1.groupby(strata).groups
1012 tables = []
1013 for g in gb:
1014 ii = gb[g]
1015 tab = pd.crosstab(data1.loc[ii, var1], data1.loc[ii, var2])
1016 if (tab.shape != np.r_[2, 2]).any():
1017 msg = "Invalid table dimensions"
1018 raise ValueError(msg)
1019 tables.append(np.asarray(tab))
1021 return cls(tables)
1023 def test_null_odds(self, correction=False):
1024 """
1025 Test that all tables have odds ratio equal to 1.
1027 This is the 'Mantel-Haenszel' test.
1029 Parameters
1030 ----------
1031 correction : bool
1032 If True, use the continuity correction when calculating the
1033 test statistic.
1035 Returns
1036 -------
1037 A bunch containing the chi^2 test statistic and p-value.
1038 """
1040 statistic = np.sum(self.table[0, 0, :] -
1041 self._apb * self._apc / self._n)
1042 statistic = np.abs(statistic)
1043 if correction:
1044 statistic -= 0.5
1045 statistic = statistic**2
1046 denom = self._apb * self._apc * self._bpd * self._cpd
1047 denom /= (self._n**2 * (self._n - 1))
1048 denom = np.sum(denom)
1049 statistic /= denom
1051 # df is always 1
1052 pvalue = 1 - stats.chi2.cdf(statistic, 1)
1054 b = _Bunch()
1055 b.statistic = statistic
1056 b.pvalue = pvalue
1058 return b
1060 @cache_readonly
1061 def oddsratio_pooled(self):
1062 """
1063 The pooled odds ratio.
1065 The value is an estimate of a common odds ratio across all of the
1066 stratified tables.
1067 """
1068 odds_ratio = np.sum(self._ad / self._n) / np.sum(self._bc / self._n)
1069 return odds_ratio
1071 @cache_readonly
1072 def logodds_pooled(self):
1073 """
1074 Returns the logarithm of the pooled odds ratio.
1076 See oddsratio_pooled for more information.
1077 """
1078 return np.log(self.oddsratio_pooled)
1080 @cache_readonly
1081 def riskratio_pooled(self):
1082 """
1083 Estimate of the pooled risk ratio.
1084 """
1086 acd = self.table[0, 0, :] * self._cpd
1087 cab = self.table[1, 0, :] * self._apb
1089 rr = np.sum(acd / self._n) / np.sum(cab / self._n)
1090 return rr
1092 @cache_readonly
1093 def risk_pooled(self):
1094 # Deprecated due to name being misleading
1095 msg = "'risk_pooled' is deprecated, use 'riskratio_pooled' instead"
1096 warnings.warn(msg, DeprecationWarning)
1097 return self.riskratio_pooled
1099 @cache_readonly
1100 def logodds_pooled_se(self):
1101 """
1102 Estimated standard error of the pooled log odds ratio
1104 References
1105 ----------
1106 J. Robins, N. Breslow, S. Greenland. "Estimators of the
1107 Mantel-Haenszel Variance Consistent in Both Sparse Data and
1108 Large-Strata Limiting Models." Biometrics 42, no. 2 (1986): 311-23.
1109 """
1111 adns = np.sum(self._ad / self._n)
1112 bcns = np.sum(self._bc / self._n)
1113 lor_va = np.sum(self._apd * self._ad / self._n**2) / adns**2
1114 mid = self._apd * self._bc / self._n**2
1115 mid += (1 - self._apd / self._n) * self._ad / self._n
1116 mid = np.sum(mid)
1117 mid /= (adns * bcns)
1118 lor_va += mid
1119 lor_va += np.sum((1 - self._apd / self._n) *
1120 self._bc / self._n) / bcns**2
1121 lor_va /= 2
1122 lor_se = np.sqrt(lor_va)
1123 return lor_se
1125 def logodds_pooled_confint(self, alpha=0.05, method="normal"):
1126 """
1127 A confidence interval for the pooled log odds ratio.
1129 Parameters
1130 ----------
1131 alpha : float
1132 `1 - alpha` is the nominal coverage probability of the
1133 interval.
1134 method : str
1135 The method for producing the confidence interval. Currently
1136 must be 'normal' which uses the normal approximation.
1138 Returns
1139 -------
1140 lcb : float
1141 The lower confidence limit.
1142 ucb : float
1143 The upper confidence limit.
1144 """
1146 lor = np.log(self.oddsratio_pooled)
1147 lor_se = self.logodds_pooled_se
1149 f = -stats.norm.ppf(alpha / 2)
1151 lcb = lor - f * lor_se
1152 ucb = lor + f * lor_se
1154 return lcb, ucb
1156 def oddsratio_pooled_confint(self, alpha=0.05, method="normal"):
1157 """
1158 A confidence interval for the pooled odds ratio.
1160 Parameters
1161 ----------
1162 alpha : float
1163 `1 - alpha` is the nominal coverage probability of the
1164 interval.
1165 method : str
1166 The method for producing the confidence interval. Currently
1167 must be 'normal' which uses the normal approximation.
1169 Returns
1170 -------
1171 lcb : float
1172 The lower confidence limit.
1173 ucb : float
1174 The upper confidence limit.
1175 """
1177 lcb, ucb = self.logodds_pooled_confint(alpha, method=method)
1178 lcb = np.exp(lcb)
1179 ucb = np.exp(ucb)
1180 return lcb, ucb
1182 def test_equal_odds(self, adjust=False):
1183 """
1184 Test that all odds ratios are identical.
1186 This is the 'Breslow-Day' testing procedure.
1188 Parameters
1189 ----------
1190 adjust : bool
1191 Use the 'Tarone' adjustment to achieve the chi^2
1192 asymptotic distribution.
1194 Returns
1195 -------
1196 A bunch containing the following attributes:
1198 statistic : float
1199 The chi^2 test statistic.
1200 p-value : float
1201 The p-value for the test.
1202 """
1204 table = self.table
1206 r = self.oddsratio_pooled
1207 a = 1 - r
1208 b = r * (self._apb + self._apc) + self._dma
1209 c = -r * self._apb * self._apc
1211 # Expected value of first cell
1212 dr = np.sqrt(b**2 - 4*a*c)
1213 e11 = (-b + dr) / (2*a)
1215 # Variance of the first cell
1216 v11 = (1 / e11 + 1 / (self._apc - e11) + 1 / (self._apb - e11) +
1217 1 / (self._dma + e11))
1218 v11 = 1 / v11
1220 statistic = np.sum((table[0, 0, :] - e11)**2 / v11)
1222 if adjust:
1223 adj = table[0, 0, :].sum() - e11.sum()
1224 adj = adj**2
1225 adj /= np.sum(v11)
1226 statistic -= adj
1228 pvalue = 1 - stats.chi2.cdf(statistic, table.shape[2] - 1)
1230 b = _Bunch()
1231 b.statistic = statistic
1232 b.pvalue = pvalue
1234 return b
1236 def summary(self, alpha=0.05, float_format="%.3f", method="normal"):
1237 """
1238 A summary of all the main results.
1240 Parameters
1241 ----------
1242 alpha : float
1243 `1 - alpha` is the nominal coverage probability of the
1244 confidence intervals.
1245 float_format : str
1246 Used for formatting numeric values in the summary.
1247 method : str
1248 The method for producing the confidence interval. Currently
1249 must be 'normal' which uses the normal approximation.
1250 """
1252 def fmt(x):
1253 if isinstance(x, str):
1254 return x
1255 return float_format % x
1257 co_lcb, co_ucb = self.oddsratio_pooled_confint(
1258 alpha=alpha, method=method)
1259 clo_lcb, clo_ucb = self.logodds_pooled_confint(
1260 alpha=alpha, method=method)
1261 headers = ["Estimate", "LCB", "UCB"]
1262 stubs = ["Pooled odds", "Pooled log odds", "Pooled risk ratio", ""]
1263 data = [[fmt(x) for x in [self.oddsratio_pooled, co_lcb, co_ucb]],
1264 [fmt(x) for x in [self.logodds_pooled, clo_lcb, clo_ucb]],
1265 [fmt(x) for x in [self.riskratio_pooled, "", ""]],
1266 ['', '', '']]
1267 tab1 = iolib.SimpleTable(data, headers, stubs, data_aligns="r",
1268 table_dec_above='')
1270 headers = ["Statistic", "P-value", ""]
1271 stubs = ["Test of OR=1", "Test constant OR"]
1272 rslt1 = self.test_null_odds()
1273 rslt2 = self.test_equal_odds()
1274 data = [[fmt(x) for x in [rslt1.statistic, rslt1.pvalue, ""]],
1275 [fmt(x) for x in [rslt2.statistic, rslt2.pvalue, ""]]]
1276 tab2 = iolib.SimpleTable(data, headers, stubs, data_aligns="r")
1277 tab1.extend(tab2)
1279 headers = ["", "", ""]
1280 stubs = ["Number of tables", "Min n", "Max n", "Avg n", "Total n"]
1281 ss = self.table.sum(0).sum(0)
1282 data = [["%d" % self.table.shape[2], '', ''],
1283 ["%d" % min(ss), '', ''],
1284 ["%d" % max(ss), '', ''],
1285 ["%.0f" % np.mean(ss), '', ''],
1286 ["%d" % sum(ss), '', '', '']]
1287 tab3 = iolib.SimpleTable(data, headers, stubs, data_aligns="r")
1288 tab1.extend(tab3)
1290 return tab1
1293def mcnemar(table, exact=True, correction=True):
1294 """
1295 McNemar test of homogeneity.
1297 Parameters
1298 ----------
1299 table : array_like
1300 A square contingency table.
1301 exact : bool
1302 If exact is true, then the binomial distribution will be used.
1303 If exact is false, then the chisquare distribution will be
1304 used, which is the approximation to the distribution of the
1305 test statistic for large sample sizes.
1306 correction : bool
1307 If true, then a continuity correction is used for the chisquare
1308 distribution (if exact is false.)
1310 Returns
1311 -------
1312 A bunch with attributes:
1314 statistic : float or int, array
1315 The test statistic is the chisquare statistic if exact is
1316 false. If the exact binomial distribution is used, then this
1317 contains the min(n1, n2), where n1, n2 are cases that are zero
1318 in one sample but one in the other sample.
1319 pvalue : float or array
1320 p-value of the null hypothesis of equal marginal distributions.
1322 Notes
1323 -----
1324 This is a special case of Cochran's Q test, and of the homogeneity
1325 test. The results when the chisquare distribution is used are
1326 identical, except for continuity correction.
1327 """
1329 table = _make_df_square(table)
1330 table = np.asarray(table, dtype=np.float64)
1331 n1, n2 = table[0, 1], table[1, 0]
1333 if exact:
1334 statistic = np.minimum(n1, n2)
1335 # binom is symmetric with p=0.5
1336 # SciPy 1.7+ required int arguments
1337 int_sum = int(n1 + n2)
1338 if int_sum != (n1 + n2):
1339 warnings.warn("exact can only be used with tables containing "
1340 "integers. This warning will become a ValueError "
1341 "after 0.12.", FutureWarning)
1342 pvalue = stats.binom.cdf(statistic, int_sum, 0.5) * 2
1343 pvalue = np.minimum(pvalue, 1) # limit to 1 if n1==n2
1344 else:
1345 corr = int(correction) # convert bool to 0 or 1
1346 statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
1347 df = 1
1348 pvalue = stats.chi2.sf(statistic, df)
1350 b = _Bunch()
1351 b.statistic = statistic
1352 b.pvalue = pvalue
1353 return b
1356def cochrans_q(x, return_object=True):
1357 """
1358 Cochran's Q test for identical binomial proportions.
1360 Parameters
1361 ----------
1362 x : array_like, 2d (N, k)
1363 data with N cases and k variables
1364 return_object : bool
1365 Return values as bunch instead of as individual values.
1367 Returns
1368 -------
1369 Returns a bunch containing the following attributes, or the
1370 individual values according to the value of `return_object`.
1372 statistic : float
1373 test statistic
1374 pvalue : float
1375 pvalue from the chisquare distribution
1377 Notes
1378 -----
1379 Cochran's Q is a k-sample extension of the McNemar test. If there
1380 are only two groups, then Cochran's Q test and the McNemar test
1381 are equivalent.
1383 The procedure tests that the probability of success is the same
1384 for every group. The alternative hypothesis is that at least two
1385 groups have a different probability of success.
1387 In Wikipedia terminology, rows are blocks and columns are
1388 treatments. The number of rows N, should be large for the
1389 chisquare distribution to be a good approximation.
1391 The Null hypothesis of the test is that all treatments have the
1392 same effect.
1394 References
1395 ----------
1396 https://en.wikipedia.org/wiki/Cochran_test
1397 SAS Manual for NPAR TESTS
1398 """
1400 x = np.asarray(x, dtype=np.float64)
1401 gruni = np.unique(x)
1402 N, k = x.shape
1403 count_row_success = (x == gruni[-1]).sum(1, float)
1404 count_col_success = (x == gruni[-1]).sum(0, float)
1405 count_row_ss = count_row_success.sum()
1406 count_col_ss = count_col_success.sum()
1407 assert count_row_ss == count_col_ss # just a calculation check
1409 # From the SAS manual
1410 q_stat = ((k-1) * (k * np.sum(count_col_success**2) - count_col_ss**2)
1411 / (k * count_row_ss - np.sum(count_row_success**2)))
1413 # Note: the denominator looks just like k times the variance of
1414 # the columns
1415 # Wikipedia uses a different, but equivalent expression
1416 # q_stat = (k-1) * (k * np.sum(count_row_success**2) - count_row_ss**2)
1417 # / (k * count_col_ss - np.sum(count_col_success**2))
1419 df = k - 1
1420 pvalue = stats.chi2.sf(q_stat, df)
1422 if return_object:
1423 b = _Bunch()
1424 b.statistic = q_stat
1425 b.df = df
1426 b.pvalue = pvalue
1427 return b
1429 return q_stat, pvalue, df