Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/contrast.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import numpy as np
2from scipy.stats import f as fdist
3from scipy.stats import t as student_t
4from scipy import stats
5from statsmodels.tools.tools import clean0, fullrank
6from statsmodels.stats.multitest import multipletests
9#TODO: should this be public if it's just a container?
10class ContrastResults(object):
11 """
12 Class for results of tests of linear restrictions on coefficients in a model.
14 This class functions mainly as a container for `t_test`, `f_test` and
15 `wald_test` for the parameters of a model.
17 The attributes depend on the statistical test and are either based on the
18 normal, the t, the F or the chisquare distribution.
19 """
21 def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None,
22 df_num=None, alpha=0.05, **kwds):
24 self.effect = effect # Let it be None for F
25 if F is not None:
26 self.distribution = 'F'
27 self.fvalue = F
28 self.statistic = self.fvalue
29 self.df_denom = df_denom
30 self.df_num = df_num
31 self.dist = fdist
32 self.dist_args = (df_num, df_denom)
33 self.pvalue = fdist.sf(F, df_num, df_denom)
34 elif t is not None:
35 self.distribution = 't'
36 self.tvalue = t
37 self.statistic = t # generic alias
38 self.sd = sd
39 self.df_denom = df_denom
40 self.dist = student_t
41 self.dist_args = (df_denom,)
42 self.pvalue = self.dist.sf(np.abs(t), df_denom) * 2
43 elif 'statistic' in kwds:
44 # TODO: currently targeted to normal distribution, and chi2
45 self.distribution = kwds['distribution']
46 self.statistic = kwds['statistic']
47 self.tvalue = value = kwds['statistic'] # keep alias
48 # TODO: for results instance we decided to use tvalues also for normal
49 self.sd = sd
50 self.dist = getattr(stats, self.distribution)
51 self.dist_args = kwds.get('dist_args', ())
52 if self.distribution == 'chi2':
53 self.pvalue = self.dist.sf(self.statistic, df_denom)
54 self.df_denom = df_denom
55 else:
56 "normal"
57 self.pvalue = np.full_like(value, np.nan)
58 not_nan = ~np.isnan(value)
59 self.pvalue[not_nan] = self.dist.sf(np.abs(value[not_nan])) * 2
61 # cleanup
62 # should we return python scalar?
63 self.pvalue = np.squeeze(self.pvalue)
66 def conf_int(self, alpha=0.05):
67 """
68 Returns the confidence interval of the value, `effect` of the constraint.
70 This is currently only available for t and z tests.
72 Parameters
73 ----------
74 alpha : float, optional
75 The significance level for the confidence interval.
76 ie., The default `alpha` = .05 returns a 95% confidence interval.
78 Returns
79 -------
80 ci : ndarray, (k_constraints, 2)
81 The array has the lower and the upper limit of the confidence
82 interval in the columns.
83 """
84 if self.effect is not None:
85 # confidence intervals
86 q = self.dist.ppf(1 - alpha / 2., *self.dist_args)
87 lower = self.effect - q * self.sd
88 upper = self.effect + q * self.sd
89 return np.column_stack((lower, upper))
90 else:
91 raise NotImplementedError('Confidence Interval not available')
93 def __str__(self):
94 return self.summary().__str__()
96 def __repr__(self):
97 return str(self.__class__) + '\n' + self.__str__()
99 def summary(self, xname=None, alpha=0.05, title=None):
100 """Summarize the Results of the hypothesis test
102 Parameters
103 ----------
104 xname : list[str], optional
105 Default is `c_##` for ## in the number of regressors
106 alpha : float
107 significance level for the confidence intervals. Default is
108 alpha = 0.05 which implies a confidence level of 95%.
109 title : str, optional
110 Title for the params table. If not None, then this replaces the
111 default title
113 Returns
114 -------
115 smry : str or Summary instance
116 This contains a parameter results table in the case of t or z test
117 in the same form as the parameter results table in the model
118 results summary.
119 For F or Wald test, the return is a string.
120 """
121 if self.effect is not None:
122 # TODO: should also add some extra information, e.g. robust cov ?
123 # TODO: can we infer names for constraints, xname in __init__ ?
124 if title is None:
125 title = 'Test for Constraints'
126 elif title == '':
127 # do not add any title,
128 # I think SimpleTable skips on None - check
129 title = None
130 # we have everything for a params table
131 use_t = (self.distribution == 't')
132 yname='constraints' # Not used in params_frame
133 if xname is None:
134 xname = ['c%d' % ii for ii in range(len(self.effect))]
135 from statsmodels.iolib.summary import summary_params
136 pvalues = np.atleast_1d(self.pvalue)
137 summ = summary_params((self, self.effect, self.sd, self.statistic,
138 pvalues, self.conf_int(alpha)),
139 yname=yname, xname=xname, use_t=use_t,
140 title=title, alpha=alpha)
141 return summ
142 elif hasattr(self, 'fvalue'):
143 # TODO: create something nicer for these casee
144 return ('<F test: F=%s, p=%s, df_denom=%.3g, df_num=%.3g>' %
145 (repr(self.fvalue), self.pvalue, self.df_denom,
146 self.df_num))
147 elif self.distribution == 'chi2':
148 return ('<Wald test (%s): statistic=%s, p-value=%s, df_denom=%.3g>' %
149 (self.distribution, self.statistic, self.pvalue,
150 self.df_denom))
151 else:
152 # generic
153 return ('<Wald test: statistic=%s, p-value=%s>' %
154 (self.statistic, self.pvalue))
157 def summary_frame(self, xname=None, alpha=0.05):
158 """Return the parameter table as a pandas DataFrame
160 This is only available for t and normal tests
161 """
162 if self.effect is not None:
163 # we have everything for a params table
164 use_t = (self.distribution == 't')
165 yname='constraints' # Not used in params_frame
166 if xname is None:
167 xname = ['c%d' % ii for ii in range(len(self.effect))]
168 from statsmodels.iolib.summary import summary_params_frame
169 summ = summary_params_frame((self, self.effect, self.sd,
170 self.statistic,self.pvalue,
171 self.conf_int(alpha)), yname=yname,
172 xname=xname, use_t=use_t,
173 alpha=alpha)
174 return summ
175 else:
176 # TODO: create something nicer
177 raise NotImplementedError('only available for t and z')
181class Contrast(object):
182 """
183 This class is used to construct contrast matrices in regression models.
185 They are specified by a (term, design) pair. The term, T, is a linear
186 combination of columns of the design matrix. The matrix attribute of
187 Contrast is a contrast matrix C so that
189 colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T)))
191 where pinv(D) is the generalized inverse of D. Further, the matrix
193 Tnew = dot(C, D)
195 is full rank. The rank attribute is the rank of
197 dot(D, dot(pinv(D), T))
199 In a regression model, the contrast tests that E(dot(Tnew, Y)) = 0
200 for each column of Tnew.
202 Parameters
203 ----------
204 term : array_like
205 design : array_like
207 Attributes
208 ----------
209 contrast_matrix
211 Examples
212 --------
213 >>> import statsmodels.api as sm
214 >>> from statsmodels.stats.contrast import Contrast
215 >>> import numpy as np
216 >>> np.random.seed(54321)
217 >>> X = np.random.standard_normal((40,10))
218 # Get a contrast
219 >>> new_term = np.column_stack((X[:,0], X[:,2]))
220 >>> c = Contrast(new_term, X)
221 >>> test = [[1] + [0]*9, [0]*2 + [1] + [0]*7]
222 >>> np.allclose(c.contrast_matrix, test)
223 True
225 Get another contrast
227 >>> P = np.dot(X, np.linalg.pinv(X))
228 >>> resid = np.identity(40) - P
229 >>> noise = np.dot(resid,np.random.standard_normal((40,5)))
230 >>> new_term2 = np.column_stack((noise,X[:,2]))
231 >>> c2 = Contrast(new_term2, X)
232 >>> print(c2.contrast_matrix)
233 [ -1.26424750e-16 8.59467391e-17 1.56384718e-01 -2.60875560e-17
234 -7.77260726e-17 -8.41929574e-18 -7.36359622e-17 -1.39760860e-16
235 1.82976904e-16 -3.75277947e-18]
237 Get another contrast
239 >>> zero = np.zeros((40,))
240 >>> new_term3 = np.column_stack((zero,X[:,2]))
241 >>> c3 = Contrast(new_term3, X)
242 >>> test2 = [0]*2 + [1] + [0]*7
243 >>> np.allclose(c3.contrast_matrix, test2)
244 True
245 """
246 def _get_matrix(self):
247 """
248 Gets the contrast_matrix property
249 """
250 if not hasattr(self, "_contrast_matrix"):
251 self.compute_matrix()
252 return self._contrast_matrix
254 contrast_matrix = property(_get_matrix)
256 def __init__(self, term, design):
257 self.term = np.asarray(term)
258 self.design = np.asarray(design)
260 def compute_matrix(self):
261 """
262 Construct a contrast matrix C so that
264 colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T)))
266 where pinv(D) is the generalized inverse of D=design.
267 """
269 T = self.term
270 if T.ndim == 1:
271 T = T[:,None]
273 self.T = clean0(T)
274 self.D = self.design
275 self._contrast_matrix = contrastfromcols(self.T, self.D)
276 try:
277 self.rank = self.matrix.shape[1]
278 except:
279 self.rank = 1
281#TODO: fix docstring after usage is settled
282def contrastfromcols(L, D, pseudo=None):
283 """
284 From an n x p design matrix D and a matrix L, tries
285 to determine a p x q contrast matrix C which
286 determines a contrast of full rank, i.e. the
287 n x q matrix
289 dot(transpose(C), pinv(D))
291 is full rank.
293 L must satisfy either L.shape[0] == n or L.shape[1] == p.
295 If L.shape[0] == n, then L is thought of as representing
296 columns in the column space of D.
298 If L.shape[1] == p, then L is thought of as what is known
299 as a contrast matrix. In this case, this function returns an estimable
300 contrast corresponding to the dot(D, L.T)
302 Note that this always produces a meaningful contrast, not always
303 with the intended properties because q is always non-zero unless
304 L is identically 0. That is, it produces a contrast that spans
305 the column space of L (after projection onto the column space of D).
307 Parameters
308 ----------
309 L : array_like
310 D : array_like
311 """
312 L = np.asarray(L)
313 D = np.asarray(D)
315 n, p = D.shape
317 if L.shape[0] != n and L.shape[1] != p:
318 raise ValueError("shape of L and D mismatched")
320 if pseudo is None:
321 pseudo = np.linalg.pinv(D) # D^+ \approx= ((dot(D.T,D))^(-1),D.T)
323 if L.shape[0] == n:
324 C = np.dot(pseudo, L).T
325 else:
326 C = L
327 C = np.dot(pseudo, np.dot(D, C.T)).T
329 Lp = np.dot(D, C.T)
331 if len(Lp.shape) == 1:
332 Lp.shape = (n, 1)
334 if np.linalg.matrix_rank(Lp) != Lp.shape[1]:
335 Lp = fullrank(Lp)
336 C = np.dot(pseudo, Lp).T
338 return np.squeeze(C)
341# TODO: this is currently a minimal version, stub
342class WaldTestResults(object):
343 # for F and chi2 tests of joint hypothesis, mainly for vectorized
345 def __init__(self, statistic, distribution, dist_args, table=None,
346 pvalues=None):
347 self.table = table
349 self.distribution = distribution
350 self.statistic = statistic
351 #self.sd = sd
352 self.dist_args = dist_args
354 # The following is because I do not know which we want
355 if table is not None:
356 self.statistic = table['statistic'].values
357 self.pvalues = table['pvalue'].values
358 self.df_constraints = table['df_constraint'].values
359 if self.distribution == 'F':
360 self.df_denom = table['df_denom'].values
362 else:
363 if self.distribution == 'chi2':
364 self.dist = stats.chi2
365 self.df_constraints = self.dist_args[0] # assumes tuple
366 # using dist_args[0] is a bit dangerous,
367 elif self.distribution == 'F':
368 self.dist = stats.f
369 self.df_constraints, self.df_denom = self.dist_args
371 else:
372 raise ValueError('only F and chi2 are possible distribution')
374 if pvalues is None:
375 self.pvalues = self.dist.sf(np.abs(statistic), *dist_args)
376 else:
377 self.pvalues = pvalues
379 @property
380 def col_names(self):
381 """column names for summary table
382 """
384 pr_test = "P>%s" % self.distribution
385 col_names = [self.distribution, pr_test, 'df constraint']
386 if self.distribution == 'F':
387 col_names.append('df denom')
388 return col_names
390 def summary_frame(self):
391 # needs to be a method for consistency
392 if hasattr(self, '_dframe'):
393 return self._dframe
394 # rename the column nambes, but do not copy data
395 renaming = dict(zip(self.table.columns, self.col_names))
396 self.dframe = self.table.rename(columns=renaming)
397 return self.dframe
400 def __str__(self):
401 return self.summary_frame().to_string()
404 def __repr__(self):
405 return str(self.__class__) + '\n' + self.__str__()
408# t_test for pairwise comparison and automatic contrast/restrictions
411def _get_pairs_labels(k_level, level_names):
412 """helper function for labels for pairwise comparisons
413 """
414 idx_pairs_all = np.triu_indices(k_level, 1)
415 labels = ['%s-%s' % (level_names[name[1]], level_names[name[0]])
416 for name in zip(*idx_pairs_all)]
417 return labels
419def _contrast_pairs(k_params, k_level, idx_start):
420 """create pairwise contrast for reference coding
422 currently not used,
423 using encoding contrast matrix is more general, but requires requires
424 factor information from patsy design_info.
427 Parameters
428 ----------
429 k_params : int
430 number of parameters
431 k_level : int
432 number of levels or categories (including reference case)
433 idx_start : int
434 Index of the first parameter of this factor. The restrictions on the
435 factor are inserted as a block in the full restriction matrix starting
436 at column with index `idx_start`.
438 Returns
439 -------
440 contrasts : ndarray
441 restriction matrix with k_params columns and number of rows equal to
442 the number of restrictions.
443 """
444 k_level_m1 = k_level - 1
445 idx_pairs = np.triu_indices(k_level_m1, 1)
447 k = len(idx_pairs[0])
448 c_pairs = np.zeros((k, k_level_m1))
449 c_pairs[np.arange(k), idx_pairs[0]] = -1
450 c_pairs[np.arange(k), idx_pairs[1]] = 1
451 c_reference = np.eye(k_level_m1)
452 c = np.concatenate((c_reference, c_pairs), axis=0)
453 k_all = c.shape[0]
455 contrasts = np.zeros((k_all, k_params))
456 contrasts[:, idx_start : idx_start + k_level_m1] = c
458 return contrasts
461def t_test_multi(result, contrasts, method='hs', alpha=0.05, ci_method=None,
462 contrast_names=None):
463 """perform t_test and add multiplicity correction to results dataframe
465 Parameters
466 ----------
467 result results instance
468 results of an estimated model
469 contrasts : ndarray
470 restriction matrix for t_test
471 method : str or list of strings
472 method for multiple testing p-value correction, default is'hs'.
473 alpha : float
474 significance level for multiple testing reject decision.
475 ci_method : None
476 not used yet, will be for multiplicity corrected confidence intervals
477 contrast_names : {list[str], None}
478 If contrast_names are provided, then they are used in the index of the
479 returned dataframe, otherwise some generic default names are created.
481 Returns
482 -------
483 res_df : pandas DataFrame
484 The dataframe contains the results of the t_test and additional columns
485 for multiplicity corrected p-values and boolean indicator for whether
486 the Null hypothesis is rejected.
487 """
488 tt = result.t_test(contrasts)
489 res_df = tt.summary_frame(xname=contrast_names)
491 if type(method) is not list:
492 method = [method]
493 for meth in method:
494 mt = multipletests(tt.pvalue, method=meth, alpha=alpha)
495 res_df['pvalue-%s' % meth] = mt[1]
496 res_df['reject-%s' % meth] = mt[0]
497 return res_df
500class MultiCompResult(object):
501 """class to hold return of t_test_pairwise
503 currently just a minimal class to hold attributes.
504 """
505 def __init__(self, **kwargs):
506 self.__dict__.update(kwargs)
509def _embed_constraints(contrasts, k_params, idx_start, index=None):
510 """helper function to expand constraints to a full restriction matrix
512 Parameters
513 ----------
514 contrasts : ndarray
515 restriction matrix for t_test
516 k_params : int
517 number of parameters
518 idx_start : int
519 Index of the first parameter of this factor. The restrictions on the
520 factor are inserted as a block in the full restriction matrix starting
521 at column with index `idx_start`.
522 index : slice or ndarray
523 Column index if constraints do not form a block in the full restriction
524 matrix, i.e. if parameters that are subject to restrictions are not
525 consecutive in the list of parameters.
526 If index is not None, then idx_start is ignored.
528 Returns
529 -------
530 contrasts : ndarray
531 restriction matrix with k_params columns and number of rows equal to
532 the number of restrictions.
533 """
535 k_c, k_p = contrasts.shape
536 c = np.zeros((k_c, k_params))
537 if index is None:
538 c[:, idx_start : idx_start + k_p] = contrasts
539 else:
540 c[:, index] = contrasts
541 return c
544def _constraints_factor(encoding_matrix, comparison='pairwise', k_params=None,
545 idx_start=None):
546 """helper function to create constraints based on encoding matrix
548 Parameters
549 ----------
550 encoding_matrix : ndarray
551 contrast matrix for the encoding of a factor as defined by patsy.
552 The number of rows should be equal to the number of levels or categories
553 of the factor, the number of columns should be equal to the number
554 of parameters for this factor.
555 comparison : str
556 Currently only 'pairwise' is implemented. The restriction matrix
557 can be used for testing the hypothesis that all pairwise differences
558 are zero.
559 k_params : int
560 number of parameters
561 idx_start : int
562 Index of the first parameter of this factor. The restrictions on the
563 factor are inserted as a block in the full restriction matrix starting
564 at column with index `idx_start`.
566 Returns
567 -------
568 contrast : ndarray
569 Contrast or restriction matrix that can be used in hypothesis test
570 of model results. The number of columns is k_params.
571 """
573 cm = encoding_matrix
574 k_level, k_p = cm.shape
576 import statsmodels.sandbox.stats.multicomp as mc
577 if comparison in ['pairwise', 'pw', 'pairs']:
578 c_all = -mc.contrast_allpairs(k_level)
579 else:
580 raise NotImplementedError('currentlyonly pairwise comparison')
582 contrasts = c_all.dot(cm)
583 if k_params is not None:
584 if idx_start is None:
585 raise ValueError("if k_params is not None, then idx_start is "
586 "required")
587 contrasts = _embed_constraints(contrasts, k_params, idx_start)
588 return contrasts
591def t_test_pairwise(result, term_name, method='hs', alpha=0.05,
592 factor_labels=None, ignore=False):
593 """
594 Perform pairwise t_test with multiple testing corrected p-values.
596 This uses the formula design_info encoding contrast matrix and should
597 work for all encodings of a main effect.
599 Parameters
600 ----------
601 result : result instance
602 The results of an estimated model with a categorical main effect.
603 term_name : str
604 name of the term for which pairwise comparisons are computed.
605 Term names for categorical effects are created by patsy and
606 correspond to the main part of the exog names.
607 method : {str, list[str]}
608 multiple testing p-value correction, default is 'hs',
609 see stats.multipletesting
610 alpha : float
611 significance level for multiple testing reject decision.
612 factor_labels : {list[str], None}
613 Labels for the factor levels used for pairwise labels. If not
614 provided, then the labels from the formula design_info are used.
615 ignore : bool
616 Turn off some of the exceptions raised by input checks.
618 Returns
619 -------
620 MultiCompResult
621 The results are stored as attributes, the main attributes are the
622 following two. Other attributes are added for debugging purposes
623 or as background information.
625 - result_frame : pandas DataFrame with t_test results and multiple
626 testing corrected p-values.
627 - contrasts : matrix of constraints of the null hypothesis in the
628 t_test.
630 Notes
631 -----
633 Status: experimental. Currently only checked for treatment coding with
634 and without specified reference level.
636 Currently there are no multiple testing corrected confidence intervals
637 available.
638 """
640 desinfo = result.model.data.design_info
641 term_idx = desinfo.term_names.index(term_name)
642 term = desinfo.terms[term_idx]
643 idx_start = desinfo.term_slices[term].start
644 if not ignore and len(term.factors) > 1:
645 raise ValueError('interaction effects not yet supported')
646 factor = term.factors[0]
647 cat = desinfo.factor_infos[factor].categories
648 if factor_labels is not None:
649 if len(factor_labels) == len(cat):
650 cat = factor_labels
651 else:
652 raise ValueError("factor_labels has the wrong length, should be %d" % len(cat))
655 k_level = len(cat)
656 cm = desinfo.term_codings[term][0].contrast_matrices[factor].matrix
658 k_params = len(result.params)
659 labels = _get_pairs_labels(k_level, cat)
661 import statsmodels.sandbox.stats.multicomp as mc
662 c_all_pairs = -mc.contrast_allpairs(k_level)
663 contrasts_sub = c_all_pairs.dot(cm)
664 contrasts = _embed_constraints(contrasts_sub, k_params, idx_start)
665 res_df = t_test_multi(result, contrasts, method=method, ci_method=None,
666 alpha=alpha, contrast_names=labels)
667 res = MultiCompResult(result_frame=res_df,
668 contrasts=contrasts,
669 term=term,
670 contrast_labels=labels,
671 term_encoding_matrix=cm)
672 return res