Coverage for contextualized/analysis/pvals.py: 98%
51 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-21 13:49 -0400
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-21 13:49 -0400
1"""
2Analysis tools for generating pvalues from bootstrap replicates.
4"""
6from typing import *
8import numpy as np
9import pandas as pd
11from contextualized.analysis.effects import (
12 get_homogeneous_context_effects,
13 get_homogeneous_predictor_effects,
14 get_heterogeneous_predictor_effects,
15)
16from contextualized.easy.wrappers import SKLearnWrapper
19def get_possible_pvals(num_bootstraps: int) -> list:
20 """
21 Get the range of possible p-values based on the number of bootstraps.
23 Args:
24 num_bootstraps (int): The number of bootstraps.
26 Returns:
27 list: The minimum and maximum possible p-values.
28 """
29 min_pval = 1 / (num_bootstraps + 1)
30 max_pval = num_bootstraps / (num_bootstraps + 1)
31 return [min_pval, max_pval]
34def _validate_args(n_bootstraps: int, verbose: bool = False) -> None:
35 """
36 Check that the test has a sufficient number of bootstraps.
38 Args:
39 num_bootstraps (int): The number of bootstraps.
41 Raises:
42 ValueError: If the number of bootstraps is less than 2.
43 """
44 if n_bootstraps < 2:
45 raise ValueError(
46 f"P-values are not well defined without multiple bootstrap samples."
47 )
48 min_pval, max_pval = get_possible_pvals(n_bootstraps)
49 if verbose:
50 print(
51 "########################################################################################\n"
52 f"You are testing a model which contains {n_bootstraps} bootstraps.\n"
53 f"The minimum possible p-value is {min_pval}.\n"
54 f"To allow for lower p-values, increase the model's n_bootstraps.\n"
55 "########################################################################################"
56 )
59def calc_pval_bootstraps_one_sided(estimates, thresh=0, laplace_smoothing=1):
60 """
61 Calculate p-values from bootstrapped estimates.
63 Parameters
64 ----------
65 estimates : np.ndarray
66 Bootstrapped estimates of the test statistic.
67 thresh : float, optional
68 laplace_smoothing : int, optional
69 """
71 return (laplace_smoothing + np.sum(estimates < thresh)) / (
72 estimates.shape[0] + laplace_smoothing
73 )
76def calc_pval_bootstraps_one_sided_mean(estimates, laplace_smoothing=1):
77 """
78 Calculate p-values from bootstrapped estimates.
79 The p-value is calculated as the proportion of bootstrapped estimates that are:
80 less than 0 if the mean of the estimates is positive,
81 greater than 0 if the mean of the estimates is negative.
83 Parameters
84 ----------
85 estimates : np.ndarray
86 Bootstrapped estimates of the test statistic.
87 laplace_smoothing : int, optional
88 """
90 return calc_pval_bootstraps_one_sided(
91 estimates * np.sign(np.mean(estimates)), 0, laplace_smoothing
92 )
95def calc_homogeneous_context_effects_pvals(
96 model: SKLearnWrapper, C: np.ndarray, verbose: bool = True, **kwargs
97) -> np.ndarray:
98 """
99 Calculate p-values for the effects of context directly on the outcome.
101 Args:
102 model (SKLearnWrapper): Model to analyze.
103 C (np.ndarray): Contexts to analyze.
104 verbose (bool): Whether to print the range of possible p-values.
106 Returns:
107 np.ndarray: P-values of shape (n_contexts, n_outcomes) testing whether the
108 sign of the direct effect of context on outcomes is consistent across bootstraps.
110 Raises:
111 ValueError: If the model's n_bootstraps is less than 2.
112 """
113 _validate_args(model.n_bootstraps, verbose=verbose)
114 _, effects = get_homogeneous_context_effects(model, C, **kwargs)
115 # effects.shape: (n_contexts, n_bootstraps, n_context_vals, n_outcomes)
116 diffs = effects[:, :, -1] - effects[:, :, 0] # Test whether the sign is consistent
117 pvals = np.array(
118 [
119 np.array(
120 [
121 calc_pval_bootstraps_one_sided_mean(
122 diffs[i, :, j],
123 laplace_smoothing=kwargs.get("laplace_smoothing", 1),
124 )
125 for j in range(diffs.shape[2]) # n_outcomes
126 ]
127 )
128 for i in range(diffs.shape[0]) # n_contexts
129 ]
130 )
131 return pvals
134def calc_homogeneous_predictor_effects_pvals(
135 model: SKLearnWrapper, C: np.ndarray, verbose: bool = True, **kwargs
136) -> np.ndarray:
137 """
138 Calculate p-values for the context-invariant effects of predictors.
140 Args:
141 model (SKLearnWrapper): Model to analyze.
142 C (np.ndarray): Contexts to analyze.
143 verbose (bool): Whether to print the range of possible p-values.
145 Returns:
146 np.ndarray: P-values of shape (n_predictors, n_outcomes) testing whether the
147 sign of the context-invariant predictor effects are consistent across bootstraps.
149 Raises:
150 ValueError: If the model's n_bootstraps is less than 2.
151 """
152 _validate_args(model.n_bootstraps, verbose=verbose)
153 _, effects = get_homogeneous_predictor_effects(model, C, **kwargs)
154 # effects.shape: (n_predictors, n_bootstraps, n_outcomes)
155 pvals = np.array(
156 [
157 np.array(
158 [
159 calc_pval_bootstraps_one_sided_mean(
160 effects[i, :, j],
161 laplace_smoothing=kwargs.get("laplace_smoothing", 1),
162 )
163 for j in range(effects.shape[2]) # n_outcomes
164 ]
165 )
166 for i in range(effects.shape[0]) # n_predictors
167 ]
168 )
169 return pvals
172def calc_heterogeneous_predictor_effects_pvals(
173 model, C: np.ndarray, verbose: bool = True, **kwargs
174) -> np.ndarray:
175 """
176 Calculate p-values for the heterogeneous (context-dependent) effects of predictors.
178 Args:
179 model (SKLearnWrapper): Model to analyze.
180 C (np.ndarray): Contexts to analyze.
181 verbose (bool): Whether to print the range of possible p-values.
183 Returns:
184 np.ndarray: P-values of shape (n_contexts, n_predictors, n_outcomes) testing whether the
185 context-varying parameter range is consistent across bootstraps.
187 Raises:
188 ValueError: If the model's n_bootstraps is less than 2.
189 """
190 _validate_args(model.n_bootstraps, verbose=verbose)
191 _, effects = get_heterogeneous_predictor_effects(model, C, **kwargs)
192 # effects.shape is (n_contexts, n_predictors, n_bootstraps, n_context_vals, n_outcomes)
193 diffs = (
194 effects[:, :, :, -1] - effects[:, :, :, 0]
195 ) # Test whether the sign is consistent
196 # diffs.shape is (n_contexts, n_predictors, n_bootstraps, n_outcomes)
197 pvals = np.array(
198 [
199 np.array(
200 [
201 np.array(
202 [
203 calc_pval_bootstraps_one_sided_mean(
204 diffs[i, j, :, k],
205 laplace_smoothing=kwargs.get("laplace_smoothing", 1),
206 )
207 for k in range(diffs.shape[3])
208 ]
209 ) # n_outcomes
210 for j in range(diffs.shape[1])
211 ]
212 ) # n_predictors
213 for i in range(diffs.shape[0]) # n_contexts
214 ]
215 )
216 return pvals
219def test_each_context(
220 model_constructor: Type[SKLearnWrapper],
221 C: pd.DataFrame,
222 X: pd.DataFrame,
223 Y: pd.DataFrame,
224 verbose: bool = True,
225 model_kwargs: Dict = {"encoder_type": "linear"},
226 fit_kwargs: Dict = {"max_epochs": 3, "learning_rate": 1e-2, "n_bootstraps": 20},
227) -> pd.DataFrame:
228 """
229 Test heterogeneous predictor effects attributed to every individual context feature.
230 Applies test_heterogeneous_predictor_effects to a model learned for a single context feature in C, and does this sequentially for every context feature.
232 Args:
233 model_constructor (SKLearnWrapper): The constructor of the model to be tested, currently either ContextualizedRegressor or ContextualizedClassifier.
234 C (pd.DataFrame): The context dataframe (n_samples, n_contexts).
235 X (pd.DataFrame): The predictor dataframe (n_samples, n_predictors).
236 Y (pd.DataFrame): The outcome, target, or label dataframe (n_samples, n_outcomes).
237 verbose (bool): Whether to print the range of possible p-values.
238 **kwargs: Additional arguments for the model constructor.
240 Returns:
241 pd.DataFrame: A DataFrame of p-values for each (context, predictor, outcome) combination, describing how much the predictor's effect on the outcome varies across the context.
243 Raises:
244 ValueError: If the model's n_bootstraps is less than 2.
245 """
246 pvals_dict = {
247 "Context": [],
248 "Predictor": [],
249 "Target": [],
250 "Pvals": [],
251 }
252 _validate_args(fit_kwargs["n_bootstraps"], verbose=verbose)
253 for context in C.columns:
254 context_col = C[[context]].values
255 model = model_constructor(**model_kwargs)
256 model.fit(context_col, X.values, Y.values, **fit_kwargs)
257 pvals = calc_heterogeneous_predictor_effects_pvals(
258 model, context_col, verbose=False
259 )
260 for i, predictor in enumerate(X.columns):
261 for j, outcome in enumerate(Y.columns):
262 pvals_dict["Context"].append(context)
263 pvals_dict["Predictor"].append(predictor)
264 pvals_dict["Target"].append(outcome)
265 pvals_dict["Pvals"].append(pvals[0, i, j])
267 return pd.DataFrame.from_dict(pvals_dict)