Coverage for contextualized/analysis/pvals.py: 98%

51 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-21 13:49 -0400

1""" 

2Analysis tools for generating pvalues from bootstrap replicates. 

3 

4""" 

5 

6from typing import * 

7 

8import numpy as np 

9import pandas as pd 

10 

11from contextualized.analysis.effects import ( 

12 get_homogeneous_context_effects, 

13 get_homogeneous_predictor_effects, 

14 get_heterogeneous_predictor_effects, 

15) 

16from contextualized.easy.wrappers import SKLearnWrapper 

17 

18 

19def get_possible_pvals(num_bootstraps: int) -> list: 

20 """ 

21 Get the range of possible p-values based on the number of bootstraps. 

22 

23 Args: 

24 num_bootstraps (int): The number of bootstraps. 

25 

26 Returns: 

27 list: The minimum and maximum possible p-values. 

28 """ 

29 min_pval = 1 / (num_bootstraps + 1) 

30 max_pval = num_bootstraps / (num_bootstraps + 1) 

31 return [min_pval, max_pval] 

32 

33 

34def _validate_args(n_bootstraps: int, verbose: bool = False) -> None: 

35 """ 

36 Check that the test has a sufficient number of bootstraps. 

37 

38 Args: 

39 num_bootstraps (int): The number of bootstraps. 

40 

41 Raises: 

42 ValueError: If the number of bootstraps is less than 2. 

43 """ 

44 if n_bootstraps < 2: 

45 raise ValueError( 

46 f"P-values are not well defined without multiple bootstrap samples." 

47 ) 

48 min_pval, max_pval = get_possible_pvals(n_bootstraps) 

49 if verbose: 

50 print( 

51 "########################################################################################\n" 

52 f"You are testing a model which contains {n_bootstraps} bootstraps.\n" 

53 f"The minimum possible p-value is {min_pval}.\n" 

54 f"To allow for lower p-values, increase the model's n_bootstraps.\n" 

55 "########################################################################################" 

56 ) 

57 

58 

59def calc_pval_bootstraps_one_sided(estimates, thresh=0, laplace_smoothing=1): 

60 """ 

61 Calculate p-values from bootstrapped estimates. 

62 

63 Parameters 

64 ---------- 

65 estimates : np.ndarray 

66 Bootstrapped estimates of the test statistic. 

67 thresh : float, optional 

68 laplace_smoothing : int, optional 

69 """ 

70 

71 return (laplace_smoothing + np.sum(estimates < thresh)) / ( 

72 estimates.shape[0] + laplace_smoothing 

73 ) 

74 

75 

76def calc_pval_bootstraps_one_sided_mean(estimates, laplace_smoothing=1): 

77 """ 

78 Calculate p-values from bootstrapped estimates. 

79 The p-value is calculated as the proportion of bootstrapped estimates that are: 

80 less than 0 if the mean of the estimates is positive, 

81 greater than 0 if the mean of the estimates is negative. 

82 

83 Parameters 

84 ---------- 

85 estimates : np.ndarray 

86 Bootstrapped estimates of the test statistic. 

87 laplace_smoothing : int, optional 

88 """ 

89 

90 return calc_pval_bootstraps_one_sided( 

91 estimates * np.sign(np.mean(estimates)), 0, laplace_smoothing 

92 ) 

93 

94 

95def calc_homogeneous_context_effects_pvals( 

96 model: SKLearnWrapper, C: np.ndarray, verbose: bool = True, **kwargs 

97) -> np.ndarray: 

98 """ 

99 Calculate p-values for the effects of context directly on the outcome. 

100 

101 Args: 

102 model (SKLearnWrapper): Model to analyze. 

103 C (np.ndarray): Contexts to analyze. 

104 verbose (bool): Whether to print the range of possible p-values. 

105 

106 Returns: 

107 np.ndarray: P-values of shape (n_contexts, n_outcomes) testing whether the 

108 sign of the direct effect of context on outcomes is consistent across bootstraps. 

109 

110 Raises: 

111 ValueError: If the model's n_bootstraps is less than 2. 

112 """ 

113 _validate_args(model.n_bootstraps, verbose=verbose) 

114 _, effects = get_homogeneous_context_effects(model, C, **kwargs) 

115 # effects.shape: (n_contexts, n_bootstraps, n_context_vals, n_outcomes) 

116 diffs = effects[:, :, -1] - effects[:, :, 0] # Test whether the sign is consistent 

117 pvals = np.array( 

118 [ 

119 np.array( 

120 [ 

121 calc_pval_bootstraps_one_sided_mean( 

122 diffs[i, :, j], 

123 laplace_smoothing=kwargs.get("laplace_smoothing", 1), 

124 ) 

125 for j in range(diffs.shape[2]) # n_outcomes 

126 ] 

127 ) 

128 for i in range(diffs.shape[0]) # n_contexts 

129 ] 

130 ) 

131 return pvals 

132 

133 

134def calc_homogeneous_predictor_effects_pvals( 

135 model: SKLearnWrapper, C: np.ndarray, verbose: bool = True, **kwargs 

136) -> np.ndarray: 

137 """ 

138 Calculate p-values for the context-invariant effects of predictors. 

139 

140 Args: 

141 model (SKLearnWrapper): Model to analyze. 

142 C (np.ndarray): Contexts to analyze. 

143 verbose (bool): Whether to print the range of possible p-values. 

144 

145 Returns: 

146 np.ndarray: P-values of shape (n_predictors, n_outcomes) testing whether the 

147 sign of the context-invariant predictor effects are consistent across bootstraps. 

148 

149 Raises: 

150 ValueError: If the model's n_bootstraps is less than 2. 

151 """ 

152 _validate_args(model.n_bootstraps, verbose=verbose) 

153 _, effects = get_homogeneous_predictor_effects(model, C, **kwargs) 

154 # effects.shape: (n_predictors, n_bootstraps, n_outcomes) 

155 pvals = np.array( 

156 [ 

157 np.array( 

158 [ 

159 calc_pval_bootstraps_one_sided_mean( 

160 effects[i, :, j], 

161 laplace_smoothing=kwargs.get("laplace_smoothing", 1), 

162 ) 

163 for j in range(effects.shape[2]) # n_outcomes 

164 ] 

165 ) 

166 for i in range(effects.shape[0]) # n_predictors 

167 ] 

168 ) 

169 return pvals 

170 

171 

172def calc_heterogeneous_predictor_effects_pvals( 

173 model, C: np.ndarray, verbose: bool = True, **kwargs 

174) -> np.ndarray: 

175 """ 

176 Calculate p-values for the heterogeneous (context-dependent) effects of predictors. 

177 

178 Args: 

179 model (SKLearnWrapper): Model to analyze. 

180 C (np.ndarray): Contexts to analyze. 

181 verbose (bool): Whether to print the range of possible p-values. 

182 

183 Returns: 

184 np.ndarray: P-values of shape (n_contexts, n_predictors, n_outcomes) testing whether the 

185 context-varying parameter range is consistent across bootstraps. 

186 

187 Raises: 

188 ValueError: If the model's n_bootstraps is less than 2. 

189 """ 

190 _validate_args(model.n_bootstraps, verbose=verbose) 

191 _, effects = get_heterogeneous_predictor_effects(model, C, **kwargs) 

192 # effects.shape is (n_contexts, n_predictors, n_bootstraps, n_context_vals, n_outcomes) 

193 diffs = ( 

194 effects[:, :, :, -1] - effects[:, :, :, 0] 

195 ) # Test whether the sign is consistent 

196 # diffs.shape is (n_contexts, n_predictors, n_bootstraps, n_outcomes) 

197 pvals = np.array( 

198 [ 

199 np.array( 

200 [ 

201 np.array( 

202 [ 

203 calc_pval_bootstraps_one_sided_mean( 

204 diffs[i, j, :, k], 

205 laplace_smoothing=kwargs.get("laplace_smoothing", 1), 

206 ) 

207 for k in range(diffs.shape[3]) 

208 ] 

209 ) # n_outcomes 

210 for j in range(diffs.shape[1]) 

211 ] 

212 ) # n_predictors 

213 for i in range(diffs.shape[0]) # n_contexts 

214 ] 

215 ) 

216 return pvals 

217 

218 

219def test_each_context( 

220 model_constructor: Type[SKLearnWrapper], 

221 C: pd.DataFrame, 

222 X: pd.DataFrame, 

223 Y: pd.DataFrame, 

224 verbose: bool = True, 

225 model_kwargs: Dict = {"encoder_type": "linear"}, 

226 fit_kwargs: Dict = {"max_epochs": 3, "learning_rate": 1e-2, "n_bootstraps": 20}, 

227) -> pd.DataFrame: 

228 """ 

229 Test heterogeneous predictor effects attributed to every individual context feature. 

230 Applies test_heterogeneous_predictor_effects to a model learned for a single context feature in C, and does this sequentially for every context feature. 

231 

232 Args: 

233 model_constructor (SKLearnWrapper): The constructor of the model to be tested, currently either ContextualizedRegressor or ContextualizedClassifier. 

234 C (pd.DataFrame): The context dataframe (n_samples, n_contexts). 

235 X (pd.DataFrame): The predictor dataframe (n_samples, n_predictors). 

236 Y (pd.DataFrame): The outcome, target, or label dataframe (n_samples, n_outcomes). 

237 verbose (bool): Whether to print the range of possible p-values. 

238 **kwargs: Additional arguments for the model constructor. 

239 

240 Returns: 

241 pd.DataFrame: A DataFrame of p-values for each (context, predictor, outcome) combination, describing how much the predictor's effect on the outcome varies across the context. 

242 

243 Raises: 

244 ValueError: If the model's n_bootstraps is less than 2. 

245 """ 

246 pvals_dict = { 

247 "Context": [], 

248 "Predictor": [], 

249 "Target": [], 

250 "Pvals": [], 

251 } 

252 _validate_args(fit_kwargs["n_bootstraps"], verbose=verbose) 

253 for context in C.columns: 

254 context_col = C[[context]].values 

255 model = model_constructor(**model_kwargs) 

256 model.fit(context_col, X.values, Y.values, **fit_kwargs) 

257 pvals = calc_heterogeneous_predictor_effects_pvals( 

258 model, context_col, verbose=False 

259 ) 

260 for i, predictor in enumerate(X.columns): 

261 for j, outcome in enumerate(Y.columns): 

262 pvals_dict["Context"].append(context) 

263 pvals_dict["Predictor"].append(predictor) 

264 pvals_dict["Target"].append(outcome) 

265 pvals_dict["Pvals"].append(pvals[0, i, j]) 

266 

267 return pd.DataFrame.from_dict(pvals_dict)