Coverage for /Users/sebastiana/Documents/Sugarpills/confidence/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

170 statements  

1from typing import Tuple, Union, Dict 

2 

3import numpy as np 

4from pandas import DataFrame, Series 

5from scipy import optimize 

6from scipy import stats as st 

7from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic 

8 

9from spotify_confidence.analysis.confidence_utils import power_calculation 

10from spotify_confidence.analysis.constants import ( 

11 NUMERATOR, 

12 NUMERATOR_SUM_OF_SQUARES, 

13 DENOMINATOR, 

14 INTERVAL_SIZE, 

15 FINAL_EXPECTED_SAMPLE_SIZE, 

16 ORDINAL_GROUP_COLUMN, 

17 POINT_ESTIMATE, 

18 CI_LOWER, 

19 CI_UPPER, 

20 ADJUSTED_LOWER, 

21 ADJUSTED_UPPER, 

22 VARIANCE, 

23 NUMBER_OF_COMPARISONS, 

24 TWO_SIDED, 

25 SFX2, 

26 SFX1, 

27 STD_ERR, 

28 PREFERENCE_TEST, 

29 NULL_HYPOTHESIS, 

30 DIFFERENCE, 

31 ALPHA, 

32 IS_SIGNIFICANT, 

33 HOLM, 

34 SPOT_1_HOLM, 

35 HOMMEL, 

36 SIMES_HOCHBERG, 

37 SPOT_1_HOMMEL, 

38 SPOT_1_SIMES_HOCHBERG, 

39 NIM, 

40 ADJUSTED_ALPHA, 

41) 

42from spotify_confidence.analysis.frequentist.sequential_bound_solver import bounds 

43 

44 

45def sequential_bounds(t: np.array, alpha: float, sides: int, state: DataFrame = None): 

46 return bounds(t, alpha, rho=2, ztrun=8, sides=sides, max_nints=1000, state=state) 

47 

48 

49def point_estimate(df: DataFrame, arg_dict: Dict[str, str]) -> float: 

50 numerator = arg_dict[NUMERATOR] 

51 denominator = arg_dict[DENOMINATOR] 

52 if (df[denominator] == 0).any(): 

53 raise ValueError("""Can't compute point estimate: denominator is 0""") 

54 return df[numerator] / df[denominator] 

55 

56 

57def variance(df: DataFrame, arg_dict: Dict[str, str]) -> float: 

58 numerator = arg_dict[NUMERATOR] 

59 denominator = arg_dict[DENOMINATOR] 

60 numerator_sumsq = arg_dict[NUMERATOR_SUM_OF_SQUARES] 

61 binary = df[numerator_sumsq] == df[numerator] 

62 if binary.all(): 

63 # This equals row[POINT_ESTIMATE]*(1-row[POINT_ESTIMATE]) when the data is binary, 

64 # and also gives a robust fallback in case it's not 

65 variance = df[numerator_sumsq] / df[denominator] - df[POINT_ESTIMATE] ** 2 

66 else: 

67 variance = (df[numerator_sumsq] - np.power(df[numerator], 2) / df[denominator]) / (df[denominator] - 1) 

68 if (variance < 0).any(): 

69 raise ValueError("Computed variance is negative. " "Please check your inputs.") 

70 return variance 

71 

72 

73def std_err(df: Series, arg_dict: Dict[str, str]) -> float: 

74 denominator = arg_dict[DENOMINATOR] 

75 return np.sqrt(df[VARIANCE + SFX1] / df[denominator + SFX1] + df[VARIANCE + SFX2] / df[denominator + SFX2]) 

76 

77 

78def add_point_estimate_ci(df: Series, arg_dict: Dict[str, str]) -> Series: 

79 denominator = arg_dict[DENOMINATOR] 

80 interval_size = arg_dict[INTERVAL_SIZE] 

81 df[CI_LOWER], df[CI_UPPER] = _zconfint_generic( 

82 mean=df[POINT_ESTIMATE], 

83 std_mean=np.sqrt(df[VARIANCE] / df[denominator]), 

84 alpha=1 - interval_size, 

85 alternative=TWO_SIDED, 

86 ) 

87 return df 

88 

89 

90def p_value(df: DataFrame, arg_dict: Dict[str, str]) -> Series: 

91 _, p_value = _zstat_generic( 

92 value1=df[POINT_ESTIMATE + SFX2], 

93 value2=df[POINT_ESTIMATE + SFX1], 

94 std_diff=df[STD_ERR], 

95 alternative=df[PREFERENCE_TEST].values[0], 

96 diff=df[NULL_HYPOTHESIS], 

97 ) 

98 return p_value 

99 

100 

101def ci(df: DataFrame, alpha_column: str, arg_dict: Dict[str, str]) -> Tuple[Series, Series]: 

102 return _zconfint_generic( 

103 mean=df[DIFFERENCE], std_mean=df[STD_ERR], alpha=df[alpha_column], alternative=df[PREFERENCE_TEST].values[0] 

104 ) 

105 

106 

107def achieved_power(df: DataFrame, mde: float, alpha: float, arg_dict: Dict[str, str]) -> DataFrame: 

108 denominator = arg_dict[DENOMINATOR] 

109 v1, v2 = df[VARIANCE + SFX1], df[VARIANCE + SFX2] 

110 n1, n2 = df[denominator + SFX1], df[denominator + SFX2] 

111 

112 var_pooled = ((n1 - 1) * v1 + (n2 - 1) * v2) / (n1 + n2 - 2) 

113 

114 return power_calculation(mde, var_pooled, alpha, n1, n2) 

115 

116 

117def compute_sequential_adjusted_alpha(df: DataFrame, arg_dict: Dict[str, str]): 

118 denominator = arg_dict[DENOMINATOR] 

119 final_expected_sample_size_column = arg_dict[FINAL_EXPECTED_SAMPLE_SIZE] 

120 ordinal_group_column = arg_dict[ORDINAL_GROUP_COLUMN] 

121 n_comparisons = arg_dict[NUMBER_OF_COMPARISONS] 

122 

123 def adjusted_alphas_for_group(grp: DataFrame) -> Series: 

124 return ( 

125 sequential_bounds( 

126 t=grp["sample_size_proportions"].values, 

127 alpha=grp[ALPHA].values[0] / n_comparisons, 

128 sides=2 if (grp[PREFERENCE_TEST] == TWO_SIDED).all() else 1, 

129 ) 

130 .df.set_index(grp.index) 

131 .assign( 

132 **{ 

133 ADJUSTED_ALPHA: lambda df: df.apply( 

134 lambda row: 2 * (1 - st.norm.cdf(row["zb"])) 

135 if (grp[PREFERENCE_TEST] == TWO_SIDED).all() 

136 else 1 - st.norm.cdf(row["zb"]), 

137 axis=1, 

138 ) 

139 } 

140 ) 

141 )[["zb", ADJUSTED_ALPHA]] 

142 

143 groups_except_ordinal = [column for column in df.index.names if column != ordinal_group_column] 

144 max_sample_size_by_group = ( 

145 ( 

146 df[["current_total_" + denominator, final_expected_sample_size_column]] 

147 .groupby(groups_except_ordinal, sort=False) 

148 .max() 

149 .max(axis=1) 

150 ) 

151 if len(groups_except_ordinal) > 0 

152 else (df[["current_total_" + denominator, final_expected_sample_size_column]].max().max()) 

153 ) 

154 sample_size_proportions = Series( 

155 data=df.groupby(df.index.names, sort=False)["current_total_" + denominator].first() / max_sample_size_by_group, 

156 name="sample_size_proportions", 

157 ) 

158 

159 return Series( 

160 data=df.groupby(df.index.names, sort=False)[[ALPHA, PREFERENCE_TEST]] 

161 .first() 

162 .merge(sample_size_proportions, left_index=True, right_index=True) 

163 .assign(_sequential_dummy_index_=1) 

164 .groupby(groups_except_ordinal + ["_sequential_dummy_index_"], sort=False)[ 

165 ["sample_size_proportions", PREFERENCE_TEST, ALPHA] 

166 ] 

167 .apply(adjusted_alphas_for_group)[ADJUSTED_ALPHA], 

168 name=ADJUSTED_ALPHA, 

169 ) 

170 

171 

172def ci_for_multiple_comparison_methods( 

173 df: DataFrame, 

174 correction_method: str, 

175 alpha: float, 

176 w: float = 1.0, 

177) -> Tuple[Union[Series, float], Union[Series, float]]: 

178 if TWO_SIDED in df[PREFERENCE_TEST]: 

179 raise ValueError( 

180 "CIs can only be produced for one-sided tests when other multiple test corrections " 

181 "methods than bonferroni are applied" 

182 ) 

183 m_scal = len(df) 

184 num_significant = sum(df[IS_SIGNIFICANT]) 

185 r = m_scal - num_significant 

186 

187 def _aw(W: float, alpha: float, m_scal: float, r: int): 

188 return alpha * (1 - (1 - W) * (m_scal - r) / m_scal) 

189 

190 def _bw(W: float, alpha: float, m_scal: float, r: int): 

191 return 1 - (1 - alpha) / np.power((1 - (1 - W) * (1 - np.power((1 - alpha), (1 / m_scal)))), (m_scal - r)) 

192 

193 if correction_method in [HOLM, SPOT_1_HOLM]: 

194 adjusted_alpha_rej_equal_m = 1 - alpha / m_scal 

195 adjusted_alpha_rej_less_m = 1 - (1 - w) * (alpha / m_scal) 

196 adjusted_alpha_accept = 1 - _aw(w, alpha, m_scal, r) / r if r != 0 else 0 

197 elif correction_method in [HOMMEL, SIMES_HOCHBERG, SPOT_1_HOMMEL, SPOT_1_SIMES_HOCHBERG]: 

198 adjusted_alpha_rej_equal_m = np.power((1 - alpha), (1 / m_scal)) 

199 adjusted_alpha_rej_less_m = 1 - (1 - w) * (1 - np.power((1 - alpha), (1 / m_scal))) 

200 adjusted_alpha_accept = 1 - _bw(w, alpha, m_scal, r) / r if r != 0 else 0 

201 else: 

202 raise ValueError( 

203 "CIs not supported for correction method. " 

204 f"Supported methods: {HOMMEL}, {HOLM}, {SIMES_HOCHBERG}," 

205 f"{SPOT_1_HOLM}, {SPOT_1_HOMMEL} and {SPOT_1_SIMES_HOCHBERG}" 

206 ) 

207 

208 def _compute_ci_for_row(row: Series) -> Tuple[float, float]: 

209 if row[IS_SIGNIFICANT] and num_significant == m_scal: 

210 alpha_adj = adjusted_alpha_rej_equal_m 

211 elif row[IS_SIGNIFICANT] and num_significant < m_scal: 

212 alpha_adj = adjusted_alpha_rej_less_m 

213 else: 

214 alpha_adj = adjusted_alpha_accept 

215 

216 ci_sign = -1 if row[PREFERENCE_TEST] == "larger" else 1 

217 bound1 = row[DIFFERENCE] + ci_sign * st.norm.ppf(alpha_adj) * row[STD_ERR] 

218 if ci_sign == -1: 

219 bound2 = max(row[NULL_HYPOTHESIS], bound1) 

220 else: 

221 bound2 = min(row[NULL_HYPOTHESIS], bound1) 

222 

223 bound = bound2 if row[IS_SIGNIFICANT] else bound1 

224 

225 lower = bound if row[PREFERENCE_TEST] == "larger" else -np.inf 

226 upper = bound if row[PREFERENCE_TEST] == "smaller" else np.inf 

227 

228 row[ADJUSTED_LOWER] = lower 

229 row[ADJUSTED_UPPER] = upper 

230 

231 return row 

232 

233 ci_df = df.apply(_compute_ci_for_row, axis=1)[[ADJUSTED_LOWER, ADJUSTED_UPPER]] 

234 

235 return ci_df[ADJUSTED_LOWER], ci_df[ADJUSTED_UPPER] 

236 

237 

238def ci_width( 

239 z_alpha, binary, non_inferiority, hypothetical_effect, control_avg, control_var, control_count, treatment_count 

240) -> Union[Series, float]: 

241 treatment_var = _get_hypothetical_treatment_var( 

242 binary, non_inferiority, control_avg, control_var, hypothetical_effect 

243 ) 

244 _, std_err = st.stats._unequal_var_ttest_denom(control_var, control_count, treatment_var, treatment_count) 

245 return 2 * z_alpha * std_err 

246 

247 

248def powered_effect( 

249 df: DataFrame, 

250 z_alpha: float, 

251 z_power: float, 

252 binary: bool, 

253 non_inferiority: bool, 

254 avg_column: float, 

255 var_column: float, 

256) -> Series: 

257 

258 if binary and not non_inferiority: 

259 effect = df.apply( 

260 lambda row: _search_MDE_binary_local_search( 

261 control_avg=row[avg_column], 

262 control_var=row[var_column], 

263 non_inferiority=False, 

264 kappa=row["kappa"], 

265 proportion_of_total=row["proportion_of_total"], 

266 current_number_of_units=row["current_number_of_units"], 

267 z_alpha=z_alpha, 

268 z_power=z_power, 

269 )[0], 

270 axis=1, 

271 ) 

272 else: 

273 treatment_var = _get_hypothetical_treatment_var( 

274 binary_metric=binary, 

275 non_inferiority=df[NIM] is not None, 

276 control_avg=df[avg_column], 

277 control_var=df[var_column], 

278 hypothetical_effect=0, 

279 ) 

280 n2_partial = np.power((z_alpha + z_power), 2) * (df[var_column] / df["kappa"] + treatment_var) 

281 effect = np.sqrt( 

282 (1 / (df["current_number_of_units"] * df["proportion_of_total"])) * (n2_partial + df["kappa"] * n2_partial) 

283 ) 

284 

285 return effect 

286 

287 

288def required_sample_size( 

289 binary: Union[Series, bool], 

290 non_inferiority: Union[Series, bool], 

291 hypothetical_effect: Union[Series, float], 

292 control_avg: Union[Series, float], 

293 control_var: Union[Series, float], 

294 z_alpha: float = None, 

295 kappa: float = None, 

296 proportion_of_total: Union[Series, float] = None, 

297 z_power: float = None, 

298) -> Union[Series, float]: 

299 

300 if kappa is None: 

301 raise ValueError("kappa is None, must be postive float") 

302 if proportion_of_total is None: 

303 raise ValueError("proportion_of_total is None, must be between 0 and 1") 

304 

305 treatment_var = np.vectorize(_get_hypothetical_treatment_var)( 

306 binary, non_inferiority, control_avg, control_var, hypothetical_effect 

307 ) 

308 

309 n2 = _treatment_group_sample_size( 

310 z_alpha=z_alpha, 

311 z_power=z_power, 

312 hypothetical_effect=hypothetical_effect, 

313 control_var=control_var, 

314 treatment_var=treatment_var, 

315 kappa=kappa, 

316 ) 

317 required_sample_size = np.ceil((n2 + n2 * kappa) / proportion_of_total) 

318 return required_sample_size 

319 

320 

321def _search_MDE_binary_local_search( 

322 control_avg: float, 

323 control_var: float, 

324 non_inferiority: bool, 

325 kappa: float, 

326 proportion_of_total: float, 

327 current_number_of_units: float, 

328 z_alpha: float = None, 

329 z_power: float = None, 

330): 

331 def f(x): 

332 return _find_current_powered_effect( 

333 hypothetical_effect=x, 

334 control_avg=control_avg, 

335 control_var=control_var, 

336 binary=True, 

337 non_inferiority=non_inferiority, 

338 kappa=kappa, 

339 proportion_of_total=proportion_of_total, 

340 current_number_of_units=current_number_of_units, 

341 z_alpha=z_alpha, 

342 z_power=z_power, 

343 ) 

344 

345 max_val = 1 - control_avg 

346 min_val = min(10e-9, max_val) 

347 

348 if min_val == max_val: 

349 # corner case that crashes the optimizer 

350 return min_val, f(min_val) 

351 

352 max_iter = 100 # max number of iterations before falling back to slow grid search 

353 

354 # we stop immediately if a solution was found that is "good enough". A threshold of 

355 # 1 indicates that 

356 # the approximated number of units (based on the current effect candidate) is off by 

357 # at most 1.0 

358 goodness_threshold = 1.0 

359 

360 curr_iter = 0 

361 best_x = None 

362 best_fun = float("inf") 

363 

364 bounds_queue = [(min_val, max_val)] 

365 

366 while curr_iter < max_iter and best_fun > goodness_threshold: 

367 

368 # take next value from queue 

369 interval = bounds_queue.pop(0) 

370 

371 # conduct a bounded local search, using a very small tol value improved 

372 # performance during tests 

373 # result = optimize.minimize_scalar(f, bounds=(interval[0], interval[1]), 

374 # method='bounded', tol=10e-14) 

375 result = optimize.minimize_scalar( 

376 f, bounds=(interval[0], interval[1]), method="bounded", options={"xatol": 10e-14, "maxiter": 50} 

377 ) 

378 

379 if result.fun < best_fun: 

380 best_x = result.x 

381 best_fun = result.fun 

382 

383 curr_iter += 1 

384 

385 # add new bounds to the queue 

386 interval_split = (interval[0] + interval[1]) / 2 

387 bounds_queue.append((interval[0], interval_split)) 

388 bounds_queue.append((interval_split, interval[1])) 

389 

390 if best_fun <= goodness_threshold: 

391 return best_x, best_fun 

392 else: # check if grid search finds a better solution 

393 alt_result_x, alt_result_fun = _search_MDE_binary( 

394 control_avg, 

395 control_var, 

396 non_inferiority, 

397 kappa, 

398 proportion_of_total, 

399 current_number_of_units, 

400 z_alpha, 

401 z_power, 

402 return_cost_val=True, 

403 ) 

404 

405 return (alt_result_x, alt_result_fun) if alt_result_fun < best_fun else (best_x, best_fun) 

406 

407 

408def _search_MDE_binary( 

409 control_avg: float, 

410 control_var: float, 

411 non_inferiority: bool, 

412 kappa: float, 

413 proportion_of_total: float, 

414 current_number_of_units: float, 

415 z_alpha: float = None, 

416 z_power: float = None, 

417 return_cost_val=False, 

418): 

419 candidate_effects = np.linspace(10e-9, 1 - control_avg, num=2000) 

420 for i in range(2): 

421 test = [] 

422 for effect in candidate_effects: 

423 test.append( 

424 _find_current_powered_effect( 

425 hypothetical_effect=effect, 

426 control_avg=control_avg, 

427 control_var=control_var, 

428 binary=True, 

429 non_inferiority=non_inferiority, 

430 kappa=kappa, 

431 proportion_of_total=proportion_of_total, 

432 current_number_of_units=current_number_of_units, 

433 z_alpha=z_alpha, 

434 z_power=z_power, 

435 ) 

436 ) 

437 

438 test = np.array(test) 

439 index = [idx for idx, element in enumerate(test) if element == test.min()] 

440 if len(index) != 1: 

441 index = [index[int(np.ceil(len(index) / 2))]] 

442 if i == 0: 

443 if index[0] == 9999: 

444 return np.inf 

445 lower_effect_bound = 10e-9 if index[0] == 0 else candidate_effects[index[0] - 1] 

446 candidate_effects = np.linspace(lower_effect_bound, candidate_effects[index[0]], num=10000) 

447 

448 index = [idx for idx, element in enumerate(test) if element == test.min()] 

449 

450 return candidate_effects[index[0]], test[index[0]] if return_cost_val else candidate_effects[index[0]] 

451 

452 

453def _treatment_group_sample_size( 

454 z_alpha: float, 

455 z_power: float, 

456 hypothetical_effect: float, 

457 control_var: float, 

458 treatment_var: float, 

459 kappa: float, 

460) -> float: 

461 return np.ceil(np.power((z_alpha + z_power) / abs(hypothetical_effect), 2) * (control_var / kappa + treatment_var)) 

462 

463 

464def _find_current_powered_effect( 

465 hypothetical_effect: float, 

466 control_avg: float, 

467 control_var: float, 

468 binary: bool, 

469 non_inferiority: bool, 

470 kappa: float, 

471 proportion_of_total: float, 

472 current_number_of_units: float, 

473 z_power: float = None, 

474 z_alpha: float = None, 

475) -> float: 

476 

477 treatment_var = _get_hypothetical_treatment_var( 

478 binary_metric=binary, 

479 non_inferiority=non_inferiority, 

480 control_avg=control_avg, 

481 control_var=control_var, 

482 hypothetical_effect=hypothetical_effect, 

483 ) 

484 n2 = _treatment_group_sample_size( 

485 z_alpha, 

486 z_power, 

487 hypothetical_effect, 

488 control_var, 

489 treatment_var, 

490 kappa, 

491 ) 

492 

493 return np.power(current_number_of_units - ((n2 + n2 * kappa) / proportion_of_total), 2) 

494 

495 

496def _get_hypothetical_treatment_var( 

497 binary_metric: bool, 

498 non_inferiority: bool, 

499 control_avg: float, 

500 control_var: float, 

501 hypothetical_effect: float, 

502) -> float: 

503 if binary_metric and not non_inferiority: 

504 # For binary metrics, the variance can be derived from the average. However, 

505 # we do *not* do this for 

506 # non-inferiority tests because for non-inferiority tests, the basic assumption 

507 # is that the 

508 # mean of the control group and treatment group are identical. 

509 return (control_avg + hypothetical_effect) * (1 - (control_avg + hypothetical_effect)) 

510 else: 

511 return control_var