1from typing import Tuple, Union, Dict
2
3import numpy as np
4from pandas import DataFrame, Series
5from scipy import optimize
6from scipy import stats as st
7from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic
8
9from spotify_confidence.analysis.confidence_utils import power_calculation
10from spotify_confidence.analysis.constants import (
11 NUMERATOR,
12 NUMERATOR_SUM_OF_SQUARES,
13 DENOMINATOR,
14 INTERVAL_SIZE,
15 FINAL_EXPECTED_SAMPLE_SIZE,
16 ORDINAL_GROUP_COLUMN,
17 POINT_ESTIMATE,
18 CI_LOWER,
19 CI_UPPER,
20 ADJUSTED_LOWER,
21 ADJUSTED_UPPER,
22 VARIANCE,
23 NUMBER_OF_COMPARISONS,
24 TWO_SIDED,
25 SFX2,
26 SFX1,
27 STD_ERR,
28 PREFERENCE_TEST,
29 NULL_HYPOTHESIS,
30 DIFFERENCE,
31 ALPHA,
32 IS_SIGNIFICANT,
33 HOLM,
34 SPOT_1_HOLM,
35 HOMMEL,
36 SIMES_HOCHBERG,
37 SPOT_1_HOMMEL,
38 SPOT_1_SIMES_HOCHBERG,
39 NIM,
40 ADJUSTED_ALPHA,
41)
42from spotify_confidence.analysis.frequentist.sequential_bound_solver import bounds
43
44
45def sequential_bounds(t: np.array, alpha: float, sides: int, state: DataFrame = None):
46 return bounds(t, alpha, rho=2, ztrun=8, sides=sides, max_nints=1000, state=state)
47
48
49def point_estimate(df: DataFrame, arg_dict: Dict[str, str]) -> float:
50 numerator = arg_dict[NUMERATOR]
51 denominator = arg_dict[DENOMINATOR]
52 if (df[denominator] == 0).any():
53 raise ValueError("""Can't compute point estimate: denominator is 0""")
54 return df[numerator] / df[denominator]
55
56
57def variance(df: DataFrame, arg_dict: Dict[str, str]) -> float:
58 numerator = arg_dict[NUMERATOR]
59 denominator = arg_dict[DENOMINATOR]
60 numerator_sumsq = arg_dict[NUMERATOR_SUM_OF_SQUARES]
61 binary = df[numerator_sumsq] == df[numerator]
62 if binary.all():
63 # This equals row[POINT_ESTIMATE]*(1-row[POINT_ESTIMATE]) when the data is binary,
64 # and also gives a robust fallback in case it's not
65 variance = df[numerator_sumsq] / df[denominator] - df[POINT_ESTIMATE] ** 2
66 else:
67 variance = (df[numerator_sumsq] - np.power(df[numerator], 2) / df[denominator]) / (df[denominator] - 1)
68 if (variance < 0).any():
69 raise ValueError("Computed variance is negative. " "Please check your inputs.")
70 return variance
71
72
73def std_err(df: Series, arg_dict: Dict[str, str]) -> float:
74 denominator = arg_dict[DENOMINATOR]
75 return np.sqrt(df[VARIANCE + SFX1] / df[denominator + SFX1] + df[VARIANCE + SFX2] / df[denominator + SFX2])
76
77
78def add_point_estimate_ci(df: Series, arg_dict: Dict[str, str]) -> Series:
79 denominator = arg_dict[DENOMINATOR]
80 interval_size = arg_dict[INTERVAL_SIZE]
81 df[CI_LOWER], df[CI_UPPER] = _zconfint_generic(
82 mean=df[POINT_ESTIMATE],
83 std_mean=np.sqrt(df[VARIANCE] / df[denominator]),
84 alpha=1 - interval_size,
85 alternative=TWO_SIDED,
86 )
87 return df
88
89
90def p_value(df: DataFrame, arg_dict: Dict[str, str]) -> Series:
91 _, p_value = _zstat_generic(
92 value1=df[POINT_ESTIMATE + SFX2],
93 value2=df[POINT_ESTIMATE + SFX1],
94 std_diff=df[STD_ERR],
95 alternative=df[PREFERENCE_TEST].values[0],
96 diff=df[NULL_HYPOTHESIS],
97 )
98 return p_value
99
100
101def ci(df: DataFrame, alpha_column: str, arg_dict: Dict[str, str]) -> Tuple[Series, Series]:
102 return _zconfint_generic(
103 mean=df[DIFFERENCE], std_mean=df[STD_ERR], alpha=df[alpha_column], alternative=df[PREFERENCE_TEST].values[0]
104 )
105
106
107def achieved_power(df: DataFrame, mde: float, alpha: float, arg_dict: Dict[str, str]) -> DataFrame:
108 denominator = arg_dict[DENOMINATOR]
109 v1, v2 = df[VARIANCE + SFX1], df[VARIANCE + SFX2]
110 n1, n2 = df[denominator + SFX1], df[denominator + SFX2]
111
112 var_pooled = ((n1 - 1) * v1 + (n2 - 1) * v2) / (n1 + n2 - 2)
113
114 return power_calculation(mde, var_pooled, alpha, n1, n2)
115
116
117def compute_sequential_adjusted_alpha(df: DataFrame, arg_dict: Dict[str, str]):
118 denominator = arg_dict[DENOMINATOR]
119 final_expected_sample_size_column = arg_dict[FINAL_EXPECTED_SAMPLE_SIZE]
120 ordinal_group_column = arg_dict[ORDINAL_GROUP_COLUMN]
121 n_comparisons = arg_dict[NUMBER_OF_COMPARISONS]
122
123 def adjusted_alphas_for_group(grp: DataFrame) -> Series:
124 return (
125 sequential_bounds(
126 t=grp["sample_size_proportions"].values,
127 alpha=grp[ALPHA].values[0] / n_comparisons,
128 sides=2 if (grp[PREFERENCE_TEST] == TWO_SIDED).all() else 1,
129 )
130 .df.set_index(grp.index)
131 .assign(
132 **{
133 ADJUSTED_ALPHA: lambda df: df.apply(
134 lambda row: 2 * (1 - st.norm.cdf(row["zb"]))
135 if (grp[PREFERENCE_TEST] == TWO_SIDED).all()
136 else 1 - st.norm.cdf(row["zb"]),
137 axis=1,
138 )
139 }
140 )
141 )[["zb", ADJUSTED_ALPHA]]
142
143 groups_except_ordinal = [column for column in df.index.names if column != ordinal_group_column]
144 max_sample_size_by_group = (
145 (
146 df[["current_total_" + denominator, final_expected_sample_size_column]]
147 .groupby(groups_except_ordinal, sort=False)
148 .max()
149 .max(axis=1)
150 )
151 if len(groups_except_ordinal) > 0
152 else (df[["current_total_" + denominator, final_expected_sample_size_column]].max().max())
153 )
154 sample_size_proportions = Series(
155 data=df.groupby(df.index.names, sort=False)["current_total_" + denominator].first() / max_sample_size_by_group,
156 name="sample_size_proportions",
157 )
158
159 return Series(
160 data=df.groupby(df.index.names, sort=False)[[ALPHA, PREFERENCE_TEST]]
161 .first()
162 .merge(sample_size_proportions, left_index=True, right_index=True)
163 .assign(_sequential_dummy_index_=1)
164 .groupby(groups_except_ordinal + ["_sequential_dummy_index_"], sort=False)[
165 ["sample_size_proportions", PREFERENCE_TEST, ALPHA]
166 ]
167 .apply(adjusted_alphas_for_group)[ADJUSTED_ALPHA],
168 name=ADJUSTED_ALPHA,
169 )
170
171
172def ci_for_multiple_comparison_methods(
173 df: DataFrame,
174 correction_method: str,
175 alpha: float,
176 w: float = 1.0,
177) -> Tuple[Union[Series, float], Union[Series, float]]:
178 if TWO_SIDED in df[PREFERENCE_TEST]:
179 raise ValueError(
180 "CIs can only be produced for one-sided tests when other multiple test corrections "
181 "methods than bonferroni are applied"
182 )
183 m_scal = len(df)
184 num_significant = sum(df[IS_SIGNIFICANT])
185 r = m_scal - num_significant
186
187 def _aw(W: float, alpha: float, m_scal: float, r: int):
188 return alpha * (1 - (1 - W) * (m_scal - r) / m_scal)
189
190 def _bw(W: float, alpha: float, m_scal: float, r: int):
191 return 1 - (1 - alpha) / np.power((1 - (1 - W) * (1 - np.power((1 - alpha), (1 / m_scal)))), (m_scal - r))
192
193 if correction_method in [HOLM, SPOT_1_HOLM]:
194 adjusted_alpha_rej_equal_m = 1 - alpha / m_scal
195 adjusted_alpha_rej_less_m = 1 - (1 - w) * (alpha / m_scal)
196 adjusted_alpha_accept = 1 - _aw(w, alpha, m_scal, r) / r if r != 0 else 0
197 elif correction_method in [HOMMEL, SIMES_HOCHBERG, SPOT_1_HOMMEL, SPOT_1_SIMES_HOCHBERG]:
198 adjusted_alpha_rej_equal_m = np.power((1 - alpha), (1 / m_scal))
199 adjusted_alpha_rej_less_m = 1 - (1 - w) * (1 - np.power((1 - alpha), (1 / m_scal)))
200 adjusted_alpha_accept = 1 - _bw(w, alpha, m_scal, r) / r if r != 0 else 0
201 else:
202 raise ValueError(
203 "CIs not supported for correction method. "
204 f"Supported methods: {HOMMEL}, {HOLM}, {SIMES_HOCHBERG},"
205 f"{SPOT_1_HOLM}, {SPOT_1_HOMMEL} and {SPOT_1_SIMES_HOCHBERG}"
206 )
207
208 def _compute_ci_for_row(row: Series) -> Tuple[float, float]:
209 if row[IS_SIGNIFICANT] and num_significant == m_scal:
210 alpha_adj = adjusted_alpha_rej_equal_m
211 elif row[IS_SIGNIFICANT] and num_significant < m_scal:
212 alpha_adj = adjusted_alpha_rej_less_m
213 else:
214 alpha_adj = adjusted_alpha_accept
215
216 ci_sign = -1 if row[PREFERENCE_TEST] == "larger" else 1
217 bound1 = row[DIFFERENCE] + ci_sign * st.norm.ppf(alpha_adj) * row[STD_ERR]
218 if ci_sign == -1:
219 bound2 = max(row[NULL_HYPOTHESIS], bound1)
220 else:
221 bound2 = min(row[NULL_HYPOTHESIS], bound1)
222
223 bound = bound2 if row[IS_SIGNIFICANT] else bound1
224
225 lower = bound if row[PREFERENCE_TEST] == "larger" else -np.inf
226 upper = bound if row[PREFERENCE_TEST] == "smaller" else np.inf
227
228 row[ADJUSTED_LOWER] = lower
229 row[ADJUSTED_UPPER] = upper
230
231 return row
232
233 ci_df = df.apply(_compute_ci_for_row, axis=1)[[ADJUSTED_LOWER, ADJUSTED_UPPER]]
234
235 return ci_df[ADJUSTED_LOWER], ci_df[ADJUSTED_UPPER]
236
237
238def ci_width(
239 z_alpha, binary, non_inferiority, hypothetical_effect, control_avg, control_var, control_count, treatment_count
240) -> Union[Series, float]:
241 treatment_var = _get_hypothetical_treatment_var(
242 binary, non_inferiority, control_avg, control_var, hypothetical_effect
243 )
244 _, std_err = st.stats._unequal_var_ttest_denom(control_var, control_count, treatment_var, treatment_count)
245 return 2 * z_alpha * std_err
246
247
248def powered_effect(
249 df: DataFrame,
250 z_alpha: float,
251 z_power: float,
252 binary: bool,
253 non_inferiority: bool,
254 avg_column: float,
255 var_column: float,
256) -> Series:
257
258 if binary and not non_inferiority:
259 effect = df.apply(
260 lambda row: _search_MDE_binary_local_search(
261 control_avg=row[avg_column],
262 control_var=row[var_column],
263 non_inferiority=False,
264 kappa=row["kappa"],
265 proportion_of_total=row["proportion_of_total"],
266 current_number_of_units=row["current_number_of_units"],
267 z_alpha=z_alpha,
268 z_power=z_power,
269 )[0],
270 axis=1,
271 )
272 else:
273 treatment_var = _get_hypothetical_treatment_var(
274 binary_metric=binary,
275 non_inferiority=df[NIM] is not None,
276 control_avg=df[avg_column],
277 control_var=df[var_column],
278 hypothetical_effect=0,
279 )
280 n2_partial = np.power((z_alpha + z_power), 2) * (df[var_column] / df["kappa"] + treatment_var)
281 effect = np.sqrt(
282 (1 / (df["current_number_of_units"] * df["proportion_of_total"])) * (n2_partial + df["kappa"] * n2_partial)
283 )
284
285 return effect
286
287
288def required_sample_size(
289 binary: Union[Series, bool],
290 non_inferiority: Union[Series, bool],
291 hypothetical_effect: Union[Series, float],
292 control_avg: Union[Series, float],
293 control_var: Union[Series, float],
294 z_alpha: float = None,
295 kappa: float = None,
296 proportion_of_total: Union[Series, float] = None,
297 z_power: float = None,
298) -> Union[Series, float]:
299
300 if kappa is None:
301 raise ValueError("kappa is None, must be postive float")
302 if proportion_of_total is None:
303 raise ValueError("proportion_of_total is None, must be between 0 and 1")
304
305 treatment_var = np.vectorize(_get_hypothetical_treatment_var)(
306 binary, non_inferiority, control_avg, control_var, hypothetical_effect
307 )
308
309 n2 = _treatment_group_sample_size(
310 z_alpha=z_alpha,
311 z_power=z_power,
312 hypothetical_effect=hypothetical_effect,
313 control_var=control_var,
314 treatment_var=treatment_var,
315 kappa=kappa,
316 )
317 required_sample_size = np.ceil((n2 + n2 * kappa) / proportion_of_total)
318 return required_sample_size
319
320
321def _search_MDE_binary_local_search(
322 control_avg: float,
323 control_var: float,
324 non_inferiority: bool,
325 kappa: float,
326 proportion_of_total: float,
327 current_number_of_units: float,
328 z_alpha: float = None,
329 z_power: float = None,
330):
331 def f(x):
332 return _find_current_powered_effect(
333 hypothetical_effect=x,
334 control_avg=control_avg,
335 control_var=control_var,
336 binary=True,
337 non_inferiority=non_inferiority,
338 kappa=kappa,
339 proportion_of_total=proportion_of_total,
340 current_number_of_units=current_number_of_units,
341 z_alpha=z_alpha,
342 z_power=z_power,
343 )
344
345 max_val = 1 - control_avg
346 min_val = min(10e-9, max_val)
347
348 if min_val == max_val:
349 # corner case that crashes the optimizer
350 return min_val, f(min_val)
351
352 max_iter = 100 # max number of iterations before falling back to slow grid search
353
354 # we stop immediately if a solution was found that is "good enough". A threshold of
355 # 1 indicates that
356 # the approximated number of units (based on the current effect candidate) is off by
357 # at most 1.0
358 goodness_threshold = 1.0
359
360 curr_iter = 0
361 best_x = None
362 best_fun = float("inf")
363
364 bounds_queue = [(min_val, max_val)]
365
366 while curr_iter < max_iter and best_fun > goodness_threshold:
367
368 # take next value from queue
369 interval = bounds_queue.pop(0)
370
371 # conduct a bounded local search, using a very small tol value improved
372 # performance during tests
373 # result = optimize.minimize_scalar(f, bounds=(interval[0], interval[1]),
374 # method='bounded', tol=10e-14)
375 result = optimize.minimize_scalar(
376 f, bounds=(interval[0], interval[1]), method="bounded", options={"xatol": 10e-14, "maxiter": 50}
377 )
378
379 if result.fun < best_fun:
380 best_x = result.x
381 best_fun = result.fun
382
383 curr_iter += 1
384
385 # add new bounds to the queue
386 interval_split = (interval[0] + interval[1]) / 2
387 bounds_queue.append((interval[0], interval_split))
388 bounds_queue.append((interval_split, interval[1]))
389
390 if best_fun <= goodness_threshold:
391 return best_x, best_fun
392 else: # check if grid search finds a better solution
393 alt_result_x, alt_result_fun = _search_MDE_binary(
394 control_avg,
395 control_var,
396 non_inferiority,
397 kappa,
398 proportion_of_total,
399 current_number_of_units,
400 z_alpha,
401 z_power,
402 return_cost_val=True,
403 )
404
405 return (alt_result_x, alt_result_fun) if alt_result_fun < best_fun else (best_x, best_fun)
406
407
408def _search_MDE_binary(
409 control_avg: float,
410 control_var: float,
411 non_inferiority: bool,
412 kappa: float,
413 proportion_of_total: float,
414 current_number_of_units: float,
415 z_alpha: float = None,
416 z_power: float = None,
417 return_cost_val=False,
418):
419 candidate_effects = np.linspace(10e-9, 1 - control_avg, num=2000)
420 for i in range(2):
421 test = []
422 for effect in candidate_effects:
423 test.append(
424 _find_current_powered_effect(
425 hypothetical_effect=effect,
426 control_avg=control_avg,
427 control_var=control_var,
428 binary=True,
429 non_inferiority=non_inferiority,
430 kappa=kappa,
431 proportion_of_total=proportion_of_total,
432 current_number_of_units=current_number_of_units,
433 z_alpha=z_alpha,
434 z_power=z_power,
435 )
436 )
437
438 test = np.array(test)
439 index = [idx for idx, element in enumerate(test) if element == test.min()]
440 if len(index) != 1:
441 index = [index[int(np.ceil(len(index) / 2))]]
442 if i == 0:
443 if index[0] == 9999:
444 return np.inf
445 lower_effect_bound = 10e-9 if index[0] == 0 else candidate_effects[index[0] - 1]
446 candidate_effects = np.linspace(lower_effect_bound, candidate_effects[index[0]], num=10000)
447
448 index = [idx for idx, element in enumerate(test) if element == test.min()]
449
450 return candidate_effects[index[0]], test[index[0]] if return_cost_val else candidate_effects[index[0]]
451
452
453def _treatment_group_sample_size(
454 z_alpha: float,
455 z_power: float,
456 hypothetical_effect: float,
457 control_var: float,
458 treatment_var: float,
459 kappa: float,
460) -> float:
461 return np.ceil(np.power((z_alpha + z_power) / abs(hypothetical_effect), 2) * (control_var / kappa + treatment_var))
462
463
464def _find_current_powered_effect(
465 hypothetical_effect: float,
466 control_avg: float,
467 control_var: float,
468 binary: bool,
469 non_inferiority: bool,
470 kappa: float,
471 proportion_of_total: float,
472 current_number_of_units: float,
473 z_power: float = None,
474 z_alpha: float = None,
475) -> float:
476
477 treatment_var = _get_hypothetical_treatment_var(
478 binary_metric=binary,
479 non_inferiority=non_inferiority,
480 control_avg=control_avg,
481 control_var=control_var,
482 hypothetical_effect=hypothetical_effect,
483 )
484 n2 = _treatment_group_sample_size(
485 z_alpha,
486 z_power,
487 hypothetical_effect,
488 control_var,
489 treatment_var,
490 kappa,
491 )
492
493 return np.power(current_number_of_units - ((n2 + n2 * kappa) / proportion_of_total), 2)
494
495
496def _get_hypothetical_treatment_var(
497 binary_metric: bool,
498 non_inferiority: bool,
499 control_avg: float,
500 control_var: float,
501 hypothetical_effect: float,
502) -> float:
503 if binary_metric and not non_inferiority:
504 # For binary metrics, the variance can be derived from the average. However,
505 # we do *not* do this for
506 # non-inferiority tests because for non-inferiority tests, the basic assumption
507 # is that the
508 # mean of the control group and treatment group are identical.
509 return (control_avg + hypothetical_effect) * (1 - (control_avg + hypothetical_effect))
510 else:
511 return control_var