1# Copyright 2017-2020 Spotify AB
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15from typing import Union, Iterable, List, Tuple, Dict
16from warnings import warn
17
18import numpy as np
19from numpy import isnan
20from pandas import DataFrame, Series
21from scipy import stats as st
22from statsmodels.stats.multitest import multipletests
23
24import spotify_confidence.analysis.frequentist.confidence_computers.bootstrap_computer as bootstrap_computer
25import spotify_confidence.analysis.frequentist.confidence_computers.chi_squared_computer as chi_squared_computer
26import spotify_confidence.analysis.frequentist.confidence_computers.t_test_computer as t_test_computer
27import spotify_confidence.analysis.frequentist.confidence_computers.z_test_computer as z_test_computers
28import spotify_confidence.analysis.frequentist.confidence_computers.z_test_linreg_computer as z_test_linreg_computer
29from spotify_confidence.analysis.abstract_base_classes.confidence_computer_abc import ConfidenceComputerABC
30from spotify_confidence.analysis.confidence_utils import (
31 get_remaning_groups,
32 validate_levels,
33 level2str,
34 listify,
35 validate_and_rename_columns,
36 drop_and_rename_columns,
37 get_all_categorical_group_columns,
38 get_all_group_columns,
39 validate_data,
40 remove_group_columns,
41 groupbyApplyParallel,
42 is_non_inferiority,
43 reset_named_indices,
44)
45from spotify_confidence.analysis.constants import (
46 NUMERATOR,
47 NUMERATOR_SUM_OF_SQUARES,
48 DENOMINATOR,
49 BOOTSTRAPS,
50 INTERVAL_SIZE,
51 POINT_ESTIMATE,
52 FINAL_EXPECTED_SAMPLE_SIZE,
53 ORDINAL_GROUP_COLUMN,
54 MDE,
55 METHOD,
56 CORRECTION_METHOD,
57 ABSOLUTE,
58 VARIANCE,
59 NUMBER_OF_COMPARISONS,
60 TREATMENT_WEIGHTS,
61 IS_BINARY,
62 FEATURE,
63 FEATURE_SUMSQ,
64 FEATURE_CROSS,
65 CI_LOWER,
66 CI_UPPER,
67 DIFFERENCE,
68 P_VALUE,
69 SFX1,
70 SFX2,
71 STD_ERR,
72 ALPHA,
73 ADJUSTED_ALPHA,
74 ADJUSTED_ALPHA_POWER_SAMPLE_SIZE,
75 POWER,
76 POWERED_EFFECT,
77 ADJUSTED_POWER,
78 ADJUSTED_P,
79 ADJUSTED_LOWER,
80 ADJUSTED_UPPER,
81 IS_SIGNIFICANT,
82 REQUIRED_SAMPLE_SIZE,
83 REQUIRED_SAMPLE_SIZE_METRIC,
84 OPTIMAL_KAPPA,
85 OPTIMAL_WEIGHTS,
86 CI_WIDTH,
87 NULL_HYPOTHESIS,
88 ALTERNATIVE_HYPOTHESIS,
89 NIM,
90 PREFERENCE,
91 PREFERENCE_TEST,
92 TWO_SIDED,
93 PREFERENCE_DICT,
94 BONFERRONI,
95 HOLM,
96 HOMMEL,
97 SIMES_HOCHBERG,
98 SIDAK,
99 HOLM_SIDAK,
100 FDR_BH,
101 FDR_BY,
102 FDR_TSBH,
103 FDR_TSBKY,
104 SPOT_1_HOLM,
105 SPOT_1_HOMMEL,
106 SPOT_1_SIMES_HOCHBERG,
107 SPOT_1_SIDAK,
108 SPOT_1_HOLM_SIDAK,
109 SPOT_1_FDR_BH,
110 SPOT_1_FDR_BY,
111 SPOT_1_FDR_TSBH,
112 SPOT_1_FDR_TSBKY,
113 BONFERRONI_ONLY_COUNT_TWOSIDED,
114 BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
115 SPOT_1,
116 CORRECTION_METHODS,
117 BOOTSTRAP,
118 CHI2,
119 TTEST,
120 ZTEST,
121 NIM_TYPE,
122 CORRECTION_METHODS_THAT_REQUIRE_METRIC_INFO,
123 NIM_COLUMN_DEFAULT,
124 PREFERRED_DIRECTION_COLUMN_DEFAULT,
125 INCREASE_PREFFERED,
126 DECREASE_PREFFERED,
127 ZTESTLINREG,
128 ORIGINAL_POINT_ESTIMATE,
129 ORIGINAL_VARIANCE,
130 VARIANCE_REDUCTION,
131)
132
133confidence_computers = {
134 CHI2: chi_squared_computer,
135 TTEST: t_test_computer,
136 ZTEST: z_test_computers,
137 BOOTSTRAP: bootstrap_computer,
138 ZTESTLINREG: z_test_linreg_computer,
139}
140
141
142class GenericComputer(ConfidenceComputerABC):
143 def __init__(
144 self,
145 data_frame: DataFrame,
146 numerator_column: Union[str, None],
147 numerator_sum_squares_column: Union[str, None],
148 denominator_column: Union[str, None],
149 categorical_group_columns: Union[str, Iterable],
150 ordinal_group_column: Union[str, None],
151 interval_size: float,
152 correction_method: str,
153 method_column: str,
154 bootstrap_samples_column: Union[str, None],
155 metric_column: Union[str, None],
156 treatment_column: Union[str, None],
157 power: float,
158 point_estimate_column: str,
159 var_column: str,
160 is_binary_column: str,
161 feature_column: Union[str, None],
162 feature_sum_squares_column: Union[str, None],
163 feature_cross_sum_column: Union[str, None],
164 ):
165
166 self._df = data_frame.reset_index(drop=True)
167 self._point_estimate_column = point_estimate_column
168 self._var_column = var_column
169 self._is_binary = is_binary_column
170 self._numerator = numerator_column
171 self._numerator_sumsq = numerator_sum_squares_column
172 if self._numerator is not None and (self._numerator_sumsq is None or self._numerator_sumsq == self._numerator):
173 if (data_frame[numerator_column] <= data_frame[denominator_column]).all():
174 # Treat as binomial data
175 self._numerator_sumsq = self._numerator
176 else:
177 raise ValueError(
178 f"numerator_sum_squares_column missing or same as "
179 f"numerator_column, but since {numerator_column} is not "
180 f"always smaller than {denominator_column} it can't be "
181 f"binomial data. Please check your data."
182 )
183
184 self._denominator = denominator_column
185 self._categorical_group_columns = get_all_categorical_group_columns(
186 categorical_group_columns, metric_column, treatment_column
187 )
188 self._segments = remove_group_columns(self._categorical_group_columns, metric_column)
189 self._segments = remove_group_columns(self._segments, treatment_column)
190 self._ordinal_group_column = ordinal_group_column
191 self._metric_column = metric_column
192 self._interval_size = interval_size
193 self._power = power
194 self._treatment_column = treatment_column
195 self._feature = feature_column
196 self._feature_ssq = feature_sum_squares_column
197 self._feature_cross = feature_cross_sum_column
198
199 if correction_method.lower() not in CORRECTION_METHODS:
200 raise ValueError(f"Use one of the correction methods " + f"in {CORRECTION_METHODS}")
201 self._correction_method = correction_method
202 self._method_column = method_column
203
204 self._single_metric = False
205 if self._metric_column is not None and data_frame.groupby(self._metric_column, sort=False).ngroups == 1:
206 self._single_metric = True
207
208 self._all_group_columns = get_all_group_columns(self._categorical_group_columns, self._ordinal_group_column)
209 self._bootstrap_samples_column = bootstrap_samples_column
210
211 columns_that_must_exist = []
212 if (
213 CHI2 in self._df[self._method_column]
214 or TTEST in self._df[self._method_column]
215 or ZTEST in self._df[self._method_column]
216 ):
217 if not self._point_estimate_column or not self._var_column:
218 columns_that_must_exist += [self._numerator, self._denominator]
219 columns_that_must_exist += [] if self._numerator_sumsq is None else [self._numerator_sumsq]
220 else:
221 columns_that_must_exist += [self._point_estimate_column, self._var_column]
222 if BOOTSTRAP in self._df[self._method_column]:
223 columns_that_must_exist += [self._bootstrap_samples_column]
224 if ZTESTLINREG in self._df[self._method_column]:
225 columns_that_must_exist += [self._feature, self._feature_ssq, self._feature_cross]
226
227 validate_data(self._df, columns_that_must_exist, self._all_group_columns, self._ordinal_group_column)
228
229 self._sufficient = None
230
231 def compute_summary(self, verbose: bool) -> DataFrame:
232 return (
233 self._sufficient_statistics
234 if verbose
235 else self._sufficient_statistics[
236 self._all_group_columns
237 + ([self._metric_column] if self._metric_column is not None and self._single_metric else [])
238 + [c for c in [self._numerator, self._denominator] if c is not None]
239 + [POINT_ESTIMATE, CI_LOWER, CI_UPPER]
240 ]
241 )
242
243 @property
244 def _sufficient_statistics(self) -> DataFrame:
245 if self._sufficient is None:
246 arg_dict = {
247 NUMERATOR: self._numerator,
248 NUMERATOR_SUM_OF_SQUARES: self._numerator_sumsq,
249 DENOMINATOR: self._denominator,
250 BOOTSTRAPS: self._bootstrap_samples_column,
251 INTERVAL_SIZE: self._interval_size,
252 FEATURE: self._feature,
253 FEATURE_SUMSQ: self._feature_ssq,
254 FEATURE_CROSS: self._feature_cross,
255 }
256 groupby = [col for col in [self._method_column, self._metric_column] if col is not None]
257 self._sufficient = (
258 self._df.groupby(groupby, sort=False)
259 .apply(
260 lambda df: df.assign(
261 **{
262 POINT_ESTIMATE: lambda df: df[self._point_estimate_column]
263 if self._point_estimate_column is not None
264 else confidence_computers[df[self._method_column].values[0]].point_estimate(df, arg_dict)
265 }
266 )
267 .assign(
268 **{
269 VARIANCE: lambda df: df[self._var_column]
270 if self._var_column is not None
271 else confidence_computers[df[self._method_column].values[0]].variance(df, arg_dict)
272 }
273 )
274 .pipe(
275 lambda df: df
276 if self._point_estimate_column is not None
277 else confidence_computers[df[self._method_column].values[0]].add_point_estimate_ci(
278 df, arg_dict
279 )
280 )
281 )
282 .pipe(reset_named_indices)
283 )
284 return self._sufficient
285
286 def compute_difference(
287 self,
288 level_1: Union[str, Iterable],
289 level_2: Union[str, Iterable],
290 absolute: bool,
291 groupby: Union[str, Iterable],
292 nims: NIM_TYPE,
293 final_expected_sample_size_column: str,
294 verbose: bool,
295 mde_column: str,
296 ) -> DataFrame:
297 level_columns = get_remaning_groups(self._all_group_columns, groupby)
298 difference_df = self._compute_differences(
299 level_columns=level_columns,
300 levels=[(level_1, level_2)],
301 absolute=absolute,
302 groupby=groupby,
303 level_as_reference=True,
304 nims=nims,
305 final_expected_sample_size_column=final_expected_sample_size_column,
306 mde_column=mde_column,
307 )
308 return (
309 difference_df
310 if verbose
311 else difference_df[
312 listify(groupby)
313 + ["level_1", "level_2", "absolute_difference", DIFFERENCE, CI_LOWER, CI_UPPER, P_VALUE]
314 + [ADJUSTED_LOWER, ADJUSTED_UPPER, ADJUSTED_P, IS_SIGNIFICANT, POWERED_EFFECT, REQUIRED_SAMPLE_SIZE]
315 + ([NIM, NULL_HYPOTHESIS, PREFERENCE] if nims is not None else [])
316 ]
317 )
318
319 def compute_multiple_difference(
320 self,
321 level: Union[str, Iterable],
322 absolute: bool,
323 groupby: Union[str, Iterable],
324 level_as_reference: bool,
325 nims: NIM_TYPE,
326 final_expected_sample_size_column: str,
327 verbose: bool,
328 mde_column: str,
329 ) -> DataFrame:
330 level_columns = get_remaning_groups(self._all_group_columns, groupby)
331 other_levels = [
332 other
333 for other in self._sufficient_statistics.groupby(level_columns, sort=False).groups.keys()
334 if other != level
335 ]
336 levels = [(level, other) for other in other_levels]
337 difference_df = self._compute_differences(
338 level_columns=level_columns,
339 levels=levels,
340 absolute=absolute,
341 groupby=groupby,
342 level_as_reference=level_as_reference,
343 nims=nims,
344 final_expected_sample_size_column=final_expected_sample_size_column,
345 mde_column=mde_column,
346 )
347 return (
348 difference_df
349 if verbose
350 else difference_df[
351 listify(groupby)
352 + [
353 "level_1",
354 "level_2",
355 "absolute_difference",
356 DIFFERENCE,
357 CI_LOWER,
358 CI_UPPER,
359 P_VALUE,
360 POWERED_EFFECT,
361 REQUIRED_SAMPLE_SIZE,
362 ]
363 + [ADJUSTED_LOWER, ADJUSTED_UPPER, ADJUSTED_P, IS_SIGNIFICANT]
364 + ([NIM, NULL_HYPOTHESIS, PREFERENCE] if nims is not None else [])
365 ]
366 )
367
368 def compute_differences(
369 self,
370 levels: List[Tuple],
371 absolute: bool,
372 groupby: Union[str, Iterable],
373 nims: NIM_TYPE,
374 final_expected_sample_size_column: str,
375 verbose: bool,
376 mde_column: str,
377 ) -> DataFrame:
378 level_columns = get_remaning_groups(self._all_group_columns, groupby)
379 difference_df = self._compute_differences(
380 level_columns=level_columns,
381 levels=[levels] if type(levels) == tuple else levels,
382 absolute=absolute,
383 groupby=groupby,
384 level_as_reference=True,
385 nims=nims,
386 final_expected_sample_size_column=final_expected_sample_size_column,
387 mde_column=mde_column,
388 )
389 return (
390 difference_df
391 if verbose
392 else difference_df[
393 listify(groupby)
394 + ["level_1", "level_2", "absolute_difference", DIFFERENCE, CI_LOWER, CI_UPPER, P_VALUE]
395 + [ADJUSTED_LOWER, ADJUSTED_UPPER, ADJUSTED_P, IS_SIGNIFICANT, POWERED_EFFECT, REQUIRED_SAMPLE_SIZE]
396 + ([NIM, NULL_HYPOTHESIS, PREFERENCE] if nims is not None else [])
397 ]
398 )
399
400 def _compute_differences(
401 self,
402 level_columns: Iterable,
403 levels: Union[str, Iterable],
404 absolute: bool,
405 groupby: Union[str, Iterable],
406 level_as_reference: bool,
407 nims: NIM_TYPE,
408 final_expected_sample_size_column: str,
409 mde_column: str,
410 ):
411 if type(level_as_reference) is not bool:
412 raise ValueError(f"level_is_reference must be either True or False, but is {level_as_reference}.")
413 groupby = listify(groupby)
414 unique_levels = set([l[0] for l in levels] + [l[1] for l in levels])
415 validate_levels(self._sufficient_statistics, level_columns, unique_levels)
416 str2level = {level2str(lv): lv for lv in unique_levels}
417 levels = [
418 (level2str(l[0]), level2str(l[1])) if level_as_reference else (level2str(l[1]), level2str(l[0]))
419 for l in levels
420 ]
421
422 def assign_total_denominator(df, groupby):
423 if self._denominator is None:
424 return df.assign(**{f"current_total_{self._denominator}": None})
425
426 if len(groupby) == 0:
427 return df.assign(
428 **{f"current_total_{self._denominator}": self._sufficient_statistics[self._denominator].sum()}
429 )
430 else:
431 return df.merge(
432 df.groupby(groupby, sort=False)[self._denominator]
433 .sum()
434 .reset_index()
435 .rename(columns={self._denominator: f"current_total_{self._denominator}"})
436 )
437
438 return (
439 self._sufficient_statistics.assign(
440 level=self._sufficient_statistics[level_columns].agg(level2str, axis="columns")
441 )
442 .pipe(assign_total_denominator, groupby)
443 .query(f"level in {[l1 for l1,l2 in levels] + [l2 for l1,l2 in levels]}")
444 .pipe(lambda df: df if groupby == [] else df.set_index(groupby))
445 .pipe(
446 self._create_comparison_df,
447 groups_to_compare=levels,
448 absolute=absolute,
449 nims=nims,
450 mde_column=mde_column,
451 final_expected_sample_size_column=final_expected_sample_size_column,
452 )
453 .assign(level_1=lambda df: df["level_1"].map(lambda s: str2level[s]))
454 .assign(level_2=lambda df: df["level_2"].map(lambda s: str2level[s]))
455 .pipe(lambda df: df.reset_index([name for name in df.index.names if name is not None]))
456 .reset_index(drop=True)
457 .sort_values(by=groupby + ["level_1", "level_2"])
458 .reset_index(drop=True)
459 )
460
461 def _create_comparison_df(
462 self,
463 df: DataFrame,
464 groups_to_compare: List[Tuple[str, str]],
465 absolute: bool,
466 nims: NIM_TYPE,
467 mde_column: bool,
468 final_expected_sample_size_column: str,
469 ) -> DataFrame:
470 def join(df: DataFrame) -> DataFrame:
471 has_index = not all(idx is None for idx in df.index.names)
472 if has_index:
473 # self-join on index (the index will typically model the date,
474 # i.e., rows with the same date are joined)
475 return df.merge(df, left_index=True, right_index=True, suffixes=(SFX1, SFX2))
476 else:
477 # join on dummy column, i.e. conduct a cross join
478 return (
479 df.assign(dummy_join_column=1)
480 .merge(right=df.assign(dummy_join_column=1), on="dummy_join_column", suffixes=(SFX1, SFX2))
481 .drop(columns="dummy_join_column")
482 )
483
484 comparison_df = (
485 df.pipe(add_nim_input_columns_from_tuple_or_dict, nims=nims, mde_column=mde_column)
486 .pipe(
487 add_nims_and_mdes,
488 mde_column=mde_column,
489 nim_column=NIM_COLUMN_DEFAULT,
490 preferred_direction_column=PREFERRED_DIRECTION_COLUMN_DEFAULT,
491 )
492 .pipe(join)
493 .query(
494 "("
495 + " or ".join([f"(level_1=='{l1}' and level_2=='{l2}')" for l1, l2 in groups_to_compare])
496 + ")"
497 + "and level_1 != level_2"
498 )
499 .pipe(
500 validate_and_rename_columns,
501 [NIM, mde_column, PREFERENCE, final_expected_sample_size_column, self._method_column],
502 )
503 .pipe(
504 drop_and_rename_columns,
505 [NULL_HYPOTHESIS, ALTERNATIVE_HYPOTHESIS, f"current_total_{self._denominator}"]
506 + ([ORIGINAL_POINT_ESTIMATE] if ORIGINAL_POINT_ESTIMATE in df.columns else []),
507 )
508 .assign(**{PREFERENCE_TEST: lambda df: TWO_SIDED if self._correction_method == SPOT_1 else df[PREFERENCE]})
509 .assign(**{POWER: self._power})
510 .pipe(self._add_adjusted_power)
511 )
512
513 groups_except_ordinal = [
514 column
515 for column in df.index.names
516 if column is not None
517 and (column != self._ordinal_group_column or final_expected_sample_size_column is None)
518 ]
519 n_comparisons = self._get_num_comparisons(
520 comparison_df,
521 self._correction_method,
522 number_of_level_comparisons=comparison_df.groupby(["level_1", "level_2"], sort=False).ngroups,
523 groupby=groups_except_ordinal,
524 )
525
526 arg_dict = {
527 NUMERATOR: self._numerator,
528 NUMERATOR_SUM_OF_SQUARES: self._numerator_sumsq,
529 DENOMINATOR: self._denominator,
530 BOOTSTRAPS: self._bootstrap_samples_column,
531 FINAL_EXPECTED_SAMPLE_SIZE: final_expected_sample_size_column,
532 ORDINAL_GROUP_COLUMN: self._ordinal_group_column,
533 MDE: mde_column,
534 METHOD: self._method_column,
535 CORRECTION_METHOD: self._correction_method,
536 INTERVAL_SIZE: self._interval_size,
537 ABSOLUTE: absolute,
538 NUMBER_OF_COMPARISONS: n_comparisons,
539 }
540 comparison_df = groupbyApplyParallel(
541 comparison_df.groupby(groups_except_ordinal + [self._method_column], as_index=False, sort=False),
542 lambda df: _compute_comparisons(df, arg_dict=arg_dict),
543 )
544 return comparison_df
545
546 def compute_sample_size(
547 self,
548 treatment_weights: Iterable,
549 mde_column: str,
550 nim_column: str,
551 preferred_direction_column: str,
552 final_expected_sample_size_column: str,
553 ) -> DataFrame:
554 arg_dict, group_columns, sample_size_df = self._initialise_sample_size_and_power_computation(
555 final_expected_sample_size_column, mde_column, nim_column, preferred_direction_column, treatment_weights
556 )
557 sample_size_df = groupbyApplyParallel(
558 sample_size_df.pipe(set_alpha_and_adjust_preference, arg_dict=arg_dict).groupby(
559 group_columns + [self._method_column],
560 as_index=False,
561 sort=False,
562 ),
563 lambda df: _compute_sample_sizes_and_ci_widths(df, arg_dict=arg_dict),
564 )
565
566 return sample_size_df.reset_index()
567
568 def compute_powered_effect(
569 self,
570 treatment_weights: Iterable,
571 mde_column: str,
572 nim_column: str,
573 preferred_direction_column: str,
574 sample_size: float,
575 ) -> DataFrame:
576 arg_dict, group_columns, powered_effect_df = self._initialise_sample_size_and_power_computation(
577 sample_size, mde_column, nim_column, preferred_direction_column, treatment_weights
578 )
579 powered_effect_df = groupbyApplyParallel(
580 powered_effect_df.pipe(set_alpha_and_adjust_preference, arg_dict=arg_dict).groupby(
581 group_columns + [self._method_column],
582 as_index=False,
583 sort=False,
584 ),
585 lambda df: _compute_powered_effects(df, arg_dict=arg_dict),
586 )
587
588 return powered_effect_df.reset_index()
589
590 def _initialise_sample_size_and_power_computation(
591 self, final_expected_sample_size_column, mde_column, nim_column, preferred_direction_column, treatment_weights
592 ):
593 sample_size_df = (
594 self._sufficient_statistics.pipe(
595 lambda df: df if self._all_group_columns == [] else df.set_index(self._all_group_columns)
596 )
597 .pipe(
598 add_nims_and_mdes,
599 mde_column=mde_column,
600 nim_column=nim_column,
601 preferred_direction_column=preferred_direction_column,
602 )
603 .assign(**{PREFERENCE_TEST: lambda df: TWO_SIDED if self._correction_method == SPOT_1 else df[PREFERENCE]})
604 .assign(**{POWER: self._power})
605 .pipe(self._add_adjusted_power)
606 )
607 group_columns = [column for column in sample_size_df.index.names if column is not None]
608 n_comparisons = self._get_num_comparisons(
609 sample_size_df,
610 self._correction_method,
611 number_of_level_comparisons=len(treatment_weights) - 1,
612 groupby=group_columns,
613 )
614 arg_dict = {
615 MDE: mde_column,
616 METHOD: self._method_column,
617 NUMBER_OF_COMPARISONS: n_comparisons,
618 TREATMENT_WEIGHTS: treatment_weights,
619 INTERVAL_SIZE: self._interval_size,
620 CORRECTION_METHOD: self._correction_method,
621 IS_BINARY: self._is_binary,
622 FINAL_EXPECTED_SAMPLE_SIZE: final_expected_sample_size_column,
623 }
624 return arg_dict, group_columns, sample_size_df
625
626 def compute_optimal_weights_and_sample_size(
627 self, sample_size_df: DataFrame, number_of_groups: int
628 ) -> Tuple[Iterable, int]:
629 sample_size_df = (
630 sample_size_df.reset_index(drop=True)
631 .assign(**{OPTIMAL_KAPPA: lambda df: df.apply(_optimal_kappa, is_binary_column=self._is_binary, axis=1)})
632 .assign(
633 **{
634 OPTIMAL_WEIGHTS: lambda df: df.apply(
635 lambda row: _optimal_weights(row[OPTIMAL_KAPPA], number_of_groups), axis=1
636 )
637 }
638 )
639 )
640
641 group_columns = [column for column in sample_size_df.index.names if column is not None] + [self._method_column]
642 arg_dict = {
643 METHOD: self._method_column,
644 IS_BINARY: self._is_binary,
645 }
646 return _find_optimal_group_weights_across_rows(sample_size_df, number_of_groups, group_columns, arg_dict)
647
648 def _add_adjusted_power(self, df: DataFrame) -> DataFrame:
649 if self._correction_method in CORRECTION_METHODS_THAT_REQUIRE_METRIC_INFO:
650 if self._metric_column is None:
651 return df.assign(**{ADJUSTED_POWER: None})
652 else:
653 number_total_metrics = (
654 1 if self._single_metric else df.groupby(self._metric_column, sort=False).ngroups
655 )
656 if self._single_metric:
657 if df[df[NIM].isnull()].shape[0] > 0:
658 number_success_metrics = 1
659 else:
660 number_success_metrics = 0
661 else:
662 number_success_metrics = df[df[NIM].isnull()].groupby(self._metric_column, sort=False).ngroups
663
664 number_guardrail_metrics = number_total_metrics - number_success_metrics
665 power_correction = (
666 number_guardrail_metrics if number_success_metrics == 0 else number_guardrail_metrics + 1
667 )
668 return df.assign(**{ADJUSTED_POWER: 1 - (1 - df[POWER]) / power_correction})
669 else:
670 return df.assign(**{ADJUSTED_POWER: df[POWER]})
671
672 def achieved_power(self, level_1, level_2, mde, alpha, groupby):
673 groupby = listify(groupby)
674 level_columns = get_remaning_groups(self._all_group_columns, groupby)
675 arg_dict = {NUMERATOR: self._numerator, DENOMINATOR: self._denominator}
676 return (
677 self._compute_differences(
678 level_columns,
679 [(level_1, level_2)],
680 True,
681 groupby,
682 level_as_reference=True,
683 nims=None,
684 final_expected_sample_size_column=None,
685 mde_column=None,
686 ) # TODO: IS this right?
687 .pipe(lambda df: df if groupby == [] else df.set_index(groupby))
688 .assign(
689 achieved_power=lambda df: df.apply(
690 lambda row: confidence_computers[row[self._method_column]].achieved_power(
691 row, mde=mde, alpha=alpha, arg_dict=arg_dict
692 ),
693 axis=1,
694 )
695 )
696 )[["level_1", "level_2", "achieved_power"]]
697
698 def _get_num_comparisons(
699 self, df: DataFrame, correction_method: str, number_of_level_comparisons: int, groupby: Iterable
700 ) -> int:
701 if correction_method == BONFERRONI:
702 return max(
703 1,
704 number_of_level_comparisons * df.assign(_dummy_=1).groupby(groupby + ["_dummy_"], sort=False).ngroups,
705 )
706 elif correction_method == BONFERRONI_ONLY_COUNT_TWOSIDED:
707 return max(
708 number_of_level_comparisons
709 * df.query(f'{PREFERENCE_TEST} == "{TWO_SIDED}"')
710 .assign(_dummy_=1)
711 .groupby(groupby + ["_dummy_"], sort=False)
712 .ngroups,
713 1,
714 )
715 elif correction_method in [
716 HOLM,
717 HOMMEL,
718 SIMES_HOCHBERG,
719 SIDAK,
720 HOLM_SIDAK,
721 FDR_BH,
722 FDR_BY,
723 FDR_TSBH,
724 FDR_TSBKY,
725 ]:
726 return 1
727 elif correction_method in [
728 BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
729 SPOT_1,
730 SPOT_1_HOLM,
731 SPOT_1_HOMMEL,
732 SPOT_1_SIMES_HOCHBERG,
733 SPOT_1_SIDAK,
734 SPOT_1_HOLM_SIDAK,
735 SPOT_1_FDR_BH,
736 SPOT_1_FDR_BY,
737 SPOT_1_FDR_TSBH,
738 SPOT_1_FDR_TSBKY,
739 ]:
740 if self._metric_column is None or self._treatment_column is None:
741 return max(
742 1,
743 number_of_level_comparisons
744 * df[df[NIM].isnull()].assign(_dummy_=1).groupby(groupby + ["_dummy_"], sort=False).ngroups,
745 )
746 else:
747 if self._single_metric:
748 if df[df[NIM].isnull()].shape[0] > 0:
749 number_success_metrics = 1
750 else:
751 number_success_metrics = 0
752 else:
753 number_success_metrics = df[df[NIM].isnull()].groupby(self._metric_column, sort=False).ngroups
754
755 number_segments = (
756 1
757 if len(self._segments) == 0 or not all(item in df.index.names for item in self._segments)
758 else df.groupby(self._segments, sort=False).ngroups
759 )
760
761 return max(1, number_of_level_comparisons * max(1, number_success_metrics) * number_segments)
762 else:
763 raise ValueError(f"Unsupported correction method: {correction_method}.")
764
765
766def add_nim_input_columns_from_tuple_or_dict(df, nims: NIM_TYPE, mde_column: str) -> DataFrame:
767 if type(nims) is tuple:
768 return df.assign(**{NIM_COLUMN_DEFAULT: nims[0]}).assign(**{PREFERRED_DIRECTION_COLUMN_DEFAULT: nims[1]})
769 elif type(nims) is dict:
770 nim_values = {key: value[0] for key, value in nims.items()}
771 nim_preferences = {key: value[1] for key, value in nims.items()}
772 return df.assign(**{NIM_COLUMN_DEFAULT: lambda df: df.index.to_series().map(nim_values)}).assign(
773 **{PREFERRED_DIRECTION_COLUMN_DEFAULT: lambda df: df.index.to_series().map(nim_preferences)}
774 )
775 elif nims is None:
776 return df.assign(**{NIM_COLUMN_DEFAULT: None}).assign(
777 **{
778 PREFERRED_DIRECTION_COLUMN_DEFAULT: None
779 if PREFERRED_DIRECTION_COLUMN_DEFAULT not in df or mde_column is None
780 else df[PREFERRED_DIRECTION_COLUMN_DEFAULT]
781 }
782 )
783 else:
784 return df
785
786
787def add_nims_and_mdes(df: DataFrame, mde_column: str, nim_column: str, preferred_direction_column: str) -> DataFrame:
788 def _set_nims_and_mdes(grp: DataFrame) -> DataFrame:
789 nim = grp[nim_column].astype(float)
790 input_preference = grp[preferred_direction_column].values[0]
791 mde = None if mde_column is None else grp[mde_column]
792
793 nim_is_na = nim.isna().all()
794 mde_is_na = True if mde is None else mde.isna().all()
795 if input_preference is None or (type(input_preference) is float and isnan(input_preference)):
796 signed_nim = 0.0 if nim_is_na else nim * grp[POINT_ESTIMATE]
797 preference = TWO_SIDED
798 signed_mde = None if mde_is_na else mde * grp[POINT_ESTIMATE]
799 elif input_preference.lower() == INCREASE_PREFFERED:
800 signed_nim = 0.0 if nim_is_na else -nim * grp[POINT_ESTIMATE]
801 preference = "larger"
802 signed_mde = None if mde_is_na else mde * grp[POINT_ESTIMATE]
803 elif input_preference.lower() == DECREASE_PREFFERED:
804 signed_nim = 0.0 if nim_is_na else nim * grp[POINT_ESTIMATE]
805 preference = "smaller"
806 signed_mde = None if mde_is_na else -mde * grp[POINT_ESTIMATE]
807 else:
808 raise ValueError(f"{input_preference.lower()} not in " f"{[INCREASE_PREFFERED, DECREASE_PREFFERED]}")
809
810 return (
811 grp.assign(**{NIM: nim})
812 .assign(**{PREFERENCE: preference})
813 .assign(**{NULL_HYPOTHESIS: signed_nim})
814 .assign(**{ALTERNATIVE_HYPOTHESIS: signed_mde if nim_is_na else 0.0})
815 )
816
817 index_names = [name for name in df.index.names if name is not None]
818 return (
819 df.groupby(
820 [nim_column, preferred_direction_column] + listify(mde_column), dropna=False, as_index=False, sort=False
821 )
822 .apply(_set_nims_and_mdes)
823 .pipe(lambda df: df.reset_index(index_names))
824 .reset_index(drop=True)
825 .pipe(lambda df: df if index_names == [] else df.set_index(index_names))
826 )
827
828
829def _compute_comparisons(df: DataFrame, arg_dict: Dict) -> DataFrame:
830 return (
831 df.assign(**{DIFFERENCE: lambda df: df[POINT_ESTIMATE + SFX2] - df[POINT_ESTIMATE + SFX1]})
832 .assign(**{STD_ERR: confidence_computers[df[arg_dict[METHOD]].values[0]].std_err(df, arg_dict)})
833 .pipe(_add_p_value_and_ci, arg_dict=arg_dict)
834 .pipe(_powered_effect_and_required_sample_size_from_difference_df, arg_dict=arg_dict)
835 .pipe(_adjust_if_absolute, absolute=arg_dict[ABSOLUTE])
836 .assign(**{PREFERENCE: lambda df: df[PREFERENCE].map(PREFERENCE_DICT)})
837 .pipe(_add_variance_reduction_rate, arg_dict=arg_dict)
838 )
839
840
841def _add_variance_reduction_rate(df: DataFrame, arg_dict: Dict) -> DataFrame:
842 denominator = arg_dict[DENOMINATOR]
843 method_column = arg_dict[METHOD]
844 if (df[method_column] == ZTESTLINREG).any():
845 variance_no_reduction = (
846 df[ORIGINAL_VARIANCE + SFX1] / df[denominator + SFX1]
847 + df[ORIGINAL_VARIANCE + SFX2] / df[denominator + SFX2]
848 )
849 variance_w_reduction = (
850 df[VARIANCE + SFX1] / df[denominator + SFX1] + df[VARIANCE + SFX2] / df[denominator + SFX2]
851 )
852 df = df.assign(**{VARIANCE_REDUCTION: 1 - np.divide(variance_w_reduction, variance_no_reduction)})
853 return df
854
855
856def _add_p_value_and_ci(df: DataFrame, arg_dict: Dict) -> DataFrame:
857 def _add_adjusted_p_and_is_significant(df: DataFrame, arg_dict: Dict) -> DataFrame:
858 n_comparisons = arg_dict[NUMBER_OF_COMPARISONS]
859 if arg_dict[FINAL_EXPECTED_SAMPLE_SIZE] is not None:
860 if arg_dict[CORRECTION_METHOD] not in [
861 BONFERRONI,
862 BONFERRONI_ONLY_COUNT_TWOSIDED,
863 BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
864 SPOT_1,
865 ]:
866 raise ValueError(
867 f"{arg_dict[CORRECTION_METHOD]} not supported for sequential tests. Use one of"
868 f"{BONFERRONI}, {BONFERRONI_ONLY_COUNT_TWOSIDED}, "
869 f"{BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY}, {SPOT_1}"
870 )
871 adjusted_alpha = _compute_sequential_adjusted_alpha(df, arg_dict[METHOD], arg_dict)
872 df = df.merge(adjusted_alpha, left_index=True, right_index=True)
873 df[IS_SIGNIFICANT] = df[P_VALUE] < df[ADJUSTED_ALPHA]
874 df[P_VALUE] = None
875 df[ADJUSTED_P] = None
876 elif arg_dict[CORRECTION_METHOD] in [
877 HOLM,
878 HOMMEL,
879 SIMES_HOCHBERG,
880 SIDAK,
881 HOLM_SIDAK,
882 FDR_BH,
883 FDR_BY,
884 FDR_TSBH,
885 FDR_TSBKY,
886 SPOT_1_HOLM,
887 SPOT_1_HOMMEL,
888 SPOT_1_SIMES_HOCHBERG,
889 SPOT_1_SIDAK,
890 SPOT_1_HOLM_SIDAK,
891 SPOT_1_FDR_BH,
892 SPOT_1_FDR_BY,
893 SPOT_1_FDR_TSBH,
894 SPOT_1_FDR_TSBKY,
895 ]:
896 if arg_dict[CORRECTION_METHOD].startswith("spot-"):
897 correction_method = arg_dict[CORRECTION_METHOD][7:]
898 else:
899 correction_method = arg_dict[CORRECTION_METHOD]
900 df[ADJUSTED_ALPHA] = df[ALPHA] / n_comparisons
901 is_significant, adjusted_p, _, _ = multipletests(
902 pvals=df[P_VALUE], alpha=1 - arg_dict[INTERVAL_SIZE], method=correction_method
903 )
904 df[ADJUSTED_P] = adjusted_p
905 df[IS_SIGNIFICANT] = is_significant
906 elif arg_dict[CORRECTION_METHOD] in [
907 BONFERRONI,
908 BONFERRONI_ONLY_COUNT_TWOSIDED,
909 BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
910 SPOT_1,
911 ]:
912 df[ADJUSTED_ALPHA] = df[ALPHA] / n_comparisons
913 df[ADJUSTED_P] = df[P_VALUE].map(lambda p: min(p * n_comparisons, 1))
914 df[IS_SIGNIFICANT] = df[P_VALUE] < df[ADJUSTED_ALPHA]
915 else:
916 raise ValueError("Can't figure out which correction method to use :(")
917
918 return df
919
920 def _compute_sequential_adjusted_alpha(df: DataFrame, method_column: str, arg_dict: Dict) -> Series:
921 if all(df[method_column] == "z-test"):
922 return confidence_computers["z-test"].compute_sequential_adjusted_alpha(df, arg_dict)
923 else:
924 raise NotImplementedError("Sequential testing is only supported for z-tests")
925
926 def _add_ci(df: DataFrame, arg_dict: Dict) -> DataFrame:
927 lower, upper = confidence_computers[df[arg_dict[METHOD]].values[0]].ci(df, ALPHA, arg_dict)
928
929 if (
930 arg_dict[CORRECTION_METHOD]
931 in [
932 HOLM,
933 HOMMEL,
934 SIMES_HOCHBERG,
935 SPOT_1_HOLM,
936 SPOT_1_HOMMEL,
937 SPOT_1_SIMES_HOCHBERG,
938 ]
939 and all(df[PREFERENCE_TEST] != TWO_SIDED)
940 ):
941 if all(df[arg_dict[METHOD]] == "z-test"):
942 adjusted_lower, adjusted_upper = confidence_computers["z-test"].ci_for_multiple_comparison_methods(
943 df, arg_dict[CORRECTION_METHOD], alpha=1 - arg_dict[INTERVAL_SIZE]
944 )
945 else:
946 raise NotImplementedError(f"{arg_dict[CORRECTION_METHOD]} is only supported for ZTests")
947 elif arg_dict[CORRECTION_METHOD] in [
948 BONFERRONI,
949 BONFERRONI_ONLY_COUNT_TWOSIDED,
950 BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
951 SPOT_1,
952 SPOT_1_HOLM,
953 SPOT_1_HOMMEL,
954 SPOT_1_SIMES_HOCHBERG,
955 SPOT_1_SIDAK,
956 SPOT_1_HOLM_SIDAK,
957 SPOT_1_FDR_BH,
958 SPOT_1_FDR_BY,
959 SPOT_1_FDR_TSBH,
960 SPOT_1_FDR_TSBKY,
961 ]:
962 adjusted_lower, adjusted_upper = confidence_computers[df[arg_dict[METHOD]].values[0]].ci(
963 df, ADJUSTED_ALPHA, arg_dict
964 )
965 else:
966 warn(f"Confidence intervals not supported for {arg_dict[CORRECTION_METHOD]}")
967 adjusted_lower = None
968 adjusted_upper = None
969
970 return (
971 df.assign(**{CI_LOWER: lower})
972 .assign(**{CI_UPPER: upper})
973 .assign(**{ADJUSTED_LOWER: adjusted_lower})
974 .assign(**{ADJUSTED_UPPER: adjusted_upper})
975 )
976
977 return (
978 df.pipe(set_alpha_and_adjust_preference, arg_dict=arg_dict)
979 .assign(**{P_VALUE: lambda df: df.pipe(_p_value, arg_dict=arg_dict)})
980 .pipe(_add_adjusted_p_and_is_significant, arg_dict=arg_dict)
981 .pipe(_add_ci, arg_dict=arg_dict)
982 )
983
984
985def set_alpha_and_adjust_preference(df: DataFrame, arg_dict: Dict) -> DataFrame:
986 alpha_0 = 1 - arg_dict[INTERVAL_SIZE]
987 return df.assign(
988 **{
989 ALPHA: df.apply(
990 lambda row: 2 * alpha_0
991 if arg_dict[CORRECTION_METHOD] == SPOT_1 and row[PREFERENCE] != TWO_SIDED
992 else alpha_0,
993 axis=1,
994 )
995 }
996 ).assign(**{ADJUSTED_ALPHA_POWER_SAMPLE_SIZE: lambda df: df[ALPHA] / arg_dict[NUMBER_OF_COMPARISONS]})
997
998
999def _adjust_if_absolute(df: DataFrame, absolute: bool) -> DataFrame:
1000 if absolute:
1001 return df.assign(absolute_difference=absolute)
1002 else:
1003 return (
1004 df.assign(absolute_difference=absolute)
1005 .assign(**{DIFFERENCE: df[DIFFERENCE] / df[POINT_ESTIMATE + SFX1]})
1006 .assign(**{CI_LOWER: df[CI_LOWER] / df[POINT_ESTIMATE + SFX1]})
1007 .assign(**{CI_UPPER: df[CI_UPPER] / df[POINT_ESTIMATE + SFX1]})
1008 .assign(**{ADJUSTED_LOWER: df[ADJUSTED_LOWER] / df[POINT_ESTIMATE + SFX1]})
1009 .assign(**{ADJUSTED_UPPER: df[ADJUSTED_UPPER] / df[POINT_ESTIMATE + SFX1]})
1010 .assign(**{NULL_HYPOTHESIS: df[NULL_HYPOTHESIS] / df[POINT_ESTIMATE + SFX1]})
1011 .assign(**{POWERED_EFFECT: df[POWERED_EFFECT] / df[POINT_ESTIMATE + SFX1]})
1012 )
1013
1014
1015def _p_value(df: DataFrame, arg_dict: Dict) -> float:
1016 if df[arg_dict[METHOD]].values[0] == CHI2 and (df[NIM].notna()).any():
1017 raise ValueError("Non-inferiority margins not supported in ChiSquared. Use StudentsTTest or ZTest instead.")
1018 return confidence_computers[df[arg_dict[METHOD]].values[0]].p_value(df, arg_dict)
1019
1020
1021def _powered_effect_and_required_sample_size_from_difference_df(df: DataFrame, arg_dict: Dict) -> DataFrame:
1022 if df[arg_dict[METHOD]].values[0] not in [ZTEST, ZTESTLINREG] and arg_dict[MDE] in df:
1023 raise ValueError("Minimum detectable effects only supported for ZTest.")
1024 elif df[arg_dict[METHOD]].values[0] not in [ZTEST, ZTESTLINREG] or (df[ADJUSTED_POWER].isna()).any():
1025 df[POWERED_EFFECT] = None
1026 df[REQUIRED_SAMPLE_SIZE] = None
1027 df[REQUIRED_SAMPLE_SIZE_METRIC] = None
1028 return df
1029 else:
1030 n1, n2 = df[arg_dict[DENOMINATOR] + SFX1], df[arg_dict[DENOMINATOR] + SFX2]
1031 kappa = n1 / n2
1032 binary = (df[arg_dict[NUMERATOR_SUM_OF_SQUARES] + SFX1] == df[arg_dict[NUMERATOR] + SFX1]).all()
1033 proportion_of_total = (n1 + n2) / df[f"current_total_{arg_dict[DENOMINATOR]}"]
1034
1035 z_alpha = st.norm.ppf(
1036 1
1037 - df[ADJUSTED_ALPHA_POWER_SAMPLE_SIZE].values[0] / (2 if df[PREFERENCE_TEST].values[0] == TWO_SIDED else 1)
1038 )
1039 z_power = st.norm.ppf(df[ADJUSTED_POWER].values[0])
1040
1041 nim = df[NIM].values[0]
1042 if isinstance(nim, float):
1043 non_inferiority = not isnan(nim)
1044 elif nim is None:
1045 non_inferiority = nim is not None
1046
1047 df[POWERED_EFFECT] = confidence_computers[df[arg_dict[METHOD]].values[0]].powered_effect(
1048 df=df.assign(kappa=kappa)
1049 .assign(current_number_of_units=df[f"current_total_{arg_dict[DENOMINATOR]}"])
1050 .assign(proportion_of_total=proportion_of_total),
1051 z_alpha=z_alpha,
1052 z_power=z_power,
1053 binary=binary,
1054 non_inferiority=non_inferiority,
1055 avg_column=POINT_ESTIMATE + SFX1,
1056 var_column=VARIANCE + SFX1,
1057 )
1058
1059 if ALTERNATIVE_HYPOTHESIS in df and NULL_HYPOTHESIS in df and (df[ALTERNATIVE_HYPOTHESIS].notna()).all():
1060 df[REQUIRED_SAMPLE_SIZE] = confidence_computers[df[arg_dict[METHOD]].values[0]].required_sample_size(
1061 proportion_of_total=1,
1062 z_alpha=z_alpha,
1063 z_power=z_power,
1064 binary=binary,
1065 non_inferiority=non_inferiority,
1066 hypothetical_effect=df[ALTERNATIVE_HYPOTHESIS] - df[NULL_HYPOTHESIS],
1067 control_avg=df[POINT_ESTIMATE + SFX1],
1068 control_var=df[VARIANCE + SFX1],
1069 kappa=kappa,
1070 )
1071 df[REQUIRED_SAMPLE_SIZE_METRIC] = confidence_computers[
1072 df[arg_dict[METHOD]].values[0]
1073 ].required_sample_size(
1074 proportion_of_total=proportion_of_total,
1075 z_alpha=z_alpha,
1076 z_power=z_power,
1077 binary=binary,
1078 non_inferiority=non_inferiority,
1079 hypothetical_effect=df[ALTERNATIVE_HYPOTHESIS] - df[NULL_HYPOTHESIS],
1080 control_avg=df[POINT_ESTIMATE + SFX1],
1081 control_var=df[VARIANCE + SFX1],
1082 kappa=kappa,
1083 )
1084 else:
1085 df[REQUIRED_SAMPLE_SIZE] = None
1086 df[REQUIRED_SAMPLE_SIZE_METRIC] = None
1087
1088 return df
1089
1090
1091def _compute_sample_sizes_and_ci_widths(df: DataFrame, arg_dict: Dict) -> DataFrame:
1092 return df.pipe(_sample_size_from_summary_df, arg_dict=arg_dict).pipe(_ci_width, arg_dict=arg_dict)
1093
1094
1095def _sample_size_from_summary_df(df: DataFrame, arg_dict: Dict) -> DataFrame:
1096 if df[arg_dict[METHOD]].values[0] != ZTEST in df:
1097 raise ValueError("Sample size calculation only supported for ZTest.")
1098 elif df[arg_dict[METHOD]].values[0] != ZTEST or (df[ADJUSTED_POWER].isna()).any():
1099 df[REQUIRED_SAMPLE_SIZE_METRIC] = None
1100 else:
1101 all_weights = arg_dict[TREATMENT_WEIGHTS]
1102 control_weight, treatment_weights = all_weights[0], all_weights[1:]
1103
1104 binary = df[arg_dict[IS_BINARY]].values[0]
1105 z_alpha = st.norm.ppf(
1106 1
1107 - df[ADJUSTED_ALPHA_POWER_SAMPLE_SIZE].values[0] / (2 if df[PREFERENCE_TEST].values[0] == TWO_SIDED else 1)
1108 )
1109 z_power = st.norm.ppf(df[ADJUSTED_POWER].values[0])
1110 non_inferiority = is_non_inferiority(df[NIM].values[0])
1111
1112 max_sample_size = 0
1113 for treatment_weight in treatment_weights:
1114 kappa = control_weight / treatment_weight
1115 proportion_of_total = (control_weight + treatment_weight) / sum(all_weights)
1116
1117 if ALTERNATIVE_HYPOTHESIS in df and NULL_HYPOTHESIS in df and (df[ALTERNATIVE_HYPOTHESIS].notna()).all():
1118 this_sample_size = confidence_computers[df[arg_dict[METHOD]].values[0]].required_sample_size(
1119 proportion_of_total=proportion_of_total,
1120 z_alpha=z_alpha,
1121 z_power=z_power,
1122 binary=binary,
1123 non_inferiority=non_inferiority,
1124 hypothetical_effect=df[ALTERNATIVE_HYPOTHESIS] - df[NULL_HYPOTHESIS],
1125 control_avg=df[POINT_ESTIMATE],
1126 control_var=df[VARIANCE],
1127 kappa=kappa,
1128 )
1129 max_sample_size = max(this_sample_size.max(), max_sample_size)
1130
1131 df[REQUIRED_SAMPLE_SIZE_METRIC] = None if max_sample_size == 0 else max_sample_size
1132
1133 return df
1134
1135
1136def _compute_powered_effects(df: DataFrame, arg_dict: Dict) -> DataFrame:
1137 return df.pipe(_powered_effect_from_summary_df, arg_dict=arg_dict)
1138
1139
1140def _powered_effect_from_summary_df(df: DataFrame, arg_dict: Dict) -> DataFrame:
1141 if df[arg_dict[METHOD]].values[0] != ZTEST in df:
1142 raise ValueError("Powered effect calculation only supported for ZTest.")
1143 elif df[arg_dict[METHOD]].values[0] != ZTEST or (df[ADJUSTED_POWER].isna()).any():
1144 df[REQUIRED_SAMPLE_SIZE_METRIC] = None
1145 else:
1146 all_weights = arg_dict[TREATMENT_WEIGHTS]
1147 control_weight, treatment_weights = all_weights[0], all_weights[1:]
1148
1149 current_number_of_units = arg_dict[FINAL_EXPECTED_SAMPLE_SIZE]
1150
1151 binary = df[arg_dict[IS_BINARY]].values[0]
1152 z_alpha = st.norm.ppf(
1153 1
1154 - df[ADJUSTED_ALPHA_POWER_SAMPLE_SIZE].values[0] / (2 if df[PREFERENCE_TEST].values[0] == TWO_SIDED else 1)
1155 )
1156 z_power = st.norm.ppf(df[ADJUSTED_POWER].values[0])
1157 non_inferiority = is_non_inferiority(df[NIM].values[0])
1158
1159 max_powered_effect = 0
1160 for treatment_weight in treatment_weights:
1161 kappa = control_weight / treatment_weight
1162 proportion_of_total = (control_weight + treatment_weight) / sum(all_weights)
1163
1164 this_powered_effect = df[POWERED_EFFECT] = confidence_computers[
1165 df[arg_dict[METHOD]].values[0]
1166 ].powered_effect(
1167 df=df.assign(kappa=kappa)
1168 .assign(current_number_of_units=current_number_of_units)
1169 .assign(proportion_of_total=proportion_of_total),
1170 z_alpha=z_alpha,
1171 z_power=z_power,
1172 binary=binary,
1173 non_inferiority=non_inferiority,
1174 avg_column=POINT_ESTIMATE,
1175 var_column=VARIANCE,
1176 )
1177
1178 max_powered_effect = max(this_powered_effect.max(), max_powered_effect)
1179
1180 df[POWERED_EFFECT] = None if max_powered_effect == 0 else max_powered_effect
1181
1182 return df
1183
1184
1185def _ci_width(df: DataFrame, arg_dict: Dict) -> DataFrame:
1186 expected_sample_size = (
1187 None if arg_dict[FINAL_EXPECTED_SAMPLE_SIZE] is None else df[arg_dict[FINAL_EXPECTED_SAMPLE_SIZE]].values[0]
1188 )
1189 if expected_sample_size is None or np.isnan(expected_sample_size):
1190 return df.assign(**{CI_WIDTH: None})
1191
1192 all_weights = arg_dict[TREATMENT_WEIGHTS]
1193 control_weight, treatment_weights = all_weights[0], all_weights[1:]
1194 sum_of_weights = sum(all_weights)
1195
1196 control_count = int((control_weight / sum_of_weights) * expected_sample_size)
1197 if control_count == 0:
1198 return df.assign(**{CI_WIDTH: float("inf")})
1199
1200 else:
1201 binary = df[arg_dict[IS_BINARY]].values[0]
1202 z_alpha = st.norm.ppf(
1203 1
1204 - df[ADJUSTED_ALPHA_POWER_SAMPLE_SIZE].values[0] / (2 if df[PREFERENCE_TEST].values[0] == TWO_SIDED else 1)
1205 )
1206
1207 non_inferiority = is_non_inferiority(df[NIM].values[0])
1208 max_ci_width = 0
1209 for treatment_weight in treatment_weights:
1210 treatment_count = int((treatment_weight / sum_of_weights) * expected_sample_size)
1211 if treatment_count == 0:
1212 return df.assign(**{CI_WIDTH: float("inf")})
1213 else:
1214 comparison_ci_width = confidence_computers[df[arg_dict[METHOD]].values[0]].ci_width(
1215 z_alpha=z_alpha,
1216 binary=binary,
1217 non_inferiority=non_inferiority,
1218 hypothetical_effect=df[ALTERNATIVE_HYPOTHESIS] - df[NULL_HYPOTHESIS],
1219 control_avg=df[POINT_ESTIMATE],
1220 control_var=df[VARIANCE],
1221 control_count=control_count,
1222 treatment_count=treatment_count,
1223 )
1224
1225 max_ci_width = max(comparison_ci_width.max(), max_ci_width)
1226
1227 df[CI_WIDTH] = None if max_ci_width == 0 else max_ci_width
1228
1229 return df
1230
1231
1232def _optimal_kappa(row: Series, is_binary_column) -> float:
1233 def _binary_variance(p: float) -> float:
1234 return p * (1 - p)
1235
1236 if row[is_binary_column]:
1237 if is_non_inferiority(row[NIM]):
1238 return 1.0
1239 else:
1240 if row[POINT_ESTIMATE] == 0.0:
1241 # variance will be 0 as well in this case. This if-branch is important to avoid divide by zero problems
1242 return 1.0
1243 else:
1244 hypothetical_effect = row[ALTERNATIVE_HYPOTHESIS] - row[NULL_HYPOTHESIS]
1245 return np.sqrt(
1246 _binary_variance(row[POINT_ESTIMATE]) / _binary_variance(row[POINT_ESTIMATE] + hypothetical_effect)
1247 )
1248 else:
1249 return 1.0
1250
1251
1252def _optimal_weights(kappa: float, number_of_groups) -> Iterable:
1253 treatment_weight = 1 / (kappa + number_of_groups - 1)
1254 control_weight = kappa * treatment_weight
1255 return [control_weight] + [treatment_weight for _ in range(number_of_groups - 1)]
1256
1257
1258def _find_optimal_group_weights_across_rows(
1259 df: DataFrame, group_count: int, group_columns: Iterable, arg_dict: Dict
1260) -> (List[float], int):
1261 min_kappa = min(df[OPTIMAL_KAPPA])
1262 max_kappa = max(df[OPTIMAL_KAPPA])
1263
1264 if min_kappa == max_kappa:
1265 optimal_weights = df[OPTIMAL_WEIGHTS][0]
1266 optimal_sample_size = _calculate_optimal_sample_size_given_weights(
1267 df, optimal_weights, group_columns, arg_dict
1268 )
1269 return optimal_weights, optimal_sample_size
1270
1271 in_between_kappas = np.linspace(min_kappa, max_kappa, 100)
1272 min_optimal_sample_size = float("inf")
1273 optimal_weights = []
1274 for kappa in in_between_kappas:
1275 weights = _optimal_weights(kappa, group_count)
1276 optimal_sample_size = _calculate_optimal_sample_size_given_weights(df, weights, group_columns, arg_dict)
1277 if optimal_sample_size is not None and optimal_sample_size < min_optimal_sample_size:
1278 min_optimal_sample_size = optimal_sample_size
1279 optimal_weights = weights
1280 min_optimal_sample_size = np.nan if min_optimal_sample_size == 0 else min_optimal_sample_size
1281 return optimal_weights, min_optimal_sample_size
1282
1283
1284def _calculate_optimal_sample_size_given_weights(
1285 df: DataFrame, optimal_weights: List[float], group_columns: Iterable, arg_dict: Dict
1286) -> int:
1287 arg_dict[TREATMENT_WEIGHTS] = optimal_weights
1288 sample_size_df = groupbyApplyParallel(
1289 df.groupby(group_columns, as_index=False, sort=False),
1290 lambda df: _sample_size_from_summary_df(df, arg_dict=arg_dict),
1291 )
1292
1293 if sample_size_df[REQUIRED_SAMPLE_SIZE_METRIC].isna().all():
1294 return None
1295 optimal_sample_size = sample_size_df[REQUIRED_SAMPLE_SIZE_METRIC].max()
1296
1297 return np.ceil(optimal_sample_size) if np.isfinite(optimal_sample_size) else optimal_sample_size