1from typing import Tuple, Union, Dict, Iterable
2
3import numpy as np
4from pandas import DataFrame, Series
5from scipy import optimize
6from scipy import stats as st
7
8from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic
9
10from spotify_confidence.analysis.confidence_utils import power_calculation
11from spotify_confidence.analysis.constants import (
12 NUMERATOR,
13 NUMERATOR_SUM_OF_SQUARES,
14 DENOMINATOR,
15 INTERVAL_SIZE,
16 FINAL_EXPECTED_SAMPLE_SIZE,
17 ORDINAL_GROUP_COLUMN,
18 POINT_ESTIMATE,
19 CI_LOWER,
20 CI_UPPER,
21 ADJUSTED_LOWER,
22 ADJUSTED_UPPER,
23 VARIANCE,
24 NUMBER_OF_COMPARISONS,
25 TWO_SIDED,
26 SFX2,
27 SFX1,
28 STD_ERR,
29 PREFERENCE_TEST,
30 NULL_HYPOTHESIS,
31 DIFFERENCE,
32 ALPHA,
33 IS_SIGNIFICANT,
34 HOLM,
35 SPOT_1_HOLM,
36 HOMMEL,
37 SIMES_HOCHBERG,
38 SPOT_1_HOMMEL,
39 SPOT_1_SIMES_HOCHBERG,
40 NIM,
41 ADJUSTED_ALPHA,
42 NUMBER_OF_COMPARISONS_VALIDATION,
43 ADJUSTED_ALPHA_VALIDATION,
44 PREFERRED_DIRECTION_COLUMN_DEFAULT,
45 INCREASE_PREFFERED,
46 DECREASE_PREFFERED,
47 PREFERENCE_DICT,
48)
49from spotify_confidence.analysis.frequentist.sequential_bound_solver import bounds
50
51
52def sequential_bounds(t: np.array, alpha: float, sides: int, state: DataFrame = None):
53 return bounds(t, alpha, rho=2, ztrun=8, sides=sides, max_nints=1000, state=state)
54
55
56def sample_ratio_test(df: DataFrame, arg_dict: dict) -> Tuple[float, DataFrame]:
57 n_tot = df[arg_dict[DENOMINATOR]].sum()
58 expected_proportions = df[arg_dict[NUMERATOR]]
59 observed_proportions = df[arg_dict[DENOMINATOR]] / n_tot
60 sq_diff = np.power(observed_proportions - expected_proportions, 2)
61
62 chi2_stat = n_tot * sq_diff.divide(expected_proportions).sum()
63 deg_freedom = df.shape[0] - 1
64 p_value = 1 - st.chi2.cdf(chi2_stat, deg_freedom)
65
66 return p_value
67
68
69def p_value(df: DataFrame, arg_dict: Dict[str, str], validation: bool = True) -> Series:
70 return float('nan')
71
72
73def ci(df: DataFrame, alpha_column: str, arg_dict: Dict[str, str]) -> Tuple[Series, Series]:
74 return (float('nan'), float('nan'))
75
76
77def point_estimate(df: DataFrame, arg_dict: Dict[str, str]) -> float:
78 return float('nan')
79
80
81def variance(df: DataFrame, arg_dict: Dict[str, str]) -> float:
82 return float('nan')
83
84
85def add_point_estimate_ci(df: Series, arg_dict: Dict[str, str]) -> Series:
86 df[CI_LOWER] = float('nan')
87 df[CI_UPPER] = float('nan')
88 return df
89
90
91def std_err(df: Series, arg_dict: Dict[str, str]) -> float:
92 return float('nan')
93
94
95def compute_sequential_adjusted_alpha(df: DataFrame, arg_dict: Dict[str, str], validation: bool):
96 denominator = arg_dict[DENOMINATOR]
97 final_expected_sample_size_column = arg_dict[FINAL_EXPECTED_SAMPLE_SIZE]
98 ordinal_group_column = arg_dict[ORDINAL_GROUP_COLUMN]
99 n_comparisons = arg_dict[NUMBER_OF_COMPARISONS if not validation else NUMBER_OF_COMPARISONS_VALIDATION]
100
101 def adjusted_alphas_for_group(grp: DataFrame, validation: bool) -> Series:
102 return (
103 sequential_bounds(
104 t=grp["sample_size_proportions"].values,
105 alpha=grp[ALPHA].values[0] / n_comparisons,
106 sides=2 if (grp[PREFERENCE_TEST] == TWO_SIDED).all() and not validation else 1,
107 )
108 .df.set_index(grp.index)
109 .assign(
110 **{
111 ADJUSTED_ALPHA: lambda df: df.apply(
112 lambda row: 2 * (1 - st.norm.cdf(row["zb"]))
113 if not validation and (grp[PREFERENCE_TEST] == TWO_SIDED).all()
114 else 1 - st.norm.cdf(row["zb"]),
115 axis=1,
116 )
117 }
118 )
119 )[["zb", ADJUSTED_ALPHA]]
120
121 groups_except_ordinal = [column for column in df.index.names if column != ordinal_group_column]
122 max_sample_size_by_group = (
123 (
124 df[["current_total_" + denominator, final_expected_sample_size_column]]
125 .groupby(groups_except_ordinal, sort=False)
126 .max()
127 .max(axis=1)
128 )
129 if len(groups_except_ordinal) > 0
130 else (df[["current_total_" + denominator, final_expected_sample_size_column]].max().max())
131 )
132 sample_size_proportions = Series(
133 data=df.groupby(df.index.names, sort=False)["current_total_" + denominator].first() / max_sample_size_by_group,
134 name="sample_size_proportions",
135 )
136
137 return Series(
138 data=df.groupby(df.index.names, sort=False)[[ALPHA, PREFERENCE_TEST]]
139 .first()
140 .merge(sample_size_proportions, left_index=True, right_index=True)
141 .assign(_sequential_dummy_index_=1)
142 .groupby(groups_except_ordinal + ["_sequential_dummy_index_"], sort=False)[
143 ["sample_size_proportions", PREFERENCE_TEST, ALPHA]
144 ]
145 .apply(adjusted_alphas_for_group, validation=validation)[ADJUSTED_ALPHA],
146 name=ADJUSTED_ALPHA if not validation else ADJUSTED_ALPHA_VALIDATION,
147 )