1from typing import Tuple, Dict
2
3import numpy as np
4from pandas import DataFrame, Series
5from statsmodels.stats.proportion import proportion_confint, proportions_chisquare, confint_proportions_2indep
6
7from spotify_confidence.analysis.confidence_utils import power_calculation
8from spotify_confidence.analysis.constants import (
9 NUMERATOR,
10 DENOMINATOR,
11 INTERVAL_SIZE,
12 POINT_ESTIMATE,
13 VARIANCE,
14 CI_LOWER,
15 CI_UPPER,
16 SFX1,
17 SFX2,
18)
19
20
21def point_estimate(df: DataFrame, arg_dict: Dict[str, str]) -> float:
22 numerator = arg_dict[NUMERATOR]
23 denominator = arg_dict[DENOMINATOR]
24 if (df[denominator] == 0).any():
25 raise ValueError("""Can't compute point estimate: denominator is 0""")
26 return df[numerator] / df[denominator]
27
28
29def variance(df: DataFrame, arg_dict: Dict[str, str]) -> Series:
30 variance = df[POINT_ESTIMATE] * (1 - df[POINT_ESTIMATE])
31 if (variance < 0).any():
32 raise ValueError(f"Computed variance is negative: {variance}. " "Please check your inputs.")
33 return variance
34
35
36def std_err(df: DataFrame, arg_dict: Dict[str, str]) -> Series:
37 denominator = arg_dict[DENOMINATOR]
38 return np.sqrt(df[VARIANCE + SFX1] / df[denominator + SFX1] + df[VARIANCE + SFX2] / df[denominator + SFX2])
39
40
41def add_point_estimate_ci(df: DataFrame, arg_dict: Dict[str, str]) -> Series:
42 numerator = arg_dict[NUMERATOR]
43 denominator = arg_dict[DENOMINATOR]
44 interval_size = arg_dict[INTERVAL_SIZE]
45 df[CI_LOWER], df[CI_UPPER] = proportion_confint(
46 count=df[numerator],
47 nobs=df[denominator],
48 alpha=1 - interval_size,
49 )
50 return df
51
52
53def p_value(df: DataFrame, arg_dict: Dict[str, str]) -> Series:
54 n1, n2 = arg_dict[NUMERATOR] + SFX1, arg_dict[NUMERATOR] + SFX2
55 d1, d2 = arg_dict[DENOMINATOR] + SFX1, arg_dict[DENOMINATOR] + SFX2
56
57 def p_value_row(row):
58 _, p_value, _ = proportions_chisquare(
59 count=[row[n1], row[n2]],
60 nobs=[row[d1], row[d2]],
61 )
62 return p_value
63
64 return df.apply(p_value_row, axis=1)
65
66
67def ci(df: DataFrame, alpha_column: str, arg_dict: Dict[str, str]) -> Tuple[Series, Series]:
68 n1, n2 = arg_dict[NUMERATOR] + SFX1, arg_dict[NUMERATOR] + SFX2
69 d1, d2 = arg_dict[DENOMINATOR] + SFX1, arg_dict[DENOMINATOR] + SFX2
70 return confint_proportions_2indep(
71 count1=df[n2],
72 nobs1=df[d2],
73 count2=df[n1],
74 nobs2=df[d1],
75 alpha=df[alpha_column],
76 compare="diff",
77 method="wald",
78 )
79
80
81def achieved_power(df: DataFrame, mde: float, alpha: float, arg_dict: Dict[str, str]) -> DataFrame:
82 n1, n2 = arg_dict[NUMERATOR] + SFX1, arg_dict[NUMERATOR] + SFX2
83 d1, d2 = arg_dict[DENOMINATOR] + SFX1, arg_dict[DENOMINATOR] + SFX2
84
85 pooled_prop = (df[n1] + df[n2]) / (df[d1] + df[d2])
86 var_pooled = pooled_prop * (1 - pooled_prop)
87
88 return power_calculation(mde, var_pooled, alpha, df[d1], df[d2])