1from typing import Tuple, Dict
2
3import numpy as np
4from pandas import DataFrame, Series
5from statsmodels.stats.weightstats import _tconfint_generic, _tstat_generic
6
7from spotify_confidence.analysis.confidence_utils import power_calculation
8from spotify_confidence.analysis.constants import (
9 NUMERATOR,
10 NUMERATOR_SUM_OF_SQUARES,
11 DENOMINATOR,
12 INTERVAL_SIZE,
13 POINT_ESTIMATE,
14 CI_LOWER,
15 CI_UPPER,
16 VARIANCE,
17 TWO_SIDED,
18 SFX1,
19 SFX2,
20 STD_ERR,
21 PREFERENCE_TEST,
22 NULL_HYPOTHESIS,
23 DIFFERENCE,
24)
25
26
27def point_estimate(df: DataFrame, arg_dict: Dict[str, str]) -> float:
28 numerator = arg_dict[NUMERATOR]
29 denominator = arg_dict[DENOMINATOR]
30 if (df[denominator] == 0).any():
31 raise ValueError("""Can't compute point estimate: denominator is 0""")
32 return df[numerator] / df[denominator]
33
34
35def variance(df: DataFrame, arg_dict: Dict[str, str]) -> float:
36 numerator = arg_dict[NUMERATOR]
37 denominator = arg_dict[DENOMINATOR]
38 numerator_sumsq = arg_dict[NUMERATOR_SUM_OF_SQUARES]
39 binary = df[numerator_sumsq] == df[numerator]
40 if binary.all():
41 # This equals row[POINT_ESTIMATE]*(1-row[POINT_ESTIMATE]) when the data is binary,
42 # and also gives a robust fallback in case it's not
43 variance = df[numerator_sumsq] / df[denominator] - df[POINT_ESTIMATE] ** 2
44 else:
45 variance = (df[numerator_sumsq] - np.power(df[numerator], 2) / df[denominator]) / (df[denominator] - 1)
46 if (variance < 0).any():
47 raise ValueError("Computed variance is negative. " "Please check your inputs.")
48 return variance
49
50
51def std_err(df: DataFrame, arg_dict: Dict[str, str]) -> Series:
52 denominator = arg_dict[DENOMINATOR]
53 return np.sqrt(df[VARIANCE + SFX1] / df[denominator + SFX1] + df[VARIANCE + SFX2] / df[denominator + SFX2])
54
55
56def add_point_estimate_ci(df: DataFrame, arg_dict: Dict[str, str]) -> Series:
57 denominator = arg_dict[DENOMINATOR]
58 interval_size = arg_dict[INTERVAL_SIZE]
59 df[CI_LOWER], df[CI_UPPER] = _tconfint_generic(
60 mean=df[POINT_ESTIMATE],
61 std_mean=np.sqrt(df[VARIANCE] / df[denominator]),
62 dof=df[denominator] - 1,
63 alpha=1 - interval_size,
64 alternative=TWO_SIDED,
65 )
66 return df
67
68
69def _dof(row: Series, arg_dict: Dict[str, str]) -> float:
70 denominator = arg_dict[DENOMINATOR]
71 v1, v2 = row[VARIANCE + SFX1], row[VARIANCE + SFX2]
72 n1, n2 = row[denominator + SFX1], row[denominator + SFX2]
73 return (v1 / n1 + v2 / n2) ** 2 / ((v1 / n1) ** 2 / (n1 - 1) + (v2 / n2) ** 2 / (n2 - 1))
74
75
76def p_value(df: Series, arg_dict: Dict[str, str]) -> Series:
77 _, p_value = _tstat_generic(
78 value1=df[POINT_ESTIMATE + SFX2],
79 value2=df[POINT_ESTIMATE + SFX1],
80 std_diff=df[STD_ERR],
81 dof=_dof(df, arg_dict),
82 alternative=df[PREFERENCE_TEST].values[0],
83 diff=df[NULL_HYPOTHESIS],
84 )
85 return p_value
86
87
88def ci(df: DataFrame, alpha_column: str, arg_dict: Dict[str, str]) -> Tuple[Series, Series]:
89 return _tconfint_generic(
90 mean=df[DIFFERENCE],
91 std_mean=df[STD_ERR],
92 dof=_dof(df, arg_dict),
93 alpha=df[alpha_column],
94 alternative=df[PREFERENCE_TEST].values[0],
95 )
96
97
98def achieved_power(df: DataFrame, mde: float, alpha: float, arg_dict: Dict[str, str]) -> DataFrame:
99 v1, v2 = df[VARIANCE + SFX1], df[VARIANCE + SFX2]
100 d1, d2 = arg_dict[DENOMINATOR] + SFX1, arg_dict[DENOMINATOR] + SFX2
101 n1, n2 = df[d1], df[d2]
102
103 var_pooled = ((n1 - 1) * v1 + (n2 - 1) * v2) / (n1 + n2 - 2)
104
105 return power_calculation(mde, var_pooled, alpha, n1, n2)