1from functools import reduce
2from typing import Tuple, Union, Dict
3
4import numpy as np
5from pandas import DataFrame, Series
6
7from spotify_confidence.analysis.confidence_utils import unlist, dfmatmul
8from spotify_confidence.analysis.constants import (
9 REGRESSION_PARAM,
10 FEATURE,
11 FEATURE_SUMSQ,
12 FEATURE_CROSS,
13 NUMERATOR,
14 DENOMINATOR,
15 ORIGINAL_POINT_ESTIMATE,
16 ORIGINAL_VARIANCE,
17)
18from spotify_confidence.analysis.frequentist.confidence_computers import z_test_computer
19
20
21def estimate_slope(df, arg_dict: Dict) -> DataFrame:
22 if arg_dict[FEATURE] not in df:
23 return df
24
25 def col_sum(x):
26 return reduce(lambda x, y: x + y, x)
27
28 def dimension(x):
29 return x.shape[0] if isinstance(x, np.ndarray) and x.size > 1 else 1
30
31 k = df[arg_dict[FEATURE_SUMSQ]].apply(dimension).iloc[0]
32
33 XX0 = np.zeros((k + 1, k + 1))
34 XX0[1 : (k + 1), 1 : (k + 1)] = col_sum(df[arg_dict[FEATURE_SUMSQ]])
35 XX0[0, 0] = col_sum(df[arg_dict[DENOMINATOR]])
36 XX0[0, 1 : (k + 1)] = col_sum(df[arg_dict[FEATURE]])
37 XX0[1 : (k + 1), 0] = col_sum(df[arg_dict[FEATURE]])
38
39 Xy0 = np.zeros((k + 1, 1))
40 Xy0[
41 0,
42 ] = col_sum(df[arg_dict[NUMERATOR]])
43 Xy0[1 : (k + 1),] = np.atleast_2d(
44 col_sum(df[arg_dict[FEATURE_CROSS]])
45 ).reshape(-1, 1)
46
47 b = np.matmul(np.linalg.inv(XX0), Xy0)
48 out = b[1 : (k + 1)]
49 if out.size == 1:
50 out = out.item()
51
52 outseries = Series(index=df.index, dtype=df[arg_dict[FEATURE]].dtype)
53 df[REGRESSION_PARAM] = outseries.apply(lambda x: out)
54 return df
55
56
57def point_estimate(df: Series, arg_dict) -> float:
58 df = estimate_slope(df, arg_dict)
59 point_estimate = df[arg_dict[NUMERATOR]] / df[arg_dict[DENOMINATOR]]
60
61 if REGRESSION_PARAM in df:
62
63 def lin_reg_point_estimate_delta(row: Series, arg_dict: Dict) -> Series:
64 return dfmatmul(row[REGRESSION_PARAM], row[arg_dict[FEATURE]], outer=False)
65
66 return (
67 point_estimate
68 - df.apply(lin_reg_point_estimate_delta, arg_dict=arg_dict, axis=1) / df[arg_dict[DENOMINATOR]]
69 )
70
71 return point_estimate
72
73
74def lin_reg_variance_delta(row, arg_dict):
75 y = row[arg_dict[NUMERATOR]]
76 n = row[arg_dict[DENOMINATOR]]
77
78 XX = unlist(row[arg_dict[FEATURE_SUMSQ]])
79 X = unlist(row[arg_dict[FEATURE]])
80 Xy = unlist(row[arg_dict[FEATURE_CROSS]])
81
82 sample_var = XX / n - dfmatmul(X / n, X / n)
83 sample_cov = Xy / n - dfmatmul(X / n, y / n)
84 b = np.atleast_2d(row[REGRESSION_PARAM])
85 variance2 = np.matmul(np.transpose(b), np.matmul(sample_var, b)).item()
86 variance3 = -2 * np.matmul(np.transpose(b), sample_cov).item()
87
88 return variance2 + variance3
89
90
91def variance(df: DataFrame, arg_dict) -> Series:
92 variance1 = z_test_computer.variance(df, arg_dict)
93
94 if arg_dict[FEATURE] in df:
95 return variance1 + df.apply(lin_reg_variance_delta, arg_dict=arg_dict, axis=1)
96 else:
97 return variance1
98
99
100def add_point_estimate_ci(df: DataFrame, arg_dict: Dict) -> DataFrame:
101 df = df.assign(**{ORIGINAL_POINT_ESTIMATE: z_test_computer.point_estimate(df, arg_dict)}).assign(
102 **{ORIGINAL_VARIANCE: z_test_computer.variance(df, arg_dict)}
103 )
104
105 return z_test_computer.add_point_estimate_ci(df, arg_dict)
106
107
108def std_err(df: DataFrame, arg_dict: Dict) -> DataFrame:
109 return z_test_computer.std_err(df, arg_dict)
110
111
112def p_value(df: DataFrame, arg_dict: Dict) -> DataFrame:
113 return z_test_computer.p_value(df, arg_dict)
114
115
116def ci(df: DataFrame, alpha_column: str, arg_dict: Dict) -> DataFrame:
117 return z_test_computer.ci(df, alpha_column, arg_dict)
118
119
120def powered_effect(
121 df: DataFrame,
122 z_alpha: float,
123 z_power: float,
124 binary: bool,
125 non_inferiority: bool,
126 avg_column: float,
127 var_column: float,
128) -> Series:
129 return z_test_computer.powered_effect(df, z_alpha, z_power, binary, non_inferiority, avg_column, var_column)
130
131
132def required_sample_size(
133 binary: Union[Series, bool],
134 non_inferiority: Union[Series, bool],
135 hypothetical_effect: Union[Series, float],
136 control_avg: Union[Series, float],
137 control_var: Union[Series, float],
138 z_alpha: float = None,
139 kappa: float = None,
140 proportion_of_total: Union[Series, float] = None,
141 z_power: float = None,
142) -> Union[Series, float]:
143 return z_test_computer.required_sample_size(
144 binary,
145 non_inferiority,
146 hypothetical_effect,
147 control_avg,
148 control_var,
149 z_alpha,
150 kappa,
151 proportion_of_total,
152 z_power,
153 )