Coverage for /Users/sebastiana/Documents/Sugarpills/confidence/spotify_confidence/analysis/frequentist/confidence_computers/srm_test_computer.py: 82%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

45 statements  

1from typing import Tuple, Union, Dict, Iterable 

2 

3import numpy as np 

4from pandas import DataFrame, Series 

5from scipy import optimize 

6from scipy import stats as st 

7 

8from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic 

9 

10from spotify_confidence.analysis.confidence_utils import power_calculation 

11from spotify_confidence.analysis.constants import ( 

12 NUMERATOR, 

13 NUMERATOR_SUM_OF_SQUARES, 

14 DENOMINATOR, 

15 INTERVAL_SIZE, 

16 FINAL_EXPECTED_SAMPLE_SIZE, 

17 ORDINAL_GROUP_COLUMN, 

18 POINT_ESTIMATE, 

19 CI_LOWER, 

20 CI_UPPER, 

21 ADJUSTED_LOWER, 

22 ADJUSTED_UPPER, 

23 VARIANCE, 

24 NUMBER_OF_COMPARISONS, 

25 TWO_SIDED, 

26 SFX2, 

27 SFX1, 

28 STD_ERR, 

29 PREFERENCE_TEST, 

30 NULL_HYPOTHESIS, 

31 DIFFERENCE, 

32 ALPHA, 

33 IS_SIGNIFICANT, 

34 HOLM, 

35 SPOT_1_HOLM, 

36 HOMMEL, 

37 SIMES_HOCHBERG, 

38 SPOT_1_HOMMEL, 

39 SPOT_1_SIMES_HOCHBERG, 

40 NIM, 

41 ADJUSTED_ALPHA, 

42 NUMBER_OF_COMPARISONS_VALIDATION, 

43 ADJUSTED_ALPHA_VALIDATION, 

44 PREFERRED_DIRECTION_COLUMN_DEFAULT, 

45 INCREASE_PREFFERED, 

46 DECREASE_PREFFERED, 

47 PREFERENCE_DICT, 

48) 

49from spotify_confidence.analysis.frequentist.sequential_bound_solver import bounds 

50 

51 

52def sequential_bounds(t: np.array, alpha: float, sides: int, state: DataFrame = None): 

53 return bounds(t, alpha, rho=2, ztrun=8, sides=sides, max_nints=1000, state=state) 

54 

55 

56def sample_ratio_test(df: DataFrame, arg_dict: dict) -> Tuple[float, DataFrame]: 

57 n_tot = df[arg_dict[DENOMINATOR]].sum() 

58 expected_proportions = df[arg_dict[NUMERATOR]] 

59 observed_proportions = df[arg_dict[DENOMINATOR]] / n_tot 

60 sq_diff = np.power(observed_proportions - expected_proportions, 2) 

61 

62 chi2_stat = n_tot * sq_diff.divide(expected_proportions).sum() 

63 deg_freedom = df.shape[0] - 1 

64 p_value = 1 - st.chi2.cdf(chi2_stat, deg_freedom) 

65 

66 return p_value 

67 

68 

69def p_value(df: DataFrame, arg_dict: Dict[str, str], validation: bool = True) -> Series: 

70 return float('nan') 

71 

72 

73def ci(df: DataFrame, alpha_column: str, arg_dict: Dict[str, str]) -> Tuple[Series, Series]: 

74 return (float('nan'), float('nan')) 

75 

76 

77def point_estimate(df: DataFrame, arg_dict: Dict[str, str]) -> float: 

78 return float('nan') 

79 

80 

81def variance(df: DataFrame, arg_dict: Dict[str, str]) -> float: 

82 return float('nan') 

83 

84 

85def add_point_estimate_ci(df: Series, arg_dict: Dict[str, str]) -> Series: 

86 df[CI_LOWER] = float('nan') 

87 df[CI_UPPER] = float('nan') 

88 return df 

89 

90 

91def std_err(df: Series, arg_dict: Dict[str, str]) -> float: 

92 return float('nan') 

93 

94 

95def compute_sequential_adjusted_alpha(df: DataFrame, arg_dict: Dict[str, str], validation: bool): 

96 denominator = arg_dict[DENOMINATOR] 

97 final_expected_sample_size_column = arg_dict[FINAL_EXPECTED_SAMPLE_SIZE] 

98 ordinal_group_column = arg_dict[ORDINAL_GROUP_COLUMN] 

99 n_comparisons = arg_dict[NUMBER_OF_COMPARISONS if not validation else NUMBER_OF_COMPARISONS_VALIDATION] 

100 

101 def adjusted_alphas_for_group(grp: DataFrame, validation: bool) -> Series: 

102 return ( 

103 sequential_bounds( 

104 t=grp["sample_size_proportions"].values, 

105 alpha=grp[ALPHA].values[0] / n_comparisons, 

106 sides=2 if (grp[PREFERENCE_TEST] == TWO_SIDED).all() and not validation else 1, 

107 ) 

108 .df.set_index(grp.index) 

109 .assign( 

110 **{ 

111 ADJUSTED_ALPHA: lambda df: df.apply( 

112 lambda row: 2 * (1 - st.norm.cdf(row["zb"])) 

113 if not validation and (grp[PREFERENCE_TEST] == TWO_SIDED).all() 

114 else 1 - st.norm.cdf(row["zb"]), 

115 axis=1, 

116 ) 

117 } 

118 ) 

119 )[["zb", ADJUSTED_ALPHA]] 

120 

121 groups_except_ordinal = [column for column in df.index.names if column != ordinal_group_column] 

122 max_sample_size_by_group = ( 

123 ( 

124 df[["current_total_" + denominator, final_expected_sample_size_column]] 

125 .groupby(groups_except_ordinal, sort=False) 

126 .max() 

127 .max(axis=1) 

128 ) 

129 if len(groups_except_ordinal) > 0 

130 else (df[["current_total_" + denominator, final_expected_sample_size_column]].max().max()) 

131 ) 

132 sample_size_proportions = Series( 

133 data=df.groupby(df.index.names, sort=False)["current_total_" + denominator].first() / max_sample_size_by_group, 

134 name="sample_size_proportions", 

135 ) 

136 

137 return Series( 

138 data=df.groupby(df.index.names, sort=False)[[ALPHA, PREFERENCE_TEST]] 

139 .first() 

140 .merge(sample_size_proportions, left_index=True, right_index=True) 

141 .assign(_sequential_dummy_index_=1) 

142 .groupby(groups_except_ordinal + ["_sequential_dummy_index_"], sort=False)[ 

143 ["sample_size_proportions", PREFERENCE_TEST, ALPHA] 

144 ] 

145 .apply(adjusted_alphas_for_group, validation=validation)[ADJUSTED_ALPHA], 

146 name=ADJUSTED_ALPHA if not validation else ADJUSTED_ALPHA_VALIDATION, 

147 )