Coverage for /Users/sebastiana/Documents/Sugarpills/confidence/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py: 41%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

68 statements  

1from functools import reduce 

2from typing import Tuple, Union, Dict 

3 

4import numpy as np 

5from pandas import DataFrame, Series 

6 

7from spotify_confidence.analysis.confidence_utils import unlist, dfmatmul 

8from spotify_confidence.analysis.constants import ( 

9 REGRESSION_PARAM, 

10 FEATURE, 

11 FEATURE_SUMSQ, 

12 FEATURE_CROSS, 

13 NUMERATOR, 

14 DENOMINATOR, 

15 ORIGINAL_POINT_ESTIMATE, 

16 ORIGINAL_VARIANCE, 

17) 

18from spotify_confidence.analysis.frequentist.confidence_computers import z_test_computer 

19 

20 

21def estimate_slope(df, arg_dict: Dict) -> DataFrame: 

22 if arg_dict[FEATURE] not in df: 

23 return df 

24 

25 def col_sum(x): 

26 return reduce(lambda x, y: x + y, x) 

27 

28 def dimension(x): 

29 return x.shape[0] if isinstance(x, np.ndarray) and x.size > 1 else 1 

30 

31 k = df[arg_dict[FEATURE_SUMSQ]].apply(dimension).iloc[0] 

32 

33 XX0 = np.zeros((k + 1, k + 1)) 

34 XX0[1 : (k + 1), 1 : (k + 1)] = col_sum(df[arg_dict[FEATURE_SUMSQ]]) 

35 XX0[0, 0] = col_sum(df[arg_dict[DENOMINATOR]]) 

36 XX0[0, 1 : (k + 1)] = col_sum(df[arg_dict[FEATURE]]) 

37 XX0[1 : (k + 1), 0] = col_sum(df[arg_dict[FEATURE]]) 

38 

39 Xy0 = np.zeros((k + 1, 1)) 

40 Xy0[ 

41 0, 

42 ] = col_sum(df[arg_dict[NUMERATOR]]) 

43 Xy0[1 : (k + 1),] = np.atleast_2d( 

44 col_sum(df[arg_dict[FEATURE_CROSS]]) 

45 ).reshape(-1, 1) 

46 

47 b = np.matmul(np.linalg.inv(XX0), Xy0) 

48 out = b[1 : (k + 1)] 

49 if out.size == 1: 

50 out = out.item() 

51 

52 outseries = Series(index=df.index, dtype=df[arg_dict[FEATURE]].dtype) 

53 df[REGRESSION_PARAM] = outseries.apply(lambda x: out) 

54 return df 

55 

56 

57def point_estimate(df: Series, arg_dict) -> float: 

58 df = estimate_slope(df, arg_dict) 

59 point_estimate = df[arg_dict[NUMERATOR]] / df[arg_dict[DENOMINATOR]] 

60 

61 if REGRESSION_PARAM in df: 

62 

63 def lin_reg_point_estimate_delta(row: Series, arg_dict: Dict) -> Series: 

64 return dfmatmul(row[REGRESSION_PARAM], row[arg_dict[FEATURE]], outer=False) 

65 

66 return ( 

67 point_estimate 

68 - df.apply(lin_reg_point_estimate_delta, arg_dict=arg_dict, axis=1) / df[arg_dict[DENOMINATOR]] 

69 ) 

70 

71 return point_estimate 

72 

73 

74def lin_reg_variance_delta(row, arg_dict): 

75 y = row[arg_dict[NUMERATOR]] 

76 n = row[arg_dict[DENOMINATOR]] 

77 

78 XX = unlist(row[arg_dict[FEATURE_SUMSQ]]) 

79 X = unlist(row[arg_dict[FEATURE]]) 

80 Xy = unlist(row[arg_dict[FEATURE_CROSS]]) 

81 

82 sample_var = XX / n - dfmatmul(X / n, X / n) 

83 sample_cov = Xy / n - dfmatmul(X / n, y / n) 

84 b = np.atleast_2d(row[REGRESSION_PARAM]) 

85 variance2 = np.matmul(np.transpose(b), np.matmul(sample_var, b)).item() 

86 variance3 = -2 * np.matmul(np.transpose(b), sample_cov).item() 

87 

88 return variance2 + variance3 

89 

90 

91def variance(df: DataFrame, arg_dict) -> Series: 

92 variance1 = z_test_computer.variance(df, arg_dict) 

93 

94 if arg_dict[FEATURE] in df: 

95 return variance1 + df.apply(lin_reg_variance_delta, arg_dict=arg_dict, axis=1) 

96 else: 

97 return variance1 

98 

99 

100def add_point_estimate_ci(df: DataFrame, arg_dict: Dict) -> DataFrame: 

101 df = df.assign(**{ORIGINAL_POINT_ESTIMATE: z_test_computer.point_estimate(df, arg_dict)}).assign( 

102 **{ORIGINAL_VARIANCE: z_test_computer.variance(df, arg_dict)} 

103 ) 

104 

105 return z_test_computer.add_point_estimate_ci(df, arg_dict) 

106 

107 

108def std_err(df: DataFrame, arg_dict: Dict) -> DataFrame: 

109 return z_test_computer.std_err(df, arg_dict) 

110 

111 

112def p_value(df: DataFrame, arg_dict: Dict) -> DataFrame: 

113 return z_test_computer.p_value(df, arg_dict) 

114 

115 

116def ci(df: DataFrame, alpha_column: str, arg_dict: Dict) -> DataFrame: 

117 return z_test_computer.ci(df, alpha_column, arg_dict) 

118 

119 

120def powered_effect( 

121 df: DataFrame, 

122 z_alpha: float, 

123 z_power: float, 

124 binary: bool, 

125 non_inferiority: bool, 

126 avg_column: float, 

127 var_column: float, 

128) -> Series: 

129 return z_test_computer.powered_effect(df, z_alpha, z_power, binary, non_inferiority, avg_column, var_column) 

130 

131 

132def required_sample_size( 

133 binary: Union[Series, bool], 

134 non_inferiority: Union[Series, bool], 

135 hypothetical_effect: Union[Series, float], 

136 control_avg: Union[Series, float], 

137 control_var: Union[Series, float], 

138 z_alpha: float = None, 

139 kappa: float = None, 

140 proportion_of_total: Union[Series, float] = None, 

141 z_power: float = None, 

142) -> Union[Series, float]: 

143 return z_test_computer.required_sample_size( 

144 binary, 

145 non_inferiority, 

146 hypothetical_effect, 

147 control_avg, 

148 control_var, 

149 z_alpha, 

150 kappa, 

151 proportion_of_total, 

152 z_power, 

153 )