Coverage for /Users/sebastiana/Documents/Sugarpills/confidence/spotify_confidence/analysis/frequentist/confidence_computers/t_test_computer.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

47 statements  

1from typing import Tuple, Dict 

2 

3import numpy as np 

4from pandas import DataFrame, Series 

5from statsmodels.stats.weightstats import _tconfint_generic, _tstat_generic 

6 

7from spotify_confidence.analysis.confidence_utils import power_calculation 

8from spotify_confidence.analysis.constants import ( 

9 NUMERATOR, 

10 NUMERATOR_SUM_OF_SQUARES, 

11 DENOMINATOR, 

12 INTERVAL_SIZE, 

13 POINT_ESTIMATE, 

14 CI_LOWER, 

15 CI_UPPER, 

16 VARIANCE, 

17 TWO_SIDED, 

18 SFX1, 

19 SFX2, 

20 STD_ERR, 

21 PREFERENCE_TEST, 

22 NULL_HYPOTHESIS, 

23 DIFFERENCE, 

24) 

25 

26 

27def point_estimate(df: DataFrame, arg_dict: Dict[str, str]) -> float: 

28 numerator = arg_dict[NUMERATOR] 

29 denominator = arg_dict[DENOMINATOR] 

30 if (df[denominator] == 0).any(): 

31 raise ValueError("""Can't compute point estimate: denominator is 0""") 

32 return df[numerator] / df[denominator] 

33 

34 

35def variance(df: DataFrame, arg_dict: Dict[str, str]) -> float: 

36 numerator = arg_dict[NUMERATOR] 

37 denominator = arg_dict[DENOMINATOR] 

38 numerator_sumsq = arg_dict[NUMERATOR_SUM_OF_SQUARES] 

39 binary = df[numerator_sumsq] == df[numerator] 

40 if binary.all(): 

41 # This equals row[POINT_ESTIMATE]*(1-row[POINT_ESTIMATE]) when the data is binary, 

42 # and also gives a robust fallback in case it's not 

43 variance = df[numerator_sumsq] / df[denominator] - df[POINT_ESTIMATE] ** 2 

44 else: 

45 variance = (df[numerator_sumsq] - np.power(df[numerator], 2) / df[denominator]) / (df[denominator] - 1) 

46 if (variance < 0).any(): 

47 raise ValueError("Computed variance is negative. " "Please check your inputs.") 

48 return variance 

49 

50 

51def std_err(df: DataFrame, arg_dict: Dict[str, str]) -> Series: 

52 denominator = arg_dict[DENOMINATOR] 

53 return np.sqrt(df[VARIANCE + SFX1] / df[denominator + SFX1] + df[VARIANCE + SFX2] / df[denominator + SFX2]) 

54 

55 

56def add_point_estimate_ci(df: DataFrame, arg_dict: Dict[str, str]) -> Series: 

57 denominator = arg_dict[DENOMINATOR] 

58 interval_size = arg_dict[INTERVAL_SIZE] 

59 df[CI_LOWER], df[CI_UPPER] = _tconfint_generic( 

60 mean=df[POINT_ESTIMATE], 

61 std_mean=np.sqrt(df[VARIANCE] / df[denominator]), 

62 dof=df[denominator] - 1, 

63 alpha=1 - interval_size, 

64 alternative=TWO_SIDED, 

65 ) 

66 return df 

67 

68 

69def _dof(row: Series, arg_dict: Dict[str, str]) -> float: 

70 denominator = arg_dict[DENOMINATOR] 

71 v1, v2 = row[VARIANCE + SFX1], row[VARIANCE + SFX2] 

72 n1, n2 = row[denominator + SFX1], row[denominator + SFX2] 

73 return (v1 / n1 + v2 / n2) ** 2 / ((v1 / n1) ** 2 / (n1 - 1) + (v2 / n2) ** 2 / (n2 - 1)) 

74 

75 

76def p_value(df: Series, arg_dict: Dict[str, str]) -> Series: 

77 _, p_value = _tstat_generic( 

78 value1=df[POINT_ESTIMATE + SFX2], 

79 value2=df[POINT_ESTIMATE + SFX1], 

80 std_diff=df[STD_ERR], 

81 dof=_dof(df, arg_dict), 

82 alternative=df[PREFERENCE_TEST].values[0], 

83 diff=df[NULL_HYPOTHESIS], 

84 ) 

85 return p_value 

86 

87 

88def ci(df: DataFrame, alpha_column: str, arg_dict: Dict[str, str]) -> Tuple[Series, Series]: 

89 return _tconfint_generic( 

90 mean=df[DIFFERENCE], 

91 std_mean=df[STD_ERR], 

92 dof=_dof(df, arg_dict), 

93 alpha=df[alpha_column], 

94 alternative=df[PREFERENCE_TEST].values[0], 

95 ) 

96 

97 

98def achieved_power(df: DataFrame, mde: float, alpha: float, arg_dict: Dict[str, str]) -> DataFrame: 

99 v1, v2 = df[VARIANCE + SFX1], df[VARIANCE + SFX2] 

100 d1, d2 = arg_dict[DENOMINATOR] + SFX1, arg_dict[DENOMINATOR] + SFX2 

101 n1, n2 = df[d1], df[d2] 

102 

103 var_pooled = ((n1 - 1) * v1 + (n2 - 1) * v2) / (n1 + n2 - 2) 

104 

105 return power_calculation(mde, var_pooled, alpha, n1, n2)