Coverage for /Users/sebastiana/Documents/Sugarpills/confidence/spotify_confidence/analysis/frequentist/sample_ratio_test.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

37 statements  

1# Copyright 2017-2020 Spotify AB 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15import numpy as np 

16from pandas import DataFrame 

17from scipy.stats import chi2 

18from typing import Dict, Tuple, Iterable 

19 

20 

21def sample_ratio_test( 

22 df: DataFrame, all_group_columns: Iterable, denominator: str, expected_proportions: Dict 

23) -> Tuple[float, DataFrame]: 

24 """Goodness of fit test of observed vs. expected group frequencies. 

25 

26 Tests whether the observed proportion of total users in each group 

27 are likely to come from the sampling distribution using a Pearson's 

28 chi-squared test: 

29 https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test 

30 

31 Args: 

32 expected_proportions (dict): Expected proportion of observations in 

33 each group with group-keys as keys and proportions as values. 

34 

35 Returns: 

36 float: p-value based on the null hypothesis that observed 

37 proportions are drawn from the sampling distribution. 

38 pandas.DataFrame with the following columns: 

39 - denominator column of original data. 

40 - observed_proportion: Observed share in the group. 

41 - expected_proportion: Expected share in the group. 

42 - difference: observed - expected shares. 

43 """ 

44 

45 if not isinstance(expected_proportions, dict): 

46 raise TypeError( 

47 "`expected_proportions` must be a dict with " "groupings as keys and expected proportions " "as values" 

48 ) 

49 elif not np.allclose(sum(expected_proportions.values()), 1.0): 

50 raise ValueError("proportions must sum to one") 

51 elif not (np.array(list(expected_proportions.values())) > 0).all(): 

52 raise ValueError("proportions must all be positive") 

53 

54 all_groups = list(df.groupby(all_group_columns, sort=False).groups.keys()) 

55 if set(all_groups) != set(expected_proportions.keys()): 

56 raise ValueError(f"`expected_proportion` keys must match groupings in the " f"order {all_group_columns}") 

57 

58 n_tot = df[denominator].sum() 

59 

60 grouped_data = df.groupby(all_group_columns, sort=False) 

61 sr_df = grouped_data.sum() 

62 sr_df["observed_proportion"] = np.zeros(len(sr_df)) 

63 sr_df["expected_proportion"] = np.zeros(len(sr_df)) 

64 sr_df["difference"] = np.zeros(len(sr_df)) 

65 

66 a = 0 

67 for grouping, expected_proportion in expected_proportions.items(): 

68 try: 

69 n_group = grouped_data.get_group(grouping)[denominator].iloc[0] 

70 except KeyError as e: 

71 raise KeyError(f"{e} is not a valid group") 

72 

73 actual_proportion = n_group / n_tot 

74 diff = actual_proportion - expected_proportion 

75 sq_diff = np.power(diff, 2) 

76 a += sq_diff / expected_proportion 

77 

78 sr_df.loc[grouping, "observed_proportion"] = actual_proportion 

79 sr_df.loc[grouping, "expected_proportion"] = expected_proportion 

80 sr_df.loc[grouping, "difference"] = diff 

81 

82 chi2_stat = n_tot * a 

83 deg_freedom = len(grouped_data) - 1 

84 p_value = 1 - chi2.cdf(chi2_stat, deg_freedom) 

85 

86 return p_value, sr_df