1# Copyright 2017-2020 Spotify AB
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import numpy as np
16from pandas import DataFrame
17from scipy.stats import chi2
18from typing import Dict, Tuple, Iterable
19
20
21def sample_ratio_test(
22 df: DataFrame, all_group_columns: Iterable, denominator: str, expected_proportions: Dict
23) -> Tuple[float, DataFrame]:
24 """Goodness of fit test of observed vs. expected group frequencies.
25
26 Tests whether the observed proportion of total users in each group
27 are likely to come from the sampling distribution using a Pearson's
28 chi-squared test:
29 https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
30
31 Args:
32 expected_proportions (dict): Expected proportion of observations in
33 each group with group-keys as keys and proportions as values.
34
35 Returns:
36 float: p-value based on the null hypothesis that observed
37 proportions are drawn from the sampling distribution.
38 pandas.DataFrame with the following columns:
39 - denominator column of original data.
40 - observed_proportion: Observed share in the group.
41 - expected_proportion: Expected share in the group.
42 - difference: observed - expected shares.
43 """
44
45 if not isinstance(expected_proportions, dict):
46 raise TypeError(
47 "`expected_proportions` must be a dict with " "groupings as keys and expected proportions " "as values"
48 )
49 elif not np.allclose(sum(expected_proportions.values()), 1.0):
50 raise ValueError("proportions must sum to one")
51 elif not (np.array(list(expected_proportions.values())) > 0).all():
52 raise ValueError("proportions must all be positive")
53
54 all_groups = list(df.groupby(all_group_columns, sort=False).groups.keys())
55 if set(all_groups) != set(expected_proportions.keys()):
56 raise ValueError(f"`expected_proportion` keys must match groupings in the " f"order {all_group_columns}")
57
58 n_tot = df[denominator].sum()
59
60 grouped_data = df.groupby(all_group_columns, sort=False)
61 sr_df = grouped_data.sum()
62 sr_df["observed_proportion"] = np.zeros(len(sr_df))
63 sr_df["expected_proportion"] = np.zeros(len(sr_df))
64 sr_df["difference"] = np.zeros(len(sr_df))
65
66 a = 0
67 for grouping, expected_proportion in expected_proportions.items():
68 try:
69 n_group = grouped_data.get_group(grouping)[denominator].iloc[0]
70 except KeyError as e:
71 raise KeyError(f"{e} is not a valid group")
72
73 actual_proportion = n_group / n_tot
74 diff = actual_proportion - expected_proportion
75 sq_diff = np.power(diff, 2)
76 a += sq_diff / expected_proportion
77
78 sr_df.loc[grouping, "observed_proportion"] = actual_proportion
79 sr_df.loc[grouping, "expected_proportion"] = expected_proportion
80 sr_df.loc[grouping, "difference"] = diff
81
82 chi2_stat = n_tot * a
83 deg_freedom = len(grouped_data) - 1
84 p_value = 1 - chi2.cdf(chi2_stat, deg_freedom)
85
86 return p_value, sr_df