1# Copyright 2017-2020 Spotify AB
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15from collections import OrderedDict
16
17import chartify
18import numpy as np
19import pandas as pd
20from scipy.stats import beta
21
22from spotify_confidence.analysis.bayesian.bayesian_base import (
23 BaseTest,
24 randomization_warning_decorator,
25 axis_format_precision,
26)
27
28
29class BetaBinomial(BaseTest):
30 def __init__(
31 self,
32 data_frame,
33 numerator_column,
34 denominator_column,
35 categorical_group_columns=None,
36 ordinal_group_column=None,
37 prior_alpha_column=None,
38 prior_beta_column=None,
39 interval_size=0.95,
40 ):
41 """
42 Bayesian BetaBinomial model.
43
44 See: https://en.wikipedia.org/wiki/Beta-binomial_distribution
45
46 data_frame (pd.DataFrame): DataFrame
47 numerator_column (str): Column name for numerator column.
48 denominator_column (str): Column name for denominator column.
49 categorical_group_columns (str or list): Column names
50 for categorical groupings.
51 ordinal_group_column (str): Column name for ordinal
52 grouping (e.g. numeric or date values).
53 prior_alpha_column (str): Column name to use for prior alpha.
54 prior_beta_column (str): Column name to use for prior beta.
55 interval_size (float): Size of credible intervals. Default 0.95
56 """
57 super().__init__(
58 data_frame,
59 categorical_group_columns,
60 ordinal_group_column,
61 numerator_column,
62 denominator_column,
63 interval_size,
64 )
65
66 self._monte_carlo_sample_size = 500000
67
68 # Initialize priors.
69 if prior_alpha_column is None or prior_beta_column is None:
70 self._alpha_prior, self._beta_prior = (0.5, 0.5)
71 else:
72 self._alpha_prior = data_frame[prior_alpha_column]
73 self._beta_prior = data_frame[prior_beta_column]
74
75 def _interval(self, row):
76 interval = beta.interval(
77 self._interval_size,
78 row[self._numerator_column] + self._alpha_prior,
79 row[self._denominator_column] - row[self._numerator_column] + self._beta_prior,
80 )
81 return interval
82
83 def _posterior_parameters(self, group_df):
84 """Calculate parameters of posterior distribution.
85
86 Returns:
87 tuple of floats: posterior_alpha, posterior_beta"""
88 numerator = group_df[self._numerator_column].values[0]
89 denominator = group_df[self._denominator_column].values[0]
90 posterior_alpha = numerator + self._alpha_prior
91 posterior_beta = denominator - numerator + self._beta_prior
92 return posterior_alpha, posterior_beta
93
94 def _beta_pdf(self, group_df):
95 """Beta pdfs for the given dataframe"""
96 posterior_alpha, posterior_beta = self._posterior_parameters(group_df)
97 epsilon = 0.001
98 lower_range = beta.isf(1.0 - epsilon, posterior_alpha, posterior_beta)
99 upper_range = beta.isf(epsilon, posterior_alpha, posterior_beta)
100 x_range = np.linspace(lower_range, upper_range, 1000)
101 beta_pdf = [beta.pdf(x, posterior_alpha, posterior_beta) for x in x_range]
102 beta_dist = pd.DataFrame({"x": x_range, "y": beta_pdf})
103 return beta_dist
104
105 def _sample_posterior(self, group_df, posterior_sample_size=None):
106 """MCMC sampling of posterior distribution.
107 Used to calculate the posterior distribution of
108 the difference in Beta RVs.
109
110 Arguments:
111 - seed (int): Seed for random number generator.
112 Set it to make the posteriors deterministic.
113 - posterior_sample_size (int): Number of posterior
114 samples (affects precision)
115 """
116 if posterior_sample_size is None:
117 posterior_sample_size = self._monte_carlo_sample_size
118 posterior_alpha, posterior_beta = self._posterior_parameters(group_df)
119 posterior_samples = np.random.beta(posterior_alpha, posterior_beta, size=posterior_sample_size)
120 return posterior_samples
121
122 def _categorical_summary_plot(self, level_name, level_df, remaining_groups, groupby):
123
124 if not remaining_groups:
125 remaining_groups = groupby
126 grouped_df = level_df.groupby(remaining_groups)
127
128 distributions = pd.DataFrame()
129 for group_name, group_df in grouped_df:
130 beta_dist = self._beta_pdf(group_df)
131 beta_dist["group"] = str(group_name)
132 distributions = pd.concat([distributions, beta_dist], axis=0)
133
134 # Filter out the long tails of the distributions
135 filtered_xs = distributions.groupby("x")["y"].max().reset_index().loc[lambda x: x["y"] > 0.01]
136 distributions = distributions[distributions["x"].isin(filtered_xs["x"])]
137
138 # Remove legend if only one color
139 color_column = "group" if len(grouped_df) > 1 else None
140
141 ch = chartify.Chart()
142 ch.plot.area(
143 distributions,
144 "x",
145 "y",
146 color_column=color_column,
147 stacked=False,
148 color_order=[str(x) for x in list(grouped_df.groups.keys())],
149 )
150 ch.set_title("Estimate of {} / {}".format(self._numerator_column, self._denominator_column))
151
152 if groupby:
153 ch.set_subtitle("{}: {}".format(groupby, level_name))
154 else:
155 ch.set_subtitle("")
156 ch.axes.set_xaxis_label("{} / {}".format(self._numerator_column, self._denominator_column))
157 ch.axes.set_yaxis_label("Probability Density")
158 ch.set_source_label("")
159 ch.axes.set_yaxis_range(0)
160 axis_format = axis_format_precision(distributions["x"].min(), distributions["x"].max(), absolute=True)
161 ch.axes.set_xaxis_tick_format(axis_format)
162
163 ch.style.color_palette.reset_palette_order()
164
165 # Plot callouts for the means
166 for group_name, group_df in grouped_df:
167 posterior_alpha, posterior_beta = self._posterior_parameters(group_df)
168 posterior_mean = posterior_alpha / (posterior_alpha + posterior_beta)
169 density = beta.pdf(posterior_mean, posterior_alpha, posterior_beta)
170 ch.callout.line(
171 posterior_mean,
172 orientation="height",
173 line_color=ch.style.color_palette.next_color(),
174 line_dash="dashed",
175 )
176 ch.callout.text("{0:.1f}%".format(posterior_mean * 100), posterior_mean, density)
177
178 ch.axes.hide_yaxis()
179 if color_column:
180 ch.set_legend_location("outside_bottom")
181 return ch
182
183 def _difference_posteriors(self, data, level_1, level_2, absolute=True):
184
185 posterior_1 = self._sample_posterior(data.get_group(level_1))
186 posterior_2 = self._sample_posterior(data.get_group(level_2))
187
188 if absolute:
189 difference_posterior = posterior_2 - posterior_1
190 else:
191 difference_posterior = posterior_2 / posterior_1 - 1.0
192
193 return difference_posterior
194
195 def _differences(self, difference_posterior, level_1, level_2, absolute):
196 # 95% credible interval for posterior
197 credible_interval = (
198 pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2),
199 pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2 + self._interval_size),
200 )
201
202 # Probability that posterior is greater
203 # than zero (count occurences in the MC sample)
204 p_gt_zero = (difference_posterior > 0).mean()
205
206 expected_loss_v2 = difference_posterior[difference_posterior < 0].sum() / len(difference_posterior)
207 if (difference_posterior > 0).sum() == 0:
208 expected_gain_v2 = 0
209 else:
210 expected_gain_v2 = difference_posterior[difference_posterior > 0].sum() / len(difference_posterior)
211
212 expected_loss_v1 = (difference_posterior[difference_posterior * -1.0 < 0] * -1.0).sum() / len(
213 difference_posterior
214 )
215
216 if (difference_posterior * -1.0 > 0).sum() == 0:
217 expected_gain_v1 = 0
218 else:
219 expected_gain_v1 = (difference_posterior[difference_posterior * -1.0 > 0] * -1.0).sum() / len(
220 difference_posterior
221 )
222
223 return pd.DataFrame(
224 OrderedDict(
225 [
226 ("level_1", str(level_1)),
227 ("level_2", str(level_2)),
228 ("absolute_difference", absolute),
229 ("difference", difference_posterior.mean()),
230 ("ci_lower", [credible_interval[0]]),
231 ("ci_upper", [credible_interval[1]]),
232 ("P(level_2 > level_1)", p_gt_zero),
233 ("level_1 potential loss", expected_loss_v1),
234 ("level_1 potential gain", expected_gain_v1),
235 ("level_2 potential loss", expected_loss_v2),
236 ("level_2 potential gain", expected_gain_v2),
237 ]
238 )
239 )
240
241 def _difference(self, level_name, level_df, remaining_groups, groupby, level_1, level_2, absolute):
242
243 difference_df, _ = self._difference_and_difference_posterior(
244 level_df, remaining_groups, level_2, level_1, absolute
245 )
246
247 self._add_group_by_columns(difference_df, groupby, level_name)
248
249 return difference_df
250
251 def _difference_and_difference_posterior(self, level_df, remaining_groups, level_2, level_1, absolute):
252 self._validate_levels(level_df, remaining_groups, level_1)
253 self._validate_levels(level_df, remaining_groups, level_2)
254 # difference is posterior_2 - posterior_1
255 difference_posterior = self._difference_posteriors(
256 level_df.groupby(remaining_groups), level_1, level_2, absolute
257 )
258 difference_df = self._differences(difference_posterior, level_1, level_2, absolute)
259 return difference_df, difference_posterior
260
261 @randomization_warning_decorator
262 def difference(self, level_1, level_2, absolute=True, groupby=None):
263 """Return DataFrame with summary statistics of the difference between
264 level 1 and level 2.
265
266 Args:
267 level_1 (str, tuple of str): Name of first level.
268 level_2 (str, tuple of str): Name of second level.
269 absolute (bool): If True then return the
270 absolute difference (level2 - level1)
271 otherwise return the relative difference (level2 / level1 - 1)
272 groupby (str): Name of column.
273 If specified, will return the difference for each level
274 of the grouped dimension.
275
276 Returns:
277 Pandas DataFrame with the following columns:
278 - level_1: Name of level 1.
279 - level_2: Name of level 2.
280 - absolute_difference: True if absolute.
281 Absolute: level2 - level1
282 Relative: level2 / level1 - 1
283 - difference: Best estimate of the difference between level 2 and 1.
284 Posterior mean of the difference between level 1 and level 2.
285 https://en.wikipedia.org/wiki/Bayes_estimator
286 - ci_lower: Lower credible interval bound of the difference.
287 - ci_upper: Upper credible interval bound of the difference.
288 - P(level_2 > level_1): Probability that the level 2 > level 1.
289 - level_1 potential loss: The expected loss if we
290 switch to level 1, but level 2 is actually better.
291 - level_1 potential gain: The expected gain if we
292 switch to level 1, and level 1 is actually better.
293 - level_2 potential loss: The expected loss if we
294 switch to level 2, but level 1 is actually better.
295 - level_2 potential gain: The expected gain if we
296 switch to level 2, and level 2 is actually better.
297 """
298
299 results_df = self._iterate_groupby_to_dataframe(
300 self._difference, groupby=groupby, level_1=level_1, level_2=level_2, absolute=absolute
301 )
302
303 return results_df
304
305 @randomization_warning_decorator
306 def _categorical_difference_plot(self, level_1, level_2, absolute, groupby):
307 chart_grid = self._iterate_groupby_to_chartgrid(
308 self._categorical_difference_plot_, groupby=groupby, level_1=level_1, level_2=level_2, absolute=absolute
309 )
310
311 return chart_grid
312
313 def _categorical_difference_plot_(
314 self, level_name, level_df, remaining_groups, groupby, level_1, level_2, absolute
315 ):
316 difference_df, difference_posterior = self._difference_and_difference_posterior(
317 level_df, remaining_groups, level_2, level_1, absolute
318 )
319
320 posterior_mean = difference_df["difference"][0]
321 # potential_loss = difference_df['{} potential loss'.format(level_2)][0]
322
323 # Take the difference posterior and create a chart
324 df = pd.DataFrame({"values": difference_posterior})
325
326 ch = chartify.Chart(y_axis_type="density", x_axis_type="linear")
327
328 ch.plot.kde(df, "values")
329
330 ch.set_title("Change from {} to {}".format(level_1, level_2))
331
332 subtitle = "" if not groupby else "{}: {}".format(groupby, level_name)
333 ch.set_subtitle(subtitle)
334
335 # Line at no difference
336 ch.callout.line(0, orientation="height", line_color="black", line_dash="dashed")
337 # ch.callout.text('No change', 0, .5, angle=90)
338
339 # Plot callout for the mean
340 ch.callout.line(
341 posterior_mean, orientation="height", line_color=ch.style.color_palette._colors[0], line_dash="dashed"
342 )
343 # ch.callout.text(
344 # '{0:.2f}%'.format(posterior_mean * 100), posterior_mean, 0)
345 ch.callout.text("Expected change: {0:.2f}%".format(posterior_mean * 100), posterior_mean, 0, angle=90)
346
347 # ch.callout.line(
348 # potential_loss,
349 # orientation='height',
350 # line_color=ch.style.color_palette._colors[1])
351 # ch.callout.text(
352 # 'Potential Loss: {0:.2f}%'.format(potential_loss * 100),
353 # potential_loss,
354 # 1.5,
355 # angle=90)
356 # ch.callout.text(
357 # '{0:.2f}%'.format(potential_loss * 100), potential_loss, 1.)
358
359 ch.set_source_label("")
360 ch.axes.set_yaxis_range(0)
361 ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute))
362 ch.axes.set_yaxis_label("Probability Density")
363 ch.axes.hide_yaxis()
364 axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute)
365 ch.axes.set_xaxis_tick_format(axis_format)
366
367 return ch
368
369 def _multiple_difference_joint_dataframe(self, *args, **kwargs):
370
371 return self._multiple_difference_joint_base(*args, **kwargs)[0]
372
373 def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups, groupby, level, absolute):
374
375 grouped_df = level_df.groupby(remaining_groups)
376
377 grouped_df_keys = tuple(grouped_df.groups.keys())
378
379 self._validate_levels(level_df, remaining_groups, level)
380
381 posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys]
382
383 var_indx = grouped_df_keys.index(level)
384 other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level]
385
386 posterior_matrix = np.vstack(posteriors)
387
388 ge_bool_matrix = posterior_matrix[var_indx, :] >= posterior_matrix[:, :]
389
390 best_arr = ge_bool_matrix.all(axis=0)
391
392 p_ge_all = best_arr.mean()
393
394 end_value = posterior_matrix[var_indx]
395 start_value = posterior_matrix[other_indx].max(axis=0)
396
397 if absolute:
398 difference_posterior = end_value - start_value
399 else:
400 difference_posterior = end_value / start_value - 1
401
402 # E(level - best level | level != best)
403 if not (~best_arr).sum():
404 expected_loss = 0
405 else:
406 expected_loss = difference_posterior[~best_arr].mean()
407
408 # E(level - median level | level = best)
409 if not (best_arr).sum():
410 expected_gain = 0
411 else:
412 expected_gain = difference_posterior[best_arr].mean()
413
414 expectation = difference_posterior.mean()
415 ci_l_expectation = pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2)
416 ci_u_expectation = pd.Series(difference_posterior).quantile(
417 (1.0 - self._interval_size) / 2 + self._interval_size
418 )
419
420 difference_df = pd.DataFrame(
421 OrderedDict(
422 [
423 ("level", [str(level)]),
424 ("absolute_difference", absolute),
425 ("difference", expectation),
426 ("ci_lower", ci_l_expectation),
427 ("ci_upper", ci_u_expectation),
428 ("P({} >= all)".format(level), p_ge_all),
429 ("{} potential loss".format(level), expected_loss),
430 ("{} potential gain".format(level), expected_gain),
431 ]
432 )
433 )
434 self._add_group_by_columns(difference_df, groupby, level_name)
435
436 return (difference_df, difference_posterior)
437
438 @randomization_warning_decorator
439 def multiple_difference_joint(self, level, absolute=True, groupby=None):
440 """Calculate the joint probability that the given level is greater
441 than all other levels in the test.
442
443 Args:
444 level (str, tuple of str): Name of level.
445 absolute (bool): If True then return the absolute difference
446 otherwise return the relative difference.
447 groupby (str): Name of column.
448 If specified, will return an interval for each level
449 of the grouped dimension.
450
451 Returns:
452 Pandas DataFrame with the following columns:
453 - level: Name of level
454 - absolute_difference: True if absolute.
455 Absolute: level2 - level1
456 Relative: level2 / level1 - 1
457 - difference: Difference between the level and the best performing
458 among the other levels.
459 - ci_lower: Lower credible interval bound of the difference.
460 - ci_upper: Upper credible interval bound of the difference.
461 - P(level > all): Probability that the level > all other levels.
462 - potential loss: The expected loss if we
463 switch to level, but some other level is actually better.
464 - potential gain: The expected gain if we
465 switch to level, and it is actually the best.
466 """
467
468 results_df = self._iterate_groupby_to_dataframe(
469 self._multiple_difference_joint_dataframe, groupby=groupby, level=level, absolute=absolute
470 )
471
472 return results_df
473
474 def _multiple_difference_joint_plot(self, level_name, level_df, remaining_groups, groupby, level, absolute):
475
476 self._validate_levels(level_df, remaining_groups, level)
477
478 difference_df, difference_posterior = self._multiple_difference_joint_base(
479 level_name, level_df, remaining_groups, groupby, level, absolute
480 )
481
482 posterior_mean = difference_df.loc[:, "difference"].values[0]
483
484 # potential_loss = difference_df.loc[:, '{} potential loss'.format(
485 # level)].values[0]
486
487 # Take the difference posterior and create a chart
488 df = pd.DataFrame({"values": difference_posterior})
489
490 ch = chartify.Chart(y_axis_type="density", x_axis_type="linear")
491
492 ch.plot.kde(df, "values")
493
494 ch.set_title("Comparison to {}".format(level))
495
496 subtitle = "" if not groupby else "{}: {}".format(groupby, level_name)
497 ch.set_subtitle(subtitle)
498
499 # Line at no difference
500 ch.callout.line(0, orientation="height", line_color="black")
501
502 # Plot callout for the mean
503 ch.callout.line(posterior_mean, orientation="height", line_color=ch.style.color_palette._colors[0])
504
505 ch.callout.text("Expected change: {0:.2f}%".format(posterior_mean * 100), posterior_mean, 0, angle=90)
506
507 ch.set_source_label("")
508 ch.axes.set_yaxis_range(0)
509 ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute))
510 ch.axes.set_yaxis_label("Probability Density")
511 ch.axes.hide_yaxis()
512
513 axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute)
514 ch.axes.set_xaxis_tick_format(axis_format)
515 return ch
516
517 @randomization_warning_decorator
518 def multiple_difference_joint_plot(self, level, absolute=True, groupby=None):
519 """Calculate the joint probability that the given level is greater
520 than all other levels in the test.
521
522 Args:
523 level (str, tuple of str): Name of level.
524 absolute (bool): If True then return the absolute difference
525 otherwise return the relative difference.
526 groupby (str): Name of column.
527 If specified, will return an interval for each level
528 of the grouped dimension.
529
530 Returns:
531 GroupedChart object.
532 """
533
534 results_df = self._iterate_groupby_to_chartgrid(
535 self._multiple_difference_joint_plot, groupby=groupby, level=level, absolute=absolute
536 )
537
538 return results_df
539
540 def _multiple_difference(
541 self, level_name, level_df, remaining_groups, groupby, level, absolute, level_as_reference
542 ):
543
544 grouped_df = level_df.groupby(remaining_groups)
545
546 grouped_df_keys = tuple(grouped_df.groups.keys())
547
548 other_keys = [value for i, value in enumerate(grouped_df_keys) if value != level]
549
550 for key in other_keys:
551
552 # Switch the subtraction order as specified.
553 start_value, end_value = level, key
554 if not level_as_reference:
555 start_value, end_value = end_value, start_value
556
557 difference_df = self._difference(
558 level_name, level_df, remaining_groups, groupby, start_value, end_value, absolute=absolute
559 )
560
561 yield difference_df
562
563 @randomization_warning_decorator
564 def multiple_difference(self, level, absolute=True, groupby=None, level_as_reference=False):
565 """Pairwise comparison of the given level to all others.
566
567 Args:
568 level (str, tuple of str): Name of level.
569 absolute (bool): If True then return the absolute difference
570 otherwise return the relative difference.
571 groupby (str): Name of column.
572 If specified, will return an interval for each level
573 of the grouped dimension.
574 level_as_reference (bool): If True, the given level is the reference
575 value for the change. (level1)
576
577 Returns:
578 Pandas DataFrame with the following columns:
579 - groupby (If groupby is not None): Grouped dimension
580 - level_1: Name of level 1.
581 - level_2: Name of level 2.
582 - absolute_difference: True if absolute.
583 Absolute: level2 - level1
584 Relative: level2 / level1 - 1
585 - difference: Best estimate of the difference between level 2 and 1.
586 Posterior mean of the difference between level 1 and level 2.
587 https://en.wikipedia.org/wiki/Bayes_estimator
588 - ci_lower: Lower credible interval bound of the difference.
589 - ci_upper: Upper credible interval bound of the difference.
590 - P(level_2 > level_1): Probability that the level 2 > level 1.
591 - level_1 potential loss: The expected loss if we
592 switch to level 1, but level 2 is actually better.
593 - level_1 potential gain: The expected gain if we
594 switch to level 1, and level 1 is actually better.
595 - level_2 potential loss: The expected loss if we
596 switch to level 2, but level 1 is actually better.
597 - level_2 potential gain: The expected gain if we
598 switch to level 2, and level 2 is actually better.
599 """
600
601 results_df = self._iterate_groupby_to_dataframe(
602 self._multiple_difference,
603 groupby=groupby,
604 level=level,
605 absolute=absolute,
606 level_as_reference=level_as_reference,
607 )
608
609 results_df = results_df.reset_index(drop=True)
610
611 return results_df
612
613 def _categorical_multiple_difference_chart(
614 self, level_name, level_df, remaining_groups, groupby, level, absolute, level_as_reference
615 ):
616
617 grouped_df = level_df.groupby(remaining_groups)
618
619 grouped_df_keys = tuple(grouped_df.groups.keys())
620
621 self._validate_levels(level_df, remaining_groups, level)
622
623 posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys]
624
625 var_indx = grouped_df_keys.index(level)
626
627 other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level]
628
629 posterior_matrix = np.vstack(posteriors)
630
631 start_value = posterior_matrix[var_indx]
632 end_value = posterior_matrix
633 if not level_as_reference:
634 start_value, end_value = end_value, start_value
635
636 if absolute:
637 difference_posterior = end_value - start_value
638 else:
639 difference_posterior = end_value / start_value - 1
640
641 df = pd.DataFrame()
642 for group in other_indx:
643 df = pd.concat(
644 [df, pd.DataFrame({"values": difference_posterior[group], "group": str(grouped_df_keys[group])})],
645 axis=0,
646 )
647
648 # Take the difference posterior and create a chart
649 # df = pd.DataFrame({'values': difference_posterior})
650
651 ch = chartify.Chart(y_axis_type="density", x_axis_type="linear")
652
653 ch.plot.kde(df, "values", color_column="group")
654
655 title_change_label = "from" if level_as_reference else "to"
656 ch.set_title("Change {} {}".format(title_change_label, level))
657
658 subtitle = "" if not groupby else "{}: {}".format(groupby, level_name)
659 ch.set_subtitle(subtitle)
660
661 # Line at no difference
662 ch.callout.line(0, orientation="height", line_color="black", line_dash="dashed")
663 # ch.callout.text('No change', 0, .5, angle=90)
664 ch.style.color_palette.reset_palette_order()
665
666 for group in other_indx:
667 posterior_mean = difference_posterior[group].mean()
668 # Plot callout for the mean
669 ch.callout.line(
670 posterior_mean,
671 orientation="height",
672 line_color=ch.style.color_palette.next_color(),
673 line_dash="dashed",
674 )
675
676 ch.callout.text("Expected change: {0:.2f}%".format(posterior_mean * 100), posterior_mean, 0, angle=90)
677
678 # ch.callout.line(
679 # potential_loss,
680 # orientation='height',
681 # line_color=ch.style.color_palette._colors[1])
682 # ch.callout.text(
683 # 'Potential Loss: {0:.2f}%'.format(potential_loss * 100),
684 # potential_loss,
685 # 1.5,
686 # angle=90)
687 # ch.callout.text(
688 # '{0:.2f}%'.format(potential_loss * 100), potential_loss, 1.)
689
690 ch.set_source_label("")
691 ch.axes.set_yaxis_range(0)
692 ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute))
693 ch.axes.set_yaxis_label("Probability Density")
694 ch.axes.hide_yaxis()
695 axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute)
696 ch.axes.set_xaxis_tick_format(axis_format)
697
698 return ch
699
700 @randomization_warning_decorator
701 def _categorical_multiple_difference_plot(self, level, absolute, groupby, level_as_reference):
702 """Pairwise comparison of the given level to all others.
703
704 Args:
705 level (str, tuple of str): Name of level.
706 absolute (bool): If True then return the absolute difference
707 otherwise return the relative difference.
708 groupby (str): Name of column.
709 If specified, will return an interval for each level
710 of the grouped dimension.
711 level_as_reference (bool): If True, the given level is the reference
712 value for the change. (level1)
713
714 Returns:
715 GroupedChart object.
716 """
717
718 results_df = self._iterate_groupby_to_chartgrid(
719 self._categorical_multiple_difference_chart,
720 groupby=groupby,
721 level=level,
722 absolute=absolute,
723 level_as_reference=level_as_reference,
724 )
725
726 return results_df
727
728
729# class GammaPoisson(PoissonResponse):
730# pass
731
732
733# class DirichetMultinomial(MultinomialResponse):
734# def __init__(self,
735# data_frame,
736# group_columns,
737# category_column,
738# value_column,
739# prior_value_column=None):
740
741# super().__init__(data_frame, group_columns, category_column,
742# value_column)
743
744
745# class Gaussian(GaussianResponse):
746# def __init__(self,
747# data_frame,
748# groupings,
749# mean_col,
750# std_col,
751# n_col,
752# time_grouping=None,
753# prior_columns=None):
754# self.prior_lambda_column = prior_lambda_column
755# super(BaseGaussianResponse, self).__init__(
756# data_frame, groups, mean_col, std_col, n_col, time_grouping)
757# raise (NotImplementedError)
758
759
760# class DirichetCategorical(CategoricalResponse):
761# pass