Coverage for /Users/sebastiana/Documents/Sugarpills/confidence/spotify_confidence/analysis/bayesian/bayesian_models.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

244 statements  

1# Copyright 2017-2020 Spotify AB 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15from collections import OrderedDict 

16 

17import chartify 

18import numpy as np 

19import pandas as pd 

20from scipy.stats import beta 

21 

22from spotify_confidence.analysis.bayesian.bayesian_base import ( 

23 BaseTest, 

24 randomization_warning_decorator, 

25 axis_format_precision, 

26) 

27 

28 

29class BetaBinomial(BaseTest): 

30 def __init__( 

31 self, 

32 data_frame, 

33 numerator_column, 

34 denominator_column, 

35 categorical_group_columns=None, 

36 ordinal_group_column=None, 

37 prior_alpha_column=None, 

38 prior_beta_column=None, 

39 interval_size=0.95, 

40 ): 

41 """ 

42 Bayesian BetaBinomial model. 

43 

44 See: https://en.wikipedia.org/wiki/Beta-binomial_distribution 

45 

46 data_frame (pd.DataFrame): DataFrame 

47 numerator_column (str): Column name for numerator column. 

48 denominator_column (str): Column name for denominator column. 

49 categorical_group_columns (str or list): Column names 

50 for categorical groupings. 

51 ordinal_group_column (str): Column name for ordinal 

52 grouping (e.g. numeric or date values). 

53 prior_alpha_column (str): Column name to use for prior alpha. 

54 prior_beta_column (str): Column name to use for prior beta. 

55 interval_size (float): Size of credible intervals. Default 0.95 

56 """ 

57 super().__init__( 

58 data_frame, 

59 categorical_group_columns, 

60 ordinal_group_column, 

61 numerator_column, 

62 denominator_column, 

63 interval_size, 

64 ) 

65 

66 self._monte_carlo_sample_size = 500000 

67 

68 # Initialize priors. 

69 if prior_alpha_column is None or prior_beta_column is None: 

70 self._alpha_prior, self._beta_prior = (0.5, 0.5) 

71 else: 

72 self._alpha_prior = data_frame[prior_alpha_column] 

73 self._beta_prior = data_frame[prior_beta_column] 

74 

75 def _interval(self, row): 

76 interval = beta.interval( 

77 self._interval_size, 

78 row[self._numerator_column] + self._alpha_prior, 

79 row[self._denominator_column] - row[self._numerator_column] + self._beta_prior, 

80 ) 

81 return interval 

82 

83 def _posterior_parameters(self, group_df): 

84 """Calculate parameters of posterior distribution. 

85 

86 Returns: 

87 tuple of floats: posterior_alpha, posterior_beta""" 

88 numerator = group_df[self._numerator_column].values[0] 

89 denominator = group_df[self._denominator_column].values[0] 

90 posterior_alpha = numerator + self._alpha_prior 

91 posterior_beta = denominator - numerator + self._beta_prior 

92 return posterior_alpha, posterior_beta 

93 

94 def _beta_pdf(self, group_df): 

95 """Beta pdfs for the given dataframe""" 

96 posterior_alpha, posterior_beta = self._posterior_parameters(group_df) 

97 epsilon = 0.001 

98 lower_range = beta.isf(1.0 - epsilon, posterior_alpha, posterior_beta) 

99 upper_range = beta.isf(epsilon, posterior_alpha, posterior_beta) 

100 x_range = np.linspace(lower_range, upper_range, 1000) 

101 beta_pdf = [beta.pdf(x, posterior_alpha, posterior_beta) for x in x_range] 

102 beta_dist = pd.DataFrame({"x": x_range, "y": beta_pdf}) 

103 return beta_dist 

104 

105 def _sample_posterior(self, group_df, posterior_sample_size=None): 

106 """MCMC sampling of posterior distribution. 

107 Used to calculate the posterior distribution of 

108 the difference in Beta RVs. 

109 

110 Arguments: 

111 - seed (int): Seed for random number generator. 

112 Set it to make the posteriors deterministic. 

113 - posterior_sample_size (int): Number of posterior 

114 samples (affects precision) 

115 """ 

116 if posterior_sample_size is None: 

117 posterior_sample_size = self._monte_carlo_sample_size 

118 posterior_alpha, posterior_beta = self._posterior_parameters(group_df) 

119 posterior_samples = np.random.beta(posterior_alpha, posterior_beta, size=posterior_sample_size) 

120 return posterior_samples 

121 

122 def _categorical_summary_plot(self, level_name, level_df, remaining_groups, groupby): 

123 

124 if not remaining_groups: 

125 remaining_groups = groupby 

126 grouped_df = level_df.groupby(remaining_groups) 

127 

128 distributions = pd.DataFrame() 

129 for group_name, group_df in grouped_df: 

130 beta_dist = self._beta_pdf(group_df) 

131 beta_dist["group"] = str(group_name) 

132 distributions = pd.concat([distributions, beta_dist], axis=0) 

133 

134 # Filter out the long tails of the distributions 

135 filtered_xs = distributions.groupby("x")["y"].max().reset_index().loc[lambda x: x["y"] > 0.01] 

136 distributions = distributions[distributions["x"].isin(filtered_xs["x"])] 

137 

138 # Remove legend if only one color 

139 color_column = "group" if len(grouped_df) > 1 else None 

140 

141 ch = chartify.Chart() 

142 ch.plot.area( 

143 distributions, 

144 "x", 

145 "y", 

146 color_column=color_column, 

147 stacked=False, 

148 color_order=[str(x) for x in list(grouped_df.groups.keys())], 

149 ) 

150 ch.set_title("Estimate of {} / {}".format(self._numerator_column, self._denominator_column)) 

151 

152 if groupby: 

153 ch.set_subtitle("{}: {}".format(groupby, level_name)) 

154 else: 

155 ch.set_subtitle("") 

156 ch.axes.set_xaxis_label("{} / {}".format(self._numerator_column, self._denominator_column)) 

157 ch.axes.set_yaxis_label("Probability Density") 

158 ch.set_source_label("") 

159 ch.axes.set_yaxis_range(0) 

160 axis_format = axis_format_precision(distributions["x"].min(), distributions["x"].max(), absolute=True) 

161 ch.axes.set_xaxis_tick_format(axis_format) 

162 

163 ch.style.color_palette.reset_palette_order() 

164 

165 # Plot callouts for the means 

166 for group_name, group_df in grouped_df: 

167 posterior_alpha, posterior_beta = self._posterior_parameters(group_df) 

168 posterior_mean = posterior_alpha / (posterior_alpha + posterior_beta) 

169 density = beta.pdf(posterior_mean, posterior_alpha, posterior_beta) 

170 ch.callout.line( 

171 posterior_mean, 

172 orientation="height", 

173 line_color=ch.style.color_palette.next_color(), 

174 line_dash="dashed", 

175 ) 

176 ch.callout.text("{0:.1f}%".format(posterior_mean * 100), posterior_mean, density) 

177 

178 ch.axes.hide_yaxis() 

179 if color_column: 

180 ch.set_legend_location("outside_bottom") 

181 return ch 

182 

183 def _difference_posteriors(self, data, level_1, level_2, absolute=True): 

184 

185 posterior_1 = self._sample_posterior(data.get_group(level_1)) 

186 posterior_2 = self._sample_posterior(data.get_group(level_2)) 

187 

188 if absolute: 

189 difference_posterior = posterior_2 - posterior_1 

190 else: 

191 difference_posterior = posterior_2 / posterior_1 - 1.0 

192 

193 return difference_posterior 

194 

195 def _differences(self, difference_posterior, level_1, level_2, absolute): 

196 # 95% credible interval for posterior 

197 credible_interval = ( 

198 pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2), 

199 pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2 + self._interval_size), 

200 ) 

201 

202 # Probability that posterior is greater 

203 # than zero (count occurences in the MC sample) 

204 p_gt_zero = (difference_posterior > 0).mean() 

205 

206 expected_loss_v2 = difference_posterior[difference_posterior < 0].sum() / len(difference_posterior) 

207 if (difference_posterior > 0).sum() == 0: 

208 expected_gain_v2 = 0 

209 else: 

210 expected_gain_v2 = difference_posterior[difference_posterior > 0].sum() / len(difference_posterior) 

211 

212 expected_loss_v1 = (difference_posterior[difference_posterior * -1.0 < 0] * -1.0).sum() / len( 

213 difference_posterior 

214 ) 

215 

216 if (difference_posterior * -1.0 > 0).sum() == 0: 

217 expected_gain_v1 = 0 

218 else: 

219 expected_gain_v1 = (difference_posterior[difference_posterior * -1.0 > 0] * -1.0).sum() / len( 

220 difference_posterior 

221 ) 

222 

223 return pd.DataFrame( 

224 OrderedDict( 

225 [ 

226 ("level_1", str(level_1)), 

227 ("level_2", str(level_2)), 

228 ("absolute_difference", absolute), 

229 ("difference", difference_posterior.mean()), 

230 ("ci_lower", [credible_interval[0]]), 

231 ("ci_upper", [credible_interval[1]]), 

232 ("P(level_2 > level_1)", p_gt_zero), 

233 ("level_1 potential loss", expected_loss_v1), 

234 ("level_1 potential gain", expected_gain_v1), 

235 ("level_2 potential loss", expected_loss_v2), 

236 ("level_2 potential gain", expected_gain_v2), 

237 ] 

238 ) 

239 ) 

240 

241 def _difference(self, level_name, level_df, remaining_groups, groupby, level_1, level_2, absolute): 

242 

243 difference_df, _ = self._difference_and_difference_posterior( 

244 level_df, remaining_groups, level_2, level_1, absolute 

245 ) 

246 

247 self._add_group_by_columns(difference_df, groupby, level_name) 

248 

249 return difference_df 

250 

251 def _difference_and_difference_posterior(self, level_df, remaining_groups, level_2, level_1, absolute): 

252 self._validate_levels(level_df, remaining_groups, level_1) 

253 self._validate_levels(level_df, remaining_groups, level_2) 

254 # difference is posterior_2 - posterior_1 

255 difference_posterior = self._difference_posteriors( 

256 level_df.groupby(remaining_groups), level_1, level_2, absolute 

257 ) 

258 difference_df = self._differences(difference_posterior, level_1, level_2, absolute) 

259 return difference_df, difference_posterior 

260 

261 @randomization_warning_decorator 

262 def difference(self, level_1, level_2, absolute=True, groupby=None): 

263 """Return DataFrame with summary statistics of the difference between 

264 level 1 and level 2. 

265 

266 Args: 

267 level_1 (str, tuple of str): Name of first level. 

268 level_2 (str, tuple of str): Name of second level. 

269 absolute (bool): If True then return the 

270 absolute difference (level2 - level1) 

271 otherwise return the relative difference (level2 / level1 - 1) 

272 groupby (str): Name of column. 

273 If specified, will return the difference for each level 

274 of the grouped dimension. 

275 

276 Returns: 

277 Pandas DataFrame with the following columns: 

278 - level_1: Name of level 1. 

279 - level_2: Name of level 2. 

280 - absolute_difference: True if absolute. 

281 Absolute: level2 - level1 

282 Relative: level2 / level1 - 1 

283 - difference: Best estimate of the difference between level 2 and 1. 

284 Posterior mean of the difference between level 1 and level 2. 

285 https://en.wikipedia.org/wiki/Bayes_estimator 

286 - ci_lower: Lower credible interval bound of the difference. 

287 - ci_upper: Upper credible interval bound of the difference. 

288 - P(level_2 > level_1): Probability that the level 2 > level 1. 

289 - level_1 potential loss: The expected loss if we 

290 switch to level 1, but level 2 is actually better. 

291 - level_1 potential gain: The expected gain if we 

292 switch to level 1, and level 1 is actually better. 

293 - level_2 potential loss: The expected loss if we 

294 switch to level 2, but level 1 is actually better. 

295 - level_2 potential gain: The expected gain if we 

296 switch to level 2, and level 2 is actually better. 

297 """ 

298 

299 results_df = self._iterate_groupby_to_dataframe( 

300 self._difference, groupby=groupby, level_1=level_1, level_2=level_2, absolute=absolute 

301 ) 

302 

303 return results_df 

304 

305 @randomization_warning_decorator 

306 def _categorical_difference_plot(self, level_1, level_2, absolute, groupby): 

307 chart_grid = self._iterate_groupby_to_chartgrid( 

308 self._categorical_difference_plot_, groupby=groupby, level_1=level_1, level_2=level_2, absolute=absolute 

309 ) 

310 

311 return chart_grid 

312 

313 def _categorical_difference_plot_( 

314 self, level_name, level_df, remaining_groups, groupby, level_1, level_2, absolute 

315 ): 

316 difference_df, difference_posterior = self._difference_and_difference_posterior( 

317 level_df, remaining_groups, level_2, level_1, absolute 

318 ) 

319 

320 posterior_mean = difference_df["difference"][0] 

321 # potential_loss = difference_df['{} potential loss'.format(level_2)][0] 

322 

323 # Take the difference posterior and create a chart 

324 df = pd.DataFrame({"values": difference_posterior}) 

325 

326 ch = chartify.Chart(y_axis_type="density", x_axis_type="linear") 

327 

328 ch.plot.kde(df, "values") 

329 

330 ch.set_title("Change from {} to {}".format(level_1, level_2)) 

331 

332 subtitle = "" if not groupby else "{}: {}".format(groupby, level_name) 

333 ch.set_subtitle(subtitle) 

334 

335 # Line at no difference 

336 ch.callout.line(0, orientation="height", line_color="black", line_dash="dashed") 

337 # ch.callout.text('No change', 0, .5, angle=90) 

338 

339 # Plot callout for the mean 

340 ch.callout.line( 

341 posterior_mean, orientation="height", line_color=ch.style.color_palette._colors[0], line_dash="dashed" 

342 ) 

343 # ch.callout.text( 

344 # '{0:.2f}%'.format(posterior_mean * 100), posterior_mean, 0) 

345 ch.callout.text("Expected change: {0:.2f}%".format(posterior_mean * 100), posterior_mean, 0, angle=90) 

346 

347 # ch.callout.line( 

348 # potential_loss, 

349 # orientation='height', 

350 # line_color=ch.style.color_palette._colors[1]) 

351 # ch.callout.text( 

352 # 'Potential Loss: {0:.2f}%'.format(potential_loss * 100), 

353 # potential_loss, 

354 # 1.5, 

355 # angle=90) 

356 # ch.callout.text( 

357 # '{0:.2f}%'.format(potential_loss * 100), potential_loss, 1.) 

358 

359 ch.set_source_label("") 

360 ch.axes.set_yaxis_range(0) 

361 ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute)) 

362 ch.axes.set_yaxis_label("Probability Density") 

363 ch.axes.hide_yaxis() 

364 axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute) 

365 ch.axes.set_xaxis_tick_format(axis_format) 

366 

367 return ch 

368 

369 def _multiple_difference_joint_dataframe(self, *args, **kwargs): 

370 

371 return self._multiple_difference_joint_base(*args, **kwargs)[0] 

372 

373 def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups, groupby, level, absolute): 

374 

375 grouped_df = level_df.groupby(remaining_groups) 

376 

377 grouped_df_keys = tuple(grouped_df.groups.keys()) 

378 

379 self._validate_levels(level_df, remaining_groups, level) 

380 

381 posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] 

382 

383 var_indx = grouped_df_keys.index(level) 

384 other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level] 

385 

386 posterior_matrix = np.vstack(posteriors) 

387 

388 ge_bool_matrix = posterior_matrix[var_indx, :] >= posterior_matrix[:, :] 

389 

390 best_arr = ge_bool_matrix.all(axis=0) 

391 

392 p_ge_all = best_arr.mean() 

393 

394 end_value = posterior_matrix[var_indx] 

395 start_value = posterior_matrix[other_indx].max(axis=0) 

396 

397 if absolute: 

398 difference_posterior = end_value - start_value 

399 else: 

400 difference_posterior = end_value / start_value - 1 

401 

402 # E(level - best level | level != best) 

403 if not (~best_arr).sum(): 

404 expected_loss = 0 

405 else: 

406 expected_loss = difference_posterior[~best_arr].mean() 

407 

408 # E(level - median level | level = best) 

409 if not (best_arr).sum(): 

410 expected_gain = 0 

411 else: 

412 expected_gain = difference_posterior[best_arr].mean() 

413 

414 expectation = difference_posterior.mean() 

415 ci_l_expectation = pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2) 

416 ci_u_expectation = pd.Series(difference_posterior).quantile( 

417 (1.0 - self._interval_size) / 2 + self._interval_size 

418 ) 

419 

420 difference_df = pd.DataFrame( 

421 OrderedDict( 

422 [ 

423 ("level", [str(level)]), 

424 ("absolute_difference", absolute), 

425 ("difference", expectation), 

426 ("ci_lower", ci_l_expectation), 

427 ("ci_upper", ci_u_expectation), 

428 ("P({} >= all)".format(level), p_ge_all), 

429 ("{} potential loss".format(level), expected_loss), 

430 ("{} potential gain".format(level), expected_gain), 

431 ] 

432 ) 

433 ) 

434 self._add_group_by_columns(difference_df, groupby, level_name) 

435 

436 return (difference_df, difference_posterior) 

437 

438 @randomization_warning_decorator 

439 def multiple_difference_joint(self, level, absolute=True, groupby=None): 

440 """Calculate the joint probability that the given level is greater 

441 than all other levels in the test. 

442 

443 Args: 

444 level (str, tuple of str): Name of level. 

445 absolute (bool): If True then return the absolute difference 

446 otherwise return the relative difference. 

447 groupby (str): Name of column. 

448 If specified, will return an interval for each level 

449 of the grouped dimension. 

450 

451 Returns: 

452 Pandas DataFrame with the following columns: 

453 - level: Name of level 

454 - absolute_difference: True if absolute. 

455 Absolute: level2 - level1 

456 Relative: level2 / level1 - 1 

457 - difference: Difference between the level and the best performing 

458 among the other levels. 

459 - ci_lower: Lower credible interval bound of the difference. 

460 - ci_upper: Upper credible interval bound of the difference. 

461 - P(level > all): Probability that the level > all other levels. 

462 - potential loss: The expected loss if we 

463 switch to level, but some other level is actually better. 

464 - potential gain: The expected gain if we 

465 switch to level, and it is actually the best. 

466 """ 

467 

468 results_df = self._iterate_groupby_to_dataframe( 

469 self._multiple_difference_joint_dataframe, groupby=groupby, level=level, absolute=absolute 

470 ) 

471 

472 return results_df 

473 

474 def _multiple_difference_joint_plot(self, level_name, level_df, remaining_groups, groupby, level, absolute): 

475 

476 self._validate_levels(level_df, remaining_groups, level) 

477 

478 difference_df, difference_posterior = self._multiple_difference_joint_base( 

479 level_name, level_df, remaining_groups, groupby, level, absolute 

480 ) 

481 

482 posterior_mean = difference_df.loc[:, "difference"].values[0] 

483 

484 # potential_loss = difference_df.loc[:, '{} potential loss'.format( 

485 # level)].values[0] 

486 

487 # Take the difference posterior and create a chart 

488 df = pd.DataFrame({"values": difference_posterior}) 

489 

490 ch = chartify.Chart(y_axis_type="density", x_axis_type="linear") 

491 

492 ch.plot.kde(df, "values") 

493 

494 ch.set_title("Comparison to {}".format(level)) 

495 

496 subtitle = "" if not groupby else "{}: {}".format(groupby, level_name) 

497 ch.set_subtitle(subtitle) 

498 

499 # Line at no difference 

500 ch.callout.line(0, orientation="height", line_color="black") 

501 

502 # Plot callout for the mean 

503 ch.callout.line(posterior_mean, orientation="height", line_color=ch.style.color_palette._colors[0]) 

504 

505 ch.callout.text("Expected change: {0:.2f}%".format(posterior_mean * 100), posterior_mean, 0, angle=90) 

506 

507 ch.set_source_label("") 

508 ch.axes.set_yaxis_range(0) 

509 ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute)) 

510 ch.axes.set_yaxis_label("Probability Density") 

511 ch.axes.hide_yaxis() 

512 

513 axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute) 

514 ch.axes.set_xaxis_tick_format(axis_format) 

515 return ch 

516 

517 @randomization_warning_decorator 

518 def multiple_difference_joint_plot(self, level, absolute=True, groupby=None): 

519 """Calculate the joint probability that the given level is greater 

520 than all other levels in the test. 

521 

522 Args: 

523 level (str, tuple of str): Name of level. 

524 absolute (bool): If True then return the absolute difference 

525 otherwise return the relative difference. 

526 groupby (str): Name of column. 

527 If specified, will return an interval for each level 

528 of the grouped dimension. 

529 

530 Returns: 

531 GroupedChart object. 

532 """ 

533 

534 results_df = self._iterate_groupby_to_chartgrid( 

535 self._multiple_difference_joint_plot, groupby=groupby, level=level, absolute=absolute 

536 ) 

537 

538 return results_df 

539 

540 def _multiple_difference( 

541 self, level_name, level_df, remaining_groups, groupby, level, absolute, level_as_reference 

542 ): 

543 

544 grouped_df = level_df.groupby(remaining_groups) 

545 

546 grouped_df_keys = tuple(grouped_df.groups.keys()) 

547 

548 other_keys = [value for i, value in enumerate(grouped_df_keys) if value != level] 

549 

550 for key in other_keys: 

551 

552 # Switch the subtraction order as specified. 

553 start_value, end_value = level, key 

554 if not level_as_reference: 

555 start_value, end_value = end_value, start_value 

556 

557 difference_df = self._difference( 

558 level_name, level_df, remaining_groups, groupby, start_value, end_value, absolute=absolute 

559 ) 

560 

561 yield difference_df 

562 

563 @randomization_warning_decorator 

564 def multiple_difference(self, level, absolute=True, groupby=None, level_as_reference=False): 

565 """Pairwise comparison of the given level to all others. 

566 

567 Args: 

568 level (str, tuple of str): Name of level. 

569 absolute (bool): If True then return the absolute difference 

570 otherwise return the relative difference. 

571 groupby (str): Name of column. 

572 If specified, will return an interval for each level 

573 of the grouped dimension. 

574 level_as_reference (bool): If True, the given level is the reference 

575 value for the change. (level1) 

576 

577 Returns: 

578 Pandas DataFrame with the following columns: 

579 - groupby (If groupby is not None): Grouped dimension 

580 - level_1: Name of level 1. 

581 - level_2: Name of level 2. 

582 - absolute_difference: True if absolute. 

583 Absolute: level2 - level1 

584 Relative: level2 / level1 - 1 

585 - difference: Best estimate of the difference between level 2 and 1. 

586 Posterior mean of the difference between level 1 and level 2. 

587 https://en.wikipedia.org/wiki/Bayes_estimator 

588 - ci_lower: Lower credible interval bound of the difference. 

589 - ci_upper: Upper credible interval bound of the difference. 

590 - P(level_2 > level_1): Probability that the level 2 > level 1. 

591 - level_1 potential loss: The expected loss if we 

592 switch to level 1, but level 2 is actually better. 

593 - level_1 potential gain: The expected gain if we 

594 switch to level 1, and level 1 is actually better. 

595 - level_2 potential loss: The expected loss if we 

596 switch to level 2, but level 1 is actually better. 

597 - level_2 potential gain: The expected gain if we 

598 switch to level 2, and level 2 is actually better. 

599 """ 

600 

601 results_df = self._iterate_groupby_to_dataframe( 

602 self._multiple_difference, 

603 groupby=groupby, 

604 level=level, 

605 absolute=absolute, 

606 level_as_reference=level_as_reference, 

607 ) 

608 

609 results_df = results_df.reset_index(drop=True) 

610 

611 return results_df 

612 

613 def _categorical_multiple_difference_chart( 

614 self, level_name, level_df, remaining_groups, groupby, level, absolute, level_as_reference 

615 ): 

616 

617 grouped_df = level_df.groupby(remaining_groups) 

618 

619 grouped_df_keys = tuple(grouped_df.groups.keys()) 

620 

621 self._validate_levels(level_df, remaining_groups, level) 

622 

623 posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] 

624 

625 var_indx = grouped_df_keys.index(level) 

626 

627 other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level] 

628 

629 posterior_matrix = np.vstack(posteriors) 

630 

631 start_value = posterior_matrix[var_indx] 

632 end_value = posterior_matrix 

633 if not level_as_reference: 

634 start_value, end_value = end_value, start_value 

635 

636 if absolute: 

637 difference_posterior = end_value - start_value 

638 else: 

639 difference_posterior = end_value / start_value - 1 

640 

641 df = pd.DataFrame() 

642 for group in other_indx: 

643 df = pd.concat( 

644 [df, pd.DataFrame({"values": difference_posterior[group], "group": str(grouped_df_keys[group])})], 

645 axis=0, 

646 ) 

647 

648 # Take the difference posterior and create a chart 

649 # df = pd.DataFrame({'values': difference_posterior}) 

650 

651 ch = chartify.Chart(y_axis_type="density", x_axis_type="linear") 

652 

653 ch.plot.kde(df, "values", color_column="group") 

654 

655 title_change_label = "from" if level_as_reference else "to" 

656 ch.set_title("Change {} {}".format(title_change_label, level)) 

657 

658 subtitle = "" if not groupby else "{}: {}".format(groupby, level_name) 

659 ch.set_subtitle(subtitle) 

660 

661 # Line at no difference 

662 ch.callout.line(0, orientation="height", line_color="black", line_dash="dashed") 

663 # ch.callout.text('No change', 0, .5, angle=90) 

664 ch.style.color_palette.reset_palette_order() 

665 

666 for group in other_indx: 

667 posterior_mean = difference_posterior[group].mean() 

668 # Plot callout for the mean 

669 ch.callout.line( 

670 posterior_mean, 

671 orientation="height", 

672 line_color=ch.style.color_palette.next_color(), 

673 line_dash="dashed", 

674 ) 

675 

676 ch.callout.text("Expected change: {0:.2f}%".format(posterior_mean * 100), posterior_mean, 0, angle=90) 

677 

678 # ch.callout.line( 

679 # potential_loss, 

680 # orientation='height', 

681 # line_color=ch.style.color_palette._colors[1]) 

682 # ch.callout.text( 

683 # 'Potential Loss: {0:.2f}%'.format(potential_loss * 100), 

684 # potential_loss, 

685 # 1.5, 

686 # angle=90) 

687 # ch.callout.text( 

688 # '{0:.2f}%'.format(potential_loss * 100), potential_loss, 1.) 

689 

690 ch.set_source_label("") 

691 ch.axes.set_yaxis_range(0) 

692 ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute)) 

693 ch.axes.set_yaxis_label("Probability Density") 

694 ch.axes.hide_yaxis() 

695 axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute) 

696 ch.axes.set_xaxis_tick_format(axis_format) 

697 

698 return ch 

699 

700 @randomization_warning_decorator 

701 def _categorical_multiple_difference_plot(self, level, absolute, groupby, level_as_reference): 

702 """Pairwise comparison of the given level to all others. 

703 

704 Args: 

705 level (str, tuple of str): Name of level. 

706 absolute (bool): If True then return the absolute difference 

707 otherwise return the relative difference. 

708 groupby (str): Name of column. 

709 If specified, will return an interval for each level 

710 of the grouped dimension. 

711 level_as_reference (bool): If True, the given level is the reference 

712 value for the change. (level1) 

713 

714 Returns: 

715 GroupedChart object. 

716 """ 

717 

718 results_df = self._iterate_groupby_to_chartgrid( 

719 self._categorical_multiple_difference_chart, 

720 groupby=groupby, 

721 level=level, 

722 absolute=absolute, 

723 level_as_reference=level_as_reference, 

724 ) 

725 

726 return results_df 

727 

728 

729# class GammaPoisson(PoissonResponse): 

730# pass 

731 

732 

733# class DirichetMultinomial(MultinomialResponse): 

734# def __init__(self, 

735# data_frame, 

736# group_columns, 

737# category_column, 

738# value_column, 

739# prior_value_column=None): 

740 

741# super().__init__(data_frame, group_columns, category_column, 

742# value_column) 

743 

744 

745# class Gaussian(GaussianResponse): 

746# def __init__(self, 

747# data_frame, 

748# groupings, 

749# mean_col, 

750# std_col, 

751# n_col, 

752# time_grouping=None, 

753# prior_columns=None): 

754# self.prior_lambda_column = prior_lambda_column 

755# super(BaseGaussianResponse, self).__init__( 

756# data_frame, groups, mean_col, std_col, n_col, time_grouping) 

757# raise (NotImplementedError) 

758 

759 

760# class DirichetCategorical(CategoricalResponse): 

761# pass