Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/graphics/boxplots.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Variations on boxplots."""
3# Author: Ralf Gommers
4# Based on code by Flavio Coelho and Teemu Ikonen.
6import numpy as np
7from scipy.stats import gaussian_kde
9from . import utils
12__all__ = ['violinplot', 'beanplot']
15def violinplot(data, ax=None, labels=None, positions=None, side='both',
16 show_boxplot=True, plot_opts=None):
17 """
18 Make a violin plot of each dataset in the `data` sequence.
20 A violin plot is a boxplot combined with a kernel density estimate of the
21 probability density function per point.
23 Parameters
24 ----------
25 data : sequence[array_like]
26 Data arrays, one array per value in `positions`.
27 ax : AxesSubplot, optional
28 If given, this subplot is used to plot in instead of a new figure being
29 created.
30 labels : list[str], optional
31 Tick labels for the horizontal axis. If not given, integers
32 ``1..len(data)`` are used.
33 positions : array_like, optional
34 Position array, used as the horizontal axis of the plot. If not given,
35 spacing of the violins will be equidistant.
36 side : {'both', 'left', 'right'}, optional
37 How to plot the violin. Default is 'both'. The 'left', 'right'
38 options can be used to create asymmetric violin plots.
39 show_boxplot : bool, optional
40 Whether or not to show normal box plots on top of the violins.
41 Default is True.
42 plot_opts : dict, optional
43 A dictionary with plotting options. Any of the following can be
44 provided, if not present in `plot_opts` the defaults will be used::
46 - 'violin_fc', MPL color. Fill color for violins. Default is 'y'.
47 - 'violin_ec', MPL color. Edge color for violins. Default is 'k'.
48 - 'violin_lw', scalar. Edge linewidth for violins. Default is 1.
49 - 'violin_alpha', float. Transparancy of violins. Default is 0.5.
50 - 'cutoff', bool. If True, limit violin range to data range.
51 Default is False.
52 - 'cutoff_val', scalar. Where to cut off violins if `cutoff` is
53 True. Default is 1.5 standard deviations.
54 - 'cutoff_type', {'std', 'abs'}. Whether cutoff value is absolute,
55 or in standard deviations. Default is 'std'.
56 - 'violin_width' : float. Relative width of violins. Max available
57 space is 1, default is 0.8.
58 - 'label_fontsize', MPL fontsize. Adjusts fontsize only if given.
59 - 'label_rotation', scalar. Adjusts label rotation only if given.
60 Specify in degrees.
61 - 'bw_factor', Adjusts the scipy gaussian_kde kernel. default: None.
62 Options for scalar or callable.
64 Returns
65 -------
66 Figure
67 If `ax` is None, the created figure. Otherwise the figure to which
68 `ax` is connected.
70 See Also
71 --------
72 beanplot : Bean plot, builds on `violinplot`.
73 matplotlib.pyplot.boxplot : Standard boxplot.
75 Notes
76 -----
77 The appearance of violins can be customized with `plot_opts`. If
78 customization of boxplot elements is required, set `show_boxplot` to False
79 and plot it on top of the violins by calling the Matplotlib `boxplot`
80 function directly. For example::
82 violinplot(data, ax=ax, show_boxplot=False)
83 ax.boxplot(data, sym='cv', whis=2.5)
85 It can happen that the axis labels or tick labels fall outside the plot
86 area, especially with rotated labels on the horizontal axis. With
87 Matplotlib 1.1 or higher, this can easily be fixed by calling
88 ``ax.tight_layout()``. With older Matplotlib one has to use ``plt.rc`` or
89 ``plt.rcParams`` to fix this, for example::
91 plt.rc('figure.subplot', bottom=0.25)
92 violinplot(data, ax=ax)
94 References
95 ----------
96 J.L. Hintze and R.D. Nelson, "Violin Plots: A Box Plot-Density Trace
97 Synergism", The American Statistician, Vol. 52, pp.181-84, 1998.
99 Examples
100 --------
101 We use the American National Election Survey 1996 dataset, which has Party
102 Identification of respondents as independent variable and (among other
103 data) age as dependent variable.
105 >>> data = sm.datasets.anes96.load_pandas()
106 >>> party_ID = np.arange(7)
107 >>> labels = ["Strong Democrat", "Weak Democrat", "Independent-Democrat",
108 ... "Independent-Indpendent", "Independent-Republican",
109 ... "Weak Republican", "Strong Republican"]
111 Group age by party ID, and create a violin plot with it:
113 >>> plt.rcParams['figure.subplot.bottom'] = 0.23 # keep labels visible
114 >>> age = [data.exog['age'][data.endog == id] for id in party_ID]
115 >>> fig = plt.figure()
116 >>> ax = fig.add_subplot(111)
117 >>> sm.graphics.violinplot(age, ax=ax, labels=labels,
118 ... plot_opts={'cutoff_val':5, 'cutoff_type':'abs',
119 ... 'label_fontsize':'small',
120 ... 'label_rotation':30})
121 >>> ax.set_xlabel("Party identification of respondent.")
122 >>> ax.set_ylabel("Age")
123 >>> plt.show()
125 .. plot:: plots/graphics_boxplot_violinplot.py
126 """
127 plot_opts = {} if plot_opts is None else plot_opts
128 if max([np.size(arr) for arr in data]) == 0:
129 msg = "No Data to make Violin: Try again!"
130 raise ValueError(msg)
132 fig, ax = utils.create_mpl_ax(ax)
134 data = list(map(np.asarray, data))
135 if positions is None:
136 positions = np.arange(len(data)) + 1
138 # Determine available horizontal space for each individual violin.
139 pos_span = np.max(positions) - np.min(positions)
140 width = np.min([0.15 * np.max([pos_span, 1.]),
141 plot_opts.get('violin_width', 0.8) / 2.])
143 # Plot violins.
144 for pos_data, pos in zip(data, positions):
145 _single_violin(ax, pos, pos_data, width, side, plot_opts)
147 if show_boxplot:
148 ax.boxplot(data, notch=1, positions=positions, vert=1)
150 # Set ticks and tick labels of horizontal axis.
151 _set_ticks_labels(ax, data, labels, positions, plot_opts)
153 return fig
156def _single_violin(ax, pos, pos_data, width, side, plot_opts):
157 """"""
158 bw_factor = plot_opts.get('bw_factor', None)
160 def _violin_range(pos_data, plot_opts):
161 """Return array with correct range, with which violins can be plotted."""
162 cutoff = plot_opts.get('cutoff', False)
163 cutoff_type = plot_opts.get('cutoff_type', 'std')
164 cutoff_val = plot_opts.get('cutoff_val', 1.5)
166 s = 0.0
167 if not cutoff:
168 if cutoff_type == 'std':
169 s = cutoff_val * np.std(pos_data)
170 else:
171 s = cutoff_val
173 x_lower = kde.dataset.min() - s
174 x_upper = kde.dataset.max() + s
175 return np.linspace(x_lower, x_upper, 100)
177 pos_data = np.asarray(pos_data)
178 # Kernel density estimate for data at this position.
179 kde = gaussian_kde(pos_data, bw_method=bw_factor)
181 # Create violin for pos, scaled to the available space.
182 xvals = _violin_range(pos_data, plot_opts)
183 violin = kde.evaluate(xvals)
184 violin = width * violin / violin.max()
186 if side == 'both':
187 envelope_l, envelope_r = (-violin + pos, violin + pos)
188 elif side == 'right':
189 envelope_l, envelope_r = (pos, violin + pos)
190 elif side == 'left':
191 envelope_l, envelope_r = (-violin + pos, pos)
192 else:
193 msg = "`side` parameter should be one of {'left', 'right', 'both'}."
194 raise ValueError(msg)
196 # Draw the violin.
197 ax.fill_betweenx(xvals, envelope_l, envelope_r,
198 facecolor=plot_opts.get('violin_fc', '#66c2a5'),
199 edgecolor=plot_opts.get('violin_ec', 'k'),
200 lw=plot_opts.get('violin_lw', 1),
201 alpha=plot_opts.get('violin_alpha', 0.5))
203 return xvals, violin
206def _set_ticks_labels(ax, data, labels, positions, plot_opts):
207 """Set ticks and labels on horizontal axis."""
209 # Set xticks and limits.
210 ax.set_xlim([np.min(positions) - 0.5, np.max(positions) + 0.5])
211 ax.set_xticks(positions)
213 label_fontsize = plot_opts.get('label_fontsize')
214 label_rotation = plot_opts.get('label_rotation')
215 if label_fontsize or label_rotation:
216 from matplotlib.artist import setp
218 if labels is not None:
219 if not len(labels) == len(data):
220 msg = "Length of `labels` should equal length of `data`."
221 raise ValueError(msg)
223 xticknames = ax.set_xticklabels(labels)
224 if label_fontsize:
225 setp(xticknames, fontsize=label_fontsize)
227 if label_rotation:
228 setp(xticknames, rotation=label_rotation)
230 return
233def beanplot(data, ax=None, labels=None, positions=None, side='both',
234 jitter=False, plot_opts={}):
235 """
236 Bean plot of each dataset in a sequence.
238 A bean plot is a combination of a `violinplot` (kernel density estimate of
239 the probability density function per point) with a line-scatter plot of all
240 individual data points.
242 Parameters
243 ----------
244 data : sequence[array_like]
245 Data arrays, one array per value in `positions`.
246 ax : AxesSubplot
247 If given, this subplot is used to plot in instead of a new figure being
248 created.
249 labels : list[str], optional
250 Tick labels for the horizontal axis. If not given, integers
251 ``1..len(data)`` are used.
252 positions : array_like, optional
253 Position array, used as the horizontal axis of the plot. If not given,
254 spacing of the violins will be equidistant.
255 side : {'both', 'left', 'right'}, optional
256 How to plot the violin. Default is 'both'. The 'left', 'right'
257 options can be used to create asymmetric violin plots.
258 jitter : bool, optional
259 If True, jitter markers within violin instead of plotting regular lines
260 around the center. This can be useful if the data is very dense.
261 plot_opts : dict, optional
262 A dictionary with plotting options. All the options for `violinplot`
263 can be specified, they will simply be passed to `violinplot`. Options
264 specific to `beanplot` are:
266 - 'violin_width' : float. Relative width of violins. Max available
267 space is 1, default is 0.8.
268 - 'bean_color', MPL color. Color of bean plot lines. Default is 'k'.
269 Also used for jitter marker edge color if `jitter` is True.
270 - 'bean_size', scalar. Line length as a fraction of maximum length.
271 Default is 0.5.
272 - 'bean_lw', scalar. Linewidth, default is 0.5.
273 - 'bean_show_mean', bool. If True (default), show mean as a line.
274 - 'bean_show_median', bool. If True (default), show median as a
275 marker.
276 - 'bean_mean_color', MPL color. Color of mean line. Default is 'b'.
277 - 'bean_mean_lw', scalar. Linewidth of mean line, default is 2.
278 - 'bean_mean_size', scalar. Line length as a fraction of maximum length.
279 Default is 0.5.
280 - 'bean_median_color', MPL color. Color of median marker. Default
281 is 'r'.
282 - 'bean_median_marker', MPL marker. Marker type, default is '+'.
283 - 'jitter_marker', MPL marker. Marker type for ``jitter=True``.
284 Default is 'o'.
285 - 'jitter_marker_size', int. Marker size. Default is 4.
286 - 'jitter_fc', MPL color. Jitter marker face color. Default is None.
287 - 'bean_legend_text', str. If given, add a legend with given text.
289 Returns
290 -------
291 Figure
292 If `ax` is None, the created figure. Otherwise the figure to which
293 `ax` is connected.
295 See Also
296 --------
297 violinplot : Violin plot, also used internally in `beanplot`.
298 matplotlib.pyplot.boxplot : Standard boxplot.
300 References
301 ----------
302 P. Kampstra, "Beanplot: A Boxplot Alternative for Visual Comparison of
303 Distributions", J. Stat. Soft., Vol. 28, pp. 1-9, 2008.
305 Examples
306 --------
307 We use the American National Election Survey 1996 dataset, which has Party
308 Identification of respondents as independent variable and (among other
309 data) age as dependent variable.
311 >>> data = sm.datasets.anes96.load_pandas()
312 >>> party_ID = np.arange(7)
313 >>> labels = ["Strong Democrat", "Weak Democrat", "Independent-Democrat",
314 ... "Independent-Indpendent", "Independent-Republican",
315 ... "Weak Republican", "Strong Republican"]
317 Group age by party ID, and create a violin plot with it:
319 >>> plt.rcParams['figure.subplot.bottom'] = 0.23 # keep labels visible
320 >>> age = [data.exog['age'][data.endog == id] for id in party_ID]
321 >>> fig = plt.figure()
322 >>> ax = fig.add_subplot(111)
323 >>> sm.graphics.beanplot(age, ax=ax, labels=labels,
324 ... plot_opts={'cutoff_val':5, 'cutoff_type':'abs',
325 ... 'label_fontsize':'small',
326 ... 'label_rotation':30})
327 >>> ax.set_xlabel("Party identification of respondent.")
328 >>> ax.set_ylabel("Age")
329 >>> plt.show()
331 .. plot:: plots/graphics_boxplot_beanplot.py
332 """
333 fig, ax = utils.create_mpl_ax(ax)
335 data = list(map(np.asarray, data))
336 if positions is None:
337 positions = np.arange(len(data)) + 1
339 # Determine available horizontal space for each individual violin.
340 pos_span = np.max(positions) - np.min(positions)
341 violin_width = np.min([0.15 * np.max([pos_span, 1.]),
342 plot_opts.get('violin_width', 0.8) / 2.])
343 bean_width = np.min([0.15 * np.max([pos_span, 1.]),
344 plot_opts.get('bean_size', 0.5) / 2.])
345 bean_mean_width = np.min([0.15 * np.max([pos_span, 1.]),
346 plot_opts.get('bean_mean_size', 0.5) / 2.])
348 legend_txt = plot_opts.get('bean_legend_text', None)
349 for pos_data, pos in zip(data, positions):
350 # Draw violins.
351 xvals, violin = _single_violin(ax, pos, pos_data, violin_width, side, plot_opts)
353 if jitter:
354 # Draw data points at random coordinates within violin envelope.
355 jitter_coord = pos + _jitter_envelope(pos_data, xvals, violin, side)
356 ax.plot(jitter_coord, pos_data, ls='',
357 marker=plot_opts.get('jitter_marker', 'o'),
358 ms=plot_opts.get('jitter_marker_size', 4),
359 mec=plot_opts.get('bean_color', 'k'),
360 mew=1, mfc=plot_opts.get('jitter_fc', 'none'),
361 label=legend_txt)
362 else:
363 # Draw bean lines.
364 ax.hlines(pos_data, pos - bean_width, pos + bean_width,
365 lw=plot_opts.get('bean_lw', 0.5),
366 color=plot_opts.get('bean_color', 'k'),
367 label=legend_txt)
369 # Show legend if required.
370 if legend_txt is not None:
371 _show_legend(ax)
372 legend_txt = None # ensure we get one entry per call to beanplot
374 # Draw mean line.
375 if plot_opts.get('bean_show_mean', True):
376 ax.hlines(np.mean(pos_data), pos - bean_mean_width, pos + bean_mean_width,
377 lw=plot_opts.get('bean_mean_lw', 2.),
378 color=plot_opts.get('bean_mean_color', 'b'))
380 # Draw median marker.
381 if plot_opts.get('bean_show_median', True):
382 ax.plot(pos, np.median(pos_data),
383 marker=plot_opts.get('bean_median_marker', '+'),
384 color=plot_opts.get('bean_median_color', 'r'))
386 # Set ticks and tick labels of horizontal axis.
387 _set_ticks_labels(ax, data, labels, positions, plot_opts)
389 return fig
392def _jitter_envelope(pos_data, xvals, violin, side):
393 """Determine envelope for jitter markers."""
394 if side == 'both':
395 low, high = (-1., 1.)
396 elif side == 'right':
397 low, high = (0, 1.)
398 elif side == 'left':
399 low, high = (-1., 0)
400 else:
401 raise ValueError("`side` input incorrect: %s" % side)
403 jitter_envelope = np.interp(pos_data, xvals, violin)
404 jitter_coord = jitter_envelope * np.random.uniform(low=low, high=high,
405 size=pos_data.size)
407 return jitter_coord
410def _show_legend(ax):
411 """Utility function to show legend."""
412 leg = ax.legend(loc=1, shadow=True, fancybox=True, labelspacing=0.2,
413 borderpad=0.15)
414 ltext = leg.get_texts()
415 llines = leg.get_lines()
416 frame = leg.get_frame()
418 from matplotlib.artist import setp
419 setp(ltext, fontsize='small')
420 setp(llines, linewidth=1)