Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/graphics/regressionplots.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1'''Partial Regression plot and residual plots to find misspecification
4Author: Josef Perktold
5License: BSD-3
6Created: 2011-01-23
8update
92011-06-05 : start to convert example to usable functions
102011-10-27 : docstrings
12'''
13from statsmodels.compat.python import lrange, lzip
14from statsmodels.compat.pandas import Appender
16import numpy as np
17import pandas as pd
18from patsy import dmatrix
20from statsmodels.regression.linear_model import OLS, GLS, WLS
21from statsmodels.genmod.generalized_linear_model import GLM
22from statsmodels.genmod.generalized_estimating_equations import GEE
23from statsmodels.sandbox.regression.predstd import wls_prediction_std
24from statsmodels.graphics import utils
25from statsmodels.nonparametric.smoothers_lowess import lowess
26from statsmodels.tools.tools import maybe_unwrap_results
28from ._regressionplots_doc import (
29 _plot_added_variable_doc,
30 _plot_partial_residuals_doc,
31 _plot_ceres_residuals_doc,
32 _plot_influence_doc,
33 _plot_leverage_resid2_doc)
35__all__ = ['plot_fit', 'plot_regress_exog', 'plot_partregress', 'plot_ccpr',
36 'plot_regress_exog', 'plot_partregress_grid', 'plot_ccpr_grid',
37 'add_lowess', 'abline_plot', 'influence_plot',
38 'plot_leverage_resid2', 'added_variable_resids',
39 'partial_resids', 'ceres_resids', 'plot_added_variable',
40 'plot_partial_residuals', 'plot_ceres_residuals']
42#TODO: consider moving to influence module
43def _high_leverage(results):
44 #TODO: replace 1 with k_constant
45 return 2. * (results.df_model + 1)/results.nobs
48def add_lowess(ax, lines_idx=0, frac=.2, **lowess_kwargs):
49 """
50 Add Lowess line to a plot.
52 Parameters
53 ----------
54 ax : AxesSubplot
55 The Axes to which to add the plot
56 lines_idx : int
57 This is the line on the existing plot to which you want to add
58 a smoothed lowess line.
59 frac : float
60 The fraction of the points to use when doing the lowess fit.
61 lowess_kwargs
62 Additional keyword arguments are passes to lowess.
64 Returns
65 -------
66 Figure
67 The figure that holds the instance.
68 """
69 y0 = ax.get_lines()[lines_idx]._y
70 x0 = ax.get_lines()[lines_idx]._x
71 lres = lowess(y0, x0, frac=frac, **lowess_kwargs)
72 ax.plot(lres[:, 0], lres[:, 1], 'r', lw=1.5)
73 return ax.figure
76def plot_fit(results, exog_idx, y_true=None, ax=None, vlines=True, **kwargs):
77 """
78 Plot fit against one regressor.
80 This creates one graph with the scatterplot of observed values
81 compared to fitted values.
83 Parameters
84 ----------
85 results : Results
86 A result instance with resid, model.endog and model.exog as
87 attributes.
88 exog_idx : {int, str}
89 Name or index of regressor in exog matrix.
90 y_true : array_like. optional
91 If this is not None, then the array is added to the plot.
92 ax : AxesSubplot, optional
93 If given, this subplot is used to plot in instead of a new figure being
94 created.
95 vlines : bool, optional
96 If this not True, then the uncertainty of the fit is not
97 plotted.
98 **kwargs
99 The keyword arguments are passed to the plot command for the fitted
100 values points.
102 Returns
103 -------
104 Figure
105 If `ax` is None, the created figure. Otherwise the figure to which
106 `ax` is connected.
108 Examples
109 --------
110 Load the Statewide Crime data set and perform linear regression with
111 `poverty` and `hs_grad` as variables and `murder` as the response
113 >>> import statsmodels.api as sm
114 >>> import matplotlib.pyplot as plt
116 >>> data = sm.datasets.statecrime.load_pandas().data
117 >>> murder = data['murder']
118 >>> X = data[['poverty', 'hs_grad']]
120 >>> X["constant"] = 1
121 >>> y = murder
122 >>> model = sm.OLS(y, X)
123 >>> results = model.fit()
125 Create a plot just for the variable 'Poverty':
127 >>> fig, ax = plt.subplots()
128 >>> fig = sm.graphics.plot_fit(results, 0, ax=ax)
129 >>> ax.set_ylabel("Murder Rate")
130 >>> ax.set_xlabel("Poverty Level")
131 >>> ax.set_title("Linear Regression")
133 >>> plt.show()
135 .. plot:: plots/graphics_plot_fit_ex.py
136 """
138 fig, ax = utils.create_mpl_ax(ax)
140 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
141 results = maybe_unwrap_results(results)
143 #maybe add option for wendog, wexog
144 y = results.model.endog
145 x1 = results.model.exog[:, exog_idx]
146 x1_argsort = np.argsort(x1)
147 y = y[x1_argsort]
148 x1 = x1[x1_argsort]
150 ax.plot(x1, y, 'bo', label=results.model.endog_names)
151 if y_true is not None:
152 ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
153 title = 'Fitted values versus %s' % exog_name
155 ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
156 label='fitted', **kwargs)
157 if vlines is True:
158 _, iv_l, iv_u = wls_prediction_std(results)
159 ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1,
160 color='k', alpha=.7)
161 #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
162 # color='k')
163 ax.set_title(title)
164 ax.set_xlabel(exog_name)
165 ax.set_ylabel(results.model.endog_names)
166 ax.legend(loc='best', numpoints=1)
168 return fig
171def plot_regress_exog(results, exog_idx, fig=None):
172 """Plot regression results against one regressor.
174 This plots four graphs in a 2 by 2 figure: 'endog versus exog',
175 'residuals versus exog', 'fitted versus exog' and
176 'fitted plus residual versus exog'
178 Parameters
179 ----------
180 results : result instance
181 A result instance with resid, model.endog and model.exog as attributes.
182 exog_idx : int or str
183 Name or index of regressor in exog matrix.
184 fig : Figure, optional
185 If given, this figure is simply returned. Otherwise a new figure is
186 created.
188 Returns
189 -------
190 Figure
191 The value of `fig` if provided. Otherwise a new instance.
193 Examples
194 --------
195 Load the Statewide Crime data set and build a model with regressors
196 including the rate of high school graduation (hs_grad), population in urban
197 areas (urban), households below poverty line (poverty), and single person
198 households (single). Outcome variable is the murder rate (murder).
200 Build a 2 by 2 figure based on poverty showing fitted versus actual murder
201 rate, residuals versus the poverty rate, partial regression plot of poverty,
202 and CCPR plot for poverty rate.
204 >>> import statsmodels.api as sm
205 >>> import matplotlib.pyplot as plot
206 >>> import statsmodels.formula.api as smf
208 >>> fig = plt.figure(figsize=(8, 6))
209 >>> crime_data = sm.datasets.statecrime.load_pandas()
210 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
211 ... data=crime_data.data).fit()
212 >>> sm.graphics.plot_regress_exog(results, 'poverty', fig=fig)
213 >>> plt.show()
215 .. plot:: plots/graphics_regression_regress_exog.py
216 """
218 fig = utils.create_mpl_fig(fig)
220 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
221 results = maybe_unwrap_results(results)
223 #maybe add option for wendog, wexog
224 y_name = results.model.endog_names
225 x1 = results.model.exog[:, exog_idx]
226 prstd, iv_l, iv_u = wls_prediction_std(results)
228 ax = fig.add_subplot(2, 2, 1)
229 ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name)
230 ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted',
231 alpha=.5)
232 ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7)
233 ax.set_title('Y and Fitted vs. X', fontsize='large')
234 ax.set_xlabel(exog_name)
235 ax.set_ylabel(y_name)
236 ax.legend(loc='best')
238 ax = fig.add_subplot(2, 2, 2)
239 ax.plot(x1, results.resid, 'o')
240 ax.axhline(y=0, color='black')
241 ax.set_title('Residuals versus %s' % exog_name, fontsize='large')
242 ax.set_xlabel(exog_name)
243 ax.set_ylabel("resid")
245 ax = fig.add_subplot(2, 2, 3)
246 exog_noti = np.ones(results.model.exog.shape[1], bool)
247 exog_noti[exog_idx] = False
248 exog_others = results.model.exog[:, exog_noti]
249 from pandas import Series
250 fig = plot_partregress(results.model.data.orig_endog,
251 Series(x1, name=exog_name,
252 index=results.model.data.row_labels),
253 exog_others, obs_labels=False, ax=ax)
254 ax.set_title('Partial regression plot', fontsize='large')
255 #ax.set_ylabel("Fitted values")
256 #ax.set_xlabel(exog_name)
258 ax = fig.add_subplot(2, 2, 4)
259 fig = plot_ccpr(results, exog_idx, ax=ax)
260 ax.set_title('CCPR Plot', fontsize='large')
261 #ax.set_xlabel(exog_name)
262 #ax.set_ylabel("Fitted values + resids")
264 fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")
266 fig.tight_layout()
268 fig.subplots_adjust(top=.90)
269 return fig
272def _partial_regression(endog, exog_i, exog_others):
273 """Partial regression.
275 regress endog on exog_i conditional on exog_others
277 uses OLS
279 Parameters
280 ----------
281 endog : array_like
282 exog : array_like
283 exog_others : array_like
285 Returns
286 -------
287 res1c : OLS results instance
289 (res1a, res1b) : tuple of OLS results instances
290 results from regression of endog on exog_others and of exog_i on
291 exog_others
292 """
293 #FIXME: This function does not appear to be used.
294 res1a = OLS(endog, exog_others).fit()
295 res1b = OLS(exog_i, exog_others).fit()
296 res1c = OLS(res1a.resid, res1b.resid).fit()
298 return res1c, (res1a, res1b)
301def plot_partregress(endog, exog_i, exog_others, data=None,
302 title_kwargs={}, obs_labels=True, label_kwargs={},
303 ax=None, ret_coords=False, **kwargs):
304 """Plot partial regression for a single regressor.
306 Parameters
307 ----------
308 endog : {ndarray, str}
309 The endogenous or response variable. If string is given, you can use a
310 arbitrary translations as with a formula.
311 exog_i : {ndarray, str}
312 The exogenous, explanatory variable. If string is given, you can use a
313 arbitrary translations as with a formula.
314 exog_others : {ndarray, list[str]}
315 Any other exogenous, explanatory variables. If a list of strings is
316 given, each item is a term in formula. You can use a arbitrary
317 translations as with a formula. The effect of these variables will be
318 removed by OLS regression.
319 data : {DataFrame, dict}
320 Some kind of data structure with names if the other variables are
321 given as strings.
322 title_kwargs : dict
323 Keyword arguments to pass on for the title. The key to control the
324 fonts is fontdict.
325 obs_labels : {bool, array_like}
326 Whether or not to annotate the plot points with their observation
327 labels. If obs_labels is a boolean, the point labels will try to do
328 the right thing. First it will try to use the index of data, then
329 fall back to the index of exog_i. Alternatively, you may give an
330 array-like object corresponding to the observation numbers.
331 label_kwargs : dict
332 Keyword arguments that control annotate for the observation labels.
333 ax : AxesSubplot, optional
334 If given, this subplot is used to plot in instead of a new figure being
335 created.
336 ret_coords : bool
337 If True will return the coordinates of the points in the plot. You
338 can use this to add your own annotations.
339 **kwargs
340 The keyword arguments passed to plot for the points.
342 Returns
343 -------
344 fig : Figure
345 If `ax` is None, the created figure. Otherwise the figure to which
346 `ax` is connected.
347 coords : list, optional
348 If ret_coords is True, return a tuple of arrays (x_coords, y_coords).
350 See Also
351 --------
352 plot_partregress_grid : Plot partial regression for a set of regressors.
354 Notes
355 -----
356 The slope of the fitted line is the that of `exog_i` in the full
357 multiple regression. The individual points can be used to assess the
358 influence of points on the estimated coefficient.
360 Examples
361 --------
362 Load the Statewide Crime data set and plot partial regression of the rate
363 of high school graduation (hs_grad) on the murder rate(murder).
365 The effects of the percent of the population living in urban areas (urban),
366 below the poverty line (poverty) , and in a single person household (single)
367 are removed by OLS regression.
369 >>> import statsmodels.api as sm
370 >>> import matplotlib.pyplot as plt
372 >>> crime_data = sm.datasets.statecrime.load_pandas()
373 >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
374 ... exog_others=['urban', 'poverty', 'single'],
375 ... data=crime_data.data, obs_labels=False)
376 >>> plt.show()
378 .. plot:: plots/graphics_regression_partregress.py
380 More detailed examples can be found in the Regression Plots notebook
381 on the examples page.
382 """
383 #NOTE: there is no interaction between possible missing data and
384 #obs_labels yet, so this will need to be tweaked a bit for this case
385 fig, ax = utils.create_mpl_ax(ax)
387 # strings, use patsy to transform to data
388 if isinstance(endog, str):
389 endog = dmatrix(endog + "-1", data)
391 if isinstance(exog_others, str):
392 RHS = dmatrix(exog_others, data)
393 elif isinstance(exog_others, list):
394 RHS = "+".join(exog_others)
395 RHS = dmatrix(RHS, data)
396 else:
397 RHS = exog_others
398 RHS_isemtpy = False
399 if isinstance(RHS, np.ndarray) and RHS.size==0:
400 RHS_isemtpy = True
401 elif isinstance(RHS, pd.DataFrame) and RHS.empty:
402 RHS_isemtpy = True
403 if isinstance(exog_i, str):
404 exog_i = dmatrix(exog_i + "-1", data)
406 # all arrays or pandas-like
408 if RHS_isemtpy:
409 endog = np.asarray(endog)
410 exog_i = np.asarray(exog_i)
411 ax.plot(endog, exog_i, 'o', **kwargs)
412 fitted_line = OLS(endog, exog_i).fit()
413 x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name
414 y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0]
415 else:
416 res_yaxis = OLS(endog, RHS).fit()
417 res_xaxis = OLS(exog_i, RHS).fit()
418 xaxis_resid = res_xaxis.resid
419 yaxis_resid = res_yaxis.resid
420 x_axis_endog_name = res_xaxis.model.endog_names
421 y_axis_endog_name = res_yaxis.model.endog_names
422 ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
423 fitted_line = OLS(yaxis_resid, xaxis_resid).fit()
425 fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)
427 if x_axis_endog_name == 'y': # for no names regression will just get a y
428 x_axis_endog_name = 'x' # this is misleading, so use x
429 ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
430 ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
431 ax.set_title('Partial Regression Plot', **title_kwargs)
433 # NOTE: if we want to get super fancy, we could annotate if a point is
434 # clicked using this widget
435 # http://stackoverflow.com/questions/4652439/
436 # is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
437 # 4674445#4674445
438 if obs_labels is True:
439 if data is not None:
440 obs_labels = data.index
441 elif hasattr(exog_i, "index"):
442 obs_labels = exog_i.index
443 else:
444 obs_labels = res_xaxis.model.data.row_labels
445 #NOTE: row_labels can be None.
446 #Maybe we should fix this to never be the case.
447 if obs_labels is None:
448 obs_labels = lrange(len(exog_i))
450 if obs_labels is not False: # could be array_like
451 if len(obs_labels) != len(exog_i):
452 raise ValueError("obs_labels does not match length of exog_i")
453 label_kwargs.update(dict(ha="center", va="bottom"))
454 ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels,
455 lzip(res_xaxis.resid, res_yaxis.resid),
456 [(0, 5)] * len(obs_labels), "x-large", ax=ax,
457 **label_kwargs)
459 if ret_coords:
460 return fig, (res_xaxis.resid, res_yaxis.resid)
461 else:
462 return fig
465def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None):
466 """
467 Plot partial regression for a set of regressors.
469 Parameters
470 ----------
471 results : Results instance
472 A regression model results instance.
473 exog_idx : {None, list[int], list[str]}
474 The indices or column names of the exog used in the plot, default is
475 all.
476 grid : {None, tuple[int]}
477 If grid is given, then it is used for the arrangement of the subplots.
478 The format of grid is (nrows, ncols). If grid is None, then ncol is
479 one, if there are only 2 subplots, and the number of columns is two
480 otherwise.
481 fig : Figure, optional
482 If given, this figure is simply returned. Otherwise a new figure is
483 created.
485 Returns
486 -------
487 Figure
488 If `fig` is None, the created figure. Otherwise `fig` itself.
490 See Also
491 --------
492 plot_partregress : Plot partial regression for a single regressor.
493 plot_ccpr : Plot CCPR against one regressor
495 Notes
496 -----
497 A subplot is created for each explanatory variable given by exog_idx.
498 The partial regression plot shows the relationship between the response
499 and the given explanatory variable after removing the effect of all other
500 explanatory variables in exog.
502 References
503 ----------
504 See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm
506 Examples
507 --------
508 Using the state crime dataset separately plot the effect of the each
509 variable on the on the outcome, murder rate while accounting for the effect
510 of all other variables in the model visualized with a grid of partial
511 regression plots.
513 >>> from statsmodels.graphics.regressionplots import plot_partregress_grid
514 >>> import statsmodels.api as sm
515 >>> import matplotlib.pyplot as plt
516 >>> import statsmodels.formula.api as smf
518 >>> fig = plt.figure(figsize=(8, 6))
519 >>> crime_data = sm.datasets.statecrime.load_pandas()
520 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
521 ... data=crime_data.data).fit()
522 >>> plot_partregress_grid(results, fig=fig)
523 >>> plt.show()
525 .. plot:: plots/graphics_regression_partregress_grid.py
526 """
527 import pandas
528 fig = utils.create_mpl_fig(fig)
530 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
532 # TODO: maybe add option for using wendog, wexog instead
533 y = pandas.Series(results.model.endog, name=results.model.endog_names)
534 exog = results.model.exog
536 k_vars = exog.shape[1]
537 # this function does not make sense if k_vars=1
539 nrows = (len(exog_idx) + 1) // 2
540 ncols = 1 if nrows == len(exog_idx) else 2
541 if grid is not None:
542 nrows, ncols = grid
543 if ncols > 1:
544 title_kwargs = {"fontdict": {"fontsize": 'small'}}
546 # for indexing purposes
547 other_names = np.array(results.model.exog_names)
548 for i, idx in enumerate(exog_idx):
549 others = lrange(k_vars)
550 others.pop(idx)
551 exog_others = pandas.DataFrame(exog[:, others],
552 columns=other_names[others])
553 ax = fig.add_subplot(nrows, ncols, i + 1)
554 plot_partregress(y, pandas.Series(exog[:, idx],
555 name=other_names[idx]),
556 exog_others, ax=ax, title_kwargs=title_kwargs,
557 obs_labels=False)
558 ax.set_title("")
560 fig.suptitle("Partial Regression Plot", fontsize="large")
561 fig.tight_layout()
562 fig.subplots_adjust(top=.95)
564 return fig
567def plot_ccpr(results, exog_idx, ax=None):
568 """
569 Plot CCPR against one regressor.
571 Generates a component and component-plus-residual (CCPR) plot.
573 Parameters
574 ----------
575 results : result instance
576 A regression results instance.
577 exog_idx : {int, str}
578 Exogenous, explanatory variable. If string is given, it should
579 be the variable name that you want to use, and you can use arbitrary
580 translations as with a formula.
581 ax : AxesSubplot, optional
582 If given, it is used to plot in instead of a new figure being
583 created.
585 Returns
586 -------
587 Figure
588 If `ax` is None, the created figure. Otherwise the figure to which
589 `ax` is connected.
591 See Also
592 --------
593 plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.
595 Notes
596 -----
597 The CCPR plot provides a way to judge the effect of one regressor on the
598 response variable by taking into account the effects of the other
599 independent variables. The partial residuals plot is defined as
600 Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
601 X_i to show where the fitted line would lie. Care should be taken if X_i
602 is highly correlated with any of the other independent variables. If this
603 is the case, the variance evident in the plot will be an underestimate of
604 the true variance.
606 References
607 ----------
608 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
610 Examples
611 --------
612 Using the state crime dataset plot the effect of the rate of single
613 households ('single') on the murder rate while accounting for high school
614 graduation rate ('hs_grad'), percentage of people in an urban area, and rate
615 of poverty ('poverty').
617 >>> import statsmodels.api as sm
618 >>> import matplotlib.pyplot as plot
619 >>> import statsmodels.formula.api as smf
621 >>> crime_data = sm.datasets.statecrime.load_pandas()
622 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
623 ... data=crime_data.data).fit()
624 >>> sm.graphics.plot_ccpr(results, 'single')
625 >>> plt.show()
627 .. plot:: plots/graphics_regression_ccpr.py
628 """
629 fig, ax = utils.create_mpl_ax(ax)
631 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
632 results = maybe_unwrap_results(results)
634 x1 = results.model.exog[:, exog_idx]
635 #namestr = ' for %s' % self.name if self.name else ''
636 x1beta = x1*results.params[exog_idx]
637 ax.plot(x1, x1beta + results.resid, 'o')
638 from statsmodels.tools.tools import add_constant
639 mod = OLS(x1beta, add_constant(x1)).fit()
640 params = mod.params
641 fig = abline_plot(*params, **dict(ax=ax))
642 #ax.plot(x1, x1beta, '-')
643 ax.set_title('Component and component plus residual plot')
644 ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
645 ax.set_xlabel("%s" % exog_name)
647 return fig
650def plot_ccpr_grid(results, exog_idx=None, grid=None, fig=None):
651 """
652 Generate CCPR plots against a set of regressors, plot in a grid.
654 Generates a grid of component and component-plus-residual (CCPR) plots.
656 Parameters
657 ----------
658 results : result instance
659 A results instance with exog and params.
660 exog_idx : None or list of int
661 The indices or column names of the exog used in the plot.
662 grid : None or tuple of int (nrows, ncols)
663 If grid is given, then it is used for the arrangement of the subplots.
664 If grid is None, then ncol is one, if there are only 2 subplots, and
665 the number of columns is two otherwise.
666 fig : Figure, optional
667 If given, this figure is simply returned. Otherwise a new figure is
668 created.
670 Returns
671 -------
672 Figure
673 If `ax` is None, the created figure. Otherwise the figure to which
674 `ax` is connected.
676 See Also
677 --------
678 plot_ccpr : Creates CCPR plot for a single regressor.
680 Notes
681 -----
682 Partial residual plots are formed as::
684 Res + Betahat(i)*Xi versus Xi
686 and CCPR adds::
688 Betahat(i)*Xi versus Xi
690 References
691 ----------
692 See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
694 Examples
695 --------
696 Using the state crime dataset separately plot the effect of the each
697 variable on the on the outcome, murder rate while accounting for the effect
698 of all other variables in the model.
700 >>> import statsmodels.api as sm
701 >>> import matplotlib.pyplot as plt
702 >>> import statsmodels.formula.api as smf
704 >>> fig = plt.figure(figsize=(8, 8))
705 >>> crime_data = sm.datasets.statecrime.load_pandas()
706 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
707 ... data=crime_data.data).fit()
708 >>> sm.graphics.plot_ccpr_grid(results, fig=fig)
709 >>> plt.show()
711 .. plot:: plots/graphics_regression_ccpr_grid.py
712 """
713 fig = utils.create_mpl_fig(fig)
715 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
717 if grid is not None:
718 nrows, ncols = grid
719 else:
720 if len(exog_idx) > 2:
721 nrows = int(np.ceil(len(exog_idx)/2.))
722 ncols = 2
723 else:
724 nrows = len(exog_idx)
725 ncols = 1
727 seen_constant = 0
728 for i, idx in enumerate(exog_idx):
729 if results.model.exog[:, idx].var() == 0:
730 seen_constant = 1
731 continue
733 ax = fig.add_subplot(nrows, ncols, i+1-seen_constant)
734 fig = plot_ccpr(results, exog_idx=idx, ax=ax)
735 ax.set_title("")
737 fig.suptitle("Component-Component Plus Residual Plot", fontsize="large")
739 fig.tight_layout()
741 fig.subplots_adjust(top=.95)
742 return fig
745def abline_plot(intercept=None, slope=None, horiz=None, vert=None,
746 model_results=None, ax=None, **kwargs):
747 """
748 Plot a line given an intercept and slope.
750 Parameters
751 ----------
752 intercept : float
753 The intercept of the line.
754 slope : float
755 The slope of the line.
756 horiz : float or array_like
757 Data for horizontal lines on the y-axis.
758 vert : array_like
759 Data for verterical lines on the x-axis.
760 model_results : statsmodels results instance
761 Any object that has a two-value `params` attribute. Assumed that it
762 is (intercept, slope).
763 ax : axes, optional
764 Matplotlib axes instance.
765 **kwargs
766 Options passed to matplotlib.pyplot.plt.
768 Returns
769 -------
770 Figure
771 The figure given by `ax.figure` or a new instance.
773 Examples
774 --------
775 >>> import numpy as np
776 >>> import statsmodels.api as sm
778 >>> np.random.seed(12345)
779 >>> X = sm.add_constant(np.random.normal(0, 20, size=30))
780 >>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30)
781 >>> mod = sm.OLS(y,X).fit()
782 >>> fig = sm.graphics.abline_plot(model_results=mod)
783 >>> ax = fig.axes[0]
784 >>> ax.scatter(X[:,1], y)
785 >>> ax.margins(.1)
786 >>> import matplotlib.pyplot as plt
787 >>> plt.show()
789 .. plot:: plots/graphics_regression_abline.py
790 """
791 if ax is not None: # get axis limits first thing, do not change these
792 x = ax.get_xlim()
793 else:
794 x = None
796 fig, ax = utils.create_mpl_ax(ax)
798 if model_results:
799 intercept, slope = model_results.params
800 if x is None:
801 x = [model_results.model.exog[:, 1].min(),
802 model_results.model.exog[:, 1].max()]
803 else:
804 if not (intercept is not None and slope is not None):
805 raise ValueError("specify slope and intercepty or model_results")
806 if x is None:
807 x = ax.get_xlim()
809 data_y = [x[0]*slope+intercept, x[1]*slope+intercept]
810 ax.set_xlim(x)
811 #ax.set_ylim(y)
813 from matplotlib.lines import Line2D
815 class ABLine2D(Line2D):
816 def __init__(self, *args, **kwargs):
817 super(ABLine2D, self).__init__(*args, **kwargs)
818 self.id_xlim_callback = None
819 self.id_ylim_callback = None
821 def remove(self):
822 ax = self.axes
823 if self.id_xlim_callback:
824 ax.callbacks.disconnect(self.id_xlim_callback)
825 if self.id_ylim_callback:
826 ax.callbacks.disconnect(self.id_ylim_callback)
827 super(ABLine2D, self).remove()
829 def update_datalim(self, ax):
830 ax.set_autoscale_on(False)
831 children = ax.get_children()
832 ablines = [child for child in children if child is self]
833 abline = ablines[0]
834 x = ax.get_xlim()
835 y = [x[0] * slope + intercept, x[1] * slope + intercept]
836 abline.set_data(x, y)
837 ax.figure.canvas.draw()
839 # TODO: how to intercept something like a margins call and adjust?
840 line = ABLine2D(x, data_y, **kwargs)
841 ax.add_line(line)
842 line.id_xlim_callback = ax.callbacks.connect('xlim_changed', line.update_datalim)
843 line.id_ylim_callback = ax.callbacks.connect('ylim_changed', line.update_datalim)
845 if horiz:
846 ax.hline(horiz)
847 if vert:
848 ax.vline(vert)
849 return fig
852@Appender(_plot_influence_doc.format(**{
853 'extra_params_doc': "results: object\n"
854 " Results for a fitted regression model.\n"
855 " influence: instance\n"
856 " The instance of Influence for model."}))
857def _influence_plot(results, influence, external=True, alpha=.05,
858 criterion="cooks", size=48, plot_alpha=.75, ax=None,
859 **kwargs):
860 infl = influence
861 fig, ax = utils.create_mpl_ax(ax)
863 if criterion.lower().startswith('coo'):
864 psize = infl.cooks_distance[0]
865 elif criterion.lower().startswith('dff'):
866 psize = np.abs(infl.dffits[0])
867 else:
868 raise ValueError("Criterion %s not understood" % criterion)
870 # scale the variables
871 #TODO: what is the correct scaling and the assumption here?
872 #we want plots to be comparable across different plots
873 #so we would need to use the expected distribution of criterion probably
874 old_range = np.ptp(psize)
875 new_range = size**2 - 8**2
877 psize = (psize - psize.min()) * new_range/old_range + 8**2
879 leverage = infl.hat_matrix_diag
880 if external:
881 resids = infl.resid_studentized_external
882 else:
883 resids = infl.resid_studentized
885 from scipy import stats
887 cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
888 large_resid = np.abs(resids) > cutoff
889 large_leverage = leverage > _high_leverage(results)
890 large_points = np.logical_or(large_resid, large_leverage)
892 ax.scatter(leverage, resids, s=psize, alpha=plot_alpha)
894 # add point labels
895 labels = results.model.data.row_labels
896 if labels is None:
897 labels = lrange(len(resids))
898 ax = utils.annotate_axes(np.where(large_points)[0], labels,
899 lzip(leverage, resids),
900 lzip(-(psize/2)**.5, (psize/2)**.5), "x-large",
901 ax)
903 # TODO: make configurable or let people do it ex-post?
904 font = {"fontsize": 16, "color": "black"}
905 ax.set_ylabel("Studentized Residuals", **font)
906 ax.set_xlabel("H Leverage", **font)
907 ax.set_title("Influence Plot", **font)
908 return fig
911@Appender(_plot_influence_doc.format(**{
912 'extra_params_doc': "results : Results\n"
913 " Results for a fitted regression model."}))
914def influence_plot(results, external=True, alpha=.05, criterion="cooks",
915 size=48, plot_alpha=.75, ax=None, **kwargs):
917 infl = results.get_influence()
918 res = _influence_plot(results, infl, external=external, alpha=alpha,
919 criterion=criterion, size=size,
920 plot_alpha=plot_alpha, ax=ax, **kwargs)
921 return res
924@Appender(_plot_leverage_resid2_doc.format({
925 'extra_params_doc': "results: object\n"
926 " Results for a fitted regression model\n"
927 "influence: instance\n"
928 " instance of Influence for model"}))
929def _plot_leverage_resid2(results, influence, alpha=.05, ax=None,
930 **kwargs):
932 from scipy.stats import zscore, norm
933 fig, ax = utils.create_mpl_ax(ax)
935 infl = influence
936 leverage = infl.hat_matrix_diag
937 resid = zscore(infl.resid)
938 ax.plot(resid**2, leverage, 'o', **kwargs)
939 ax.set_xlabel("Normalized residuals**2")
940 ax.set_ylabel("Leverage")
941 ax.set_title("Leverage vs. Normalized residuals squared")
943 large_leverage = leverage > _high_leverage(results)
944 #norm or t here if standardized?
945 cutoff = norm.ppf(1.-alpha/2)
946 large_resid = np.abs(resid) > cutoff
947 labels = results.model.data.row_labels
948 if labels is None:
949 labels = lrange(int(results.nobs))
950 index = np.where(np.logical_or(large_leverage, large_resid))[0]
951 ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
952 [(0, 5)]*int(results.nobs), "large",
953 ax=ax, ha="center", va="bottom")
954 ax.margins(.075, .075)
955 return fig
958@Appender(_plot_leverage_resid2_doc.format({
959 'extra_params_doc': "results : object\n"
960 " Results for a fitted regression model"}))
961def plot_leverage_resid2(results, alpha=.05, ax=None, **kwargs):
963 infl = results.get_influence()
964 return _plot_leverage_resid2(results, infl, alpha=alpha, ax=ax, **kwargs)
968@Appender(_plot_added_variable_doc % {
969 'extra_params_doc': "results : object\n"
970 " Results for a fitted regression model"})
971def plot_added_variable(results, focus_exog, resid_type=None,
972 use_glm_weights=True, fit_kwargs=None, ax=None):
974 model = results.model
976 fig, ax = utils.create_mpl_ax(ax)
978 endog_resid, focus_exog_resid =\
979 added_variable_resids(results, focus_exog,
980 resid_type=resid_type,
981 use_glm_weights=use_glm_weights,
982 fit_kwargs=fit_kwargs)
984 ax.plot(focus_exog_resid, endog_resid, 'o', alpha=0.6)
986 ax.set_title('Added variable plot', fontsize='large')
988 if isinstance(focus_exog, str):
989 xname = focus_exog
990 else:
991 xname = model.exog_names[focus_exog]
992 ax.set_xlabel(xname, size=15)
993 ax.set_ylabel(model.endog_names + " residuals", size=15)
995 return fig
998@Appender(_plot_partial_residuals_doc % {
999 'extra_params_doc': "results : object\n"
1000 " Results for a fitted regression model"})
1001def plot_partial_residuals(results, focus_exog, ax=None):
1002 # Docstring attached below
1004 model = results.model
1006 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1008 pr = partial_resids(results, focus_exog)
1009 focus_exog_vals = results.model.exog[:, focus_col]
1011 fig, ax = utils.create_mpl_ax(ax)
1012 ax.plot(focus_exog_vals, pr, 'o', alpha=0.6)
1014 ax.set_title('Partial residuals plot', fontsize='large')
1016 if isinstance(focus_exog, str):
1017 xname = focus_exog
1018 else:
1019 xname = model.exog_names[focus_exog]
1020 ax.set_xlabel(xname, size=15)
1021 ax.set_ylabel("Component plus residual", size=15)
1023 return fig
1026@Appender(_plot_ceres_residuals_doc % {
1027 'extra_params_doc': "results : Results\n"
1028 " Results instance of a fitted regression "
1029 "model."})
1030def plot_ceres_residuals(results, focus_exog, frac=0.66, cond_means=None,
1031 ax=None):
1033 model = results.model
1035 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1037 presid = ceres_resids(results, focus_exog, frac=frac,
1038 cond_means=cond_means)
1040 focus_exog_vals = model.exog[:, focus_col]
1042 fig, ax = utils.create_mpl_ax(ax)
1043 ax.plot(focus_exog_vals, presid, 'o', alpha=0.6)
1045 ax.set_title('CERES residuals plot', fontsize='large')
1047 ax.set_xlabel(focus_exog, size=15)
1048 ax.set_ylabel("Component plus residual", size=15)
1050 return fig
1053def ceres_resids(results, focus_exog, frac=0.66, cond_means=None):
1054 """
1055 Calculate the CERES residuals (Conditional Expectation Partial
1056 Residuals) for a fitted model.
1058 Parameters
1059 ----------
1060 results : model results instance
1061 The fitted model for which the CERES residuals are calculated.
1062 focus_exog : int
1063 The column of results.model.exog used as the 'focus variable'.
1064 frac : float, optional
1065 Lowess smoothing parameter for estimating the conditional
1066 means. Not used if `cond_means` is provided.
1067 cond_means : array_like, optional
1068 If provided, the columns of this array are the conditional
1069 means E[exog | focus exog], where exog ranges over some
1070 or all of the columns of exog other than focus exog. If
1071 this is an empty nx0 array, the conditional means are
1072 treated as being zero. If None, the conditional means are
1073 estimated.
1075 Returns
1076 -------
1077 An array containing the CERES residuals.
1079 Notes
1080 -----
1081 If `cond_means` is not provided, it is obtained by smoothing each
1082 column of exog (except the focus column) against the focus column.
1084 Currently only supports GLM, GEE, and OLS models.
1085 """
1087 model = results.model
1089 if not isinstance(model, (GLM, GEE, OLS)):
1090 raise ValueError("ceres residuals not available for %s" %
1091 model.__class__.__name__)
1093 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1095 # Indices of non-focus columns
1096 ix_nf = range(len(results.params))
1097 ix_nf = list(ix_nf)
1098 ix_nf.pop(focus_col)
1099 nnf = len(ix_nf)
1101 # Estimate the conditional means if not provided.
1102 if cond_means is None:
1104 # Below we calculate E[x | focus] where x is each column other
1105 # than the focus column. We do not want the intercept when we do
1106 # this so we remove it here.
1107 pexog = model.exog[:, ix_nf]
1108 pexog -= pexog.mean(0)
1109 u, s, vt = np.linalg.svd(pexog, 0)
1110 ii = np.flatnonzero(s > 1e-6)
1111 pexog = u[:, ii]
1113 fcol = model.exog[:, focus_col]
1114 cond_means = np.empty((len(fcol), pexog.shape[1]))
1115 for j in range(pexog.shape[1]):
1117 # Get the fitted values for column i given the other
1118 # columns (skip the intercept).
1119 y0 = pexog[:, j]
1121 cf = lowess(y0, fcol, frac=frac, return_sorted=False)
1123 cond_means[:, j] = cf
1125 new_exog = np.concatenate((model.exog[:, ix_nf], cond_means), axis=1)
1127 # Refit the model using the adjusted exog values
1128 klass = model.__class__
1129 init_kwargs = model._get_init_kwds()
1130 new_model = klass(model.endog, new_exog, **init_kwargs)
1131 new_result = new_model.fit()
1133 # The partial residual, with respect to l(x2) (notation of Cook 1998)
1134 presid = model.endog - new_result.fittedvalues
1135 if isinstance(model, (GLM, GEE)):
1136 presid *= model.family.link.deriv(new_result.fittedvalues)
1137 if new_exog.shape[1] > nnf:
1138 presid += np.dot(new_exog[:, nnf:], new_result.params[nnf:])
1140 return presid
1142def partial_resids(results, focus_exog):
1143 """
1144 Returns partial residuals for a fitted model with respect to a
1145 'focus predictor'.
1147 Parameters
1148 ----------
1149 results : results instance
1150 A fitted regression model.
1151 focus col : int
1152 The column index of model.exog with respect to which the
1153 partial residuals are calculated.
1155 Returns
1156 -------
1157 An array of partial residuals.
1159 References
1160 ----------
1161 RD Cook and R Croos-Dabrera (1998). Partial residual plots in
1162 generalized linear models. Journal of the American Statistical
1163 Association, 93:442.
1164 """
1166 # TODO: could be a method of results
1167 # TODO: see Cook et al (1998) for a more general definition
1169 # The calculation follows equation (8) from Cook's paper.
1170 model = results.model
1171 resid = model.endog - results.predict()
1173 if isinstance(model, (GLM, GEE)):
1174 resid *= model.family.link.deriv(results.fittedvalues)
1175 elif isinstance(model, (OLS, GLS, WLS)):
1176 pass # No need to do anything
1177 else:
1178 raise ValueError("Partial residuals for '%s' not implemented."
1179 % type(model))
1181 if type(focus_exog) is str:
1182 focus_col = model.exog_names.index(focus_exog)
1183 else:
1184 focus_col = focus_exog
1186 focus_val = results.params[focus_col] * model.exog[:, focus_col]
1188 return focus_val + resid
1190def added_variable_resids(results, focus_exog, resid_type=None,
1191 use_glm_weights=True, fit_kwargs=None):
1192 """
1193 Residualize the endog variable and a 'focus' exog variable in a
1194 regression model with respect to the other exog variables.
1196 Parameters
1197 ----------
1198 results : regression results instance
1199 A fitted model including the focus exog and all other
1200 predictors of interest.
1201 focus_exog : {int, str}
1202 The column of results.model.exog or a variable name that is
1203 to be residualized against the other predictors.
1204 resid_type : str
1205 The type of residuals to use for the dependent variable. If
1206 None, uses `resid_deviance` for GLM/GEE and `resid` otherwise.
1207 use_glm_weights : bool
1208 Only used if the model is a GLM or GEE. If True, the
1209 residuals for the focus predictor are computed using WLS, with
1210 the weights obtained from the IRLS calculations for fitting
1211 the GLM. If False, unweighted regression is used.
1212 fit_kwargs : dict, optional
1213 Keyword arguments to be passed to fit when refitting the
1214 model.
1216 Returns
1217 -------
1218 endog_resid : array_like
1219 The residuals for the original exog
1220 focus_exog_resid : array_like
1221 The residuals for the focus predictor
1223 Notes
1224 -----
1225 The 'focus variable' residuals are always obtained using linear
1226 regression.
1228 Currently only GLM, GEE, and OLS models are supported.
1229 """
1231 model = results.model
1232 if not isinstance(model, (GEE, GLM, OLS)):
1233 raise ValueError("model type %s not supported for added variable residuals" %
1234 model.__class__.__name__)
1236 exog = model.exog
1237 endog = model.endog
1239 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1241 focus_exog_vals = exog[:, focus_col]
1243 # Default residuals
1244 if resid_type is None:
1245 if isinstance(model, (GEE, GLM)):
1246 resid_type = "resid_deviance"
1247 else:
1248 resid_type = "resid"
1250 ii = range(exog.shape[1])
1251 ii = list(ii)
1252 ii.pop(focus_col)
1253 reduced_exog = exog[:, ii]
1254 start_params = results.params[ii]
1256 klass = model.__class__
1258 kwargs = model._get_init_kwds()
1259 new_model = klass(endog, reduced_exog, **kwargs)
1260 args = {"start_params": start_params}
1261 if fit_kwargs is not None:
1262 args.update(fit_kwargs)
1263 new_result = new_model.fit(**args)
1264 if not new_result.converged:
1265 raise ValueError("fit did not converge when calculating added variable residuals")
1267 try:
1268 endog_resid = getattr(new_result, resid_type)
1269 except AttributeError:
1270 raise ValueError("'%s' residual type not available" % resid_type)
1272 import statsmodels.regression.linear_model as lm
1274 if isinstance(model, (GLM, GEE)) and use_glm_weights:
1275 weights = model.family.weights(results.fittedvalues)
1276 if hasattr(model, "data_weights"):
1277 weights = weights * model.data_weights
1278 lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit()
1279 else:
1280 lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit()
1281 focus_exog_resid = lm_results.resid
1283 return endog_resid, focus_exog_resid