Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/multivariate/multivariate_ols.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
3"""General linear model
5author: Yichuan Liu
6"""
7import numpy as np
8from numpy.linalg import eigvals, inv, solve, matrix_rank, pinv, svd
9from scipy import stats
10import pandas as pd
11from patsy import DesignInfo
13from statsmodels.compat.pandas import Substitution
14from statsmodels.base.model import Model
15from statsmodels.iolib import summary2
16__docformat__ = 'restructuredtext en'
18_hypotheses_doc = \
19"""hypotheses : list[tuple]
20 Hypothesis `L*B*M = C` to be tested where B is the parameters in
21 regression Y = X*B. Each element is a tuple of length 2, 3, or 4:
23 * (name, contrast_L)
24 * (name, contrast_L, transform_M)
25 * (name, contrast_L, transform_M, constant_C)
27 containing a string `name`, the contrast matrix L, the transform
28 matrix M (for transforming dependent variables), and right-hand side
29 constant matrix constant_C, respectively.
31 contrast_L : 2D array or an array of strings
32 Left-hand side contrast matrix for hypotheses testing.
33 If 2D array, each row is an hypotheses and each column is an
34 independent variable. At least 1 row
35 (1 by k_exog, the number of independent variables) is required.
36 If an array of strings, it will be passed to
37 patsy.DesignInfo().linear_constraint.
39 transform_M : 2D array or an array of strings or None, optional
40 Left hand side transform matrix.
41 If `None` or left out, it is set to a k_endog by k_endog
42 identity matrix (i.e. do not transform y matrix).
43 If an array of strings, it will be passed to
44 patsy.DesignInfo().linear_constraint.
46 constant_C : 2D array or None, optional
47 Right-hand side constant matrix.
48 if `None` or left out it is set to a matrix of zeros
49 Must has the same number of rows as contrast_L and the same
50 number of columns as transform_M
52 If `hypotheses` is None: 1) the effect of each independent variable
53 on the dependent variables will be tested. Or 2) if model is created
54 using a formula, `hypotheses` will be created according to
55 `design_info`. 1) and 2) is equivalent if no additional variables
56 are created by the formula (e.g. dummy variables for categorical
57 variables and interaction terms)
58"""
61def _multivariate_ols_fit(endog, exog, method='svd', tolerance=1e-8):
62 """
63 Solve multivariate linear model y = x * params
64 where y is dependent variables, x is independent variables
66 Parameters
67 ----------
68 endog : array_like
69 each column is a dependent variable
70 exog : array_like
71 each column is a independent variable
72 method : str
73 'svd' - Singular value decomposition
74 'pinv' - Moore-Penrose pseudoinverse
75 tolerance : float, a small positive number
76 Tolerance for eigenvalue. Values smaller than tolerance is considered
77 zero.
78 Returns
79 -------
80 a tuple of matrices or values necessary for hypotheses testing
82 .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
83 Notes
84 -----
85 Status: experimental and incomplete
86 """
87 y = endog
88 x = exog
89 nobs, k_endog = y.shape
90 nobs1, k_exog= x.shape
91 if nobs != nobs1:
92 raise ValueError('x(n=%d) and y(n=%d) should have the same number of '
93 'rows!' % (nobs1, nobs))
95 # Calculate the matrices necessary for hypotheses testing
96 df_resid = nobs - k_exog
97 if method == 'pinv':
98 # Regression coefficients matrix
99 pinv_x = pinv(x)
100 params = pinv_x.dot(y)
102 # inverse of x'x
103 inv_cov = pinv_x.dot(pinv_x.T)
104 if matrix_rank(inv_cov,tol=tolerance) < k_exog:
105 raise ValueError('Covariance of x singular!')
107 # Sums of squares and cross-products of residuals
108 # Y'Y - (X * params)'B * params
109 t = x.dot(params)
110 sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
111 return (params, df_resid, inv_cov, sscpr)
112 elif method == 'svd':
113 u, s, v = svd(x, 0)
114 if (s > tolerance).sum() < len(s):
115 raise ValueError('Covariance of x singular!')
116 invs = 1. / s
118 params = v.T.dot(np.diag(invs)).dot(u.T).dot(y)
119 inv_cov = v.T.dot(np.diag(np.power(invs, 2))).dot(v)
120 t = np.diag(s).dot(v).dot(params)
121 sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
122 return (params, df_resid, inv_cov, sscpr)
123 else:
124 raise ValueError('%s is not a supported method!' % method)
127def multivariate_stats(eigenvals,
128 r_err_sscp,
129 r_contrast, df_resid, tolerance=1e-8):
130 """
131 For multivariate linear model Y = X * B
132 Testing hypotheses
133 L*B*M = 0
134 where L is contrast matrix, B is the parameters of the
135 multivariate linear model and M is dependent variable transform matrix.
136 T = L*inv(X'X)*L'
137 H = M'B'L'*inv(T)*LBM
138 E = M'(Y'Y - B'X'XB)M
140 Parameters
141 ----------
142 eigenvals : ndarray
143 The eigenvalues of inv(E + H)*H
144 r_err_sscp : int
145 Rank of E + H
146 r_contrast : int
147 Rank of T matrix
148 df_resid : int
149 Residual degree of freedom (n_samples minus n_variables of X)
150 tolerance : float
151 smaller than which eigenvalue is considered 0
153 Returns
154 -------
155 A DataFrame
157 References
158 ----------
159 .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
160 """
161 v = df_resid
162 p = r_err_sscp
163 q = r_contrast
164 s = np.min([p, q])
165 ind = eigenvals > tolerance
166 n_e = ind.sum()
167 eigv2 = eigenvals[ind]
168 eigv1 = np.array([i / (1 - i) for i in eigv2])
169 m = (np.abs(p - q) - 1) / 2
170 n = (v - p - 1) / 2
172 cols = ['Value', 'Num DF', 'Den DF', 'F Value', 'Pr > F']
173 index = ["Wilks' lambda", "Pillai's trace",
174 "Hotelling-Lawley trace", "Roy's greatest root"]
175 results = pd.DataFrame(columns=cols,
176 index=index)
178 def fn(x):
179 return np.real([x])[0]
181 results.loc["Wilks' lambda", 'Value'] = fn(np.prod(1 - eigv2))
183 results.loc["Pillai's trace", 'Value'] = fn(eigv2.sum())
185 results.loc["Hotelling-Lawley trace", 'Value'] = fn(eigv1.sum())
187 results.loc["Roy's greatest root", 'Value'] = fn(eigv1.max())
189 r = v - (p - q + 1)/2
190 u = (p*q - 2) / 4
191 df1 = p * q
192 if p*p + q*q - 5 > 0:
193 t = np.sqrt((p*p*q*q - 4) / (p*p + q*q - 5))
194 else:
195 t = 1
196 df2 = r*t - 2*u
197 lmd = results.loc["Wilks' lambda", 'Value']
198 lmd = np.power(lmd, 1 / t)
199 F = (1 - lmd) / lmd * df2 / df1
200 results.loc["Wilks' lambda", 'Num DF'] = df1
201 results.loc["Wilks' lambda", 'Den DF'] = df2
202 results.loc["Wilks' lambda", 'F Value'] = F
203 pval = stats.f.sf(F, df1, df2)
204 results.loc["Wilks' lambda", 'Pr > F'] = pval
206 V = results.loc["Pillai's trace", 'Value']
207 df1 = s * (2*m + s + 1)
208 df2 = s * (2*n + s + 1)
209 F = df2 / df1 * V / (s - V)
210 results.loc["Pillai's trace", 'Num DF'] = df1
211 results.loc["Pillai's trace", 'Den DF'] = df2
212 results.loc["Pillai's trace", 'F Value'] = F
213 pval = stats.f.sf(F, df1, df2)
214 results.loc["Pillai's trace", 'Pr > F'] = pval
216 U = results.loc["Hotelling-Lawley trace", 'Value']
217 if n > 0:
218 b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
219 df1 = p * q
220 df2 = 4 + (p*q + 2) / (b - 1)
221 c = (df2 - 2) / 2 / n
222 F = df2 / df1 * U / c
223 else:
224 df1 = s * (2*m + s + 1)
225 df2 = s * (s*n + 1)
226 F = df2 / df1 / s * U
227 results.loc["Hotelling-Lawley trace", 'Num DF'] = df1
228 results.loc["Hotelling-Lawley trace", 'Den DF'] = df2
229 results.loc["Hotelling-Lawley trace", 'F Value'] = F
230 pval = stats.f.sf(F, df1, df2)
231 results.loc["Hotelling-Lawley trace", 'Pr > F'] = pval
233 sigma = results.loc["Roy's greatest root", 'Value']
234 r = np.max([p, q])
235 df1 = r
236 df2 = v - r + q
237 F = df2 / df1 * sigma
238 results.loc["Roy's greatest root", 'Num DF'] = df1
239 results.loc["Roy's greatest root", 'Den DF'] = df2
240 results.loc["Roy's greatest root", 'F Value'] = F
241 pval = stats.f.sf(F, df1, df2)
242 results.loc["Roy's greatest root", 'Pr > F'] = pval
243 return results
246def _multivariate_ols_test(hypotheses, fit_results, exog_names,
247 endog_names):
248 def fn(L, M, C):
249 # .. [1] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
250 params, df_resid, inv_cov, sscpr = fit_results
251 # t1 = (L * params)M
252 t1 = L.dot(params).dot(M) - C
253 # H = t1'L(X'X)^L't1
254 t2 = L.dot(inv_cov).dot(L.T)
255 q = matrix_rank(t2)
256 H = t1.T.dot(inv(t2)).dot(t1)
258 # E = M'(Y'Y - B'(X'X)B)M
259 E = M.T.dot(sscpr).dot(M)
260 return E, H, q, df_resid
262 return _multivariate_test(hypotheses, exog_names, endog_names, fn)
265@Substitution(hypotheses_doc=_hypotheses_doc)
266def _multivariate_test(hypotheses, exog_names, endog_names, fn):
267 """
268 Multivariate linear model hypotheses testing
270 For y = x * params, where y are the dependent variables and x are the
271 independent variables, testing L * params * M = 0 where L is the contrast
272 matrix for hypotheses testing and M is the transformation matrix for
273 transforming the dependent variables in y.
275 Algorithm:
276 T = L*inv(X'X)*L'
277 H = M'B'L'*inv(T)*LBM
278 E = M'(Y'Y - B'X'XB)M
279 And then finding the eigenvalues of inv(H + E)*H
281 .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
283 Parameters
284 ----------
285 %(hypotheses_doc)s
286 k_xvar : int
287 The number of independent variables
288 k_yvar : int
289 The number of dependent variables
290 fn : function
291 a function fn(contrast_L, transform_M) that returns E, H, q, df_resid
292 where q is the rank of T matrix
294 Returns
295 -------
296 results : MANOVAResults
297 """
299 k_xvar = len(exog_names)
300 k_yvar = len(endog_names)
301 results = {}
302 for hypo in hypotheses:
303 if len(hypo) ==2:
304 name, L = hypo
305 M = None
306 C = None
307 elif len(hypo) == 3:
308 name, L, M = hypo
309 C = None
310 elif len(hypo) == 4:
311 name, L, M, C = hypo
312 else:
313 raise ValueError('hypotheses must be a tuple of length 2, 3 or 4.'
314 ' len(hypotheses)=%d' % len(hypo))
315 if any(isinstance(j, str) for j in L):
316 L = DesignInfo(exog_names).linear_constraint(L).coefs
317 else:
318 if not isinstance(L, np.ndarray) or len(L.shape) != 2:
319 raise ValueError('Contrast matrix L must be a 2-d array!')
320 if L.shape[1] != k_xvar:
321 raise ValueError('Contrast matrix L should have the same '
322 'number of columns as exog! %d != %d' %
323 (L.shape[1], k_xvar))
324 if M is None:
325 M = np.eye(k_yvar)
326 elif any(isinstance(j, str) for j in M):
327 M = DesignInfo(endog_names).linear_constraint(M).coefs.T
328 else:
329 if M is not None:
330 if not isinstance(M, np.ndarray) or len(M.shape) != 2:
331 raise ValueError('Transform matrix M must be a 2-d array!')
332 if M.shape[0] != k_yvar:
333 raise ValueError('Transform matrix M should have the same '
334 'number of rows as the number of columns '
335 'of endog! %d != %d' %
336 (M.shape[0], k_yvar))
337 if C is None:
338 C = np.zeros([L.shape[0], M.shape[1]])
339 elif not isinstance(C, np.ndarray):
340 raise ValueError('Constant matrix C must be a 2-d array!')
342 if C.shape[0] != L.shape[0]:
343 raise ValueError('contrast L and constant C must have the same '
344 'number of rows! %d!=%d'
345 % (L.shape[0], C.shape[0]))
346 if C.shape[1] != M.shape[1]:
347 raise ValueError('transform M and constant C must have the same '
348 'number of columns! %d!=%d'
349 % (M.shape[1], C.shape[1]))
350 E, H, q, df_resid = fn(L, M, C)
351 EH = np.add(E, H)
352 p = matrix_rank(EH)
354 # eigenvalues of inv(E + H)H
355 eigv2 = np.sort(eigvals(solve(EH, H)))
356 stat_table = multivariate_stats(eigv2, p, q, df_resid)
358 results[name] = {'stat':stat_table, 'contrast_L':L,
359 'transform_M':M, 'constant_C':C}
360 return results
363class _MultivariateOLS(Model):
364 """
365 Multivariate linear model via least squares
368 Parameters
369 ----------
370 endog : array_like
371 Dependent variables. A nobs x k_endog array where nobs is
372 the number of observations and k_endog is the number of dependent
373 variables
374 exog : array_like
375 Independent variables. A nobs x k_exog array where nobs is the
376 number of observations and k_exog is the number of independent
377 variables. An intercept is not included by default and should be added
378 by the user (models specified using a formula include an intercept by
379 default)
381 Attributes
382 ----------
383 endog : ndarray
384 See Parameters.
385 exog : ndarray
386 See Parameters.
387 """
388 _formula_max_endog = None
390 def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs):
391 if len(endog.shape) == 1 or endog.shape[1] == 1:
392 raise ValueError('There must be more than one dependent variable'
393 ' to fit multivariate OLS!')
394 super(_MultivariateOLS, self).__init__(endog, exog, missing=missing,
395 hasconst=hasconst, **kwargs)
397 def fit(self, method='svd'):
398 self._fittedmod = _multivariate_ols_fit(
399 self.endog, self.exog, method=method)
400 return _MultivariateOLSResults(self)
403class _MultivariateOLSResults(object):
404 """
405 _MultivariateOLS results class
406 """
407 def __init__(self, fitted_mv_ols):
408 if (hasattr(fitted_mv_ols, 'data') and
409 hasattr(fitted_mv_ols.data, 'design_info')):
410 self.design_info = fitted_mv_ols.data.design_info
411 else:
412 self.design_info = None
413 self.exog_names = fitted_mv_ols.exog_names
414 self.endog_names = fitted_mv_ols.endog_names
415 self._fittedmod = fitted_mv_ols._fittedmod
417 def __str__(self):
418 return self.summary().__str__()
420 @Substitution(hypotheses_doc=_hypotheses_doc)
421 def mv_test(self, hypotheses=None):
422 """
423 Linear hypotheses testing
425 Parameters
426 ----------
427 %(hypotheses_doc)s
429 Returns
430 -------
431 results: _MultivariateOLSResults
433 Notes
434 -----
435 Tests hypotheses of the form
437 L * params * M = C
439 where `params` is the regression coefficient matrix for the
440 linear model y = x * params, `L` is the contrast matrix, `M` is the
441 dependent variable transform matrix and C is the constant matrix.
442 """
443 k_xvar = len(self.exog_names)
444 if hypotheses is None:
445 if self.design_info is not None:
446 terms = self.design_info.term_name_slices
447 hypotheses = []
448 for key in terms:
449 L_contrast = np.eye(k_xvar)[terms[key], :]
450 hypotheses.append([key, L_contrast, None])
451 else:
452 hypotheses = []
453 for i in range(k_xvar):
454 name = 'x%d' % (i)
455 L = np.zeros([1, k_xvar])
456 L[i] = 1
457 hypotheses.append([name, L, None])
459 results = _multivariate_ols_test(hypotheses, self._fittedmod,
460 self.exog_names, self.endog_names)
462 return MultivariateTestResults(results,
463 self.endog_names,
464 self.exog_names)
466 def summary(self):
467 raise NotImplementedError
470class MultivariateTestResults(object):
471 """ Multivariate test results class
472 Returned by `mv_test` method of `_MultivariateOLSResults` class
474 Attributes
475 ----------
476 results : dict
477 For hypothesis name `key`:
478 results[key]['stat'] contains the multivariate test results
479 results[key]['contrast_L'] contains the contrast_L matrix
480 results[key]['transform_M'] contains the transform_M matrix
481 results[key]['constant_C'] contains the constant_C matrix
482 endog_names : str
483 exog_names : str
484 summary_frame : multiindex dataframe
485 Returns results as a multiindex dataframe
486 """
487 def __init__(self, mv_test_df, endog_names, exog_names):
488 self.results = mv_test_df
489 self.endog_names = endog_names
490 self.exog_names = exog_names
492 def __str__(self):
493 return self.summary().__str__()
495 def __getitem__(self, item):
496 return self.results[item]
498 @property
499 def summary_frame(self):
500 """
501 Return results as a multiindex dataframe
502 """
503 df = []
504 for key in self.results:
505 tmp = self.results[key]['stat'].copy()
506 tmp.loc[:, 'Effect'] = key
507 df.append(tmp.reset_index())
508 df = pd.concat(df, axis=0)
509 df = df.set_index(['Effect', 'index'])
510 df.index.set_names(['Effect', 'Statistic'], inplace=True)
511 return df
513 def summary(self, show_contrast_L=False, show_transform_M=False,
514 show_constant_C=False):
515 """
517 Parameters
518 ----------
519 contrast_L : True or False
520 Whether to show contrast_L matrix
521 transform_M : True or False
522 Whether to show transform_M matrix
523 """
524 summ = summary2.Summary()
525 summ.add_title('Multivariate linear model')
526 for key in self.results:
527 summ.add_dict({'':''})
528 df = self.results[key]['stat'].copy()
529 df = df.reset_index()
530 c = df.columns.values
531 c[0] = key
532 df.columns = c
533 df.index = ['', '', '', '']
534 summ.add_df(df)
535 if show_contrast_L:
536 summ.add_dict({key:' contrast L='})
537 df = pd.DataFrame(self.results[key]['contrast_L'],
538 columns=self.exog_names)
539 summ.add_df(df)
540 if show_transform_M:
541 summ.add_dict({key:' transform M='})
542 df = pd.DataFrame(self.results[key]['transform_M'],
543 index=self.endog_names)
544 summ.add_df(df)
545 if show_constant_C:
546 summ.add_dict({key:' constant C='})
547 df = pd.DataFrame(self.results[key]['constant_C'])
548 summ.add_df(df)
549 return summ