Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2 

3"""Canonical correlation analysis 

4 

5author: Yichuan Liu 

6""" 

7import numpy as np 

8from numpy.linalg import svd 

9import scipy 

10import pandas as pd 

11 

12from statsmodels.base.model import Model 

13from statsmodels.iolib import summary2 

14from .multivariate_ols import multivariate_stats 

15 

16 

17class CanCorr(Model): 

18 """ 

19 Canonical correlation analysis using singular value decomposition 

20 

21 For matrices exog=x and endog=y, find projections x_cancoef and y_cancoef 

22 such that: 

23 

24 x1 = x * x_cancoef, x1' * x1 is identity matrix 

25 y1 = y * y_cancoef, y1' * y1 is identity matrix 

26 

27 and the correlation between x1 and y1 is maximized. 

28 

29 Attributes 

30 ---------- 

31 endog : ndarray 

32 See Parameters. 

33 exog : ndarray 

34 See Parameters. 

35 cancorr : ndarray 

36 The canonical correlation values 

37 y_cancoeff: ndarray 

38 The canonical coefficients for endog 

39 x_cancoeff: ndarray 

40 The canonical coefficients for exog 

41 

42 References 

43 ---------- 

44 .. [*] http://numerical.recipes/whp/notes/CanonCorrBySVD.pdf 

45 .. [*] http://www.csun.edu/~ata20315/psy524/docs/Psy524%20Lecture%208%20CC.pdf 

46 .. [*] http://www.mathematica-journal.com/2014/06/canonical-correlation-analysis/ 

47 """ # noqa:E501 

48 def __init__(self, endog, exog, tolerance=1e-8, missing='none', hasconst=None, **kwargs): 

49 super(CanCorr, self).__init__(endog, exog, missing=missing, 

50 hasconst=hasconst, **kwargs) 

51 self._fit(tolerance) 

52 

53 def _fit(self, tolerance=1e-8): 

54 """Fit the model 

55 

56 A ValueError is raised if there are singular values smaller than the 

57 tolerance. The treatment of singular arrays might change in future. 

58 

59 Parameters 

60 ---------- 

61 tolerance : float 

62 eigenvalue tolerance, values smaller than which is considered 0 

63 """ 

64 nobs, k_yvar = self.endog.shape 

65 nobs, k_xvar = self.exog.shape 

66 k = np.min([k_yvar, k_xvar]) 

67 

68 x = np.array(self.exog) 

69 x = x - x.mean(0) 

70 y = np.array(self.endog) 

71 y = y - y.mean(0) 

72 

73 ux, sx, vx = svd(x, 0) 

74 # vx_ds = vx.T divided by sx 

75 vx_ds = vx.T 

76 mask = sx > tolerance 

77 if mask.sum() < len(mask): 

78 raise ValueError('exog is collinear.') 

79 vx_ds[:, mask] /= sx[mask] 

80 uy, sy, vy = svd(y, 0) 

81 # vy_ds = vy.T divided by sy 

82 vy_ds = vy.T 

83 mask = sy > tolerance 

84 if mask.sum() < len(mask): 

85 raise ValueError('endog is collinear.') 

86 vy_ds[:, mask] /= sy[mask] 

87 u, s, v = svd(ux.T.dot(uy), 0) 

88 

89 # Correct any roundoff 

90 self.cancorr = np.array([max(0, min(s[i], 1)) for i in range(len(s))]) 

91 

92 self.x_cancoef = vx_ds.dot(u[:, :k]) 

93 self.y_cancoef = vy_ds.dot(v.T[:, :k]) 

94 

95 def corr_test(self): 

96 """Approximate F test 

97 Perform multivariate statistical tests of the hypothesis that 

98 there is no canonical correlation between endog and exog. 

99 For each canonical correlation, testing its significance based on 

100 Wilks' lambda. 

101 

102 Returns 

103 ------- 

104 CanCorrTestResults instance 

105 """ 

106 nobs, k_yvar = self.endog.shape 

107 nobs, k_xvar = self.exog.shape 

108 eigenvals = np.power(self.cancorr, 2) 

109 stats = pd.DataFrame(columns=['Canonical Correlation', "Wilks' lambda", 

110 'Num DF','Den DF', 'F Value','Pr > F'], 

111 index=list(range(len(eigenvals) - 1, -1, -1))) 

112 prod = 1 

113 for i in range(len(eigenvals) - 1, -1, -1): 

114 prod *= 1 - eigenvals[i] 

115 p = k_yvar - i 

116 q = k_xvar - i 

117 r = (nobs - k_yvar - 1) - (p - q + 1) / 2 

118 u = (p * q - 2) / 4 

119 df1 = p * q 

120 if p ** 2 + q ** 2 - 5 > 0: 

121 t = np.sqrt(((p * q) ** 2 - 4) / (p ** 2 + q ** 2 - 5)) 

122 else: 

123 t = 1 

124 df2 = r * t - 2 * u 

125 lmd = np.power(prod, 1 / t) 

126 F = (1 - lmd) / lmd * df2 / df1 

127 stats.loc[i, 'Canonical Correlation'] = self.cancorr[i] 

128 stats.loc[i, "Wilks' lambda"] = prod 

129 stats.loc[i, 'Num DF'] = df1 

130 stats.loc[i, 'Den DF'] = df2 

131 stats.loc[i, 'F Value'] = F 

132 pval = scipy.stats.f.sf(F, df1, df2) 

133 stats.loc[i, 'Pr > F'] = pval 

134 ''' 

135 # Wilk's Chi square test of each canonical correlation 

136 df = (p - i + 1) * (q - i + 1) 

137 chi2 = a * np.log(prod) 

138 pval = stats.chi2.sf(chi2, df) 

139 stats.loc[i, 'Canonical correlation'] = self.cancorr[i] 

140 stats.loc[i, 'Chi-square'] = chi2 

141 stats.loc[i, 'DF'] = df 

142 stats.loc[i, 'Pr > ChiSq'] = pval 

143 ''' 

144 ind = stats.index.values[::-1] 

145 stats = stats.loc[ind, :] 

146 

147 # Multivariate tests (remember x has mean removed) 

148 stats_mv = multivariate_stats(eigenvals, 

149 k_yvar, k_xvar, nobs - k_xvar - 1) 

150 return CanCorrTestResults(stats, stats_mv) 

151 

152 

153class CanCorrTestResults(object): 

154 """ 

155 Canonical correlation results class 

156 

157 Attributes 

158 ---------- 

159 stats : DataFrame 

160 Contain statistical tests results for each canonical correlation 

161 stats_mv : DataFrame 

162 Contain the multivariate statistical tests results 

163 """ 

164 def __init__(self, stats, stats_mv): 

165 self.stats = stats 

166 self.stats_mv = stats_mv 

167 

168 def __str__(self): 

169 return self.summary().__str__() 

170 

171 def summary(self): 

172 summ = summary2.Summary() 

173 summ.add_title('Cancorr results') 

174 summ.add_df(self.stats) 

175 summ.add_dict({'': ''}) 

176 summ.add_dict({'Multivariate Statistics and F Approximations': ''}) 

177 summ.add_df(self.stats_mv) 

178 return summ