Coverage for ParTIpy/optim.py: 12%

78 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-16 10:22 +0100

1""" 

2Optimize the archetypal analysis objective by block coordiante descent. 

3 

4a) Regularized Nonnegative Least Squares 

5- Paper: A. Cutler and L. Breiman, “Archetypal analysis,” Technometrics, vol. 36, no. 4, pp. 338–347, 1994, doi: 10.1080/00401706.1994.10485840. 

6 

7 

8b) Projected Gradients (PCHA) 

9- Paper: M. Mørup and L. K. Hansen, “Archetypal analysis for machine learning and data mining,” Neurocomputing, vol. 80, pp. 54–63, Mar. 2012, doi: 10.1016/j.neucom.2011.06.033. 

10 

11 

12c) Adapted Frank-Wolfe algorithm 

13- Paper: C. Bauckhage, K. Kersting, F. Hoppe, and C. Thurau, “Archetypal analysis as an autoencoder,” presented at the Workshop “New Challenges in Neural Computation” (NC2) 2015, 2015. Accessed: Feb. 10, 2025. [Online]. Available: https://publica.fraunhofer.de/handle/publica/393337 

14 

15 

16Code adapted from 

17a) https://github.com/nichohelmut/football_results/blob/master/clustering/clustering.py 

18b) https://github.com/atmguille/archetypal-analysis (by Guillermo García Cobo) 

19""" 

20 

21import numpy as np 

22import scipy.optimize 

23 

24from .const import LAMBDA 

25 

26 

27def _compute_A_regularized_nnls( 

28 X: np.ndarray, 

29 Z: np.ndarray, 

30 A: np.ndarray = None, 

31 derivative_max_iter=None, 

32) -> np.ndarray: 

33 # huge_constant is added as a new column to account for w norm constraint 

34 X_padded = np.hstack([X, (LAMBDA * np.ones(X.shape[0]))[:, None]]) 

35 Zt_padded = np.vstack([Z.T, LAMBDA * np.ones(Z.shape[0])]) 

36 

37 # Use non-negative least squares to solve the optimization problem 

38 A = np.array([scipy.optimize.nnls(A=Zt_padded, b=X_padded[n, :])[0] for n in range(X.shape[0])]) 

39 return A 

40 

41 

42def _compute_B_regularized_nnls( 

43 X: np.ndarray, 

44 A: np.ndarray, 

45 B: np.ndarray = None, 

46 derivative_max_iter=None, 

47) -> np.ndarray: 

48 Z = np.linalg.lstsq(a=A, b=X, rcond=None)[0] 

49 Z_padded = np.hstack([Z, (LAMBDA * np.ones(Z.shape[0]))[:, None]]) 

50 Xt_padded = np.vstack([X.T, LAMBDA * np.ones(X.shape[0])]) 

51 B = np.array([scipy.optimize.nnls(A=Xt_padded, b=Z_padded[k, :])[0] for k in range(Z.shape[0])]) 

52 return B 

53 

54 

55# @njit(cache=True) 

56def _compute_A_projected_gradients( 

57 X: np.ndarray, 

58 Z: np.ndarray, 

59 A: np.ndarray, 

60 derivative_max_iter: int = 10, 

61 muA: float = 1.0, 

62) -> np.ndarray: 

63 """Updates the A matrix given the data matrix X and the archetypes Z. 

64 

65 A is the matrix that provides the best convex approximation of X by Z. 

66 

67 Parameters 

68 ---------- 

69 X : numpy 2d-array 

70 Data matrix with shape (n_samples, n_features). 

71 

72 Z : numpy 2d-array 

73 Archetypes matrix with shape (n_archetypes, n_features). 

74 

75 A : numpy 2d-array 

76 A matrix with shape (n_samples, n_archetypes). 

77 

78 derivative_max_iter: int 

79 Maximum number of steps for optimization 

80 

81 Returns 

82 ------- 

83 A : numpy 2d-array 

84 Updated A matrix with shape (n_samples, n_archetypes). 

85 """ 

86 rel_tol = 1e-6 

87 prev_RSS = np.linalg.norm(X - A @ Z) ** 2 

88 for _ in range(derivative_max_iter): 

89 # brackets are VERY important to save time 

90 # [G] ~ N x K 

91 G = 2.0 * (A @ (Z @ Z.T) - X @ Z.T) 

92 G = G - np.sum(A * G, axis=1)[:, None] # chain rule of projection, check broadcasting 

93 

94 prev_A = A 

95 # NOTE: original implementation has a while True 

96 for _ in range(derivative_max_iter * 100): 

97 A = (prev_A - muA * G).clip(0) 

98 A = A / (np.sum(A, axis=1)[:, None] + np.finfo(np.float64).eps) # Avoid division by zero 

99 RSS = np.linalg.norm(X - A @ Z) ** 2 

100 if RSS <= (prev_RSS * (1 + rel_tol)): 

101 muA *= 1.2 

102 break 

103 else: 

104 muA /= 2.0 

105 return A 

106 

107 

108# @njit(cache=True) 

109def _compute_B_projected_gradients( 

110 X: np.ndarray, 

111 A: np.ndarray, 

112 B: np.ndarray, 

113 derivative_max_iter: int = 10, 

114 muB: float = 1.0, 

115) -> np.ndarray: 

116 """Updates the B matrix given the data matrix X and the A matrix. 

117 

118 Parameters 

119 ---------- 

120 X : numpy 2d-array 

121 Data matrix with shape (n_samples, n_features). 

122 

123 A : numpy 2d-array 

124 A matrix with shape (n_samples, n_archetypes). 

125 

126 B : numpy 2d-array 

127 B matrix with shape (n_archetypes, n_samples). 

128 

129 derivative_max_iter: int 

130 Maximum number of steps for optimization 

131 

132 Returns 

133 ------- 

134 B : numpy 2d-array 

135 Updated B matrix with shape (n_archetypes, n_samples). 

136 """ 

137 rel_tol = 1e-6 

138 prev_RSS = np.linalg.norm(X - A @ (B @ X)) ** 2 

139 for _ in range(derivative_max_iter): 

140 # brackets are VERY important to save time 

141 # [G] ~ K x N 

142 G = 2.0 * (((A.T @ A) @ (B @ X) @ X.T) - ((A.T @ X) @ X.T)) 

143 G = G - np.sum(B * G, axis=1)[:, None] # chain rule of projection, check broadcasting 

144 

145 prev_B = B 

146 # NOTE: original implementation has a while True 

147 for _ in range(derivative_max_iter * 100): 

148 B = (prev_B - muB * G).clip(0) 

149 B = B / (np.sum(B, axis=1)[:, None] + np.finfo(np.float64).eps) # Avoid division by zero 

150 RSS = np.linalg.norm(X - A @ (B @ X)) ** 2 

151 if RSS <= (prev_RSS * (1 + rel_tol)): 

152 muB *= 1.2 

153 break 

154 else: 

155 muB /= 2.0 

156 return B 

157 

158 

159# @njit(cache=True) 

160def _compute_A_frank_wolfe( 

161 X: np.ndarray, 

162 Z: np.ndarray, 

163 # A: np.ndarray = None, 

164 A: np.ndarray, 

165 derivative_max_iter: int = 10, 

166) -> np.ndarray: 

167 n_samples, n_archetypes = X.shape[0], Z.shape[0] 

168 # TODO: why do we initalize A here all new? 

169 # A = np.zeros((n_samples, n_archetypes)) 

170 # # TODO: why do we set here the first column of A to 1.0? 

171 # # is this just a simplest way to create a matrix A that satisfies the constraints? 

172 # A[:, 0] = 1.0 

173 e = np.zeros(A.shape) 

174 for _t in range(derivative_max_iter): 

175 # Define the objective function and its gradient 

176 # def f(A_flat): 

177 # A_mat = A_flat.reshape((n_samples, n_archetypes)) 

178 # return np.linalg.norm(X - A_mat @ Z) ** 2 

179 

180 def grad_f(A_flat): 

181 A_mat = A_flat.reshape((n_samples, n_archetypes)) 

182 return 2.0 * (A_mat @ (Z @ Z.T) - X @ Z.T).flatten() 

183 

184 # Compute the gradient at the current iterate 

185 G = grad_f(A.flatten()) 

186 

187 # For each sample, get the archetype column with the most negative gradient 

188 argmins = np.argmin(G.reshape((n_samples, n_archetypes)), axis=1) 

189 

190 # Set the indicator matrix e 

191 e[range(n_samples), argmins] = 1.0 

192 

193 # Compute the search direction 

194 # d = (e - A).flatten() 

195 

196 # Perform line search using scipy.optimize.line_search 

197 # gamma = line_search(f, grad_f, A.flatten(), d, maxiter=100)[0] 

198 # if gamma is None or gamma < 1e-6: 

199 # gamma = 1e-6 

200 gamma = 1e-3 

201 

202 # Update A 

203 A += gamma * (e - A) 

204 

205 # Reset e 

206 e[range(n_samples), argmins] = 0.0 

207 

208 assert np.allclose(np.sum(A, axis=1), 1.0), "A is not a stochastic matrix" 

209 assert np.all(A >= 0), "A has negative elements" 

210 return A 

211 

212 

213# @njit(cache=True) 

214def _compute_B_frank_wolfe( 

215 X: np.ndarray, 

216 A: np.ndarray, 

217 # B=None, 

218 B: np.ndarray, 

219 derivative_max_iter: int = 10, 

220) -> np.ndarray: 

221 n_samples, n_archetypes = X.shape[0], A.shape[1] 

222 # B = np.zeros((n_archetypes, n_samples)) 

223 # # TODO: why do we set here the first column of B to 1.0? 

224 # # is this just a simplest way to create a matrix B that satisfies the constraints? 

225 # B[:, 0] = 1.0 

226 e = np.zeros(B.shape) 

227 for _t in range(derivative_max_iter): 

228 # Define the objective function and its gradient 

229 # def f(B_flat): 

230 # B_mat = B_flat.reshape((n_archetypes, n_samples)) 

231 # return np.linalg.norm(X - A @ (B_mat @ X)) ** 2 

232 

233 def grad_f(B_flat): 

234 B_mat = B_flat.reshape((n_archetypes, n_samples)) 

235 return 2.0 * ((A.T @ A) @ (B_mat @ X) @ X.T - (A.T @ X) @ X.T).flatten() 

236 

237 # Compute the gradient at the current iterate 

238 G = grad_f(B.flatten()) 

239 

240 # For each archetype, get the sample column with the most negative gradient 

241 argmins = np.argmin(G.reshape((n_archetypes, n_samples)), axis=1) 

242 

243 # Set the indicator matrix e 

244 e[range(n_archetypes), argmins] = 1.0 

245 

246 # Compute the search direction 

247 # d = (e - B).flatten() 

248 

249 # Perform line search using scipy.optimize.line_search 

250 # gamma = line_search(f, grad_f, B.flatten(), d, maxiter=100)[0] 

251 # if gamma is None or gamma < 1e-6: 

252 # gamma = 1e-6 

253 gamma = 1e-3 

254 

255 # Update B 

256 B += gamma * (e - B) 

257 

258 # Reset e 

259 e[range(n_archetypes), argmins] = 0.0 

260 

261 assert np.allclose(np.sum(B, axis=1), 1.0), "B is not a stochastic matrix" 

262 assert np.all(B >= 0), "B has negative elements" 

263 return B