Coverage for ParTIpy/optim.py: 12%
78 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-16 10:22 +0100
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-16 10:22 +0100
1"""
2Optimize the archetypal analysis objective by block coordiante descent.
4a) Regularized Nonnegative Least Squares
5- Paper: A. Cutler and L. Breiman, “Archetypal analysis,” Technometrics, vol. 36, no. 4, pp. 338–347, 1994, doi: 10.1080/00401706.1994.10485840.
8b) Projected Gradients (PCHA)
9- Paper: M. Mørup and L. K. Hansen, “Archetypal analysis for machine learning and data mining,” Neurocomputing, vol. 80, pp. 54–63, Mar. 2012, doi: 10.1016/j.neucom.2011.06.033.
12c) Adapted Frank-Wolfe algorithm
13- Paper: C. Bauckhage, K. Kersting, F. Hoppe, and C. Thurau, “Archetypal analysis as an autoencoder,” presented at the Workshop “New Challenges in Neural Computation” (NC2) 2015, 2015. Accessed: Feb. 10, 2025. [Online]. Available: https://publica.fraunhofer.de/handle/publica/393337
16Code adapted from
17a) https://github.com/nichohelmut/football_results/blob/master/clustering/clustering.py
18b) https://github.com/atmguille/archetypal-analysis (by Guillermo García Cobo)
19"""
21import numpy as np
22import scipy.optimize
24from .const import LAMBDA
27def _compute_A_regularized_nnls(
28 X: np.ndarray,
29 Z: np.ndarray,
30 A: np.ndarray = None,
31 derivative_max_iter=None,
32) -> np.ndarray:
33 # huge_constant is added as a new column to account for w norm constraint
34 X_padded = np.hstack([X, (LAMBDA * np.ones(X.shape[0]))[:, None]])
35 Zt_padded = np.vstack([Z.T, LAMBDA * np.ones(Z.shape[0])])
37 # Use non-negative least squares to solve the optimization problem
38 A = np.array([scipy.optimize.nnls(A=Zt_padded, b=X_padded[n, :])[0] for n in range(X.shape[0])])
39 return A
42def _compute_B_regularized_nnls(
43 X: np.ndarray,
44 A: np.ndarray,
45 B: np.ndarray = None,
46 derivative_max_iter=None,
47) -> np.ndarray:
48 Z = np.linalg.lstsq(a=A, b=X, rcond=None)[0]
49 Z_padded = np.hstack([Z, (LAMBDA * np.ones(Z.shape[0]))[:, None]])
50 Xt_padded = np.vstack([X.T, LAMBDA * np.ones(X.shape[0])])
51 B = np.array([scipy.optimize.nnls(A=Xt_padded, b=Z_padded[k, :])[0] for k in range(Z.shape[0])])
52 return B
55# @njit(cache=True)
56def _compute_A_projected_gradients(
57 X: np.ndarray,
58 Z: np.ndarray,
59 A: np.ndarray,
60 derivative_max_iter: int = 10,
61 muA: float = 1.0,
62) -> np.ndarray:
63 """Updates the A matrix given the data matrix X and the archetypes Z.
65 A is the matrix that provides the best convex approximation of X by Z.
67 Parameters
68 ----------
69 X : numpy 2d-array
70 Data matrix with shape (n_samples, n_features).
72 Z : numpy 2d-array
73 Archetypes matrix with shape (n_archetypes, n_features).
75 A : numpy 2d-array
76 A matrix with shape (n_samples, n_archetypes).
78 derivative_max_iter: int
79 Maximum number of steps for optimization
81 Returns
82 -------
83 A : numpy 2d-array
84 Updated A matrix with shape (n_samples, n_archetypes).
85 """
86 rel_tol = 1e-6
87 prev_RSS = np.linalg.norm(X - A @ Z) ** 2
88 for _ in range(derivative_max_iter):
89 # brackets are VERY important to save time
90 # [G] ~ N x K
91 G = 2.0 * (A @ (Z @ Z.T) - X @ Z.T)
92 G = G - np.sum(A * G, axis=1)[:, None] # chain rule of projection, check broadcasting
94 prev_A = A
95 # NOTE: original implementation has a while True
96 for _ in range(derivative_max_iter * 100):
97 A = (prev_A - muA * G).clip(0)
98 A = A / (np.sum(A, axis=1)[:, None] + np.finfo(np.float64).eps) # Avoid division by zero
99 RSS = np.linalg.norm(X - A @ Z) ** 2
100 if RSS <= (prev_RSS * (1 + rel_tol)):
101 muA *= 1.2
102 break
103 else:
104 muA /= 2.0
105 return A
108# @njit(cache=True)
109def _compute_B_projected_gradients(
110 X: np.ndarray,
111 A: np.ndarray,
112 B: np.ndarray,
113 derivative_max_iter: int = 10,
114 muB: float = 1.0,
115) -> np.ndarray:
116 """Updates the B matrix given the data matrix X and the A matrix.
118 Parameters
119 ----------
120 X : numpy 2d-array
121 Data matrix with shape (n_samples, n_features).
123 A : numpy 2d-array
124 A matrix with shape (n_samples, n_archetypes).
126 B : numpy 2d-array
127 B matrix with shape (n_archetypes, n_samples).
129 derivative_max_iter: int
130 Maximum number of steps for optimization
132 Returns
133 -------
134 B : numpy 2d-array
135 Updated B matrix with shape (n_archetypes, n_samples).
136 """
137 rel_tol = 1e-6
138 prev_RSS = np.linalg.norm(X - A @ (B @ X)) ** 2
139 for _ in range(derivative_max_iter):
140 # brackets are VERY important to save time
141 # [G] ~ K x N
142 G = 2.0 * (((A.T @ A) @ (B @ X) @ X.T) - ((A.T @ X) @ X.T))
143 G = G - np.sum(B * G, axis=1)[:, None] # chain rule of projection, check broadcasting
145 prev_B = B
146 # NOTE: original implementation has a while True
147 for _ in range(derivative_max_iter * 100):
148 B = (prev_B - muB * G).clip(0)
149 B = B / (np.sum(B, axis=1)[:, None] + np.finfo(np.float64).eps) # Avoid division by zero
150 RSS = np.linalg.norm(X - A @ (B @ X)) ** 2
151 if RSS <= (prev_RSS * (1 + rel_tol)):
152 muB *= 1.2
153 break
154 else:
155 muB /= 2.0
156 return B
159# @njit(cache=True)
160def _compute_A_frank_wolfe(
161 X: np.ndarray,
162 Z: np.ndarray,
163 # A: np.ndarray = None,
164 A: np.ndarray,
165 derivative_max_iter: int = 10,
166) -> np.ndarray:
167 n_samples, n_archetypes = X.shape[0], Z.shape[0]
168 # TODO: why do we initalize A here all new?
169 # A = np.zeros((n_samples, n_archetypes))
170 # # TODO: why do we set here the first column of A to 1.0?
171 # # is this just a simplest way to create a matrix A that satisfies the constraints?
172 # A[:, 0] = 1.0
173 e = np.zeros(A.shape)
174 for _t in range(derivative_max_iter):
175 # Define the objective function and its gradient
176 # def f(A_flat):
177 # A_mat = A_flat.reshape((n_samples, n_archetypes))
178 # return np.linalg.norm(X - A_mat @ Z) ** 2
180 def grad_f(A_flat):
181 A_mat = A_flat.reshape((n_samples, n_archetypes))
182 return 2.0 * (A_mat @ (Z @ Z.T) - X @ Z.T).flatten()
184 # Compute the gradient at the current iterate
185 G = grad_f(A.flatten())
187 # For each sample, get the archetype column with the most negative gradient
188 argmins = np.argmin(G.reshape((n_samples, n_archetypes)), axis=1)
190 # Set the indicator matrix e
191 e[range(n_samples), argmins] = 1.0
193 # Compute the search direction
194 # d = (e - A).flatten()
196 # Perform line search using scipy.optimize.line_search
197 # gamma = line_search(f, grad_f, A.flatten(), d, maxiter=100)[0]
198 # if gamma is None or gamma < 1e-6:
199 # gamma = 1e-6
200 gamma = 1e-3
202 # Update A
203 A += gamma * (e - A)
205 # Reset e
206 e[range(n_samples), argmins] = 0.0
208 assert np.allclose(np.sum(A, axis=1), 1.0), "A is not a stochastic matrix"
209 assert np.all(A >= 0), "A has negative elements"
210 return A
213# @njit(cache=True)
214def _compute_B_frank_wolfe(
215 X: np.ndarray,
216 A: np.ndarray,
217 # B=None,
218 B: np.ndarray,
219 derivative_max_iter: int = 10,
220) -> np.ndarray:
221 n_samples, n_archetypes = X.shape[0], A.shape[1]
222 # B = np.zeros((n_archetypes, n_samples))
223 # # TODO: why do we set here the first column of B to 1.0?
224 # # is this just a simplest way to create a matrix B that satisfies the constraints?
225 # B[:, 0] = 1.0
226 e = np.zeros(B.shape)
227 for _t in range(derivative_max_iter):
228 # Define the objective function and its gradient
229 # def f(B_flat):
230 # B_mat = B_flat.reshape((n_archetypes, n_samples))
231 # return np.linalg.norm(X - A @ (B_mat @ X)) ** 2
233 def grad_f(B_flat):
234 B_mat = B_flat.reshape((n_archetypes, n_samples))
235 return 2.0 * ((A.T @ A) @ (B_mat @ X) @ X.T - (A.T @ X) @ X.T).flatten()
237 # Compute the gradient at the current iterate
238 G = grad_f(B.flatten())
240 # For each archetype, get the sample column with the most negative gradient
241 argmins = np.argmin(G.reshape((n_archetypes, n_samples)), axis=1)
243 # Set the indicator matrix e
244 e[range(n_archetypes), argmins] = 1.0
246 # Compute the search direction
247 # d = (e - B).flatten()
249 # Perform line search using scipy.optimize.line_search
250 # gamma = line_search(f, grad_f, B.flatten(), d, maxiter=100)[0]
251 # if gamma is None or gamma < 1e-6:
252 # gamma = 1e-6
253 gamma = 1e-3
255 # Update B
256 B += gamma * (e - B)
258 # Reset e
259 e[range(n_archetypes), argmins] = 0.0
261 assert np.allclose(np.sum(B, axis=1), 1.0), "B is not a stochastic matrix"
262 assert np.all(B >= 0), "B has negative elements"
263 return B