Coverage for src/driada/dim_reduction/embedding.py: 79.70%
271 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
2from scipy.sparse.linalg import eigs
3import umap.umap_ as umap
4from pydiffmap import diffusion_map as dm
5from scipy.sparse.csgraph import shortest_path
7from sklearn.decomposition import PCA
8from sklearn.manifold import spectral_embedding, Isomap, LocallyLinearEmbedding, TSNE
9# from sklearn.cluster.spectral import discretize
11# warnings.filterwarnings("ignore")
13from .dr_base import *
14from .graph import ProximityGraph
15try:
16 from .neural import *
17except ImportError:
18 # torch is optional dependency
19 pass
20from .mvu import *
22from ..network.matrix_utils import get_inv_sqrt_diag_matrix
24def norm_cross_corr(a, b):
25 a = (a - np.mean(a)) / (np.std(a) * len(a))
26 b = (b - np.mean(b)) / (np.std(b))
27 c = np.correlate(a, b, 'full')
28 return c
31def remove_outliers(data, thr_percentile):
32 thr1 = np.mean(data) + thr_percentile * np.std(data)
33 thr2 = np.mean(data) - thr_percentile * np.std(data)
34 good_points = np.where((data < thr1) & (data > thr2))[0]
36 return good_points, data[good_points]
39class Embedding:
40 '''
41 Low-dimensional representation of data
42 '''
44 def __init__(self, init_data, init_distmat, labels, params, g=None):
45 if g is not None:
46 if isinstance(g, ProximityGraph):
47 self.graph = g
48 else:
49 raise Exception('Wrong graph type!')
51 self.all_params = e_param_filter(params)
52 for key in params:
53 setattr(self, key, params[key])
55 self.init_data = init_data
56 self.init_distmat = init_distmat
58 if self.e_method.is_linear:
59 self.transformation_matrix = None
61 self.labels = labels
62 self.coords = None
64 try:
65 self.nclasses = len(set(self.labels))
66 except:
67 self.nclasses = np.unique(self.labels)
69 if self.e_method.nn_based:
70 self.nnmodel = None
72 def build(self, kwargs=None):
73 if kwargs is None:
74 kwargs = dict()
75 fn = getattr(self, 'create_' + self.e_method_name + '_embedding_')
77 if self.e_method.requires_graph:
78 # TODO: move connectivity check to graph
79 if not self.graph.is_connected():
80 raise Exception('Graph is not connected!')
82 fn(**kwargs)
84 def create_pca_embedding_(self, verbose=True):
85 if verbose:
86 print('Calculating PCA embedding...')
88 pca = PCA(n_components=self.dim)
89 self.coords = pca.fit_transform(self.init_data.T).T
90 self.reducer_ = pca
91 # print(pca.explained_variance_ratio_)
93 def create_isomap_embedding_(self):
94 A = self.graph.adj
95 map = Isomap(n_components=self.dim, n_neighbors=self.graph.nn,
96 metric='precomputed')
97 # self.coords = sp.csr_matrix(map.fit_transform(self.graph.data.A.T).T)
98 spmatrix = shortest_path(A.todense(), method='D', directed=False)
99 self.coords = map.fit_transform(spmatrix).T
100 self.reducer_ = map
102 def create_mds_embedding_(self):
103 """Create MDS (Multi-Dimensional Scaling) embedding."""
104 from sklearn.manifold import MDS
106 # MDS typically uses a distance matrix
107 if hasattr(self, 'init_distmat') and self.init_distmat is not None:
108 # Use provided distance matrix
109 mds = MDS(n_components=self.dim, dissimilarity='precomputed', random_state=42)
110 self.coords = mds.fit_transform(self.init_distmat).T
111 else:
112 # Compute from data
113 mds = MDS(n_components=self.dim, random_state=42)
114 self.coords = mds.fit_transform(self.init_data.T).T
116 self.reducer_ = mds
119 def create_mvu_embedding_(self):
120 mvu = MaximumVarianceUnfolding(equation="berkley", solver=cp.SCS, solver_tol=1e-2,
121 eig_tol=1.0e-10, solver_iters=2500,
122 warm_start=False, seed=None)
124 self.coords = mvu.fit_transform(self.graph.data.T, self.dim,
125 self.graph.nn, dropout_rate=0)
126 self.reducer_ = mvu
128 def create_lle_embedding_(self):
129 lle = LocallyLinearEmbedding(n_components=self.dim, n_neighbors=self.graph.nn)
130 self.coords = lle.fit_transform(self.graph.data.T).T
131 self.reducer_ = lle
133 def create_hlle_embedding_(self):
134 hlle = LocallyLinearEmbedding(n_components=self.dim,
135 n_neighbors=self.graph.nn,
136 method='hessian')
137 self.coords = hlle.fit_transform(self.graph.data.T).T
138 self.reducer_ = hlle
140 def create_le_embedding_(self):
141 A = self.graph.adj
142 dim = self.dim
143 n = self.graph.n
145 DH = get_inv_sqrt_diag_matrix(A)
146 P = self.graph.get_matrix('trans')
148 start_v = np.ones(n)
149 # LR mode is much more stable, this is why we use P matrix largest eigenvalues
150 eigvals, eigvecs = eigs(P, k=dim + 1, which='LR', v0=start_v, maxiter=n * 1000)
151 # eigvals, vecs = eigs(nL, k = dim2 + 1, which = 'SM')
153 eigvals = np.asarray([np.round(np.real(x), 6) for x in eigvals])
155 if np.count_nonzero(eigvals == 1.0) > 1:
156 raise Exception('Graph is not connected, LE will result in errors!')
157 else:
158 vecs = np.real(eigvecs.T[1:])
159 vec_norms = np.array([np.real(sum([x * x for x in v])) for v in vecs])
160 vecs = vecs / vec_norms[:, np.newaxis]
161 vecs = vecs.dot(DH.toarray())
162 self.coords = vecs
164 def create_auto_le_embedding_(self):
165 A = self.graph.adj
166 dim = self.dim
168 A = A.asfptype()
169 # Convert to numpy array instead of matrix to avoid sklearn compatibility issues
170 vecs = spectral_embedding(np.asarray(A.todense()), n_components=dim, eigen_solver=None,
171 random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True).T
173 self.coords = vecs
175 def create_dmaps_embedding_(self):
176 raise Exception('not so easy to implement properly (https://sci-hub.se/10.1016/j.acha.2015.01.001)')
178 def create_auto_dmaps_embedding_(self):
179 dim = self.dim
180 nn = self.graph.nn
181 metric = self.graph.metric
182 metric_args = self.graph.metric_args
183 alpha = self.dm_alpha if hasattr(self, 'dm_alpha') else 1
185 mydmap = dm.DiffusionMap.from_sklearn(n_evecs=dim,
186 k=nn,
187 epsilon='bgh',
188 metric=metric,
189 metric_params=metric_args,
190 alpha=alpha)
192 dmap = mydmap.fit_transform(self.init_data.T)
194 self.coords = dmap.T
195 self.reducer_ = dmap
197 def create_tsne_embedding_(self):
198 model = TSNE(n_components=self.dim, verbose=1)
199 self.coords = model.fit_transform(self.init_data.T).T
200 self.reducer_ = model
202 def create_umap_embedding_(self):
203 min_dist = self.min_dist
204 reducer = umap.UMAP(n_neighbors=self.graph.nn, n_components=self.dim,
205 min_dist=min_dist)
207 self.coords = reducer.fit_transform(self.graph.data.T).T
208 self.reducer_ = reducer
210 def create_ae_embedding_(self, continue_learning=0, epochs=50, lr=1e-3, seed=42, batch_size=32,
211 enc_kwargs=None, dec_kwargs=None,
212 feature_dropout=0.2, train_size=0.8, inter_dim=100,
213 verbose=True,
214 add_corr_loss=False, corr_hyperweight=0,
215 add_mi_loss=False, mi_hyperweight=0, minimize_mi_data=None,
216 log_every=1,
217 device=None):
219 # ---------------------------------------------------------------------------
221 torch.manual_seed(seed)
222 torch.backends.cudnn.benchmark = False
223 torch.backends.cudnn.deterministic = True
225 # TODO: add train_test_split
226 train_dataset = NeuroDataset(self.init_data[:, :int(train_size * self.init_data.shape[1])])
227 test_dataset = NeuroDataset(self.init_data[:, int(train_size * self.init_data.shape[1]):])
228 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
229 test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
231 # ---------------------------------------------------------------------------
232 if device is None:
233 # use gpu if available
234 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
235 if verbose:
236 print('device:', device)
238 if not continue_learning:
239 # create a model from `AE` autoencoder class
240 # load it to the specified device, either gpu or cpu
241 # Ensure kwargs are dictionaries, not None
242 if enc_kwargs is None:
243 enc_kwargs = {}
244 if dec_kwargs is None:
245 dec_kwargs = {}
246 model = AE(orig_dim=self.init_data.shape[0], inter_dim=inter_dim, code_dim=self.dim,
247 enc_kwargs=enc_kwargs, dec_kwargs=dec_kwargs, device=device)
249 model = model.to(device)
250 else:
251 model = self.nnmodel
253 # create an optimizer object
254 optimizer = optim.Adam(model.parameters(), lr=lr)
256 # mean-squared error loss
257 criterion = nn.MSELoss()
259 def correlation_loss(data):
260 #print('corr')
261 corr = torch.corrcoef(data)
262 #print(corr)
263 nv = corr.shape[0]
264 closs = torch.abs((torch.sum(torch.abs(corr)) - 1*nv)/(nv**2 - nv)) # average pairwise correlation amplitude
265 #print(closs)
266 return closs
268 def data_orthogonality_loss(data, ortdata):
270 # punishes big amplitude correlation coefficients between all variables from data and ortdata.
271 # ortdata is supposed fixed
272 # temporal workaround instead of MINE MI estimation
274 # print('ortho')
275 #print(ortdata)
276 n1, n2 = data.shape[0], ortdata.shape[1]
277 fulldata = torch.cat((data, ortdata), dim=0)
278 corr = torch.corrcoef(fulldata)
279 #print(corr)
280 #print(corr[n1:, :n1])
281 nvar = n1*n2
282 closs = torch.abs(
283 (torch.sum(torch.abs(corr))) / nvar) # average pairwise correlation amplitude
284 #print(closs)
285 #print()
286 return closs
288 # ---------------------------------------------------------------------------
289 f_dropout = nn.Dropout(feature_dropout)
291 best_test_epoch = -1
292 best_test_loss = 1e10
293 best_test_model = None
295 for epoch in range(epochs):
296 loss = 0
297 for batch_features, _, indices in train_loader:
298 batch_features = batch_features.to(device)
299 # reset the gradients back to zero
300 # PyTorch accumulates gradients on subsequent backward passes
301 optimizer.zero_grad()
303 # compute reconstructions
304 noisy_batch_features = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features
305 outputs = model(noisy_batch_features.float())
306 code = model.encoder(noisy_batch_features.float()).T
308 '''
309 # ==================== MINE experiment ========================
311 from torch_mist.estimators import mine
312 from torch_mist.utils.train import train_mi_estimator
313 from torch_mist.utils import evaluate_mi
315 minimize_mi_data = torch.tensor(minimize_mi_data[:, _]).float().to(device)
316 print(minimize_mi_data.shape, code.shape)
318 estimator = mine(
319 x_dim=minimize_mi_data.shape[0],
320 y_dim=code.shape[0],
321 hidden_dims=[64, 32],
322 )
325 # Train it on the given samples
326 train_log = train_mi_estimator(
327 estimator=estimator,
328 train_data=(minimize_mi_data.T, code.T),
329 batch_size=batch_size,
330 max_iterations=1000,
331 device=device,
332 fast_train=True,
333 max_epochs=10
334 )
336 # Evaluate the estimator on the entirety of the data
337 estimated_mi = evaluate_mi(
338 estimator=estimator,
339 data=(minimize_mi_data.T.to(device), code.T.to(device)),
340 batch_size=batch_size
341 )
343 print(f"Mutual information estimated value: {estimated_mi} nats")
344 # ==================== MINE experiment ========================
345 '''
346 # compute training reconstruction loss
347 train_loss = criterion(outputs, batch_features.float())
349 if add_mi_loss:
350 ortdata = torch.tensor(minimize_mi_data[:, indices]).float().to(device)
351 train_loss += mi_hyperweight * data_orthogonality_loss(code, ortdata)
353 if add_corr_loss:
354 train_loss += corr_hyperweight * correlation_loss(code)
356 # compute accumulated gradients
357 train_loss.backward()
359 # perform parameter update based on current gradients
360 optimizer.step()
362 # add the mini-batch training loss to epoch loss
363 loss += train_loss.item()
365 # compute the epoch training loss
366 loss = loss / len(train_loader)
368 # display the epoch training loss
369 if (epoch + 1) % log_every == 0:
371 # compute loss on test part
372 tloss = 0
373 for batch_features, _, indices in test_loader:
374 batch_features = batch_features.to(device)
375 # compute reconstructions
376 noisy_batch_features = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features
377 outputs = model(noisy_batch_features.float())
379 # compute test reconstruction loss
380 test_loss = criterion(outputs, batch_features.float())
382 if add_mi_loss:
383 ortdata = torch.tensor(minimize_mi_data[:, indices]).float().to(device)
384 train_loss += mi_hyperweight * data_orthogonality_loss(code, ortdata)
386 if add_corr_loss:
387 code = model.encoder(noisy_batch_features.float()).T
388 train_loss += corr_hyperweight * correlation_loss(code)
390 tloss += test_loss.item()
392 # compute the epoch test loss
393 tloss = tloss / len(test_loader)
394 if tloss < best_test_loss:
395 best_test_loss = tloss
396 best_test_epoch = epoch + 1
397 best_test_model = model
398 if verbose:
399 print(f"epoch : {epoch + 1}/{epochs}, train loss = {loss:.8f}, test loss = {tloss:.8f}")
401 if verbose:
402 if best_test_epoch != epochs + 1:
403 print(f'best model: epoch {best_test_epoch}')
405 self.nnmodel = best_test_model
406 input_ = torch.tensor(self.init_data.T).float().to(device)
407 self.coords = model.get_code_embedding(input_)
408 self.nn_loss = tloss
410 # -------------------------------------
412 def create_vae_embedding_(self, continue_learning=0, epochs=50, lr=1e-3, seed=42, batch_size=32,
413 enc_kwargs=None, dec_kwargs=None, feature_dropout=0.2, kld_weight=1,
414 train_size=0.8, inter_dim=128, verbose=True, log_every=10, **kwargs):
416 # TODO: add best model mechanism as above
417 # ---------------------------------------------------------------------------
418 torch.manual_seed(seed)
419 torch.backends.cudnn.benchmark = False
420 torch.backends.cudnn.deterministic = True
422 # TODO: add train_test_split
423 # TODO: move out data loading for autoencoders
424 train_dataset = NeuroDataset(self.init_data[:, :int(train_size * self.init_data.shape[1])])
425 test_dataset = NeuroDataset(self.init_data[:, int(train_size * self.init_data.shape[1]):])
426 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
427 test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
429 # ---------------------------------------------------------------------------
430 # use gpu if available
431 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
433 if not continue_learning:
434 # create a model from `VAE` autoencoder class
435 # load it to the specified device, either gpu or cpu
436 model = VAE(orig_dim=len(self.init_data), inter_dim=inter_dim, code_dim=self.dim,
437 enc_kwargs=enc_kwargs, dec_kwargs=dec_kwargs, device=device)
438 model = model.to(device)
439 else:
440 model = self.nnmodel
442 # create an optimizer object
443 # Adam optimizer with learning rate lr
444 optimizer = optim.Adam(model.parameters(), lr=lr)
446 # BCE error loss
447 #criterion = nn.BCELoss(reduction='sum')
448 criterion = nn.MSELoss()
450 # ---------------------------------------------------------------------------
451 f_dropout = nn.Dropout(feature_dropout)
453 for epoch in range(epochs):
454 loss = 0
455 loss1 = 0
456 loss2 = 0
457 for batch_features, _, _ in train_loader: # NeuroDataset returns 3 values
458 batch_features = batch_features.to(device)
459 # reset the gradients back to zero
460 # PyTorch accumulates gradients on subsequent backward passes
461 optimizer.zero_grad()
463 # compute reconstructions
464 data = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features
465 data = data.to(device).float() # Ensure float32
466 reconstruction, mu, logvar = model(data)
468 # compute training reconstruction loss
469 mse_loss = criterion(reconstruction, data)
470 kld_loss = -0.5 * torch.sum(
471 1 + logvar - mu.pow(2) - logvar.exp()) # * train_dataset.__len__()/batch_size
472 train_loss = mse_loss + kld_weight*kld_loss
474 # compute accumulated gradients
475 train_loss.backward()
477 # perform parameter update based on current gradients
478 optimizer.step()
480 # add the mini-batch training loss to epoch loss
481 loss += train_loss.item()
482 loss1 += mse_loss.item()
483 loss2 += kld_loss.item()
485 # compute the epoch training loss
486 loss = loss / len(train_loader)
487 loss1 = loss1 / len(train_loader)
488 loss2 = loss2 / len(train_loader)
490 # display the epoch training loss
491 # display the epoch training loss
492 if (epoch + 1) % log_every == 0:
493 # compute loss on test part
494 tloss = 0
495 for batch_features, _, _ in test_loader: # NeuroDataset returns 3 values
496 data = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features
497 data = data.to(device).float() # Ensure float32
498 reconstruction, mu, logvar = model(data)
500 # compute training reconstruction loss
501 mse_loss = criterion(reconstruction, data)
502 kld_loss = -0.5 * torch.sum(
503 1 + logvar - mu.pow(2) - logvar.exp()) # * train_dataset.__len__()/batch_size
504 test_loss = mse_loss + kld_weight * kld_loss
505 tloss += test_loss.item()
507 # compute the epoch training loss
508 tloss = tloss / len(test_loader)
509 if verbose:
510 print(f"epoch : {epoch + 1}/{epochs}, train loss = {loss:.8f}, test loss = {tloss:.8f}")
512 self.nnmodel = model
513 input_ = torch.tensor(self.init_data.T).float().to(device)
514 self.coords = model.get_code_embedding(input_)
516 # -------------------------------------
518 def continue_learning(self, add_epochs, kwargs={}):
519 if self.all_params['e_method_name'] not in ['ae', 'vae']:
520 raise Exception('This is not a DL-based method!')
522 fn = getattr(self, 'create_' + self.all_params['e_method_name'] + '_embedding_')
523 fn(continue_learning=1, epochs=add_epochs, kwargs=kwargs)