Coverage for src/driada/dim_reduction/embedding.py: 79.70%

271 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-25 15:40 +0300

1 

2from scipy.sparse.linalg import eigs 

3import umap.umap_ as umap 

4from pydiffmap import diffusion_map as dm 

5from scipy.sparse.csgraph import shortest_path 

6 

7from sklearn.decomposition import PCA 

8from sklearn.manifold import spectral_embedding, Isomap, LocallyLinearEmbedding, TSNE 

9# from sklearn.cluster.spectral import discretize 

10 

11# warnings.filterwarnings("ignore") 

12 

13from .dr_base import * 

14from .graph import ProximityGraph 

15try: 

16 from .neural import * 

17except ImportError: 

18 # torch is optional dependency 

19 pass 

20from .mvu import * 

21 

22from ..network.matrix_utils import get_inv_sqrt_diag_matrix 

23 

24def norm_cross_corr(a, b): 

25 a = (a - np.mean(a)) / (np.std(a) * len(a)) 

26 b = (b - np.mean(b)) / (np.std(b)) 

27 c = np.correlate(a, b, 'full') 

28 return c 

29 

30 

31def remove_outliers(data, thr_percentile): 

32 thr1 = np.mean(data) + thr_percentile * np.std(data) 

33 thr2 = np.mean(data) - thr_percentile * np.std(data) 

34 good_points = np.where((data < thr1) & (data > thr2))[0] 

35 

36 return good_points, data[good_points] 

37 

38 

39class Embedding: 

40 ''' 

41 Low-dimensional representation of data 

42 ''' 

43 

44 def __init__(self, init_data, init_distmat, labels, params, g=None): 

45 if g is not None: 

46 if isinstance(g, ProximityGraph): 

47 self.graph = g 

48 else: 

49 raise Exception('Wrong graph type!') 

50 

51 self.all_params = e_param_filter(params) 

52 for key in params: 

53 setattr(self, key, params[key]) 

54 

55 self.init_data = init_data 

56 self.init_distmat = init_distmat 

57 

58 if self.e_method.is_linear: 

59 self.transformation_matrix = None 

60 

61 self.labels = labels 

62 self.coords = None 

63 

64 try: 

65 self.nclasses = len(set(self.labels)) 

66 except: 

67 self.nclasses = np.unique(self.labels) 

68 

69 if self.e_method.nn_based: 

70 self.nnmodel = None 

71 

72 def build(self, kwargs=None): 

73 if kwargs is None: 

74 kwargs = dict() 

75 fn = getattr(self, 'create_' + self.e_method_name + '_embedding_') 

76 

77 if self.e_method.requires_graph: 

78 # TODO: move connectivity check to graph 

79 if not self.graph.is_connected(): 

80 raise Exception('Graph is not connected!') 

81 

82 fn(**kwargs) 

83 

84 def create_pca_embedding_(self, verbose=True): 

85 if verbose: 

86 print('Calculating PCA embedding...') 

87 

88 pca = PCA(n_components=self.dim) 

89 self.coords = pca.fit_transform(self.init_data.T).T 

90 self.reducer_ = pca 

91 # print(pca.explained_variance_ratio_) 

92 

93 def create_isomap_embedding_(self): 

94 A = self.graph.adj 

95 map = Isomap(n_components=self.dim, n_neighbors=self.graph.nn, 

96 metric='precomputed') 

97 # self.coords = sp.csr_matrix(map.fit_transform(self.graph.data.A.T).T) 

98 spmatrix = shortest_path(A.todense(), method='D', directed=False) 

99 self.coords = map.fit_transform(spmatrix).T 

100 self.reducer_ = map 

101 

102 def create_mds_embedding_(self): 

103 """Create MDS (Multi-Dimensional Scaling) embedding.""" 

104 from sklearn.manifold import MDS 

105 

106 # MDS typically uses a distance matrix 

107 if hasattr(self, 'init_distmat') and self.init_distmat is not None: 

108 # Use provided distance matrix 

109 mds = MDS(n_components=self.dim, dissimilarity='precomputed', random_state=42) 

110 self.coords = mds.fit_transform(self.init_distmat).T 

111 else: 

112 # Compute from data 

113 mds = MDS(n_components=self.dim, random_state=42) 

114 self.coords = mds.fit_transform(self.init_data.T).T 

115 

116 self.reducer_ = mds 

117 

118 

119 def create_mvu_embedding_(self): 

120 mvu = MaximumVarianceUnfolding(equation="berkley", solver=cp.SCS, solver_tol=1e-2, 

121 eig_tol=1.0e-10, solver_iters=2500, 

122 warm_start=False, seed=None) 

123 

124 self.coords = mvu.fit_transform(self.graph.data.T, self.dim, 

125 self.graph.nn, dropout_rate=0) 

126 self.reducer_ = mvu 

127 

128 def create_lle_embedding_(self): 

129 lle = LocallyLinearEmbedding(n_components=self.dim, n_neighbors=self.graph.nn) 

130 self.coords = lle.fit_transform(self.graph.data.T).T 

131 self.reducer_ = lle 

132 

133 def create_hlle_embedding_(self): 

134 hlle = LocallyLinearEmbedding(n_components=self.dim, 

135 n_neighbors=self.graph.nn, 

136 method='hessian') 

137 self.coords = hlle.fit_transform(self.graph.data.T).T 

138 self.reducer_ = hlle 

139 

140 def create_le_embedding_(self): 

141 A = self.graph.adj 

142 dim = self.dim 

143 n = self.graph.n 

144 

145 DH = get_inv_sqrt_diag_matrix(A) 

146 P = self.graph.get_matrix('trans') 

147 

148 start_v = np.ones(n) 

149 # LR mode is much more stable, this is why we use P matrix largest eigenvalues 

150 eigvals, eigvecs = eigs(P, k=dim + 1, which='LR', v0=start_v, maxiter=n * 1000) 

151 # eigvals, vecs = eigs(nL, k = dim2 + 1, which = 'SM') 

152 

153 eigvals = np.asarray([np.round(np.real(x), 6) for x in eigvals]) 

154 

155 if np.count_nonzero(eigvals == 1.0) > 1: 

156 raise Exception('Graph is not connected, LE will result in errors!') 

157 else: 

158 vecs = np.real(eigvecs.T[1:]) 

159 vec_norms = np.array([np.real(sum([x * x for x in v])) for v in vecs]) 

160 vecs = vecs / vec_norms[:, np.newaxis] 

161 vecs = vecs.dot(DH.toarray()) 

162 self.coords = vecs 

163 

164 def create_auto_le_embedding_(self): 

165 A = self.graph.adj 

166 dim = self.dim 

167 

168 A = A.asfptype() 

169 # Convert to numpy array instead of matrix to avoid sklearn compatibility issues 

170 vecs = spectral_embedding(np.asarray(A.todense()), n_components=dim, eigen_solver=None, 

171 random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True).T 

172 

173 self.coords = vecs 

174 

175 def create_dmaps_embedding_(self): 

176 raise Exception('not so easy to implement properly (https://sci-hub.se/10.1016/j.acha.2015.01.001)') 

177 

178 def create_auto_dmaps_embedding_(self): 

179 dim = self.dim 

180 nn = self.graph.nn 

181 metric = self.graph.metric 

182 metric_args = self.graph.metric_args 

183 alpha = self.dm_alpha if hasattr(self, 'dm_alpha') else 1 

184 

185 mydmap = dm.DiffusionMap.from_sklearn(n_evecs=dim, 

186 k=nn, 

187 epsilon='bgh', 

188 metric=metric, 

189 metric_params=metric_args, 

190 alpha=alpha) 

191 

192 dmap = mydmap.fit_transform(self.init_data.T) 

193 

194 self.coords = dmap.T 

195 self.reducer_ = dmap 

196 

197 def create_tsne_embedding_(self): 

198 model = TSNE(n_components=self.dim, verbose=1) 

199 self.coords = model.fit_transform(self.init_data.T).T 

200 self.reducer_ = model 

201 

202 def create_umap_embedding_(self): 

203 min_dist = self.min_dist 

204 reducer = umap.UMAP(n_neighbors=self.graph.nn, n_components=self.dim, 

205 min_dist=min_dist) 

206 

207 self.coords = reducer.fit_transform(self.graph.data.T).T 

208 self.reducer_ = reducer 

209 

210 def create_ae_embedding_(self, continue_learning=0, epochs=50, lr=1e-3, seed=42, batch_size=32, 

211 enc_kwargs=None, dec_kwargs=None, 

212 feature_dropout=0.2, train_size=0.8, inter_dim=100, 

213 verbose=True, 

214 add_corr_loss=False, corr_hyperweight=0, 

215 add_mi_loss=False, mi_hyperweight=0, minimize_mi_data=None, 

216 log_every=1, 

217 device=None): 

218 

219 # --------------------------------------------------------------------------- 

220 

221 torch.manual_seed(seed) 

222 torch.backends.cudnn.benchmark = False 

223 torch.backends.cudnn.deterministic = True 

224 

225 # TODO: add train_test_split 

226 train_dataset = NeuroDataset(self.init_data[:, :int(train_size * self.init_data.shape[1])]) 

227 test_dataset = NeuroDataset(self.init_data[:, int(train_size * self.init_data.shape[1]):]) 

228 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 

229 test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True) 

230 

231 # --------------------------------------------------------------------------- 

232 if device is None: 

233 # use gpu if available 

234 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

235 if verbose: 

236 print('device:', device) 

237 

238 if not continue_learning: 

239 # create a model from `AE` autoencoder class 

240 # load it to the specified device, either gpu or cpu 

241 # Ensure kwargs are dictionaries, not None 

242 if enc_kwargs is None: 

243 enc_kwargs = {} 

244 if dec_kwargs is None: 

245 dec_kwargs = {} 

246 model = AE(orig_dim=self.init_data.shape[0], inter_dim=inter_dim, code_dim=self.dim, 

247 enc_kwargs=enc_kwargs, dec_kwargs=dec_kwargs, device=device) 

248 

249 model = model.to(device) 

250 else: 

251 model = self.nnmodel 

252 

253 # create an optimizer object 

254 optimizer = optim.Adam(model.parameters(), lr=lr) 

255 

256 # mean-squared error loss 

257 criterion = nn.MSELoss() 

258 

259 def correlation_loss(data): 

260 #print('corr') 

261 corr = torch.corrcoef(data) 

262 #print(corr) 

263 nv = corr.shape[0] 

264 closs = torch.abs((torch.sum(torch.abs(corr)) - 1*nv)/(nv**2 - nv)) # average pairwise correlation amplitude 

265 #print(closs) 

266 return closs 

267 

268 def data_orthogonality_loss(data, ortdata): 

269 

270 # punishes big amplitude correlation coefficients between all variables from data and ortdata. 

271 # ortdata is supposed fixed 

272 # temporal workaround instead of MINE MI estimation 

273 

274 # print('ortho') 

275 #print(ortdata) 

276 n1, n2 = data.shape[0], ortdata.shape[1] 

277 fulldata = torch.cat((data, ortdata), dim=0) 

278 corr = torch.corrcoef(fulldata) 

279 #print(corr) 

280 #print(corr[n1:, :n1]) 

281 nvar = n1*n2 

282 closs = torch.abs( 

283 (torch.sum(torch.abs(corr))) / nvar) # average pairwise correlation amplitude 

284 #print(closs) 

285 #print() 

286 return closs 

287 

288 # --------------------------------------------------------------------------- 

289 f_dropout = nn.Dropout(feature_dropout) 

290 

291 best_test_epoch = -1 

292 best_test_loss = 1e10 

293 best_test_model = None 

294 

295 for epoch in range(epochs): 

296 loss = 0 

297 for batch_features, _, indices in train_loader: 

298 batch_features = batch_features.to(device) 

299 # reset the gradients back to zero 

300 # PyTorch accumulates gradients on subsequent backward passes 

301 optimizer.zero_grad() 

302 

303 # compute reconstructions 

304 noisy_batch_features = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features 

305 outputs = model(noisy_batch_features.float()) 

306 code = model.encoder(noisy_batch_features.float()).T 

307 

308 ''' 

309 # ==================== MINE experiment ======================== 

310 

311 from torch_mist.estimators import mine 

312 from torch_mist.utils.train import train_mi_estimator 

313 from torch_mist.utils import evaluate_mi 

314 

315 minimize_mi_data = torch.tensor(minimize_mi_data[:, _]).float().to(device) 

316 print(minimize_mi_data.shape, code.shape) 

317 

318 estimator = mine( 

319 x_dim=minimize_mi_data.shape[0], 

320 y_dim=code.shape[0], 

321 hidden_dims=[64, 32], 

322 ) 

323 

324 

325 # Train it on the given samples 

326 train_log = train_mi_estimator( 

327 estimator=estimator, 

328 train_data=(minimize_mi_data.T, code.T), 

329 batch_size=batch_size, 

330 max_iterations=1000, 

331 device=device, 

332 fast_train=True, 

333 max_epochs=10 

334 ) 

335 

336 # Evaluate the estimator on the entirety of the data 

337 estimated_mi = evaluate_mi( 

338 estimator=estimator, 

339 data=(minimize_mi_data.T.to(device), code.T.to(device)), 

340 batch_size=batch_size 

341 ) 

342 

343 print(f"Mutual information estimated value: {estimated_mi} nats") 

344 # ==================== MINE experiment ======================== 

345 ''' 

346 # compute training reconstruction loss 

347 train_loss = criterion(outputs, batch_features.float()) 

348 

349 if add_mi_loss: 

350 ortdata = torch.tensor(minimize_mi_data[:, indices]).float().to(device) 

351 train_loss += mi_hyperweight * data_orthogonality_loss(code, ortdata) 

352 

353 if add_corr_loss: 

354 train_loss += corr_hyperweight * correlation_loss(code) 

355 

356 # compute accumulated gradients 

357 train_loss.backward() 

358 

359 # perform parameter update based on current gradients 

360 optimizer.step() 

361 

362 # add the mini-batch training loss to epoch loss 

363 loss += train_loss.item() 

364 

365 # compute the epoch training loss 

366 loss = loss / len(train_loader) 

367 

368 # display the epoch training loss 

369 if (epoch + 1) % log_every == 0: 

370 

371 # compute loss on test part 

372 tloss = 0 

373 for batch_features, _, indices in test_loader: 

374 batch_features = batch_features.to(device) 

375 # compute reconstructions 

376 noisy_batch_features = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features 

377 outputs = model(noisy_batch_features.float()) 

378 

379 # compute test reconstruction loss 

380 test_loss = criterion(outputs, batch_features.float()) 

381 

382 if add_mi_loss: 

383 ortdata = torch.tensor(minimize_mi_data[:, indices]).float().to(device) 

384 train_loss += mi_hyperweight * data_orthogonality_loss(code, ortdata) 

385 

386 if add_corr_loss: 

387 code = model.encoder(noisy_batch_features.float()).T 

388 train_loss += corr_hyperweight * correlation_loss(code) 

389 

390 tloss += test_loss.item() 

391 

392 # compute the epoch test loss 

393 tloss = tloss / len(test_loader) 

394 if tloss < best_test_loss: 

395 best_test_loss = tloss 

396 best_test_epoch = epoch + 1 

397 best_test_model = model 

398 if verbose: 

399 print(f"epoch : {epoch + 1}/{epochs}, train loss = {loss:.8f}, test loss = {tloss:.8f}") 

400 

401 if verbose: 

402 if best_test_epoch != epochs + 1: 

403 print(f'best model: epoch {best_test_epoch}') 

404 

405 self.nnmodel = best_test_model 

406 input_ = torch.tensor(self.init_data.T).float().to(device) 

407 self.coords = model.get_code_embedding(input_) 

408 self.nn_loss = tloss 

409 

410 # ------------------------------------- 

411 

412 def create_vae_embedding_(self, continue_learning=0, epochs=50, lr=1e-3, seed=42, batch_size=32, 

413 enc_kwargs=None, dec_kwargs=None, feature_dropout=0.2, kld_weight=1, 

414 train_size=0.8, inter_dim=128, verbose=True, log_every=10, **kwargs): 

415 

416 # TODO: add best model mechanism as above 

417 # --------------------------------------------------------------------------- 

418 torch.manual_seed(seed) 

419 torch.backends.cudnn.benchmark = False 

420 torch.backends.cudnn.deterministic = True 

421 

422 # TODO: add train_test_split 

423 # TODO: move out data loading for autoencoders 

424 train_dataset = NeuroDataset(self.init_data[:, :int(train_size * self.init_data.shape[1])]) 

425 test_dataset = NeuroDataset(self.init_data[:, int(train_size * self.init_data.shape[1]):]) 

426 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 

427 test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True) 

428 

429 # --------------------------------------------------------------------------- 

430 # use gpu if available 

431 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

432 

433 if not continue_learning: 

434 # create a model from `VAE` autoencoder class 

435 # load it to the specified device, either gpu or cpu 

436 model = VAE(orig_dim=len(self.init_data), inter_dim=inter_dim, code_dim=self.dim, 

437 enc_kwargs=enc_kwargs, dec_kwargs=dec_kwargs, device=device) 

438 model = model.to(device) 

439 else: 

440 model = self.nnmodel 

441 

442 # create an optimizer object 

443 # Adam optimizer with learning rate lr 

444 optimizer = optim.Adam(model.parameters(), lr=lr) 

445 

446 # BCE error loss 

447 #criterion = nn.BCELoss(reduction='sum') 

448 criterion = nn.MSELoss() 

449 

450 # --------------------------------------------------------------------------- 

451 f_dropout = nn.Dropout(feature_dropout) 

452 

453 for epoch in range(epochs): 

454 loss = 0 

455 loss1 = 0 

456 loss2 = 0 

457 for batch_features, _, _ in train_loader: # NeuroDataset returns 3 values 

458 batch_features = batch_features.to(device) 

459 # reset the gradients back to zero 

460 # PyTorch accumulates gradients on subsequent backward passes 

461 optimizer.zero_grad() 

462 

463 # compute reconstructions 

464 data = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features 

465 data = data.to(device).float() # Ensure float32 

466 reconstruction, mu, logvar = model(data) 

467 

468 # compute training reconstruction loss 

469 mse_loss = criterion(reconstruction, data) 

470 kld_loss = -0.5 * torch.sum( 

471 1 + logvar - mu.pow(2) - logvar.exp()) # * train_dataset.__len__()/batch_size 

472 train_loss = mse_loss + kld_weight*kld_loss 

473 

474 # compute accumulated gradients 

475 train_loss.backward() 

476 

477 # perform parameter update based on current gradients 

478 optimizer.step() 

479 

480 # add the mini-batch training loss to epoch loss 

481 loss += train_loss.item() 

482 loss1 += mse_loss.item() 

483 loss2 += kld_loss.item() 

484 

485 # compute the epoch training loss 

486 loss = loss / len(train_loader) 

487 loss1 = loss1 / len(train_loader) 

488 loss2 = loss2 / len(train_loader) 

489 

490 # display the epoch training loss 

491 # display the epoch training loss 

492 if (epoch + 1) % log_every == 0: 

493 # compute loss on test part 

494 tloss = 0 

495 for batch_features, _, _ in test_loader: # NeuroDataset returns 3 values 

496 data = f_dropout(torch.ones(batch_features.shape).to(device)) * batch_features 

497 data = data.to(device).float() # Ensure float32 

498 reconstruction, mu, logvar = model(data) 

499 

500 # compute training reconstruction loss 

501 mse_loss = criterion(reconstruction, data) 

502 kld_loss = -0.5 * torch.sum( 

503 1 + logvar - mu.pow(2) - logvar.exp()) # * train_dataset.__len__()/batch_size 

504 test_loss = mse_loss + kld_weight * kld_loss 

505 tloss += test_loss.item() 

506 

507 # compute the epoch training loss 

508 tloss = tloss / len(test_loader) 

509 if verbose: 

510 print(f"epoch : {epoch + 1}/{epochs}, train loss = {loss:.8f}, test loss = {tloss:.8f}") 

511 

512 self.nnmodel = model 

513 input_ = torch.tensor(self.init_data.T).float().to(device) 

514 self.coords = model.get_code_embedding(input_) 

515 

516 # ------------------------------------- 

517 

518 def continue_learning(self, add_epochs, kwargs={}): 

519 if self.all_params['e_method_name'] not in ['ae', 'vae']: 

520 raise Exception('This is not a DL-based method!') 

521 

522 fn = getattr(self, 'create_' + self.all_params['e_method_name'] + '_embedding_') 

523 fn(continue_learning=1, epochs=add_epochs, kwargs=kwargs) 

524