Coverage for src/driada/dim_reduction/graph.py: 49.43%

174 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-25 15:40 +0300

1# @title Graph class { form-width: "200px" } 

2 

3import pynndescent 

4import scipy.sparse as sp 

5import numpy as np 

6import matplotlib.pyplot as plt 

7import numpy.random as npr 

8 

9from scipy.optimize import curve_fit 

10 

11from sklearn.neighbors import kneighbors_graph 

12from sklearn.utils.validation import check_symmetric 

13from scipy.sparse.csgraph import shortest_path 

14from umap.umap_ import fuzzy_simplicial_set 

15from .dr_base import * 

16 

17from ..network.net_base import Network 

18 

19 

20class ProximityGraph(Network): 

21 ''' 

22 Graph built on data points which represents the underlying manifold 

23 ''' 

24 

25 def __init__(self, d, m_params, g_params, create_nx_graph=False): 

26 self.all_metric_params = m_param_filter(m_params) 

27 self.metric = m_params['metric_name'] 

28 self.metric_args = {key: self.all_metric_params[key] for key in self.all_metric_params.keys() 

29 if key not in ['metric_name', 'sigma']} 

30 

31 all_params = g_param_filter(g_params) 

32 for key in all_params: 

33 setattr(self, key, g_params[key]) 

34 

35 self.data = d 

36 

37 self.construct_adjacency() 

38 # TODO: add graph_preprocessing to changeable graph params 

39 super(ProximityGraph, self).__init__(adj=self.adj, 

40 preprocessing='giant_cc', 

41 create_nx_graph=create_nx_graph, 

42 directed=False, 

43 weighted=all_params['weighted']) 

44 

45 node_mapping = self._init_to_final_node_mapping 

46 original_n = self.data.shape[1] # Data is (features, samples) 

47 lost_nodes = set(range(original_n)) - set(list(node_mapping.keys())) 

48 if len(lost_nodes) > 0: 

49 print(f'{len(lost_nodes)} nodes lost after giant component creation!') 

50 

51 if len(lost_nodes) >= self.max_deleted_nodes * original_n: 

52 raise Exception(f'more than {self.max_deleted_nodes * 100} % of nodes discarded during gc creation!') 

53 else: 

54 self.lost_nodes = lost_nodes 

55 connected = [i for i in range(self.n) if i not in self.lost_nodes] 

56 self.bin_adj = self.bin_adj[connected, :].tocsc()[:, connected].tocsr() 

57 self.neigh_distmat = self.neigh_distmat[connected, :].tocsc()[:, connected].tocsr() 

58 

59 self._checkpoint() 

60 

61 #arr = np.array(range(self.n)).reshape(-1, 1) 

62 #self.timediff = cdist(arr, arr, 'cityblock') 

63 #self.norm_timediff = self.timediff / (self.n / 3) 

64 

65 def distances_to_affinities(self): 

66 if self.neigh_distmat is None: 

67 raise Exception('distances between nearest neighbors not available') 

68 

69 if not self.weighted: 

70 raise Exception('no need to construct affinities for binary graph weights') 

71 

72 if self.dist_to_aff == 'hk': 

73 sigma = self.all_metric_params['sigma'] 

74 self.adj = self.neigh_distmat.copy() 

75 sqdist_matrix = self.neigh_distmat.multiply(self.neigh_distmat) 

76 mean_sqdist = sqdist_matrix.sum() / sqdist_matrix.nnz 

77 self.adj.data = np.exp(-sqdist_matrix.data / (1.0 * sigma * mean_sqdist)) 

78 # Ensure symmetry after transformation 

79 self.adj = (self.adj + self.adj.T) / 2.0 

80 

81 def construct_adjacency(self): 

82 construct_fn = getattr(self, 'create_' + self.g_method_name + '_graph_') 

83 construct_fn() 

84 

85 def _checkpoint(self): 

86 if self.adj is not None: 

87 if not sp.issparse(self.adj): 

88 # check for sparsity violation 

89 raise Exception('Adjacency matrix is not sparse!') 

90 self.adj = check_symmetric(self.adj, raise_exception=True) 

91 self.bin_adj = check_symmetric(self.bin_adj, raise_exception=True) 

92 self.neigh_distmat = check_symmetric(self.neigh_distmat, raise_exception=True) 

93 print('Adjacency symmetry confirmed') 

94 

95 else: 

96 raise Exception('Adjacency matrix is not constructed, checkpoint routines are unavailable') 

97 

98 def create_umap_graph_(self): 

99 RAND = np.random.RandomState(42) 

100 adj, _, _, dists = fuzzy_simplicial_set(self.data.T, self.nn, metric=self.metric, 

101 metric_kwds=self.metric_args, 

102 random_state=RAND, 

103 return_dists=True) 

104 

105 self.adj = sp.csr_matrix(adj) 

106 #print('WARNING: distmat is not yet implemented in this branch') 

107 

108 def create_knn_graph_(self): 

109 if self.metric in named_distances: 

110 curr_metric = self.metric 

111 else: 

112 curr_metric = globals()[self.metric] 

113 

114 N = self.data.shape[1] 

115 index = pynndescent.NNDescent(self.data.T, 

116 metric=curr_metric, 

117 metric_kwds=self.metric_args, 

118 n_neighbors=self.nn + 1, 

119 diversify_prob=1.0, 

120 pruning_degree_multiplier=1.5) 

121 

122 neighs, dists = index.neighbor_graph 

123 

124 neigh_cols = neighs[:, 1:].flatten() 

125 dist_vals = dists[:, 1:].flatten() 

126 neigh_rows = np.repeat(np.arange(N), self.nn) 

127 

128 all_neigh_cols = np.concatenate((neigh_cols, neigh_rows)) 

129 all_neigh_rows = np.concatenate((neigh_rows, neigh_cols)) 

130 all_dist_vals = np.concatenate((dist_vals, dist_vals)) 

131 

132 self.bin_adj = sp.csr_matrix((np.array([True for _ in range(2 * N * self.nn)]), 

133 (all_neigh_rows, all_neigh_cols))) 

134 

135 self.neigh_distmat = sp.csr_matrix((all_dist_vals, (all_neigh_rows, all_neigh_cols))) 

136 

137 if self.weighted: 

138 self.distances_to_affinities() 

139 else: 

140 self.adj = self.bin_adj.copy() 

141 

142 def create_auto_knn_graph_(self): 

143 A = kneighbors_graph(self.data.T, self.nn, mode='connectivity', include_self=False) 

144 A.setdiag(0) 

145 A = A.astype(bool, casting='unsafe', copy=True) 

146 A = A + A.T 

147 self.adj = A 

148 #print('WARNING: distmat is not yet implemented in this branch') 

149 

150 def create_eps_graph_(self): 

151 """Create epsilon-ball graph where edges connect points within distance eps.""" 

152 from sklearn.neighbors import radius_neighbors_graph 

153 

154 # Use radius_neighbors_graph to create epsilon-ball graph 

155 # eps is the maximum distance between points 

156 # mode='connectivity' gives binary adjacency matrix 

157 # include_self=False excludes self-loops 

158 A = radius_neighbors_graph(self.data.T, self.eps, mode='connectivity', 

159 metric=self.metric, metric_params=self.metric_args, 

160 include_self=False) 

161 

162 # Ensure symmetry 

163 A = A + A.T 

164 A = A.astype(bool, casting='unsafe', copy=True) 

165 

166 # Check if graph is too sparse or too dense 

167 nnz_ratio = A.nnz / (self.data.shape[1] * (self.data.shape[1] - 1)) 

168 if nnz_ratio < self.eps_min: 

169 raise ValueError(f'Epsilon graph too sparse (density={nnz_ratio:.4f} < eps_min={self.eps_min}). ' 

170 f'Consider increasing eps parameter.') 

171 if nnz_ratio > 0.5: 

172 print(f'WARNING: Epsilon graph is dense (density={nnz_ratio:.4f}). ' 

173 f'Consider decreasing eps parameter.') 

174 

175 self.adj = A 

176 self.bin_adj = A.copy() 

177 

178 # For weighted graphs, compute distance matrix 

179 if self.weighted: 

180 # Get distance matrix for connected pairs 

181 D = radius_neighbors_graph(self.data.T, self.eps, mode='distance', 

182 metric=self.metric, metric_params=self.metric_args, 

183 include_self=False) 

184 D = D + D.T 

185 self.neigh_distmat = D 

186 self.distances_to_affinities() 

187 else: 

188 # For unweighted graphs, create sparse zero matrix for distances 

189 self.neigh_distmat = sp.csr_matrix(A.shape) 

190 

191 def calculate_indim(self, mode, factor=2): 

192 

193 def normalizing_const(dim): 

194 if dim == 0: 

195 return 1 

196 if dim == 1: 

197 return 2 

198 elif dim == 2: 

199 return np.pi / 2 

200 else: 

201 return (dim - 1.0) / dim * normalizing_const(dim - 2) 

202 

203 def func(x, a): 

204 # C = normalizing_const(a) 

205 return a * (np.log10(np.sin(1.0 * x * np.pi / 2))) # + np.log10(C) 

206 # return a*(np.log10(np.sin(1.0*x/xmax*np.pi/2))) - np.log10(valmax) 

207 

208 def func2(x, a): 

209 return -a / 2. * (x - max(x)) ** 2 

210 

211 if self.g_method not in ['w_knn', 'w_auto_knn']: 

212 raise Exception('Distance matrix construction missed!') 

213 

214 print('Calculating graph internal dimension...') 

215 

216 if mode == 'fast': 

217 distmat = sp.csr_matrix(self.sqdist_matrix) 

218 distmat.data = np.sqrt(self.sqdist_matrix.data) 

219 indices = list(npr.permutation(npr.choice(self.n, size=self.n // factor, replace=False))) 

220 dm = distmat[indices, :][:, indices] 

221 

222 elif mode == 'full': 

223 dm = self.sqdist_matrix 

224 dm.data = np.sqrt(self.sqdist_matrix.data) 

225 

226 print('Shortest path computation started, distance matrix size: ', dm.shape) 

227 spmatrix = shortest_path(dm, method='D', directed=False) 

228 all_dists = spmatrix.flatten() 

229 all_dists = all_dists[all_dists != 0] 

230 

231 nbins = 500 

232 pre_hist = np.histogram(all_dists, bins=nbins, density=True) 

233 # plt.hist(all_dists, bins = 500) 

234 dx = pre_hist[1][1] - pre_hist[1][0] 

235 dmax_bin = np.argmax(pre_hist[0]) 

236 dmax = pre_hist[1][dmax_bin] + dx / 2.0 

237 

238 hist = np.histogram(all_dists / dmax, bins=nbins, density=True) 

239 distr_x = hist[1][:-1] + dx / 2 

240 distr_y = hist[0] / max(hist[0][0:nbins]) 

241 avg = np.mean(all_dists) 

242 std = np.std(all_dists / dmax) 

243 

244 res = [] 

245 # print(distr_x) 

246 # print(distr_y) 

247 left_distr_x = distr_x[(distr_x > 1 - 2. * std) & (distr_x <= 1) & (distr_y > 1e-6)] 

248 left_distr_y = np.log10(distr_y[(distr_x[:] > 1 - 2. * std) & (distr_x[:] <= 1)]) # & (distr_y[:]>1e-6)]) 

249 

250 for D in [0.1 * x for x in range(10, 260)]: 

251 y = func(left_distr_x, D - 1) 

252 res.append(np.linalg.norm(y - left_distr_y) / np.sqrt(len(y))) 

253 

254 plot = 0 

255 if plot: 

256 fig = plt.figure(2, figsize=(12, 10)) 

257 plt.plot(np.linspace(0, len(res) / 10.0, num=len(res)), res) 

258 

259 Dmin = 0.1 * (np.argmax(-np.array(res)) + 1) 

260 print('Dmin = ', Dmin) 

261 fit = curve_fit(func2, left_distr_x, left_distr_y) 

262 # print(fit) 

263 a = (fit[0][0]) 

264 # print('Dfit = ', Dfit) 

265 

266 plot = 0 

267 if plot: 

268 fig = plt.figure(1, figsize=(12, 10)) 

269 ax = fig.add_subplot(111) 

270 plt.hist(all_dists / dmax, bins=nbins, histtype='stepfilled', density=True, log=True) 

271 

272 alpha = 2.0 

273 R = np.sqrt(2 * a) 

274 print('R = ', R) 

275 Dpr = 1 - alpha ** 2 / (2 * np.log(np.cos(alpha * np.pi / 2.0 / R))) 

276 print('D_calc = ', Dpr) 

277 

278 return Dmin, Dpr 

279 

280 def scaling(self): 

281 mat = self.adj.astype(bool).A.astype(int) 

282 diagsums = [] 

283 for i in range(self.n - 1): 

284 diagsums.append(np.trace(mat, offset=i, dtype=None, out=None) / (self.n - i)) 

285 

286 return diagsums