Coverage for src/driada/dim_reduction/graph.py: 49.43%
174 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
1# @title Graph class { form-width: "200px" }
3import pynndescent
4import scipy.sparse as sp
5import numpy as np
6import matplotlib.pyplot as plt
7import numpy.random as npr
9from scipy.optimize import curve_fit
11from sklearn.neighbors import kneighbors_graph
12from sklearn.utils.validation import check_symmetric
13from scipy.sparse.csgraph import shortest_path
14from umap.umap_ import fuzzy_simplicial_set
15from .dr_base import *
17from ..network.net_base import Network
20class ProximityGraph(Network):
21 '''
22 Graph built on data points which represents the underlying manifold
23 '''
25 def __init__(self, d, m_params, g_params, create_nx_graph=False):
26 self.all_metric_params = m_param_filter(m_params)
27 self.metric = m_params['metric_name']
28 self.metric_args = {key: self.all_metric_params[key] for key in self.all_metric_params.keys()
29 if key not in ['metric_name', 'sigma']}
31 all_params = g_param_filter(g_params)
32 for key in all_params:
33 setattr(self, key, g_params[key])
35 self.data = d
37 self.construct_adjacency()
38 # TODO: add graph_preprocessing to changeable graph params
39 super(ProximityGraph, self).__init__(adj=self.adj,
40 preprocessing='giant_cc',
41 create_nx_graph=create_nx_graph,
42 directed=False,
43 weighted=all_params['weighted'])
45 node_mapping = self._init_to_final_node_mapping
46 original_n = self.data.shape[1] # Data is (features, samples)
47 lost_nodes = set(range(original_n)) - set(list(node_mapping.keys()))
48 if len(lost_nodes) > 0:
49 print(f'{len(lost_nodes)} nodes lost after giant component creation!')
51 if len(lost_nodes) >= self.max_deleted_nodes * original_n:
52 raise Exception(f'more than {self.max_deleted_nodes * 100} % of nodes discarded during gc creation!')
53 else:
54 self.lost_nodes = lost_nodes
55 connected = [i for i in range(self.n) if i not in self.lost_nodes]
56 self.bin_adj = self.bin_adj[connected, :].tocsc()[:, connected].tocsr()
57 self.neigh_distmat = self.neigh_distmat[connected, :].tocsc()[:, connected].tocsr()
59 self._checkpoint()
61 #arr = np.array(range(self.n)).reshape(-1, 1)
62 #self.timediff = cdist(arr, arr, 'cityblock')
63 #self.norm_timediff = self.timediff / (self.n / 3)
65 def distances_to_affinities(self):
66 if self.neigh_distmat is None:
67 raise Exception('distances between nearest neighbors not available')
69 if not self.weighted:
70 raise Exception('no need to construct affinities for binary graph weights')
72 if self.dist_to_aff == 'hk':
73 sigma = self.all_metric_params['sigma']
74 self.adj = self.neigh_distmat.copy()
75 sqdist_matrix = self.neigh_distmat.multiply(self.neigh_distmat)
76 mean_sqdist = sqdist_matrix.sum() / sqdist_matrix.nnz
77 self.adj.data = np.exp(-sqdist_matrix.data / (1.0 * sigma * mean_sqdist))
78 # Ensure symmetry after transformation
79 self.adj = (self.adj + self.adj.T) / 2.0
81 def construct_adjacency(self):
82 construct_fn = getattr(self, 'create_' + self.g_method_name + '_graph_')
83 construct_fn()
85 def _checkpoint(self):
86 if self.adj is not None:
87 if not sp.issparse(self.adj):
88 # check for sparsity violation
89 raise Exception('Adjacency matrix is not sparse!')
90 self.adj = check_symmetric(self.adj, raise_exception=True)
91 self.bin_adj = check_symmetric(self.bin_adj, raise_exception=True)
92 self.neigh_distmat = check_symmetric(self.neigh_distmat, raise_exception=True)
93 print('Adjacency symmetry confirmed')
95 else:
96 raise Exception('Adjacency matrix is not constructed, checkpoint routines are unavailable')
98 def create_umap_graph_(self):
99 RAND = np.random.RandomState(42)
100 adj, _, _, dists = fuzzy_simplicial_set(self.data.T, self.nn, metric=self.metric,
101 metric_kwds=self.metric_args,
102 random_state=RAND,
103 return_dists=True)
105 self.adj = sp.csr_matrix(adj)
106 #print('WARNING: distmat is not yet implemented in this branch')
108 def create_knn_graph_(self):
109 if self.metric in named_distances:
110 curr_metric = self.metric
111 else:
112 curr_metric = globals()[self.metric]
114 N = self.data.shape[1]
115 index = pynndescent.NNDescent(self.data.T,
116 metric=curr_metric,
117 metric_kwds=self.metric_args,
118 n_neighbors=self.nn + 1,
119 diversify_prob=1.0,
120 pruning_degree_multiplier=1.5)
122 neighs, dists = index.neighbor_graph
124 neigh_cols = neighs[:, 1:].flatten()
125 dist_vals = dists[:, 1:].flatten()
126 neigh_rows = np.repeat(np.arange(N), self.nn)
128 all_neigh_cols = np.concatenate((neigh_cols, neigh_rows))
129 all_neigh_rows = np.concatenate((neigh_rows, neigh_cols))
130 all_dist_vals = np.concatenate((dist_vals, dist_vals))
132 self.bin_adj = sp.csr_matrix((np.array([True for _ in range(2 * N * self.nn)]),
133 (all_neigh_rows, all_neigh_cols)))
135 self.neigh_distmat = sp.csr_matrix((all_dist_vals, (all_neigh_rows, all_neigh_cols)))
137 if self.weighted:
138 self.distances_to_affinities()
139 else:
140 self.adj = self.bin_adj.copy()
142 def create_auto_knn_graph_(self):
143 A = kneighbors_graph(self.data.T, self.nn, mode='connectivity', include_self=False)
144 A.setdiag(0)
145 A = A.astype(bool, casting='unsafe', copy=True)
146 A = A + A.T
147 self.adj = A
148 #print('WARNING: distmat is not yet implemented in this branch')
150 def create_eps_graph_(self):
151 """Create epsilon-ball graph where edges connect points within distance eps."""
152 from sklearn.neighbors import radius_neighbors_graph
154 # Use radius_neighbors_graph to create epsilon-ball graph
155 # eps is the maximum distance between points
156 # mode='connectivity' gives binary adjacency matrix
157 # include_self=False excludes self-loops
158 A = radius_neighbors_graph(self.data.T, self.eps, mode='connectivity',
159 metric=self.metric, metric_params=self.metric_args,
160 include_self=False)
162 # Ensure symmetry
163 A = A + A.T
164 A = A.astype(bool, casting='unsafe', copy=True)
166 # Check if graph is too sparse or too dense
167 nnz_ratio = A.nnz / (self.data.shape[1] * (self.data.shape[1] - 1))
168 if nnz_ratio < self.eps_min:
169 raise ValueError(f'Epsilon graph too sparse (density={nnz_ratio:.4f} < eps_min={self.eps_min}). '
170 f'Consider increasing eps parameter.')
171 if nnz_ratio > 0.5:
172 print(f'WARNING: Epsilon graph is dense (density={nnz_ratio:.4f}). '
173 f'Consider decreasing eps parameter.')
175 self.adj = A
176 self.bin_adj = A.copy()
178 # For weighted graphs, compute distance matrix
179 if self.weighted:
180 # Get distance matrix for connected pairs
181 D = radius_neighbors_graph(self.data.T, self.eps, mode='distance',
182 metric=self.metric, metric_params=self.metric_args,
183 include_self=False)
184 D = D + D.T
185 self.neigh_distmat = D
186 self.distances_to_affinities()
187 else:
188 # For unweighted graphs, create sparse zero matrix for distances
189 self.neigh_distmat = sp.csr_matrix(A.shape)
191 def calculate_indim(self, mode, factor=2):
193 def normalizing_const(dim):
194 if dim == 0:
195 return 1
196 if dim == 1:
197 return 2
198 elif dim == 2:
199 return np.pi / 2
200 else:
201 return (dim - 1.0) / dim * normalizing_const(dim - 2)
203 def func(x, a):
204 # C = normalizing_const(a)
205 return a * (np.log10(np.sin(1.0 * x * np.pi / 2))) # + np.log10(C)
206 # return a*(np.log10(np.sin(1.0*x/xmax*np.pi/2))) - np.log10(valmax)
208 def func2(x, a):
209 return -a / 2. * (x - max(x)) ** 2
211 if self.g_method not in ['w_knn', 'w_auto_knn']:
212 raise Exception('Distance matrix construction missed!')
214 print('Calculating graph internal dimension...')
216 if mode == 'fast':
217 distmat = sp.csr_matrix(self.sqdist_matrix)
218 distmat.data = np.sqrt(self.sqdist_matrix.data)
219 indices = list(npr.permutation(npr.choice(self.n, size=self.n // factor, replace=False)))
220 dm = distmat[indices, :][:, indices]
222 elif mode == 'full':
223 dm = self.sqdist_matrix
224 dm.data = np.sqrt(self.sqdist_matrix.data)
226 print('Shortest path computation started, distance matrix size: ', dm.shape)
227 spmatrix = shortest_path(dm, method='D', directed=False)
228 all_dists = spmatrix.flatten()
229 all_dists = all_dists[all_dists != 0]
231 nbins = 500
232 pre_hist = np.histogram(all_dists, bins=nbins, density=True)
233 # plt.hist(all_dists, bins = 500)
234 dx = pre_hist[1][1] - pre_hist[1][0]
235 dmax_bin = np.argmax(pre_hist[0])
236 dmax = pre_hist[1][dmax_bin] + dx / 2.0
238 hist = np.histogram(all_dists / dmax, bins=nbins, density=True)
239 distr_x = hist[1][:-1] + dx / 2
240 distr_y = hist[0] / max(hist[0][0:nbins])
241 avg = np.mean(all_dists)
242 std = np.std(all_dists / dmax)
244 res = []
245 # print(distr_x)
246 # print(distr_y)
247 left_distr_x = distr_x[(distr_x > 1 - 2. * std) & (distr_x <= 1) & (distr_y > 1e-6)]
248 left_distr_y = np.log10(distr_y[(distr_x[:] > 1 - 2. * std) & (distr_x[:] <= 1)]) # & (distr_y[:]>1e-6)])
250 for D in [0.1 * x for x in range(10, 260)]:
251 y = func(left_distr_x, D - 1)
252 res.append(np.linalg.norm(y - left_distr_y) / np.sqrt(len(y)))
254 plot = 0
255 if plot:
256 fig = plt.figure(2, figsize=(12, 10))
257 plt.plot(np.linspace(0, len(res) / 10.0, num=len(res)), res)
259 Dmin = 0.1 * (np.argmax(-np.array(res)) + 1)
260 print('Dmin = ', Dmin)
261 fit = curve_fit(func2, left_distr_x, left_distr_y)
262 # print(fit)
263 a = (fit[0][0])
264 # print('Dfit = ', Dfit)
266 plot = 0
267 if plot:
268 fig = plt.figure(1, figsize=(12, 10))
269 ax = fig.add_subplot(111)
270 plt.hist(all_dists / dmax, bins=nbins, histtype='stepfilled', density=True, log=True)
272 alpha = 2.0
273 R = np.sqrt(2 * a)
274 print('R = ', R)
275 Dpr = 1 - alpha ** 2 / (2 * np.log(np.cos(alpha * np.pi / 2.0 / R)))
276 print('D_calc = ', Dpr)
278 return Dmin, Dpr
280 def scaling(self):
281 mat = self.adj.astype(bool).A.astype(int)
282 diagsums = []
283 for i in range(self.n - 1):
284 diagsums.append(np.trace(mat, offset=i, dtype=None, out=None) / (self.n - i))
286 return diagsums