Coverage for src/driada/dimensionality/intrinsic.py: 90.38%
52 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
1"""
2Intrinsic dimensionality estimation using nearest neighbor methods.
4This module implements methods to estimate the intrinsic dimensionality of datasets
5based on the statistics of nearest neighbor distances.
6"""
8import numpy as np
9from sklearn.neighbors import NearestNeighbors
10import pynndescent
13def nn_dimension(data, k=2, graph_method='sklearn'):
14 """
15 Estimate intrinsic dimension using the k-NN algorithm.
17 This method estimates the intrinsic dimensionality by analyzing the ratios
18 of distances to the k-th and (k-1)-th nearest neighbors. For k=2, this
19 is the TWO-NN algorithm [1].
21 Parameters
22 ----------
23 data : array-like of shape (n_samples, n_features)
24 The input data matrix where rows are samples and columns are features.
26 k : int, default=2
27 The number of nearest neighbors to consider. k=2 gives the TWO-NN algorithm.
28 Higher values can provide more robust estimates.
30 graph_method : {'sklearn', 'pynndescent'}, default='sklearn'
31 Method to use for nearest neighbor graph construction:
32 - 'sklearn': Uses scikit-learn's NearestNeighbors (exact)
33 - 'pynndescent': Uses PyNNDescent (approximate, faster for large datasets)
35 Returns
36 -------
37 d : float
38 The estimated intrinsic dimension of the dataset.
40 Notes
41 -----
42 The method is based on the principle that in a d-dimensional manifold,
43 the ratio of distances to successive nearest neighbors follows a specific
44 distribution that depends on d.
46 References
47 ----------
48 [1] Facco, E., d'Errico, M., Rodriguez, A., & Laio, A. (2017).
49 Estimating the intrinsic dimension of datasets by a minimal
50 neighborhood information. Scientific Reports, 7(1), 12140.
51 https://doi.org/10.1038/s41598-017-11873-y
53 Examples
54 --------
55 >>> import numpy as np
56 >>> from driada.dimensionality.intrinsic import nn_dimension
57 >>> # Generate data on a 2D manifold embedded in 10D
58 >>> theta = np.random.uniform(0, 2*np.pi, 1000)
59 >>> r = np.random.uniform(0, 1, 1000)
60 >>> x = r * np.cos(theta)
61 >>> y = r * np.sin(theta)
62 >>> data = np.column_stack([x, y] + [np.random.randn(1000)*0.01 for _ in range(8)])
63 >>> d_est = nn_dimension(data, k=2)
64 >>> print(f"Estimated dimension: {d_est:.2f}") # Should be close to 2
65 """
66 data = np.asarray(data)
67 n_samples = len(data)
69 if k < 2:
70 raise ValueError("k must be at least 2")
72 if n_samples <= k:
73 raise ValueError(f"Number of samples ({n_samples}) must be greater than k ({k})")
75 # Compute nearest neighbor distances
76 if graph_method == 'sklearn':
77 nbrs = NearestNeighbors(n_neighbors=k+1, metric='euclidean')
78 nbrs.fit(data)
79 distances, indices = nbrs.kneighbors(data)
80 elif graph_method == 'pynndescent':
81 index = pynndescent.NNDescent(
82 data,
83 metric='euclidean',
84 n_neighbors=k+1
85 )
86 indices, distances = index.neighbor_graph
87 else:
88 raise ValueError(f"Unknown graph construction method: {graph_method}. "
89 "Choose from 'sklearn' or 'pynndescent'.")
91 # Compute ratios of successive nearest neighbor distances
92 # distances[:, 0] is the distance to self (0), so we start from index 1
93 ratios = np.zeros((k-1, n_samples))
94 for i in range(k-1):
95 # Ratio of (i+2)-th neighbor distance to (i+1)-th neighbor distance
96 ratios[i, :] = distances[:, i+2] / distances[:, i+1]
98 # Maximum likelihood estimation for intrinsic dimension
99 if k == 2:
100 # TWO-NN estimator: equation (3) from [1]
101 d = (n_samples - 1) / np.sum(np.log(ratios[0, :]))
102 else:
103 # Generalized k-NN estimator
104 log_sum = 0
105 for j in range(k-1):
106 log_sum += (j + 1) * np.sum(np.log(ratios[j, :]))
107 d = (n_samples * (k - 1) - 1) / log_sum
109 return d
112def correlation_dimension(data, r_min=None, r_max=None, n_bins=20):
113 """
114 Estimate the correlation dimension using the Grassberger-Procaccia algorithm.
116 The correlation dimension measures how the number of neighbor pairs scales
117 with distance, providing another estimate of intrinsic dimensionality.
119 Parameters
120 ----------
121 data : array-like of shape (n_samples, n_features)
122 The input data matrix.
124 r_min : float, optional
125 Minimum distance to consider. If None, automatically determined.
127 r_max : float, optional
128 Maximum distance to consider. If None, automatically determined.
130 n_bins : int, default=20
131 Number of distance bins for the correlation sum.
133 Returns
134 -------
135 dimension : float
136 The estimated correlation dimension.
138 References
139 ----------
140 Grassberger, P., & Procaccia, I. (1983). Characterization of strange
141 attractors. Physical Review Letters, 50(5), 346.
142 """
143 from scipy.spatial.distance import pdist
145 # Compute pairwise distances
146 distances = pdist(data)
147 distances = distances[distances > 0] # Remove zero distances
149 if r_min is None:
150 r_min = np.percentile(distances, 1)
151 if r_max is None:
152 r_max = np.percentile(distances, 50)
154 # Create log-spaced bins
155 r_values = np.logspace(np.log10(r_min), np.log10(r_max), n_bins)
157 # Compute correlation sum C(r) for each r
158 correlation_sum = []
159 for r in r_values:
160 C_r = np.mean(distances < r)
161 if C_r > 0:
162 correlation_sum.append(C_r)
163 else:
164 correlation_sum.append(np.nan)
166 correlation_sum = np.array(correlation_sum)
167 valid = ~np.isnan(correlation_sum) & (correlation_sum > 0)
169 if np.sum(valid) < 2:
170 return np.nan
172 # Fit line in log-log space
173 log_r = np.log(r_values[valid])
174 log_C = np.log(correlation_sum[valid])
176 # Linear regression to find slope (dimension)
177 coeffs = np.polyfit(log_r, log_C, 1)
178 dimension = coeffs[0]
180 return dimension