Coverage for src/driada/dimensionality/intrinsic.py: 90.38%

52 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-25 15:40 +0300

1""" 

2Intrinsic dimensionality estimation using nearest neighbor methods. 

3 

4This module implements methods to estimate the intrinsic dimensionality of datasets 

5based on the statistics of nearest neighbor distances. 

6""" 

7 

8import numpy as np 

9from sklearn.neighbors import NearestNeighbors 

10import pynndescent 

11 

12 

13def nn_dimension(data, k=2, graph_method='sklearn'): 

14 """ 

15 Estimate intrinsic dimension using the k-NN algorithm. 

16  

17 This method estimates the intrinsic dimensionality by analyzing the ratios 

18 of distances to the k-th and (k-1)-th nearest neighbors. For k=2, this 

19 is the TWO-NN algorithm [1]. 

20  

21 Parameters 

22 ---------- 

23 data : array-like of shape (n_samples, n_features) 

24 The input data matrix where rows are samples and columns are features. 

25  

26 k : int, default=2 

27 The number of nearest neighbors to consider. k=2 gives the TWO-NN algorithm. 

28 Higher values can provide more robust estimates. 

29  

30 graph_method : {'sklearn', 'pynndescent'}, default='sklearn' 

31 Method to use for nearest neighbor graph construction: 

32 - 'sklearn': Uses scikit-learn's NearestNeighbors (exact) 

33 - 'pynndescent': Uses PyNNDescent (approximate, faster for large datasets) 

34  

35 Returns 

36 ------- 

37 d : float 

38 The estimated intrinsic dimension of the dataset. 

39  

40 Notes 

41 ----- 

42 The method is based on the principle that in a d-dimensional manifold, 

43 the ratio of distances to successive nearest neighbors follows a specific 

44 distribution that depends on d. 

45  

46 References 

47 ---------- 

48 [1] Facco, E., d'Errico, M., Rodriguez, A., & Laio, A. (2017). 

49 Estimating the intrinsic dimension of datasets by a minimal  

50 neighborhood information. Scientific Reports, 7(1), 12140. 

51 https://doi.org/10.1038/s41598-017-11873-y 

52  

53 Examples 

54 -------- 

55 >>> import numpy as np 

56 >>> from driada.dimensionality.intrinsic import nn_dimension 

57 >>> # Generate data on a 2D manifold embedded in 10D 

58 >>> theta = np.random.uniform(0, 2*np.pi, 1000) 

59 >>> r = np.random.uniform(0, 1, 1000) 

60 >>> x = r * np.cos(theta) 

61 >>> y = r * np.sin(theta) 

62 >>> data = np.column_stack([x, y] + [np.random.randn(1000)*0.01 for _ in range(8)]) 

63 >>> d_est = nn_dimension(data, k=2) 

64 >>> print(f"Estimated dimension: {d_est:.2f}") # Should be close to 2 

65 """ 

66 data = np.asarray(data) 

67 n_samples = len(data) 

68 

69 if k < 2: 

70 raise ValueError("k must be at least 2") 

71 

72 if n_samples <= k: 

73 raise ValueError(f"Number of samples ({n_samples}) must be greater than k ({k})") 

74 

75 # Compute nearest neighbor distances 

76 if graph_method == 'sklearn': 

77 nbrs = NearestNeighbors(n_neighbors=k+1, metric='euclidean') 

78 nbrs.fit(data) 

79 distances, indices = nbrs.kneighbors(data) 

80 elif graph_method == 'pynndescent': 

81 index = pynndescent.NNDescent( 

82 data, 

83 metric='euclidean', 

84 n_neighbors=k+1 

85 ) 

86 indices, distances = index.neighbor_graph 

87 else: 

88 raise ValueError(f"Unknown graph construction method: {graph_method}. " 

89 "Choose from 'sklearn' or 'pynndescent'.") 

90 

91 # Compute ratios of successive nearest neighbor distances 

92 # distances[:, 0] is the distance to self (0), so we start from index 1 

93 ratios = np.zeros((k-1, n_samples)) 

94 for i in range(k-1): 

95 # Ratio of (i+2)-th neighbor distance to (i+1)-th neighbor distance 

96 ratios[i, :] = distances[:, i+2] / distances[:, i+1] 

97 

98 # Maximum likelihood estimation for intrinsic dimension 

99 if k == 2: 

100 # TWO-NN estimator: equation (3) from [1] 

101 d = (n_samples - 1) / np.sum(np.log(ratios[0, :])) 

102 else: 

103 # Generalized k-NN estimator 

104 log_sum = 0 

105 for j in range(k-1): 

106 log_sum += (j + 1) * np.sum(np.log(ratios[j, :])) 

107 d = (n_samples * (k - 1) - 1) / log_sum 

108 

109 return d 

110 

111 

112def correlation_dimension(data, r_min=None, r_max=None, n_bins=20): 

113 """ 

114 Estimate the correlation dimension using the Grassberger-Procaccia algorithm. 

115  

116 The correlation dimension measures how the number of neighbor pairs scales 

117 with distance, providing another estimate of intrinsic dimensionality. 

118  

119 Parameters 

120 ---------- 

121 data : array-like of shape (n_samples, n_features) 

122 The input data matrix. 

123  

124 r_min : float, optional 

125 Minimum distance to consider. If None, automatically determined. 

126  

127 r_max : float, optional 

128 Maximum distance to consider. If None, automatically determined. 

129  

130 n_bins : int, default=20 

131 Number of distance bins for the correlation sum. 

132  

133 Returns 

134 ------- 

135 dimension : float 

136 The estimated correlation dimension. 

137  

138 References 

139 ---------- 

140 Grassberger, P., & Procaccia, I. (1983). Characterization of strange  

141 attractors. Physical Review Letters, 50(5), 346. 

142 """ 

143 from scipy.spatial.distance import pdist 

144 

145 # Compute pairwise distances 

146 distances = pdist(data) 

147 distances = distances[distances > 0] # Remove zero distances 

148 

149 if r_min is None: 

150 r_min = np.percentile(distances, 1) 

151 if r_max is None: 

152 r_max = np.percentile(distances, 50) 

153 

154 # Create log-spaced bins 

155 r_values = np.logspace(np.log10(r_min), np.log10(r_max), n_bins) 

156 

157 # Compute correlation sum C(r) for each r 

158 correlation_sum = [] 

159 for r in r_values: 

160 C_r = np.mean(distances < r) 

161 if C_r > 0: 

162 correlation_sum.append(C_r) 

163 else: 

164 correlation_sum.append(np.nan) 

165 

166 correlation_sum = np.array(correlation_sum) 

167 valid = ~np.isnan(correlation_sum) & (correlation_sum > 0) 

168 

169 if np.sum(valid) < 2: 

170 return np.nan 

171 

172 # Fit line in log-log space 

173 log_r = np.log(r_values[valid]) 

174 log_C = np.log(correlation_sum[valid]) 

175 

176 # Linear regression to find slope (dimension) 

177 coeffs = np.polyfit(log_r, log_C, 1) 

178 dimension = coeffs[0] 

179 

180 return dimension