Coverage for src/driada/dimensionality/linear.py: 94.44%

54 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-25 15:40 +0300

1""" 

2Linear dimensionality estimation methods. 

3 

4This module implements dimensionality estimation based on linear methods 

5such as Principal Component Analysis (PCA). 

6""" 

7 

8import numpy as np 

9from sklearn.decomposition import PCA 

10 

11 

12def pca_dimension(data, threshold=0.95, standardize=True): 

13 """ 

14 Estimate dimensionality using Principal Component Analysis (PCA). 

15  

16 This method determines the number of principal components needed to 

17 explain a specified fraction of the total variance in the data. 

18  

19 Parameters 

20 ---------- 

21 data : array-like of shape (n_samples, n_features) 

22 The input data matrix where rows are samples and columns are features. 

23  

24 threshold : float, default=0.95 

25 The fraction of total variance that should be explained by the 

26 selected components. Must be between 0 and 1. 

27  

28 standardize : bool, default=True 

29 Whether to standardize the data (zero mean, unit variance) before 

30 applying PCA. Standardization is recommended when features have 

31 different scales. 

32  

33 Returns 

34 ------- 

35 n_components : int 

36 The number of principal components needed to explain the specified 

37 fraction of variance. 

38  

39 Notes 

40 ----- 

41 This method provides a linear estimate of dimensionality based on variance 

42 explained. It may overestimate the intrinsic dimension for nonlinear 

43 manifolds but is useful for understanding the effective linear dimension 

44 of the data. 

45  

46 Examples 

47 -------- 

48 >>> import numpy as np 

49 >>> from driada.dimensionality.linear import pca_dimension 

50 >>> # Generate data with 3 effective dimensions 

51 >>> data = np.random.randn(1000, 10) 

52 >>> data[:, 3:] *= 0.1 # Make dimensions 4-10 have small variance 

53 >>> n_dim = pca_dimension(data, threshold=0.95) 

54 >>> print(f"Number of components for 95% variance: {n_dim}") 

55 """ 

56 if not 0 < threshold <= 1: 

57 raise ValueError(f"threshold must be between 0 and 1, got {threshold}") 

58 

59 data = np.asarray(data) 

60 

61 if data.shape[0] < 2: 

62 raise ValueError("Need at least 2 samples to estimate dimensionality") 

63 

64 # Standardize data if requested 

65 if standardize: 

66 # Compute mean and std, handling zero variance features 

67 data_mean = np.mean(data, axis=0) 

68 data_std = np.std(data, axis=0) 

69 

70 # Avoid division by zero for constant features 

71 data_std[data_std == 0] = 1.0 

72 

73 data_standardized = (data - data_mean) / data_std 

74 else: 

75 data_standardized = data 

76 

77 # Apply PCA 

78 pca = PCA() 

79 pca.fit(data_standardized) 

80 

81 # Compute cumulative explained variance 

82 cumulative_variance = np.cumsum(pca.explained_variance_ratio_) 

83 

84 # Find number of components exceeding threshold 

85 n_components = np.argmax(cumulative_variance >= threshold) + 1 

86 

87 return n_components 

88 

89 

90def pca_dimension_profile(data, thresholds=None, standardize=True): 

91 """ 

92 Compute PCA dimensionality estimates for multiple variance thresholds. 

93  

94 This function provides a profile of how many components are needed 

95 for different levels of variance explained, which can help in 

96 understanding the distribution of variance across components. 

97  

98 Parameters 

99 ---------- 

100 data : array-like of shape (n_samples, n_features) 

101 The input data matrix. 

102  

103 thresholds : array-like, optional 

104 Variance thresholds to evaluate. If None, uses 

105 [0.5, 0.8, 0.9, 0.95, 0.99]. 

106  

107 standardize : bool, default=True 

108 Whether to standardize the data before PCA. 

109  

110 Returns 

111 ------- 

112 profile : dict 

113 Dictionary containing: 

114 - 'thresholds': array of variance thresholds 

115 - 'n_components': array of components needed for each threshold 

116 - 'explained_variance_ratio': variance explained by each component 

117 - 'cumulative_variance': cumulative variance explained 

118  

119 Examples 

120 -------- 

121 >>> data = np.random.randn(1000, 20) 

122 >>> profile = pca_dimension_profile(data) 

123 >>> for thresh, n_comp in zip(profile['thresholds'], profile['n_components']): 

124 ... print(f"{thresh*100:.0f}% variance: {n_comp} components") 

125 """ 

126 if thresholds is None: 

127 thresholds = np.array([0.5, 0.8, 0.9, 0.95, 0.99]) 

128 else: 

129 thresholds = np.asarray(thresholds) 

130 

131 data = np.asarray(data) 

132 

133 # Standardize if requested 

134 if standardize: 

135 data_mean = np.mean(data, axis=0) 

136 data_std = np.std(data, axis=0) 

137 data_std[data_std == 0] = 1.0 

138 data_standardized = (data - data_mean) / data_std 

139 else: 

140 data_standardized = data 

141 

142 # Apply PCA 

143 pca = PCA() 

144 pca.fit(data_standardized) 

145 

146 # Compute cumulative variance 

147 cumulative_variance = np.cumsum(pca.explained_variance_ratio_) 

148 

149 # Find components needed for each threshold 

150 n_components = [] 

151 for threshold in thresholds: 

152 if threshold > cumulative_variance[-1]: 

153 # Can't reach this threshold 

154 n_comp = len(cumulative_variance) 

155 else: 

156 n_comp = np.argmax(cumulative_variance >= threshold) + 1 

157 n_components.append(n_comp) 

158 

159 return { 

160 'thresholds': thresholds, 

161 'n_components': np.array(n_components), 

162 'explained_variance_ratio': pca.explained_variance_ratio_, 

163 'cumulative_variance': cumulative_variance 

164 } 

165 

166 

167def effective_rank(data, standardize=True): 

168 """ 

169 Compute the effective rank (Roy & Vetterli, 2007) of the data matrix. 

170  

171 The effective rank is a continuous measure of dimensionality based on 

172 the entropy of the normalized eigenvalue distribution. 

173  

174 Parameters 

175 ---------- 

176 data : array-like of shape (n_samples, n_features) 

177 The input data matrix. 

178  

179 standardize : bool, default=True 

180 Whether to standardize the data before computation. 

181  

182 Returns 

183 ------- 

184 eff_rank : float 

185 The effective rank of the data matrix. 

186  

187 References 

188 ---------- 

189 Roy, O., & Vetterli, M. (2007). The effective rank: A measure of  

190 effective dimensionality. In 2007 15th European Signal Processing  

191 Conference (pp. 606-610). IEEE. 

192 """ 

193 from scipy.stats import entropy 

194 

195 data = np.asarray(data) 

196 

197 # Standardize if requested 

198 if standardize: 

199 data_mean = np.mean(data, axis=0) 

200 data_std = np.std(data, axis=0) 

201 data_std[data_std == 0] = 1.0 

202 data_standardized = (data - data_mean) / data_std 

203 else: 

204 data_standardized = data 

205 

206 # Compute singular values 

207 _, s, _ = np.linalg.svd(data_standardized, full_matrices=False) 

208 

209 # Normalize to get probability distribution 

210 s_normalized = s / np.sum(s) 

211 

212 # Compute entropy (base e) 

213 ent = entropy(s_normalized) 

214 

215 # Convert to effective rank 

216 eff_rank = np.exp(ent) 

217 

218 return eff_rank