Coverage for src/driada/dimensionality/linear.py: 94.44%
54 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-25 15:40 +0300
1"""
2Linear dimensionality estimation methods.
4This module implements dimensionality estimation based on linear methods
5such as Principal Component Analysis (PCA).
6"""
8import numpy as np
9from sklearn.decomposition import PCA
12def pca_dimension(data, threshold=0.95, standardize=True):
13 """
14 Estimate dimensionality using Principal Component Analysis (PCA).
16 This method determines the number of principal components needed to
17 explain a specified fraction of the total variance in the data.
19 Parameters
20 ----------
21 data : array-like of shape (n_samples, n_features)
22 The input data matrix where rows are samples and columns are features.
24 threshold : float, default=0.95
25 The fraction of total variance that should be explained by the
26 selected components. Must be between 0 and 1.
28 standardize : bool, default=True
29 Whether to standardize the data (zero mean, unit variance) before
30 applying PCA. Standardization is recommended when features have
31 different scales.
33 Returns
34 -------
35 n_components : int
36 The number of principal components needed to explain the specified
37 fraction of variance.
39 Notes
40 -----
41 This method provides a linear estimate of dimensionality based on variance
42 explained. It may overestimate the intrinsic dimension for nonlinear
43 manifolds but is useful for understanding the effective linear dimension
44 of the data.
46 Examples
47 --------
48 >>> import numpy as np
49 >>> from driada.dimensionality.linear import pca_dimension
50 >>> # Generate data with 3 effective dimensions
51 >>> data = np.random.randn(1000, 10)
52 >>> data[:, 3:] *= 0.1 # Make dimensions 4-10 have small variance
53 >>> n_dim = pca_dimension(data, threshold=0.95)
54 >>> print(f"Number of components for 95% variance: {n_dim}")
55 """
56 if not 0 < threshold <= 1:
57 raise ValueError(f"threshold must be between 0 and 1, got {threshold}")
59 data = np.asarray(data)
61 if data.shape[0] < 2:
62 raise ValueError("Need at least 2 samples to estimate dimensionality")
64 # Standardize data if requested
65 if standardize:
66 # Compute mean and std, handling zero variance features
67 data_mean = np.mean(data, axis=0)
68 data_std = np.std(data, axis=0)
70 # Avoid division by zero for constant features
71 data_std[data_std == 0] = 1.0
73 data_standardized = (data - data_mean) / data_std
74 else:
75 data_standardized = data
77 # Apply PCA
78 pca = PCA()
79 pca.fit(data_standardized)
81 # Compute cumulative explained variance
82 cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
84 # Find number of components exceeding threshold
85 n_components = np.argmax(cumulative_variance >= threshold) + 1
87 return n_components
90def pca_dimension_profile(data, thresholds=None, standardize=True):
91 """
92 Compute PCA dimensionality estimates for multiple variance thresholds.
94 This function provides a profile of how many components are needed
95 for different levels of variance explained, which can help in
96 understanding the distribution of variance across components.
98 Parameters
99 ----------
100 data : array-like of shape (n_samples, n_features)
101 The input data matrix.
103 thresholds : array-like, optional
104 Variance thresholds to evaluate. If None, uses
105 [0.5, 0.8, 0.9, 0.95, 0.99].
107 standardize : bool, default=True
108 Whether to standardize the data before PCA.
110 Returns
111 -------
112 profile : dict
113 Dictionary containing:
114 - 'thresholds': array of variance thresholds
115 - 'n_components': array of components needed for each threshold
116 - 'explained_variance_ratio': variance explained by each component
117 - 'cumulative_variance': cumulative variance explained
119 Examples
120 --------
121 >>> data = np.random.randn(1000, 20)
122 >>> profile = pca_dimension_profile(data)
123 >>> for thresh, n_comp in zip(profile['thresholds'], profile['n_components']):
124 ... print(f"{thresh*100:.0f}% variance: {n_comp} components")
125 """
126 if thresholds is None:
127 thresholds = np.array([0.5, 0.8, 0.9, 0.95, 0.99])
128 else:
129 thresholds = np.asarray(thresholds)
131 data = np.asarray(data)
133 # Standardize if requested
134 if standardize:
135 data_mean = np.mean(data, axis=0)
136 data_std = np.std(data, axis=0)
137 data_std[data_std == 0] = 1.0
138 data_standardized = (data - data_mean) / data_std
139 else:
140 data_standardized = data
142 # Apply PCA
143 pca = PCA()
144 pca.fit(data_standardized)
146 # Compute cumulative variance
147 cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
149 # Find components needed for each threshold
150 n_components = []
151 for threshold in thresholds:
152 if threshold > cumulative_variance[-1]:
153 # Can't reach this threshold
154 n_comp = len(cumulative_variance)
155 else:
156 n_comp = np.argmax(cumulative_variance >= threshold) + 1
157 n_components.append(n_comp)
159 return {
160 'thresholds': thresholds,
161 'n_components': np.array(n_components),
162 'explained_variance_ratio': pca.explained_variance_ratio_,
163 'cumulative_variance': cumulative_variance
164 }
167def effective_rank(data, standardize=True):
168 """
169 Compute the effective rank (Roy & Vetterli, 2007) of the data matrix.
171 The effective rank is a continuous measure of dimensionality based on
172 the entropy of the normalized eigenvalue distribution.
174 Parameters
175 ----------
176 data : array-like of shape (n_samples, n_features)
177 The input data matrix.
179 standardize : bool, default=True
180 Whether to standardize the data before computation.
182 Returns
183 -------
184 eff_rank : float
185 The effective rank of the data matrix.
187 References
188 ----------
189 Roy, O., & Vetterli, M. (2007). The effective rank: A measure of
190 effective dimensionality. In 2007 15th European Signal Processing
191 Conference (pp. 606-610). IEEE.
192 """
193 from scipy.stats import entropy
195 data = np.asarray(data)
197 # Standardize if requested
198 if standardize:
199 data_mean = np.mean(data, axis=0)
200 data_std = np.std(data, axis=0)
201 data_std[data_std == 0] = 1.0
202 data_standardized = (data - data_mean) / data_std
203 else:
204 data_standardized = data
206 # Compute singular values
207 _, s, _ = np.linalg.svd(data_standardized, full_matrices=False)
209 # Normalize to get probability distribution
210 s_normalized = s / np.sum(s)
212 # Compute entropy (base e)
213 ent = entropy(s_normalized)
215 # Convert to effective rank
216 eff_rank = np.exp(ent)
218 return eff_rank