Coverage for partipy/simulate.py: 73%
51 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-09 10:24 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-09 10:24 +0200
1import numpy as np
2from scipy.spatial import ConvexHull, QhullError, distance_matrix
5def _generate_extreme_archetypes(n_archetypes, n_dimensions, rng, distribution="normal", max_attempts=100):
6 """
7 Generate extreme archetypes, ensuring they are vertices of a convex hull
8 and maximally separated from each other.
10 Parameters
11 ----------
12 n_archetypes : int
13 Number of archetypes to generate.
14 n_dimensions : int
15 Number of dimensions for each archetype.
16 rng : numpy.random.Generator
17 Random number generator.
18 distribution : str, optional
19 Distribution to sample from, either "normal" or "uniform".
20 max_attempts : int, optional
21 Maximum number of attempts to generate valid archetypes. If the maximum number of attempts is exceeded
22 without generating enough archetypes, a ValueError is raised.
24 Returns
25 -------
26 np.ndarray
27 Array of shape (n_archetypes, n_dimensions) containing the archetypes.
28 """
29 # Determine minimum number of points needed based on dimensions
30 min_points_needed = n_dimensions + 1
32 for attempt in range(max_attempts):
33 # Increase candidate count each attempt, and ensure we have enough points
34 n_candidates = max(n_archetypes * (attempt + 2), min_points_needed)
36 # Generate candidate points
37 if distribution == "normal":
38 candidates = rng.normal(loc=0, scale=1, size=(n_candidates, n_dimensions))
39 elif distribution == "uniform":
40 candidates = rng.uniform(low=-1, high=1, size=(n_candidates, n_dimensions))
41 else:
42 raise NotImplementedError(f"{distribution} not implemented")
44 try:
45 # Compute convex hull
46 hull = ConvexHull(candidates)
48 # Get the vertices of the convex hull
49 vertices = candidates[hull.vertices]
51 # If we have more vertices than needed, select the most distant subset
52 if len(vertices) > n_archetypes:
53 # Select a subset with maximum distance between points
54 selected_vertices = _select_distant_vertices(vertices, n_archetypes, rng)
55 return selected_vertices
56 # If we have exactly the right number, return them
57 elif len(vertices) == n_archetypes:
58 return vertices
59 # Otherwise, try again with more candidates
61 except QhullError as e:
62 error_msg = str(e)
63 print("QhullError")
64 # Check if the error is about not having enough points
65 if "not enough points" in error_msg:
66 continue
67 # For other QhullError types, just try again
68 print(error_msg)
69 continue
70 raise ValueError("Increase the max number of attempts")
73def _select_distant_vertices(vertices, n_select, rng):
74 """
75 Select n_select vertices from the given set, maximizing the minimum distance between any two selected vertices.
76 Uses the maxmin (farthest point) sampling algorithm.
78 Parameters
79 ----------
80 vertices : np.ndarray
81 Array of shape (n_vertices, n_dimensions) containing the vertices.
82 n_select : int
83 Number of vertices to select.
84 rng : numpy.random.Generator
85 Random number generator.
87 Returns
88 -------
89 np.ndarray
90 Array of shape (n_select, n_dimensions) containing the selected vertices.
91 """
92 if len(vertices) <= n_select:
93 return vertices
95 # Calculate pairwise distance matrix
96 dist_matrix = distance_matrix(vertices, vertices)
98 # Set diagonal to infinity (distance to self)
99 np.fill_diagonal(dist_matrix, np.inf)
101 # Start with a random vertex
102 selected_indices = [rng.integers(0, len(vertices))]
104 # Greedily add vertices that maximize the minimum distance
105 while len(selected_indices) < n_select:
106 # Find the minimum distance from each remaining point to any selected point
107 remaining_indices = [i for i in range(len(vertices)) if i not in selected_indices]
108 min_distances = np.min(dist_matrix[remaining_indices][:, selected_indices], axis=1)
110 # Select the point with maximum minimum distance
111 max_idx = np.argmax(min_distances)
112 selected_indices.append(remaining_indices[max_idx])
114 return vertices[selected_indices]
117def simulate_archetypes(
118 n_samples: int,
119 n_archetypes: int,
120 n_dimensions: int,
121 noise_std: float,
122 max_attempts: int = 100,
123 seed: int = 42,
124):
125 """
126 Simulate synthetic data for benchmarking archetypal analysis on datasets with known ground truth (archetypes Z and coefficients A).
128 Candidate archetypes are randomly sampled from the range [-1, 1].
129 Then, we compute the convex hull of these candiate archetypes to ensure that in the final set of archetypes we only include points that are vertices in the convex hull
130 The coefficients in A, which map each data point to a convex combination of archetypes, are sampled from an exponential distribution and normalized to have a row sum of 1.
131 The coordinates of the data points (X) are then calculated as X = A @ Z. Optionally, Gaussian noise can be added to simulate real-world noise.
133 Parameters
134 ----------
135 n_samples : int
136 Number of data points (samples) to generate.
137 n_archetypes : int
138 Number of archetypes to use for generating the data.
139 n_dimensions : int
140 Number of dimensions (features) for each data point and archetype.
141 noise_std : float
142 Standard deviation of Gaussian noise added to the data. Set to 0 for no noise.
143 seed : int, optional (default=42)
144 Random seed for reproducibility.
146 Returns
147 -------
148 X : np.ndarray
149 Generated data matrix of shape (n_samples, n_dimensions).
150 A : np.ndarray
151 Coefficient matrix of shape (n_samples, n_archetypes), representing the convex combinations
152 of archetypes for each data point.
153 Z : np.ndarray
154 Archetype matrix of shape (n_archetypes, n_dimensions), representing the archetypes.
155 """
156 assert noise_std >= 0
158 rng = np.random.default_rng(seed=seed)
159 Z = _generate_extreme_archetypes(n_archetypes, n_dimensions, rng, max_attempts=max_attempts)
160 A = rng.exponential(scale=1, size=(n_samples, n_archetypes))
161 A /= A.sum(axis=1, keepdims=True)
162 X = A @ Z
163 if noise_std > 0:
164 X += rng.normal(loc=0, scale=noise_std, size=X.shape)
166 assert np.all(np.isclose(A.sum(axis=1), 1))
167 assert A.min() > 0
169 return X, A, Z