Coverage for partipy/simulate.py: 73%

51 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-09 10:24 +0200

1import numpy as np 

2from scipy.spatial import ConvexHull, QhullError, distance_matrix 

3 

4 

5def _generate_extreme_archetypes(n_archetypes, n_dimensions, rng, distribution="normal", max_attempts=100): 

6 """ 

7 Generate extreme archetypes, ensuring they are vertices of a convex hull 

8 and maximally separated from each other. 

9 

10 Parameters 

11 ---------- 

12 n_archetypes : int 

13 Number of archetypes to generate. 

14 n_dimensions : int 

15 Number of dimensions for each archetype. 

16 rng : numpy.random.Generator 

17 Random number generator. 

18 distribution : str, optional 

19 Distribution to sample from, either "normal" or "uniform". 

20 max_attempts : int, optional 

21 Maximum number of attempts to generate valid archetypes. If the maximum number of attempts is exceeded 

22 without generating enough archetypes, a ValueError is raised. 

23 

24 Returns 

25 ------- 

26 np.ndarray 

27 Array of shape (n_archetypes, n_dimensions) containing the archetypes. 

28 """ 

29 # Determine minimum number of points needed based on dimensions 

30 min_points_needed = n_dimensions + 1 

31 

32 for attempt in range(max_attempts): 

33 # Increase candidate count each attempt, and ensure we have enough points 

34 n_candidates = max(n_archetypes * (attempt + 2), min_points_needed) 

35 

36 # Generate candidate points 

37 if distribution == "normal": 

38 candidates = rng.normal(loc=0, scale=1, size=(n_candidates, n_dimensions)) 

39 elif distribution == "uniform": 

40 candidates = rng.uniform(low=-1, high=1, size=(n_candidates, n_dimensions)) 

41 else: 

42 raise NotImplementedError(f"{distribution} not implemented") 

43 

44 try: 

45 # Compute convex hull 

46 hull = ConvexHull(candidates) 

47 

48 # Get the vertices of the convex hull 

49 vertices = candidates[hull.vertices] 

50 

51 # If we have more vertices than needed, select the most distant subset 

52 if len(vertices) > n_archetypes: 

53 # Select a subset with maximum distance between points 

54 selected_vertices = _select_distant_vertices(vertices, n_archetypes, rng) 

55 return selected_vertices 

56 # If we have exactly the right number, return them 

57 elif len(vertices) == n_archetypes: 

58 return vertices 

59 # Otherwise, try again with more candidates 

60 

61 except QhullError as e: 

62 error_msg = str(e) 

63 print("QhullError") 

64 # Check if the error is about not having enough points 

65 if "not enough points" in error_msg: 

66 continue 

67 # For other QhullError types, just try again 

68 print(error_msg) 

69 continue 

70 raise ValueError("Increase the max number of attempts") 

71 

72 

73def _select_distant_vertices(vertices, n_select, rng): 

74 """ 

75 Select n_select vertices from the given set, maximizing the minimum distance between any two selected vertices. 

76 Uses the maxmin (farthest point) sampling algorithm. 

77 

78 Parameters 

79 ---------- 

80 vertices : np.ndarray 

81 Array of shape (n_vertices, n_dimensions) containing the vertices. 

82 n_select : int 

83 Number of vertices to select. 

84 rng : numpy.random.Generator 

85 Random number generator. 

86 

87 Returns 

88 ------- 

89 np.ndarray 

90 Array of shape (n_select, n_dimensions) containing the selected vertices. 

91 """ 

92 if len(vertices) <= n_select: 

93 return vertices 

94 

95 # Calculate pairwise distance matrix 

96 dist_matrix = distance_matrix(vertices, vertices) 

97 

98 # Set diagonal to infinity (distance to self) 

99 np.fill_diagonal(dist_matrix, np.inf) 

100 

101 # Start with a random vertex 

102 selected_indices = [rng.integers(0, len(vertices))] 

103 

104 # Greedily add vertices that maximize the minimum distance 

105 while len(selected_indices) < n_select: 

106 # Find the minimum distance from each remaining point to any selected point 

107 remaining_indices = [i for i in range(len(vertices)) if i not in selected_indices] 

108 min_distances = np.min(dist_matrix[remaining_indices][:, selected_indices], axis=1) 

109 

110 # Select the point with maximum minimum distance 

111 max_idx = np.argmax(min_distances) 

112 selected_indices.append(remaining_indices[max_idx]) 

113 

114 return vertices[selected_indices] 

115 

116 

117def simulate_archetypes( 

118 n_samples: int, 

119 n_archetypes: int, 

120 n_dimensions: int, 

121 noise_std: float, 

122 max_attempts: int = 100, 

123 seed: int = 42, 

124): 

125 """ 

126 Simulate synthetic data for benchmarking archetypal analysis on datasets with known ground truth (archetypes Z and coefficients A). 

127 

128 Candidate archetypes are randomly sampled from the range [-1, 1]. 

129 Then, we compute the convex hull of these candiate archetypes to ensure that in the final set of archetypes we only include points that are vertices in the convex hull 

130 The coefficients in A, which map each data point to a convex combination of archetypes, are sampled from an exponential distribution and normalized to have a row sum of 1. 

131 The coordinates of the data points (X) are then calculated as X = A @ Z. Optionally, Gaussian noise can be added to simulate real-world noise. 

132 

133 Parameters 

134 ---------- 

135 n_samples : int 

136 Number of data points (samples) to generate. 

137 n_archetypes : int 

138 Number of archetypes to use for generating the data. 

139 n_dimensions : int 

140 Number of dimensions (features) for each data point and archetype. 

141 noise_std : float 

142 Standard deviation of Gaussian noise added to the data. Set to 0 for no noise. 

143 seed : int, optional (default=42) 

144 Random seed for reproducibility. 

145 

146 Returns 

147 ------- 

148 X : np.ndarray 

149 Generated data matrix of shape (n_samples, n_dimensions). 

150 A : np.ndarray 

151 Coefficient matrix of shape (n_samples, n_archetypes), representing the convex combinations 

152 of archetypes for each data point. 

153 Z : np.ndarray 

154 Archetype matrix of shape (n_archetypes, n_dimensions), representing the archetypes. 

155 """ 

156 assert noise_std >= 0 

157 

158 rng = np.random.default_rng(seed=seed) 

159 Z = _generate_extreme_archetypes(n_archetypes, n_dimensions, rng, max_attempts=max_attempts) 

160 A = rng.exponential(scale=1, size=(n_samples, n_archetypes)) 

161 A /= A.sum(axis=1, keepdims=True) 

162 X = A @ Z 

163 if noise_std > 0: 

164 X += rng.normal(loc=0, scale=noise_std, size=X.shape) 

165 

166 assert np.all(np.isclose(A.sum(axis=1), 1)) 

167 assert A.min() > 0 

168 

169 return X, A, Z