Coverage for partipy/initialize.py: 96%

68 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-09 10:24 +0200

1import numpy as np 

2from scipy.spatial.distance import cdist 

3 

4from .optim import _compute_A_projected_gradients 

5 

6 

7def _construct_B_from_indices(X: np.ndarray, indices: list) -> np.ndarray: 

8 X = X.astype(np.float32) 

9 n_samples, _ = X.shape 

10 B = np.zeros((len(indices), n_samples), dtype=np.float32) 

11 for arch_idx in range(len(indices)): 

12 B[arch_idx, indices[arch_idx]] = 1.0 

13 B = np.ascontiguousarray(B, dtype=np.float32) 

14 return B 

15 

16 

17def _init_A(n_samples: int, n_archetypes: int, seed: int, epsilon: float = 1e-9) -> np.ndarray: 

18 rng = np.random.default_rng(seed) # Use a fixed seed 

19 A = -np.log(rng.random((n_samples, n_archetypes), dtype=np.float32) + epsilon) 

20 A /= np.sum(A, axis=1, keepdims=True) 

21 A = np.ascontiguousarray(A, dtype=np.float32) 

22 return A 

23 

24 

25def _init_uniform( 

26 X: np.ndarray, n_archetypes: int, seed: int = 42, return_indices: bool = False 

27) -> np.ndarray | tuple[np.ndarray, list[int]]: 

28 """Random selection of data points from X to form initial archetypes. 

29 

30 Parameters 

31 ---------- 

32 X: numpy 2d-array 

33 Data matrix with shape (n_samples x n_features). 

34 

35 n_archetypes: int 

36 Number of candidate archetypes to extract. 

37 

38 seed: int 

39 Seed for reproducibility 

40 

41 Returns 

42 ------- 

43 B : numpy 2d-array 

44 Matrix B with shape (n_archetypes, n_samples). 

45 """ 

46 assert n_archetypes >= 2 

47 X = np.ascontiguousarray(X, dtype=np.float32) 

48 n_samples, _ = X.shape 

49 rng = np.random.default_rng(seed=seed) 

50 selected_indices = list(rng.choice(a=n_samples, size=n_archetypes, replace=False)) 

51 B = _construct_B_from_indices(X=X, indices=selected_indices) 

52 if return_indices: 

53 return B, selected_indices 

54 else: 

55 return B 

56 

57 

58def _init_furthest_sum( 

59 X: np.ndarray, n_archetypes: int, seed: int = 42, return_indices: bool = False 

60) -> np.ndarray | tuple[np.ndarray, list[int]]: 

61 """Furthest sum initialization for archetypes. 

62 

63 Reference: M. Mørup and L. K. Hansen, “Archetypal analysis for machine learning and data mining,” Neurocomputing, vol. 80, pp. 54-63, Mar. 2012, doi: https://doi.org/10.1016/j.neucom.2011.06.033. 

64 

65 Parameters 

66 ---------- 

67 X: numpy 2d-array 

68 Data matrix with shape (n_samples x n_features). 

69 

70 n_archetypes: int 

71 Number of candidate archetypes to extract. 

72 

73 seed: int 

74 Seed for reproducibility 

75 

76 Returns 

77 ------- 

78 B : numpy 2d-array 

79 Matrix B with shape (n_archetypes, n_samples). 

80 """ 

81 assert n_archetypes >= 2 

82 X = np.ascontiguousarray(X, dtype=np.float32) 

83 n_samples, _ = X.shape 

84 rng = np.random.default_rng(seed=seed) 

85 

86 first_idx = rng.choice(n_samples, size=1)[0] 

87 all_indices = np.arange(n_samples, dtype=int) 

88 selected_indices = [first_idx] 

89 

90 # we replace the first 10 archetypes, see in the orginal implementation 

91 # https://github.com/ulfaslak/py_pcha/blob/f398d28c6a28c4a8121cb75443b221fc58fc10e4/py_pcha/furthest_sum.py#L46 

92 for iter in range(1, n_archetypes + 11): 

93 if iter > (n_archetypes - 1): 

94 selected_indices = selected_indices[1:] 

95 Z = X[selected_indices, :] 

96 distances = cdist(Z, X, metric="euclidean") 

97 mean_distances = distances.mean(axis=0) 

98 zero_distances = distances.min(axis=0) 

99 allowed_indices = all_indices[zero_distances > 0] 

100 selected_indices.append(allowed_indices[np.argmax(mean_distances[zero_distances > 0])]) 

101 

102 B = _construct_B_from_indices(X=X, indices=selected_indices) 

103 if return_indices: 

104 return B, selected_indices 

105 else: 

106 return B 

107 

108 

109def _init_plus_plus( 

110 X: np.ndarray, 

111 n_archetypes: int, 

112 seed: int = 42, 

113 return_indices: bool = False, 

114 epsilon: float = 1e-9, 

115) -> np.ndarray | tuple[np.ndarray, list[int]]: 

116 """Archetypal++ initialization for archetypes. 

117 

118 Reference: Mair, S., Sjölund, J., 2024. Archetypal Analysis++: Rethinking the Initialization Strategy. doi: https://doi.org/10.48550/arXiv.2301.13748 

119 

120 

121 Parameters 

122 ---------- 

123 X: numpy 2d-array 

124 Data matrix with shape (n_samples x n_features). 

125 

126 n_archetypes: int 

127 Number of candidate archetypes to extract. 

128 

129 seed: int 

130 Seed for reproducibility 

131 

132 Returns 

133 ------- 

134 B : numpy 2d-array 

135 Matrix B with shape (n_archetypes, n_samples). 

136 """ 

137 assert n_archetypes >= 2 

138 X = X = np.ascontiguousarray(X, dtype=np.float32) 

139 n_samples, _ = X.shape 

140 rng = np.random.default_rng(seed=seed) 

141 

142 first_idx = rng.choice(n_samples, size=1)[0] 

143 selected_indices = [first_idx] 

144 

145 for iter in range(1, n_archetypes): 

146 Z = X[selected_indices, :] 

147 if iter == 1: 

148 A = np.ones((n_samples, 1)) 

149 else: 

150 A = _init_A(n_samples=n_samples, n_archetypes=len(selected_indices), seed=seed) 

151 A = _compute_A_projected_gradients(X=X, Z=Z, A=A) 

152 p = np.sum(np.square(X - np.dot(A, Z)), axis=1) + epsilon 

153 p /= p.sum() 

154 selected_indices.append(rng.choice(a=n_samples, size=1, p=p)[0]) 

155 

156 B = _construct_B_from_indices(X=X, indices=selected_indices) 

157 if return_indices: 

158 return B, selected_indices 

159 else: 

160 return B