Coverage for partipy/initialize.py: 96%
68 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-09 10:24 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-09 10:24 +0200
1import numpy as np
2from scipy.spatial.distance import cdist
4from .optim import _compute_A_projected_gradients
7def _construct_B_from_indices(X: np.ndarray, indices: list) -> np.ndarray:
8 X = X.astype(np.float32)
9 n_samples, _ = X.shape
10 B = np.zeros((len(indices), n_samples), dtype=np.float32)
11 for arch_idx in range(len(indices)):
12 B[arch_idx, indices[arch_idx]] = 1.0
13 B = np.ascontiguousarray(B, dtype=np.float32)
14 return B
17def _init_A(n_samples: int, n_archetypes: int, seed: int, epsilon: float = 1e-9) -> np.ndarray:
18 rng = np.random.default_rng(seed) # Use a fixed seed
19 A = -np.log(rng.random((n_samples, n_archetypes), dtype=np.float32) + epsilon)
20 A /= np.sum(A, axis=1, keepdims=True)
21 A = np.ascontiguousarray(A, dtype=np.float32)
22 return A
25def _init_uniform(
26 X: np.ndarray, n_archetypes: int, seed: int = 42, return_indices: bool = False
27) -> np.ndarray | tuple[np.ndarray, list[int]]:
28 """Random selection of data points from X to form initial archetypes.
30 Parameters
31 ----------
32 X: numpy 2d-array
33 Data matrix with shape (n_samples x n_features).
35 n_archetypes: int
36 Number of candidate archetypes to extract.
38 seed: int
39 Seed for reproducibility
41 Returns
42 -------
43 B : numpy 2d-array
44 Matrix B with shape (n_archetypes, n_samples).
45 """
46 assert n_archetypes >= 2
47 X = np.ascontiguousarray(X, dtype=np.float32)
48 n_samples, _ = X.shape
49 rng = np.random.default_rng(seed=seed)
50 selected_indices = list(rng.choice(a=n_samples, size=n_archetypes, replace=False))
51 B = _construct_B_from_indices(X=X, indices=selected_indices)
52 if return_indices:
53 return B, selected_indices
54 else:
55 return B
58def _init_furthest_sum(
59 X: np.ndarray, n_archetypes: int, seed: int = 42, return_indices: bool = False
60) -> np.ndarray | tuple[np.ndarray, list[int]]:
61 """Furthest sum initialization for archetypes.
63 Reference: M. Mørup and L. K. Hansen, “Archetypal analysis for machine learning and data mining,” Neurocomputing, vol. 80, pp. 54-63, Mar. 2012, doi: https://doi.org/10.1016/j.neucom.2011.06.033.
65 Parameters
66 ----------
67 X: numpy 2d-array
68 Data matrix with shape (n_samples x n_features).
70 n_archetypes: int
71 Number of candidate archetypes to extract.
73 seed: int
74 Seed for reproducibility
76 Returns
77 -------
78 B : numpy 2d-array
79 Matrix B with shape (n_archetypes, n_samples).
80 """
81 assert n_archetypes >= 2
82 X = np.ascontiguousarray(X, dtype=np.float32)
83 n_samples, _ = X.shape
84 rng = np.random.default_rng(seed=seed)
86 first_idx = rng.choice(n_samples, size=1)[0]
87 all_indices = np.arange(n_samples, dtype=int)
88 selected_indices = [first_idx]
90 # we replace the first 10 archetypes, see in the orginal implementation
91 # https://github.com/ulfaslak/py_pcha/blob/f398d28c6a28c4a8121cb75443b221fc58fc10e4/py_pcha/furthest_sum.py#L46
92 for iter in range(1, n_archetypes + 11):
93 if iter > (n_archetypes - 1):
94 selected_indices = selected_indices[1:]
95 Z = X[selected_indices, :]
96 distances = cdist(Z, X, metric="euclidean")
97 mean_distances = distances.mean(axis=0)
98 zero_distances = distances.min(axis=0)
99 allowed_indices = all_indices[zero_distances > 0]
100 selected_indices.append(allowed_indices[np.argmax(mean_distances[zero_distances > 0])])
102 B = _construct_B_from_indices(X=X, indices=selected_indices)
103 if return_indices:
104 return B, selected_indices
105 else:
106 return B
109def _init_plus_plus(
110 X: np.ndarray,
111 n_archetypes: int,
112 seed: int = 42,
113 return_indices: bool = False,
114 epsilon: float = 1e-9,
115) -> np.ndarray | tuple[np.ndarray, list[int]]:
116 """Archetypal++ initialization for archetypes.
118 Reference: Mair, S., Sjölund, J., 2024. Archetypal Analysis++: Rethinking the Initialization Strategy. doi: https://doi.org/10.48550/arXiv.2301.13748
121 Parameters
122 ----------
123 X: numpy 2d-array
124 Data matrix with shape (n_samples x n_features).
126 n_archetypes: int
127 Number of candidate archetypes to extract.
129 seed: int
130 Seed for reproducibility
132 Returns
133 -------
134 B : numpy 2d-array
135 Matrix B with shape (n_archetypes, n_samples).
136 """
137 assert n_archetypes >= 2
138 X = X = np.ascontiguousarray(X, dtype=np.float32)
139 n_samples, _ = X.shape
140 rng = np.random.default_rng(seed=seed)
142 first_idx = rng.choice(n_samples, size=1)[0]
143 selected_indices = [first_idx]
145 for iter in range(1, n_archetypes):
146 Z = X[selected_indices, :]
147 if iter == 1:
148 A = np.ones((n_samples, 1))
149 else:
150 A = _init_A(n_samples=n_samples, n_archetypes=len(selected_indices), seed=seed)
151 A = _compute_A_projected_gradients(X=X, Z=Z, A=A)
152 p = np.sum(np.square(X - np.dot(A, Z)), axis=1) + epsilon
153 p /= p.sum()
154 selected_indices.append(rng.choice(a=n_samples, size=1, p=p)[0])
156 B = _construct_B_from_indices(X=X, indices=selected_indices)
157 if return_indices:
158 return B, selected_indices
159 else:
160 return B