Coverage for src / autoencodix / evaluate / _general_evaluator.py: 11%
261 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-21 10:09 +0200
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-21 10:09 +0200
1from typing import Union, no_type_check
2import warnings
4import numpy as np
5import pandas as pd
6import sklearn
7from sklearn import linear_model
8from sklearn.model_selection import cross_validate
9from sklearn.decomposition import PCA
10from sklearn.metrics import get_scorer
12from umap import UMAP
13from sklearn.manifold import TSNE
14from sklearn.base import ClassifierMixin, RegressorMixin
17from autoencodix.utils._result import Result
18from autoencodix.data._datasetcontainer import DatasetContainer
19from autoencodix.base._base_evaluator import BaseEvaluator
20from autoencodix.base._base_visualizer import BaseVisualizer
22sklearn.set_config(enable_metadata_routing=True)
25class GeneralEvaluator(BaseEvaluator):
26 def __init__(self):
27 # super().__init__()
28 pass
30 @no_type_check
31 def evaluate(
32 self,
33 datasets: DatasetContainer,
34 result: Result,
35 ml_model_class: ClassifierMixin = linear_model.LogisticRegression(
36 max_iter=1000
37 ), # Default is sklearn LogisticRegression
38 ml_model_regression: RegressorMixin = linear_model.LinearRegression(), # Default is sklearn LinearRegression
39 params: Union[
40 list, str
41 ] = "all", # No default? ... or all params in annotation?
42 metric_class: str = "roc_auc_ovo", # Default is 'roc_auc_ovo' via https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-string-names
43 metric_regression: str = "r2", # Default is 'r2'
44 reference_methods: list = [], # Default [], Options are "PCA", "UMAP", "TSNE", "RandomFeature"
45 reference_reducer: dict = {}, # Option to provide pre-fitted reducer objects for PCA, UMAP or TSNE, e.g. {"PCA": pca_reducer, "UMAP": umap_reducer, "TSNE": tsne_reducer}
46 split_type: str = "use-split", # Default is "use-split", other options: "CV-5", ... "LOOCV"?
47 n_downsample: Union[
48 int, None
49 ] = 10000, # Default is 10000, if provided downsample to this number of samples for faster evaluation. Set to None to disable downsampling.
50 top_k_classes: Union[
51 int, None
52 ] = 20, # Default is 20, if provided restrict classification tasks to top k classes, others combined into "other"
53 exclude_classes: Union[
54 list, None
55 ] = None, # Default is None, if provided exclude these classes from evaluation
56 ) -> Result:
57 """Evaluates the performance of machine learning models on various feature representations and clinical parameters.
59 This method performs classification or regression tasks using specified machine learning models on different feature sets (e.g., latent space, PCA, UMAP, TSNE, RandomFeature) and clinical annotation parameters. It supports multiple evaluation strategies, including pre-defined train/valid/test splits, k-fold cross-validation, and leave-one-out cross-validation. The results are aggregated and stored in the provided `result` object.
60 - Samples with missing annotation values for a given parameter are excluded from the corresponding evaluation.
61 - For "RandomFeature", five random feature sets are evaluated.
62 - The method appends results to any existing `embedding_evaluation` in the result object.
64 Args:
65 datasets: A DatasetContainer containing train, valid, and test datasets, each with `sample_ids` and `metadata` (either a DataFrame or a dictionary with a 'paired' key for clinical annotations).
66 result: An Result object to store the evaluation results. Should have an `embedding_evaluation` attribute which updated (typically a DataFrame).
67 ml_model_class: The scikit-learn classifier to use for classification tasks (default: `sklearn.linear_model.LogisticRegression()`).
68 ml_model_regression: The scikit-learn regressor to use for regression tasks (default: `sklearn.linear_model.LinearRegression()`).
69 params:List of clinical annotation columns to evaluate, or "all" to use all columns (default: "all").
70 metric_class: Scoring metric for classification tasks (default: "roc_auc_ovo").
71 metric_regression: Scoring metric for regression tasks (default: "r2").
72 reference_methods: List of feature representations to evaluate (e.g., "PCA", "UMAP", "TSNE", "RandomFeature"). "Latent" is always included (default: []).
73 reference_reducer: Optional dictionary of pre-fitted dimensionality reduction objects for PCA, UMAP, or TSNE to ensure consistent transformations across runs (default: {}).
74 split_type: which split to use
75 use-split" for pre-defined splits, "CV-N" for N-fold cross-validation, or "LOOCV" for leave-one-out cross-validation (default: "use-split").
76 n_downsample: If provided, downsample the data to this number of samples for faster evaluation. Default is 10000. Set to None to disable downsampling.
77 top_k_classes: If provided, restrict classification tasks to the top k classes, combining others into "other" (default: 20).
78 exclude_classes: List of classes to exclude from evaluation (default: None).
79 Returns:
80 The updated result object with evaluation results stored in `embedding_evaluation`.
81 Raises
82 ValueError: If required annotation data is missing or improperly formatted, or if an unsupported split type is specified.
84 """
86 already_warned = False
88 df_results = pd.DataFrame()
90 reference_methods.append("Latent")
92 reference_methods = self._expand_reference_methods(
93 reference_methods=reference_methods, result=result
94 )
96 ## Overwrite original datasets with new_datasets if available after predict with other data
97 if datasets is None:
98 datasets = DatasetContainer()
100 if bool(result.new_datasets.train):
101 datasets.train = result.new_datasets.train
102 if bool(result.new_datasets.valid):
103 datasets.valid = result.new_datasets.valid
104 if bool(result.new_datasets.test):
105 datasets.test = result.new_datasets.test
107 if not bool(datasets.train or datasets.valid or datasets.test):
108 raise ValueError(
109 "No datasets found in result object. Please run predict with new data or save/load with all datasets by using save_all=True."
110 )
111 elif split_type == "use-split" and not bool(datasets.train):
112 warnings.warn(
113 "Warning: No train split found in result datasets for 'use-split' evaluation. ML model cannot be trained without a train split. Switch to cross-validation (CV-5) instead."
114 )
115 split_type = "CV-5"
117 for task in reference_methods:
118 print(f"Perform ML task with feature df: {task}")
120 # clin_data = self._get_clin_data(datasets)
121 clin_data = BaseVisualizer._collect_all_metadata(result=result)
123 if split_type == "use-split":
124 # Pandas dataframe with sample_ids and split information
125 sample_split = pd.DataFrame(columns=["SAMPLE_ID", "SPLIT"])
127 if datasets.train is not None:
128 if hasattr(datasets.train, "paired_sample_ids"):
129 if datasets.train.paired_sample_ids is not None:
130 sample_ids = datasets.train.paired_sample_ids
131 else:
132 sample_ids = datasets.train.sample_ids
133 sample_split_temp = dict(
134 sample_split,
135 **{
136 "SAMPLE_ID": sample_ids,
137 "SPLIT": ["train"] * len(sample_ids),
138 },
139 )
140 sample_split = pd.concat(
141 [sample_split, pd.DataFrame(sample_split_temp)],
142 axis=0,
143 ignore_index=True,
144 )
145 # else:
146 # raise ValueError(
147 # "No training data found. Please provide a valid training dataset."
148 # )
149 if datasets.valid is not None:
150 if hasattr(datasets.valid, "paired_sample_ids"):
151 if datasets.valid.paired_sample_ids is not None:
152 sample_ids = datasets.valid.paired_sample_ids
153 else:
154 sample_ids = datasets.valid.sample_ids
155 sample_split_temp = dict(
156 sample_split,
157 **{
158 "SAMPLE_ID": sample_ids,
159 "SPLIT": ["valid"] * len(sample_ids),
160 },
161 )
162 sample_split = pd.concat(
163 [sample_split, pd.DataFrame(sample_split_temp)],
164 axis=0,
165 ignore_index=True,
166 )
167 if datasets.test is not None:
168 if hasattr(datasets.test, "paired_sample_ids"):
169 if datasets.test.paired_sample_ids is not None:
170 sample_ids = datasets.test.paired_sample_ids
171 else:
172 sample_ids = datasets.test.sample_ids
173 sample_split_temp = dict(
174 sample_split,
175 **{
176 "SAMPLE_ID": sample_ids,
177 "SPLIT": ["test"] * len(sample_ids),
178 },
179 )
180 sample_split = pd.concat(
181 [sample_split, pd.DataFrame(sample_split_temp)],
182 axis=0,
183 ignore_index=True,
184 )
186 sample_split = sample_split.set_index("SAMPLE_ID", drop=False)
188 ## df -> task
189 subtask = [task]
190 if "RandomFeature" in task:
191 subtask = [task + "_R" + str(x) for x in range(1, 6)]
192 for sub in subtask:
193 print(sub)
194 # if is_modalix:
195 # modality = task.split("_$_")[1]
196 # task_xmodal = task.split("_$_")[0]
198 # df = self._load_input_for_ml_xmodal(task_xmodal, datasets, result, modality=modality)
199 # else:
200 df = self._load_input_for_ml(
201 task,
202 datasets,
203 result,
204 n_downsample,
205 reference_reducer=reference_reducer,
206 )
208 if params == "all":
209 params = clin_data.columns.tolist()
211 for task_param in params:
212 # if "Latent" in task:
213 print(f"Perform ML task for target parameter: {task_param}")
214 ## Check if classification or regression task
215 ml_type = self._get_ml_type(clin_data, task_param)
217 if pd.isna(clin_data[task_param]).sum() > 0:
218 # if pd.isna(clin_data[task_param]).values.any():
219 if not already_warned:
220 print(
221 "There are NA values in the annotation file. Samples with missing data will be removed for ML task evaluation."
222 )
223 already_warned = True
224 # logger.warning(clin_data.loc[pd.isna(clin_data[task_param]), task_param])
226 samples_nonna = clin_data.loc[
227 pd.notna(clin_data[task_param]), task_param
228 ].index
229 # print(df)
230 df = df.loc[samples_nonna.intersection(df.index), :]
231 if split_type == "use-split":
232 sample_split = sample_split.loc[
233 samples_nonna.intersection(sample_split.index), :
234 ]
235 # print(sample_split)
237 if n_downsample is not None:
238 if df.shape[0] > n_downsample:
239 sample_idx = np.random.choice(
240 df.shape[0], n_downsample, replace=False
241 )
242 df = df.iloc[sample_idx]
243 if split_type == "use-split":
244 sample_split = sample_split.loc[df.index, :]
246 if ml_type == "classification":
247 metric = metric_class
248 sklearn_ml = ml_model_class
250 if ml_type == "regression":
251 metric = metric_regression
252 sklearn_ml = ml_model_regression
254 if split_type == "use-split":
255 # print("Sample Split:")
256 # print(sample_split)
257 # print("Latent:")
258 # print(df)
259 results = self._single_ml_presplit(
260 sample_split=sample_split,
261 df=df,
262 clin_data=clin_data,
263 task_param=task_param,
264 sklearn_ml=sklearn_ml,
265 metric=metric,
266 ml_type=ml_type,
267 top_k_classes=top_k_classes,
268 )
269 elif split_type.startswith("CV-"):
270 cv_folds = int(split_type.split("-")[1])
272 results = self._single_ml(
273 df=df,
274 clin_data=clin_data,
275 task_param=task_param,
276 sklearn_ml=sklearn_ml,
277 metric=metric,
278 ml_type=ml_type,
279 cv_folds=cv_folds,
280 top_k_classes=top_k_classes,
281 exclude_classes=exclude_classes,
282 )
283 elif split_type == "LOOCV":
284 # Leave One Out Cross Validation
285 results = self._single_ml(
286 df=df,
287 clin_data=clin_data,
288 task_param=task_param,
289 sklearn_ml=sklearn_ml,
290 metric=metric,
291 ml_type=ml_type,
292 cv_folds=len(df),
293 top_k_classes=top_k_classes,
294 exclude_classes=exclude_classes,
295 )
296 else:
297 raise ValueError(
298 f"Your split type {split_type} is not supported. Please use 'use-split', 'CV-5', 'LOOCV' or 'CV-N'."
299 )
300 results = self._enrich_results(
301 results=results,
302 sklearn_ml=sklearn_ml,
303 ml_type=ml_type,
304 task=task,
305 sub=sub,
306 )
308 df_results = pd.concat([df_results, results])
310 ## Check if embedding_evaluation is empty
311 if (
312 hasattr(result, "embedding_evaluation")
313 and len(result.embedding_evaluation) == 0
314 ):
315 result.embedding_evaluation = df_results
316 else:
317 # merge with existing results
318 result.embedding_evaluation = pd.concat(
319 [result.embedding_evaluation, df_results], axis=0
320 )
322 return result
324 @staticmethod
325 def _single_ml(
326 df: pd.DataFrame,
327 clin_data: pd.DataFrame,
328 task_param: str,
329 sklearn_ml: Union[ClassifierMixin, RegressorMixin],
330 metric: str,
331 ml_type: str,
332 cv_folds: int = 5,
333 top_k_classes: Union[int, None] = 20,
334 exclude_classes: Union[list, None] = None,
335 ):
336 """Function learns on the given data frame df and label data the provided sklearn model.
338 Cross validation is performed according to the config and scores are returned as output as specified by metrics
340 Args:
341 df: Dataframe with input data
342 clin_data: Dataframe with label data
343 task_param: Column name with label data
344 sklearn_ml: Sklearn ML module specifying the ML algorithm
345 metric: string specifying the metric to be calculated by cross validation
346 ml_type: string specifying if this is a classification or regression task, used to determine which sklearn model and metric to use
347 cv_folds: Number of cross validation folds
348 top_k_classes: Number of top classes to keep, others combined into "other"
349 exclude_classes: List of classes to exclude from evaluation (default: None)
350 Returns:
351 score_df: data frame containing metrics (scores) for all CV runs (long format)
353 """
355 # X -> df
356 # Y -> task_param
357 y: Union[pd.Series, pd.DataFrame] = clin_data.loc[df.index, task_param]
358 if exclude_classes is not None:
359 df = df[~y.isin(exclude_classes)]
360 y = y[~y.isin(exclude_classes)]
361 score_df = dict()
362 # check if classification
363 if ml_type == "classification":
364 # Remove empty classes from y
365 y = y.cat.remove_unused_categories()
367 ## Cross Validation
368 if len(y.unique()) > 1: # ty: ignore
369 # Check that more samples per class than cv_folds
370 # print(y.value_counts())
371 if ml_type == "classification":
372 min_class_count = y.value_counts().min() # ty: ignore
373 if min_class_count < cv_folds:
374 # Combine all classes with less than cv_folds samples into one class "other"
375 warnings.warn(
376 f"Warning: For task parameter {task_param}, some classes have less samples ({min_class_count}) than the number of CV folds ({cv_folds}). Combining these classes into one class 'other' for evaluation."
377 )
378 y = y.apply(
379 lambda x: (
380 x
381 if clin_data[task_param].value_counts().loc[x] >= cv_folds
382 else "other"
383 )
384 )
385 # Restrict number of classes to top k classes
386 if top_k_classes is not None:
387 if len(y.unique()) > top_k_classes:
388 top_k_classes_list = (
389 y.value_counts().nlargest(top_k_classes).index
390 )
391 y = y.apply(lambda x: x if x in top_k_classes_list else "other")
392 scores = cross_validate(
393 sklearn_ml,
394 df,
395 y,
396 cv=cv_folds,
397 scoring=metric,
398 return_train_score=True,
399 n_jobs=-1,
400 )
402 # Output
404 # Output Format
405 # CV_RUN | SCORE_SPLIT | TASK_PARAM | METRIC | VALUE
407 score_df["cv_run"] = list()
408 score_df["score_split"] = list()
409 score_df["CLINIC_PARAM"] = list()
410 score_df["metric"] = list()
411 score_df["value"] = list()
413 cv_runs = ["CV_" + str(x) for x in range(1, cv_folds + 1)]
414 task_param_cv = [task_param for x in range(1, cv_folds + 1)]
416 for m in scores:
417 if m.split("_")[0] == "test" or m.split("_")[0] == "train":
418 split_cv = [m.split("_")[0] for x in range(1, cv_folds + 1)]
419 metric_cv = [metric for x in range(1, cv_folds + 1)]
421 score_df["cv_run"].extend(cv_runs)
422 score_df["score_split"].extend(split_cv)
423 score_df["CLINIC_PARAM"].extend(task_param_cv)
424 score_df["metric"].extend(metric_cv)
425 score_df["value"].extend(scores[m])
427 return pd.DataFrame(score_df)
429 def _enrich_results(
430 self,
431 results: pd.DataFrame,
432 sklearn_ml: Union[ClassifierMixin, RegressorMixin],
433 ml_type: str,
434 task: str,
435 sub: str,
436 ) -> pd.DataFrame:
437 res_ml_alg = [str(sklearn_ml).split("(")[0] for x in range(0, results.shape[0])]
438 res_ml_type = [ml_type for x in range(0, results.shape[0])]
439 res_ml_task = [task for x in range(0, results.shape[0])]
440 res_ml_subtask = [sub for x in range(0, results.shape[0])]
442 results["ML_ALG"] = res_ml_alg
443 results["ML_TYPE"] = res_ml_type
444 # if is_modalix:
445 # results["MODALITY"] = [modality for x in range(0, results.shape[0])]
446 # results["ML_TASK"] = [task_xmodal for x in range(0, results.shape[0])]
447 # else:
448 results["ML_TASK"] = res_ml_task
449 results["ML_SUBTASK"] = res_ml_subtask
451 return results
453 @staticmethod
454 def _single_ml_presplit(
455 sample_split: pd.DataFrame,
456 df: pd.DataFrame,
457 clin_data: pd.DataFrame,
458 task_param: str,
459 sklearn_ml: Union[ClassifierMixin, RegressorMixin],
460 metric: str,
461 ml_type: str,
462 top_k_classes: Union[int, None] = 20,
463 ):
464 """Trains the provided sklearn model on the training split and evaluates it on train, valid, and test splits using the specified metric.
466 Args:
467 sample_split: DataFrame with sample IDs and their corresponding split ("train", "valid", "test").
468 df: DataFrame with input features, indexed by sample IDs.
469 clin_data: DataFrame with label/annotation data, indexed by sample IDs.
470 task_param: Column name in clin_data specifying the target variable.
471 sklearn_ml: Instantiated sklearn model to use for training and evaluation.
472 metric: Scoring metric compatible with sklearn's get_scorer.
473 ml_type: Type of machine learning task ("classification" or "regression").
474 top_k_classes: If provided, restrict classification tasks to the top k classes, combining others into "other" (default: 20).
476 Returns:
477 DataFrame containing evaluation scores for each split (train, valid, test) and the specified metric.
479 Raises
480 ValueError: If the provided metric is not supported by sklearn.
481 """
482 split_list = ["train", "valid", "test"]
484 score_df = dict()
485 score_df["score_split"] = list()
486 score_df["CLINIC_PARAM"] = list()
487 score_df["metric"] = list()
488 score_df["value"] = list()
490 X_train = df.loc[
491 sample_split.loc[sample_split.SPLIT == "train", "SAMPLE_ID"], :
492 ]
493 train_samples = [s for s in X_train.index]
494 Y_train = clin_data.loc[train_samples, task_param]
495 # train model once on training data
496 if len(Y_train.unique()) > 1: # ty: ignore
498 # Restrict number of classes to top k classes
499 if top_k_classes is not None and ml_type == "classification":
500 if len(Y_train.unique()) > top_k_classes:
501 top_k_classes_list = (
502 Y_train.value_counts().nlargest(top_k_classes).index
503 )
504 Y_train = Y_train.apply(
505 lambda x: x if x in top_k_classes_list else "other"
506 )
507 sklearn_ml.fit(X_train, Y_train) # ty: ignore
509 # eval on all splits
510 for split in split_list:
511 X = df.loc[
512 sample_split.loc[sample_split.SPLIT == split, "SAMPLE_ID"], :
513 ]
514 if X.shape[0] == 0:
515 # No samples in this split, skip
516 continue
517 samples = [s for s in X.index]
518 Y = clin_data.loc[samples, task_param]
520 # Performace on train, valid and test data split
522 score_df["score_split"].append(split)
523 score_df["CLINIC_PARAM"].append(task_param)
524 score_df["metric"].append(metric)
525 sklearn_scorer = get_scorer(metric)
527 if sklearn_scorer is None:
528 raise ValueError(
529 f"Your metric {metric} is not supported by sklearn. Please use a valid metric."
530 )
532 if ml_type == "classification":
533 if top_k_classes is not None and (
534 len(Y_train.unique()) > top_k_classes
535 ):
536 # Adjust Y to only contain top k classes and other as for Y_train
537 Y = Y.apply(lambda x: x if x in top_k_classes_list else "other")
538 # Check that Y has only classes which are present in Y_train
539 if (
540 len(
541 set(Y.unique()).difference( # ty: ignore
542 set(Y_train.unique()) # ty: ignore
543 ) # ty: ignore
544 ) # ty: ignore
545 > 0 # ty: ignore
546 ): # ty: ignore
547 print(
548 f"Classes in split {split} are not present in training data"
549 )
550 # Adjust Y to only contain classes present in Y_train
551 Y = Y[Y.isin(Y_train.unique())] # ty: ignore
552 # Adjust X as well
553 X = X.loc[Y.index, :]
555 if ml_type == "classification":
556 score_temp = sklearn_scorer(
557 sklearn_ml, X, Y, labels=np.sort(Y_train.unique())
558 )
559 elif ml_type == "regression":
560 score_temp = sklearn_scorer(sklearn_ml, X, Y)
561 else:
562 raise ValueError(
563 f"Your ML type {ml_type} is not supported. Please use 'classification' or 'regression'."
564 )
565 score_df["value"].append(score_temp)
566 else:
567 ## Warning that there is only one class in the training data
568 warnings.warn(
569 f"Warning: There is only one class in the training data for task parameter {task_param}. Skipping evaluation for this task."
570 )
572 return pd.DataFrame(score_df)
574 @staticmethod
575 def _get_ml_type(clin_data: pd.DataFrame, task_param: str) -> str:
576 """Determines the machine learning task type (classification or regression) based on the data type of a specified column in clinical data.
578 Args:
579 clin_data: The clinical data as a pandas DataFrame.
580 task_param: The column name in clin_data to inspect for determining the task type.
582 Returns:
583 "classification" if the first value in the specified column is a string, otherwise "regression".
584 """
585 ## Auto-Detection
586 if type(list(clin_data[task_param])[0]) is str:
587 ml_type = "classification"
588 elif clin_data[task_param].unique().shape[0] < 3:
589 ml_type = "classification"
590 else:
591 ml_type = "regression"
593 return ml_type
595 @staticmethod
596 def _load_input_for_ml(
597 task: str,
598 dataset: DatasetContainer,
599 result: Result,
600 n_downsample: Union[int, None] = None,
601 reference_reducer: dict = {},
602 ) -> pd.DataFrame:
603 """Loads and processes input data for various machine learning tasks based on the specified task type.
606 Task Details:
607 - "Latent": Concatenates latent representations from train, validation, and test splits at the final epoch.
608 - "UMAP": Applies UMAP dimensionality reduction to the concatenated dataset splits.
609 - "PCA": Applies PCA dimensionality reduction to the concatenated dataset splits.
610 - "TSNE": Applies t-SNE dimensionality reduction to the concatenated dataset splits.
611 - "RandomFeature": Randomly samples columns (features) from the concatenated dataset splits.
613 Args:
614 task: The type of ML task. Supported values are "Latent", "UMAP", "PCA", "TSNE", and "RandomFeature".
615 dataset: The dataset container object holding train, validation, and test splits.
616 result: The result object containing model configuration and methods to retrieve latent representations.
617 n_downsample: If provided, downsample the data to this number of samples for faster processing. Default is None (no downsampling).
618 reference_reducer: Optional dictionary of pre-fitted dimensionality reduction objects for PCA, UMAP, or TSNE to ensure consistent transformations across runs (default: {}).
619 Returns:
620 A DataFrame containing the processed input data suitable for the specified ML task.
621 Raises:
622 ValueError: If the provided task is not supported.
623 """
625 final_epoch = result.model.config.epochs - 1
627 # if task == "Latent":
628 # df = pd.concat(
629 # [
630 # result.get_latent_df(epoch=final_epoch, split="train"),
631 # result.get_latent_df(epoch=final_epoch, split="valid"),
632 # result.get_latent_df(epoch=-1, split="test"),
633 # ]
634 # )
636 if task == "Latent":
637 dfs = []
638 for split in ["train", "valid", "test"]:
639 df_split = result.get_latent_df(
640 epoch=final_epoch if split != "test" else -1, split=split
641 )
642 if df_split is not None and not df_split.empty:
643 if n_downsample is not None:
644 if df_split.shape[0] > n_downsample:
645 print("Downsampling data for Latent representation...")
646 sample_idx = np.random.choice(
647 df_split.shape[0], n_downsample, replace=False
648 )
649 df_split = df_split.iloc[sample_idx]
650 dfs.append(df_split)
652 df = pd.concat(dfs) if dfs else pd.DataFrame()
654 elif task in ["UMAP", "PCA", "TSNE", "RandomFeature"]:
655 dfs = []
656 for split_name in ["train", "valid", "test"]:
657 split_data = getattr(dataset, split_name, None)
658 if split_data is not None:
659 dfs.append(split_data._to_df())
661 if not dfs:
662 raise ValueError(
663 "No available dataset splits (train, valid, test) to process."
664 )
666 df_processed = pd.concat(dfs)
668 # elif task in ["UMAP", "PCA", "TSNE", "RandomFeature"]:
669 # if dataset.train is None:
670 # raise ValueError("train attribute of dataset cannot be None")
671 # if dataset.valid is None:
672 # raise ValueError("valid attribute of dataset cannot be None")
673 # if dataset.test is None:
674 # raise ValueError("test attribute of dataset cannot be None")
676 # df_processed = pd.concat(
677 # [
678 # dataset.train._to_df(),
679 # dataset.test._to_df(),
680 # dataset.valid._to_df(),
681 # ]
682 # )
683 if task == "UMAP":
684 if task in reference_reducer:
685 reducer = reference_reducer[task]
686 if reducer.n_components != result.model.config.latent_dim:
687 raise ValueError(
688 f"The provided UMAP reducer has n_components={reducer.n_components}, which does not match the latent dimension {result.model.config.latent_dim} specified in the model config."
689 )
690 else:
691 reducer = UMAP(n_components=result.model.config.latent_dim)
692 reducer.fit(df_processed)
694 df = pd.DataFrame(
695 reducer.transform(df_processed), index=df_processed.index
696 )
697 elif task == "PCA":
698 if task in reference_reducer:
699 reducer = reference_reducer[task]
700 if reducer.n_components_ != result.model.config.latent_dim:
701 raise ValueError(
702 f"The provided PCA reducer has n_components={reducer.n_components_}, which does not match the latent dimension {result.model.config.latent_dim} specified in the model config."
703 )
704 print("Using pre-fitted PCA reducer for dimensionality reduction.")
705 else:
706 reducer = PCA(n_components=result.model.config.latent_dim)
707 reducer.fit(df_processed)
709 df = pd.DataFrame(
710 reducer.transform(df_processed), index=df_processed.index
711 )
712 elif task == "TSNE":
713 if task in reference_reducer:
714 reducer = reference_reducer[task]
715 else:
716 reducer = TSNE(n_components=result.model.config.latent_dim)
717 reducer.fit(df_processed)
719 df = pd.DataFrame(
720 reducer.transform(df_processed), index=df_processed.index
721 )
722 elif task == "RandomFeature":
723 df = df_processed.sample(n=result.model.config.latent_dim, axis=1)
724 else:
725 raise ValueError(
726 f"Your ML task {task} is not supported. Please use Latent, UMAP, PCA or RandomFeature."
727 )
729 return df