Source code for MED3pa.datasets.loading_strategies

"""
This module provides strategies for loading data from files into usable Python formats, focusing on converting data into **NumPy** arrays. 
It includes an abstract base class ``DataLoadingStrategy`` for defining common interfaces and concrete implementations of this class, such as ``CSVDataLoadingStrategy`` for handling CSV files.
This setup allows easy extension to support additional file types as needed.
"""

import numpy as np
import pandas as pd
from typing import List, Tuple
from abc import ABC, abstractmethod


[docs]class DataLoadingStrategy(ABC): """ Abstract base class for data loading strategies. Defines a common interface for all data loading strategies. """
[docs] @abstractmethod def execute(path_to_file: str, target_column_name: str) -> Tuple[List[str], np.ndarray, np.ndarray]: """ Abstract method to execute the data loading strategy. Args: path_to_file (str): The path to the file to be loaded. target_column_name (str): The name of the target column in the dataset. Returns: Tuple[List[str], np.ndarray, np.ndarray]: A tuple containing the column labels, observations as a NumPy array, and the target as a NumPy array. """ pass
[docs]class CSVDataLoadingStrategy(DataLoadingStrategy): """ Strategy class for loading CSV data. Implements the abstract execute method to handle CSV files. Methods: execute(path_to_file: str, target_column_name: str) -> Tuple[List[str], np.ndarray, np.ndarray]: Loads CSV data from the given path, separates observations and target, and converts them to NumPy arrays. """
[docs] @staticmethod def execute(path_to_file: str, target_column_name: str) -> Tuple[List[str], np.ndarray, np.ndarray]: """ Loads CSV data from the given path, separates observations and target, and converts them to NumPy arrays. Args: path_to_file (str): The path to the CSV file to be loaded. target_column_name (str): The name of the target column in the dataset. Returns: Tuple[List[str], np.ndarray, np.ndarray]: Column labels, observations as a NumPy array, and target as a NumPy array. """ # Read the CSV file df = pd.read_csv(path_to_file) # Separate observations and target observations = df.drop(columns=[target_column_name]) target = df[target_column_name] column_labels = observations.columns.tolist() # Convert to NumPy arrays obs_np = observations.to_numpy() target_np = target.to_numpy() return column_labels, obs_np, target_np