import warnings
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
[docs]
class EmulatorDataset(Dataset):
"""
A PyTorch dataset for loading emulator data, designed to handle sequence-based inputs and projections.
Args:
X (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The input data.
y (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The target data.
sequence_length (int, optional): The length of the input sequence. Default is 5.
projection_length (int or tuple, optional): The length of the projection period. Default is 86.
Attributes:
X (torch.Tensor): The input data converted to a PyTorch tensor.
y (torch.Tensor): The target data converted to a PyTorch tensor.
sequence_length (int): The length of the input sequence.
xdim (int): The number of dimensions in X.
num_projections (int): The number of projections in the dataset.
num_timesteps (int): The number of timesteps per projection.
num_features (int): The number of features in the dataset.
Methods:
_to_tensor(x): Converts input data to a PyTorch tensor.
__len__(): Returns the total number of samples.
__getitem__(i): Retrieves the i-th sample from the dataset, including proper padding.
"""
def __init__(self, X, y, sequence_length=5, projection_length=86):
super().__init__()
if isinstance(projection_length, tuple):
if len(projection_length) == 1:
projection_length = projection_length[0]
else:
raise ValueError("Projection length must be a single integer or a tuple of two integers.")
if X.shape[0] < projection_length:
warnings.warn(
f"Full projections of {projection_length} timesteps are not present in the dataset. This may lead to unexpected behavior."
)
self.X = self._to_tensor(X)
self.y = self._to_tensor(y)
self.sequence_length = sequence_length
self.xdim = len(X.shape)
if self.xdim == 3: # Batched by projection
self.num_projections, self.num_timesteps, self.num_features = X.shape
elif self.xdim == 2: # Unbatched (rows of projections*timestamps)
self.projections_and_timesteps, self.features = X.shape
self.num_timesteps = projection_length
self.num_projections = self.projections_and_timesteps // self.num_timesteps
[docs]
def _to_tensor(self, x):
"""
Converts input data to a PyTorch tensor of type float.
Args:
x (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The input data.
Returns:
torch.Tensor: A PyTorch tensor of type float.
"""
if x is None:
return None
if isinstance(x, pd.DataFrame):
x = torch.tensor(x.values)
elif isinstance(x, np.ndarray):
x = torch.tensor(x)
elif isinstance(x, torch.Tensor):
pass
else:
raise ValueError("Data must be a pandas dataframe, numpy array, or PyTorch tensor")
return x.float()
[docs]
def __len__(self):
"""
Returns the total number of samples in the dataset.
Returns:
int: The dataset length.
"""
if self.xdim == 2:
return self.X.shape[0]
else:
return self.X.shape[0] * self.X.shape[1]
[docs]
def __getitem__(self, i):
"""
Retrieves the i-th sample from the dataset, applying padding if necessary.
Args:
i (int): The index of the item to retrieve.
Returns:
tuple: A tuple containing the input sequence and corresponding target value (if available).
"""
# Calculate projection index and timestep index
projection_index = i // self.num_timesteps
time_step_index = i % self.num_timesteps
# Initialize a sequence with zeros for padding
sequence = torch.zeros((self.sequence_length, self.features))
# Calculate start and end points for copying data
start_point = max(0, time_step_index - self.sequence_length + 1)
end_point = time_step_index + 1
length_of_data = end_point - start_point
# Copy the data from the dataset to the end of the sequence to preserve recent data at the end
if self.xdim == 3:
sequence[-length_of_data:] = self.X[projection_index, start_point:end_point]
elif self.xdim == 2:
sequence[-length_of_data:] = self.X[
projection_index * self.num_timesteps
+ start_point : projection_index * self.num_timesteps
+ end_point
]
if self.y is None:
return sequence
return sequence, self.y[i]
[docs]
class PyTorchDataset(Dataset):
"""
A PyTorch dataset for general-purpose data loading.
Args:
X (torch.Tensor): The input data.
y (torch.Tensor): The target data.
Methods:
__getitem__(index): Retrieves the sample at the specified index.
__len__(): Returns the total dataset length.
"""
def __init__(self, X, y):
self.X_data = X
self.y_data = y
[docs]
def __getitem__(self, index):
"""
Retrieves the sample at the specified index.
Args:
index (int): The index of the sample.
Returns:
tuple: The input data and corresponding target (if available).
"""
if self.y_data is None:
return self.X_data[index]
return self.X_data[index], self.y_data[index]
[docs]
def __len__(self):
"""
Returns the total number of samples.
Returns:
int: The dataset length.
"""
return len(self.X_data)
[docs]
class TSDataset(Dataset):
"""
A PyTorch dataset for handling time series data with sequence-based input.
Args:
X (torch.Tensor): The input data.
y (torch.Tensor): The target data.
sequence_length (int, optional): The length of the input sequence. Default is 5.
Attributes:
X (torch.Tensor): The input data.
y (torch.Tensor): The target data.
sequence_length (int): The sequence length.
Methods:
__len__(): Returns the dataset length.
__getitem__(i): Retrieves the i-th time series sample.
"""
def __init__(self, X, y, sequence_length=5):
super().__init__()
self.X = X
self.y = y
self.sequence_length = sequence_length
[docs]
def __len__(self):
"""
Returns the length of the dataset.
Returns:
int: The dataset length.
"""
return len(self.X)
[docs]
def __getitem__(self, i):
"""
Retrieves the i-th sample, applying padding if needed.
Args:
i (int): The index of the sample.
Returns:
tuple: A tuple containing the input sequence and corresponding target value (if available).
"""
if i >= self.sequence_length - 1:
i_start = i - self.sequence_length + 1
x = self.X[i_start : (i + 1), :]
else:
padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
x = self.X[0 : (i + 1), :]
x = torch.cat((padding, x), 0)
if self.y is None:
return x
return x, self.y[i]
[docs]
class ScenarioDataset(Dataset):
"""
A PyTorch dataset designed for scenario-based data loading.
Args:
features (torch.Tensor): The input features.
labels (torch.Tensor): The target labels.
Attributes:
features (torch.Tensor): The input features.
labels (torch.Tensor): The target labels.
Methods:
__len__(): Returns the dataset length.
__getitem__(idx): Retrieves the sample at the given index.
"""
def __init__(self, features, labels):
self.features = features
self.labels = labels
[docs]
def __len__(self):
"""
Returns the total number of samples.
Returns:
int: The dataset length.
"""
return len(self.features)
[docs]
def __getitem__(self, idx):
"""
Retrieves the sample at the given index.
Args:
idx (int): The index of the sample.
Returns:
tuple: A tuple containing the input features and corresponding labels.
"""
return self.features[idx], self.labels[idx]