Source code for scitex_ml.classification.timeseries._TimeSeriesMetadata
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-09-21 20:48:00 (ywatanabe)"
# File: _TimeSeriesMetadata.py
"""
Time series metadata dataclass.
Stores comprehensive metadata about time series datasets for informed
cross-validation strategy selection.
"""
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple
[docs]
@dataclass
class TimeSeriesMetadata:
"""
Metadata about the time series data.
This dataclass captures essential characteristics of time series data
that inform the selection of appropriate cross-validation strategies.
Attributes
----------
n_samples : int
Total number of samples in the dataset
n_features : int
Number of features per sample
n_classes : Optional[int]
Number of unique classes (None for regression)
has_groups : bool
Whether data contains group/subject identifiers
group_sizes : Optional[Dict[Any, int]]
Mapping of group IDs to their sample counts
time_range : Optional[Tuple[float, float]]
Minimum and maximum timestamp values
sampling_rate : Optional[float]
Samples per time unit (e.g., Hz for sensor data)
has_gaps : bool
Whether the time series has temporal gaps
max_gap_size : Optional[float]
Maximum gap between consecutive timestamps
is_balanced : bool
Whether classes are balanced (for classification)
class_distribution : Optional[Dict[Any, float]]
Mapping of class labels to their proportions
Examples
--------
>>> import numpy as np
>>> from scitex_ml.classification import TimeSeriesMetadata
>>>
>>> # Create metadata for a dataset
>>> metadata = TimeSeriesMetadata(
... n_samples=1000,
... n_features=10,
... n_classes=2,
... has_groups=True,
... group_sizes={0: 250, 1: 250, 2: 250, 3: 250},
... time_range=(0.0, 999.0),
... sampling_rate=1.0,
... has_gaps=False,
... max_gap_size=None,
... is_balanced=True,
... class_distribution={0: 0.5, 1: 0.5}
... )
>>>
>>> print(f"Dataset has {metadata.n_samples} samples")
>>> print(f"Number of groups: {len(metadata.group_sizes) if metadata.group_sizes else 0}")
"""
n_samples: int
n_features: int
n_classes: Optional[int] = None
has_groups: bool = False
group_sizes: Optional[Dict[Any, int]] = None
time_range: Optional[Tuple[float, float]] = None
sampling_rate: Optional[float] = None
has_gaps: bool = False
max_gap_size: Optional[float] = None
is_balanced: bool = True
class_distribution: Optional[Dict[Any, float]] = None
[docs]
def get_summary(self) -> str:
"""
Generate human-readable summary of the metadata.
Returns
-------
str
Formatted summary string
"""
lines = [
f"Time Series Dataset Metadata:",
f" Samples: {self.n_samples}",
f" Features: {self.n_features}",
]
if self.n_classes is not None:
lines.append(f" Classes: {self.n_classes}")
if self.class_distribution:
lines.append(f" Class balance: {self.class_distribution}")
if self.has_groups and self.group_sizes:
n_groups = len(self.group_sizes)
avg_size = sum(self.group_sizes.values()) / n_groups
lines.append(f" Groups: {n_groups} (avg size: {avg_size:.1f})")
if self.time_range:
duration = self.time_range[1] - self.time_range[0]
lines.append(f" Time range: {duration:.2f} units")
if self.sampling_rate:
lines.append(f" Sampling rate: {self.sampling_rate:.2f} Hz")
if self.has_gaps:
lines.append(f" Has gaps: Yes (max: {self.max_gap_size:.2f})")
return "\n".join(lines)
[docs]
def suggest_strategy(self) -> str:
"""
Suggest appropriate CV strategy based on metadata.
Returns
-------
str
Suggested strategy name
"""
if self.has_groups:
return "blocking"
elif self.n_classes and not self.is_balanced:
return "stratified"
elif self.sampling_rate and self.sampling_rate > 10:
return "sliding" # High frequency data
else:
return "expanding" # Default for simple time series