Coverage for src / tracekit / loaders / hdf5_loader.py: 86%
201 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 23:04 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 23:04 +0000
1"""HDF5 file loader for waveform data.
3This module provides loading of waveform data from HDF5 (.h5) files
4with automatic dataset discovery and attribute-based metadata extraction.
7Example:
8 >>> from tracekit.loaders.hdf5_loader import load_hdf5
9 >>> trace = load_hdf5("data.h5")
10 >>> print(f"Sample rate: {trace.metadata.sample_rate} Hz")
11"""
13from __future__ import annotations
15from pathlib import Path
16from typing import TYPE_CHECKING, Any
18import numpy as np
20from tracekit.core.exceptions import FormatError, LoaderError
21from tracekit.core.types import TraceMetadata, WaveformTrace
23if TYPE_CHECKING:
24 from os import PathLike
26# Try to import h5py
27try:
28 import h5py
30 H5PY_AVAILABLE = True
31except ImportError:
32 H5PY_AVAILABLE = False
35# Common dataset names for waveform data
36DATASET_NAMES = [
37 "data",
38 "waveform",
39 "signal",
40 "samples",
41 "voltage",
42 "trace",
43 "ch1",
44 "ch2",
45 "channel1",
46 "channel2",
47 "analog",
48]
50# Common attribute names for sample rate
51SAMPLE_RATE_ATTRS = [
52 "sample_rate",
53 "samplerate",
54 "sampling_rate",
55 "fs",
56 "rate",
57 "sample_interval",
58 "dt",
59]
62class HDF5MmapTrace:
63 """Memory-mapped waveform trace backed by HDF5 dataset.
65 Provides lazy access to HDF5 dataset without loading into memory.
66 Useful for huge files that don't fit in RAM.
68 Attributes:
69 file_path: Path to the HDF5 file.
70 dataset_path: Path to dataset within HDF5 file.
71 sample_rate: Sample rate in Hz.
72 length: Number of samples in the trace.
73 metadata: TraceMetadata object.
75 Example:
76 >>> trace = HDF5MmapTrace("huge.h5", "/data", metadata)
77 >>> # Access subset without loading entire file
78 >>> subset = trace[1000:2000]
79 """
81 def __init__(
82 self,
83 file_path: str | Path,
84 dataset_path: str,
85 metadata: TraceMetadata,
86 ) -> None:
87 """Initialize HDF5 memory-mapped trace.
89 Args:
90 file_path: Path to HDF5 file.
91 dataset_path: Path to dataset within file (e.g., "/data").
92 metadata: TraceMetadata with sample rate and other info.
94 Raises:
95 LoaderError: If file not found or invalid.
96 """
97 self._file_path = Path(file_path)
98 self._dataset_path = dataset_path
99 self._metadata = metadata
100 self._h5_file: h5py.File | None = None
101 self._dataset: h5py.Dataset | None = None
103 if not self._file_path.exists(): 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 raise LoaderError(f"File not found: {self._file_path}")
106 @property
107 def sample_rate(self) -> float:
108 """Sample rate in Hz."""
109 return self._metadata.sample_rate
111 @property
112 def length(self) -> int:
113 """Number of samples."""
114 self._ensure_open()
115 assert self._dataset is not None
116 return len(self._dataset)
118 @property
119 def metadata(self) -> TraceMetadata:
120 """Trace metadata."""
121 return self._metadata
123 def _ensure_open(self) -> None:
124 """Ensure HDF5 file is open."""
125 if self._h5_file is None or self._dataset is None: 125 ↛ exitline 125 didn't return from function '_ensure_open' because the condition on line 125 was always true
126 self._h5_file = h5py.File(self._file_path, "r")
127 self._dataset = self._h5_file[self._dataset_path]
129 def __getitem__(self, key: int | slice) -> np.ndarray[Any, Any]:
130 """Access data by index or slice.
132 Args:
133 key: Index or slice.
135 Returns:
136 Numpy array of data.
137 """
138 self._ensure_open()
139 assert self._dataset is not None
140 data = self._dataset[key]
141 return np.asarray(data, dtype=np.float64)
143 def __len__(self) -> int:
144 """Return number of samples."""
145 return self.length
147 def close(self) -> None:
148 """Close HDF5 file handle."""
149 if self._h5_file is not None: 149 ↛ exitline 149 didn't return from function 'close' because the condition on line 149 was always true
150 self._h5_file.close()
151 self._h5_file = None
152 self._dataset = None
154 def __del__(self) -> None:
155 """Cleanup on deletion."""
156 self.close()
158 def __enter__(self) -> HDF5MmapTrace:
159 """Context manager entry."""
160 return self
162 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
163 """Context manager exit."""
164 self.close()
166 def __repr__(self) -> str:
167 """String representation."""
168 return (
169 f"HDF5MmapTrace("
170 f"file={self._file_path.name}, "
171 f"dataset={self._dataset_path}, "
172 f"sample_rate={self.sample_rate:.2e} Hz, "
173 f"length={self.length:,} samples)"
174 )
177def load_hdf5(
178 path: str | PathLike[str],
179 *,
180 dataset: str | None = None,
181 channel: str | int | None = None,
182 sample_rate: float | None = None,
183 mmap: bool = False,
184) -> WaveformTrace | HDF5MmapTrace:
185 """Load waveform data from an HDF5 file.
187 Loads waveform data and metadata from HDF5 files. Automatically
188 discovers datasets and extracts sample rate from attributes.
190 Args:
191 path: Path to the HDF5 file.
192 dataset: Specific dataset path to load. If None, auto-detects.
193 channel: Alias for dataset (for API consistency with other loaders).
194 sample_rate: Override sample rate (if not found in attributes).
195 mmap: If True, return memory-mapped trace for large files.
197 Returns:
198 WaveformTrace containing the waveform data and metadata.
199 If mmap=True, returns HDF5MmapTrace instead.
201 Raises:
202 LoaderError: If the file cannot be loaded.
203 FormatError: If no valid waveform data is found.
205 Example:
206 >>> trace = load_hdf5("data.h5")
207 >>> print(f"Sample rate: {trace.metadata.sample_rate} Hz")
209 >>> # Load specific dataset
210 >>> trace = load_hdf5("multi.h5", dataset="/measurements/ch1")
212 >>> # Load as memory-mapped for large files
213 >>> trace = load_hdf5("huge_data.h5", mmap=True)
214 """
215 if not H5PY_AVAILABLE:
216 raise LoaderError(
217 "HDF5 support not available",
218 details="h5py package is required for HDF5 loading",
219 fix_hint="Install h5py: pip install h5py",
220 )
222 path = Path(path)
224 if not path.exists():
225 raise LoaderError(
226 "File not found",
227 file_path=str(path),
228 )
230 # Use channel as dataset if dataset not specified
231 if dataset is None and channel is not None:
232 dataset = str(channel)
234 try:
235 with h5py.File(path, "r") as f:
236 # Find dataset
237 if dataset is not None:
238 if dataset in f:
239 ds = f[dataset]
240 else:
241 # Try to find by name
242 ds = _find_dataset_by_name(f, dataset)
243 if ds is None:
244 available = list_datasets(path)
245 raise FormatError(
246 f"Dataset not found: {dataset}",
247 file_path=str(path),
248 expected=dataset,
249 got=f"Available: {', '.join(available)}",
250 )
251 else:
252 # Auto-detect dataset
253 ds = _find_waveform_dataset(f)
254 if ds is None:
255 available = list_datasets(path)
256 raise FormatError(
257 "No waveform data found in HDF5 file",
258 file_path=str(path),
259 expected=f"Dataset named: {', '.join(DATASET_NAMES)}",
260 got=f"Datasets: {', '.join(available)}",
261 )
263 # Extract data
264 if not isinstance(ds, h5py.Dataset): 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true
265 raise FormatError(
266 "Selected path is not a dataset",
267 file_path=str(path),
268 got=type(ds).__name__,
269 )
271 data = np.asarray(ds, dtype=np.float64)
272 if data.ndim > 1:
273 data = data.ravel()
275 # Extract metadata from attributes
276 detected_sample_rate = sample_rate
277 if detected_sample_rate is None:
278 detected_sample_rate = _find_sample_rate(f, ds)
280 if detected_sample_rate is None:
281 detected_sample_rate = 1e6 # Default
283 # Get other metadata
284 vertical_scale = _get_attr(ds, ["vertical_scale", "v_scale", "scale"])
285 vertical_offset = _get_attr(ds, ["vertical_offset", "v_offset", "offset"])
286 channel_name = _get_attr(ds, ["channel_name", "name", "channel"])
288 if channel_name is None:
289 channel_name = ds.name.split("/")[-1] if ds.name else "CH1"
291 metadata = TraceMetadata(
292 sample_rate=float(detected_sample_rate),
293 vertical_scale=float(vertical_scale) if vertical_scale else None,
294 vertical_offset=float(vertical_offset) if vertical_offset else None,
295 source_file=str(path),
296 channel_name=str(channel_name),
297 )
299 # Return memory-mapped trace if requested
300 if mmap:
301 return HDF5MmapTrace(
302 file_path=path,
303 dataset_path=ds.name,
304 metadata=metadata,
305 )
307 return WaveformTrace(data=data, metadata=metadata)
309 except OSError as e:
310 raise LoaderError(
311 "Failed to read HDF5 file",
312 file_path=str(path),
313 details=str(e),
314 ) from e
315 except Exception as e:
316 if isinstance(e, LoaderError | FormatError): 316 ↛ 318line 316 didn't jump to line 318 because the condition on line 316 was always true
317 raise
318 raise LoaderError(
319 "Failed to load HDF5 file",
320 file_path=str(path),
321 details=str(e),
322 ) from e
325def _find_waveform_dataset(f: h5py.File) -> h5py.Dataset | None:
326 """Find a waveform dataset in the HDF5 file."""
327 result: h5py.Dataset | None = None
329 def visitor(name: str, obj: Any) -> None:
330 nonlocal result
331 if result is not None:
332 return
333 if isinstance(obj, h5py.Dataset):
334 name_lower = name.lower().split("/")[-1]
335 # Check for common names
336 for ds_name in DATASET_NAMES:
337 if ds_name in name_lower:
338 result = obj
339 return
340 # Check if it's a 1D numeric array
341 if obj.ndim == 1 and obj.size > 10 and np.issubdtype(obj.dtype, np.number):
342 if result is None: 342 ↛ exitline 342 didn't return from function 'visitor' because the condition on line 342 was always true
343 result = obj
345 f.visititems(visitor)
346 return result
349def _find_dataset_by_name(f: h5py.File, name: str) -> h5py.Dataset | None:
350 """Find a dataset by name (case-insensitive partial match)."""
351 name_lower = name.lower()
352 result: h5py.Dataset | None = None
354 def visitor(path: str, obj: Any) -> None:
355 nonlocal result
356 if result is not None: 356 ↛ 357line 356 didn't jump to line 357 because the condition on line 356 was never true
357 return
358 if isinstance(obj, h5py.Dataset):
359 path_lower = path.lower()
360 if name_lower in path_lower:
361 result = obj
363 f.visititems(visitor)
364 return result
367def _find_sample_rate(f: h5py.File, ds: h5py.Dataset) -> float | None:
368 """Find sample rate from HDF5 attributes."""
369 # Check dataset attributes first
370 for attr_name in SAMPLE_RATE_ATTRS:
371 if attr_name in ds.attrs:
372 value = ds.attrs[attr_name]
373 if attr_name in ("sample_interval", "dt") and value > 0:
374 return 1.0 / float(value)
375 return float(value)
377 # Check parent group attributes
378 if ds.parent is not None: 378 ↛ 387line 378 didn't jump to line 387 because the condition on line 378 was always true
379 for attr_name in SAMPLE_RATE_ATTRS:
380 if attr_name in ds.parent.attrs:
381 value = ds.parent.attrs[attr_name]
382 if attr_name in ("sample_interval", "dt") and value > 0: 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true
383 return 1.0 / float(value)
384 return float(value)
386 # Check root attributes
387 for attr_name in SAMPLE_RATE_ATTRS:
388 if attr_name in f.attrs: 388 ↛ 389line 388 didn't jump to line 389 because the condition on line 388 was never true
389 value = f.attrs[attr_name]
390 if attr_name in ("sample_interval", "dt") and value > 0:
391 return 1.0 / float(value)
392 return float(value)
394 # Check for metadata group
395 if "metadata" in f:
396 meta = f["metadata"]
397 if isinstance(meta, h5py.Group | h5py.Dataset): 397 ↛ 405line 397 didn't jump to line 405 because the condition on line 397 was always true
398 for attr_name in SAMPLE_RATE_ATTRS: 398 ↛ 405line 398 didn't jump to line 405 because the loop on line 398 didn't complete
399 if attr_name in meta.attrs:
400 value = meta.attrs[attr_name]
401 if attr_name in ("sample_interval", "dt") and value > 0: 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true
402 return 1.0 / float(value)
403 return float(value)
405 return None
408def _get_attr(obj: h5py.Dataset | h5py.Group, names: list[str]) -> Any | None:
409 """Get attribute value by trying multiple names."""
410 for name in names:
411 if name in obj.attrs:
412 value = obj.attrs[name]
413 if isinstance(value, bytes): 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true
414 return value.decode("utf-8")
415 return value
416 return None
419def list_datasets(path: str | PathLike[str]) -> list[str]:
420 """List all datasets in an HDF5 file.
422 Args:
423 path: Path to the HDF5 file.
425 Returns:
426 List of dataset paths.
428 Raises:
429 LoaderError: If h5py is not available or file not found.
431 Example:
432 >>> datasets = list_datasets("data.h5")
433 >>> print(datasets)
434 ['/measurements/ch1', '/measurements/ch2', '/time']
435 """
436 if not H5PY_AVAILABLE:
437 raise LoaderError(
438 "HDF5 support not available",
439 details="h5py package is required",
440 )
442 path = Path(path)
443 if not path.exists():
444 raise LoaderError("File not found", file_path=str(path))
446 datasets: list[str] = []
448 def visitor(name: str, obj: Any) -> None:
449 if isinstance(obj, h5py.Dataset):
450 datasets.append("/" + name)
452 try:
453 with h5py.File(path, "r") as f:
454 f.visititems(visitor)
455 except Exception as e:
456 raise LoaderError(
457 "Failed to read HDF5 file",
458 file_path=str(path),
459 details=str(e),
460 ) from e
462 return datasets
465def get_attributes(
466 path: str | PathLike[str],
467 dataset: str | None = None,
468) -> dict[str, Any]:
469 """Get attributes from an HDF5 file or dataset.
471 Args:
472 path: Path to the HDF5 file.
473 dataset: Dataset path. If None, returns root attributes.
475 Returns:
476 Dictionary of attributes.
478 Raises:
479 LoaderError: If h5py is not available or file not found.
480 """
481 if not H5PY_AVAILABLE:
482 raise LoaderError("HDF5 support not available")
484 path = Path(path)
485 if not path.exists():
486 raise LoaderError("File not found", file_path=str(path))
488 try:
489 with h5py.File(path, "r") as f:
490 obj = f[dataset] if dataset is not None else f
492 attrs = {}
493 for key, value in obj.attrs.items():
494 if isinstance(value, bytes): 494 ↛ 495line 494 didn't jump to line 495 because the condition on line 494 was never true
495 value = value.decode("utf-8")
496 elif isinstance(value, np.ndarray):
497 value = value.tolist()
498 attrs[key] = value
500 return attrs
502 except Exception as e:
503 raise LoaderError(
504 "Failed to read HDF5 attributes",
505 file_path=str(path),
506 details=str(e),
507 ) from e
510__all__ = ["get_attributes", "list_datasets", "load_hdf5"]