Coverage for src / tracekit / loaders / hdf5_loader.py: 86%

201 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 23:04 +0000

1"""HDF5 file loader for waveform data. 

2 

3This module provides loading of waveform data from HDF5 (.h5) files 

4with automatic dataset discovery and attribute-based metadata extraction. 

5 

6 

7Example: 

8 >>> from tracekit.loaders.hdf5_loader import load_hdf5 

9 >>> trace = load_hdf5("data.h5") 

10 >>> print(f"Sample rate: {trace.metadata.sample_rate} Hz") 

11""" 

12 

13from __future__ import annotations 

14 

15from pathlib import Path 

16from typing import TYPE_CHECKING, Any 

17 

18import numpy as np 

19 

20from tracekit.core.exceptions import FormatError, LoaderError 

21from tracekit.core.types import TraceMetadata, WaveformTrace 

22 

23if TYPE_CHECKING: 

24 from os import PathLike 

25 

26# Try to import h5py 

27try: 

28 import h5py 

29 

30 H5PY_AVAILABLE = True 

31except ImportError: 

32 H5PY_AVAILABLE = False 

33 

34 

35# Common dataset names for waveform data 

36DATASET_NAMES = [ 

37 "data", 

38 "waveform", 

39 "signal", 

40 "samples", 

41 "voltage", 

42 "trace", 

43 "ch1", 

44 "ch2", 

45 "channel1", 

46 "channel2", 

47 "analog", 

48] 

49 

50# Common attribute names for sample rate 

51SAMPLE_RATE_ATTRS = [ 

52 "sample_rate", 

53 "samplerate", 

54 "sampling_rate", 

55 "fs", 

56 "rate", 

57 "sample_interval", 

58 "dt", 

59] 

60 

61 

62class HDF5MmapTrace: 

63 """Memory-mapped waveform trace backed by HDF5 dataset. 

64 

65 Provides lazy access to HDF5 dataset without loading into memory. 

66 Useful for huge files that don't fit in RAM. 

67 

68 Attributes: 

69 file_path: Path to the HDF5 file. 

70 dataset_path: Path to dataset within HDF5 file. 

71 sample_rate: Sample rate in Hz. 

72 length: Number of samples in the trace. 

73 metadata: TraceMetadata object. 

74 

75 Example: 

76 >>> trace = HDF5MmapTrace("huge.h5", "/data", metadata) 

77 >>> # Access subset without loading entire file 

78 >>> subset = trace[1000:2000] 

79 """ 

80 

81 def __init__( 

82 self, 

83 file_path: str | Path, 

84 dataset_path: str, 

85 metadata: TraceMetadata, 

86 ) -> None: 

87 """Initialize HDF5 memory-mapped trace. 

88 

89 Args: 

90 file_path: Path to HDF5 file. 

91 dataset_path: Path to dataset within file (e.g., "/data"). 

92 metadata: TraceMetadata with sample rate and other info. 

93 

94 Raises: 

95 LoaderError: If file not found or invalid. 

96 """ 

97 self._file_path = Path(file_path) 

98 self._dataset_path = dataset_path 

99 self._metadata = metadata 

100 self._h5_file: h5py.File | None = None 

101 self._dataset: h5py.Dataset | None = None 

102 

103 if not self._file_path.exists(): 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 raise LoaderError(f"File not found: {self._file_path}") 

105 

106 @property 

107 def sample_rate(self) -> float: 

108 """Sample rate in Hz.""" 

109 return self._metadata.sample_rate 

110 

111 @property 

112 def length(self) -> int: 

113 """Number of samples.""" 

114 self._ensure_open() 

115 assert self._dataset is not None 

116 return len(self._dataset) 

117 

118 @property 

119 def metadata(self) -> TraceMetadata: 

120 """Trace metadata.""" 

121 return self._metadata 

122 

123 def _ensure_open(self) -> None: 

124 """Ensure HDF5 file is open.""" 

125 if self._h5_file is None or self._dataset is None: 125 ↛ exitline 125 didn't return from function '_ensure_open' because the condition on line 125 was always true

126 self._h5_file = h5py.File(self._file_path, "r") 

127 self._dataset = self._h5_file[self._dataset_path] 

128 

129 def __getitem__(self, key: int | slice) -> np.ndarray[Any, Any]: 

130 """Access data by index or slice. 

131 

132 Args: 

133 key: Index or slice. 

134 

135 Returns: 

136 Numpy array of data. 

137 """ 

138 self._ensure_open() 

139 assert self._dataset is not None 

140 data = self._dataset[key] 

141 return np.asarray(data, dtype=np.float64) 

142 

143 def __len__(self) -> int: 

144 """Return number of samples.""" 

145 return self.length 

146 

147 def close(self) -> None: 

148 """Close HDF5 file handle.""" 

149 if self._h5_file is not None: 149 ↛ exitline 149 didn't return from function 'close' because the condition on line 149 was always true

150 self._h5_file.close() 

151 self._h5_file = None 

152 self._dataset = None 

153 

154 def __del__(self) -> None: 

155 """Cleanup on deletion.""" 

156 self.close() 

157 

158 def __enter__(self) -> HDF5MmapTrace: 

159 """Context manager entry.""" 

160 return self 

161 

162 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: 

163 """Context manager exit.""" 

164 self.close() 

165 

166 def __repr__(self) -> str: 

167 """String representation.""" 

168 return ( 

169 f"HDF5MmapTrace(" 

170 f"file={self._file_path.name}, " 

171 f"dataset={self._dataset_path}, " 

172 f"sample_rate={self.sample_rate:.2e} Hz, " 

173 f"length={self.length:,} samples)" 

174 ) 

175 

176 

177def load_hdf5( 

178 path: str | PathLike[str], 

179 *, 

180 dataset: str | None = None, 

181 channel: str | int | None = None, 

182 sample_rate: float | None = None, 

183 mmap: bool = False, 

184) -> WaveformTrace | HDF5MmapTrace: 

185 """Load waveform data from an HDF5 file. 

186 

187 Loads waveform data and metadata from HDF5 files. Automatically 

188 discovers datasets and extracts sample rate from attributes. 

189 

190 Args: 

191 path: Path to the HDF5 file. 

192 dataset: Specific dataset path to load. If None, auto-detects. 

193 channel: Alias for dataset (for API consistency with other loaders). 

194 sample_rate: Override sample rate (if not found in attributes). 

195 mmap: If True, return memory-mapped trace for large files. 

196 

197 Returns: 

198 WaveformTrace containing the waveform data and metadata. 

199 If mmap=True, returns HDF5MmapTrace instead. 

200 

201 Raises: 

202 LoaderError: If the file cannot be loaded. 

203 FormatError: If no valid waveform data is found. 

204 

205 Example: 

206 >>> trace = load_hdf5("data.h5") 

207 >>> print(f"Sample rate: {trace.metadata.sample_rate} Hz") 

208 

209 >>> # Load specific dataset 

210 >>> trace = load_hdf5("multi.h5", dataset="/measurements/ch1") 

211 

212 >>> # Load as memory-mapped for large files 

213 >>> trace = load_hdf5("huge_data.h5", mmap=True) 

214 """ 

215 if not H5PY_AVAILABLE: 

216 raise LoaderError( 

217 "HDF5 support not available", 

218 details="h5py package is required for HDF5 loading", 

219 fix_hint="Install h5py: pip install h5py", 

220 ) 

221 

222 path = Path(path) 

223 

224 if not path.exists(): 

225 raise LoaderError( 

226 "File not found", 

227 file_path=str(path), 

228 ) 

229 

230 # Use channel as dataset if dataset not specified 

231 if dataset is None and channel is not None: 

232 dataset = str(channel) 

233 

234 try: 

235 with h5py.File(path, "r") as f: 

236 # Find dataset 

237 if dataset is not None: 

238 if dataset in f: 

239 ds = f[dataset] 

240 else: 

241 # Try to find by name 

242 ds = _find_dataset_by_name(f, dataset) 

243 if ds is None: 

244 available = list_datasets(path) 

245 raise FormatError( 

246 f"Dataset not found: {dataset}", 

247 file_path=str(path), 

248 expected=dataset, 

249 got=f"Available: {', '.join(available)}", 

250 ) 

251 else: 

252 # Auto-detect dataset 

253 ds = _find_waveform_dataset(f) 

254 if ds is None: 

255 available = list_datasets(path) 

256 raise FormatError( 

257 "No waveform data found in HDF5 file", 

258 file_path=str(path), 

259 expected=f"Dataset named: {', '.join(DATASET_NAMES)}", 

260 got=f"Datasets: {', '.join(available)}", 

261 ) 

262 

263 # Extract data 

264 if not isinstance(ds, h5py.Dataset): 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true

265 raise FormatError( 

266 "Selected path is not a dataset", 

267 file_path=str(path), 

268 got=type(ds).__name__, 

269 ) 

270 

271 data = np.asarray(ds, dtype=np.float64) 

272 if data.ndim > 1: 

273 data = data.ravel() 

274 

275 # Extract metadata from attributes 

276 detected_sample_rate = sample_rate 

277 if detected_sample_rate is None: 

278 detected_sample_rate = _find_sample_rate(f, ds) 

279 

280 if detected_sample_rate is None: 

281 detected_sample_rate = 1e6 # Default 

282 

283 # Get other metadata 

284 vertical_scale = _get_attr(ds, ["vertical_scale", "v_scale", "scale"]) 

285 vertical_offset = _get_attr(ds, ["vertical_offset", "v_offset", "offset"]) 

286 channel_name = _get_attr(ds, ["channel_name", "name", "channel"]) 

287 

288 if channel_name is None: 

289 channel_name = ds.name.split("/")[-1] if ds.name else "CH1" 

290 

291 metadata = TraceMetadata( 

292 sample_rate=float(detected_sample_rate), 

293 vertical_scale=float(vertical_scale) if vertical_scale else None, 

294 vertical_offset=float(vertical_offset) if vertical_offset else None, 

295 source_file=str(path), 

296 channel_name=str(channel_name), 

297 ) 

298 

299 # Return memory-mapped trace if requested 

300 if mmap: 

301 return HDF5MmapTrace( 

302 file_path=path, 

303 dataset_path=ds.name, 

304 metadata=metadata, 

305 ) 

306 

307 return WaveformTrace(data=data, metadata=metadata) 

308 

309 except OSError as e: 

310 raise LoaderError( 

311 "Failed to read HDF5 file", 

312 file_path=str(path), 

313 details=str(e), 

314 ) from e 

315 except Exception as e: 

316 if isinstance(e, LoaderError | FormatError): 316 ↛ 318line 316 didn't jump to line 318 because the condition on line 316 was always true

317 raise 

318 raise LoaderError( 

319 "Failed to load HDF5 file", 

320 file_path=str(path), 

321 details=str(e), 

322 ) from e 

323 

324 

325def _find_waveform_dataset(f: h5py.File) -> h5py.Dataset | None: 

326 """Find a waveform dataset in the HDF5 file.""" 

327 result: h5py.Dataset | None = None 

328 

329 def visitor(name: str, obj: Any) -> None: 

330 nonlocal result 

331 if result is not None: 

332 return 

333 if isinstance(obj, h5py.Dataset): 

334 name_lower = name.lower().split("/")[-1] 

335 # Check for common names 

336 for ds_name in DATASET_NAMES: 

337 if ds_name in name_lower: 

338 result = obj 

339 return 

340 # Check if it's a 1D numeric array 

341 if obj.ndim == 1 and obj.size > 10 and np.issubdtype(obj.dtype, np.number): 

342 if result is None: 342 ↛ exitline 342 didn't return from function 'visitor' because the condition on line 342 was always true

343 result = obj 

344 

345 f.visititems(visitor) 

346 return result 

347 

348 

349def _find_dataset_by_name(f: h5py.File, name: str) -> h5py.Dataset | None: 

350 """Find a dataset by name (case-insensitive partial match).""" 

351 name_lower = name.lower() 

352 result: h5py.Dataset | None = None 

353 

354 def visitor(path: str, obj: Any) -> None: 

355 nonlocal result 

356 if result is not None: 356 ↛ 357line 356 didn't jump to line 357 because the condition on line 356 was never true

357 return 

358 if isinstance(obj, h5py.Dataset): 

359 path_lower = path.lower() 

360 if name_lower in path_lower: 

361 result = obj 

362 

363 f.visititems(visitor) 

364 return result 

365 

366 

367def _find_sample_rate(f: h5py.File, ds: h5py.Dataset) -> float | None: 

368 """Find sample rate from HDF5 attributes.""" 

369 # Check dataset attributes first 

370 for attr_name in SAMPLE_RATE_ATTRS: 

371 if attr_name in ds.attrs: 

372 value = ds.attrs[attr_name] 

373 if attr_name in ("sample_interval", "dt") and value > 0: 

374 return 1.0 / float(value) 

375 return float(value) 

376 

377 # Check parent group attributes 

378 if ds.parent is not None: 378 ↛ 387line 378 didn't jump to line 387 because the condition on line 378 was always true

379 for attr_name in SAMPLE_RATE_ATTRS: 

380 if attr_name in ds.parent.attrs: 

381 value = ds.parent.attrs[attr_name] 

382 if attr_name in ("sample_interval", "dt") and value > 0: 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true

383 return 1.0 / float(value) 

384 return float(value) 

385 

386 # Check root attributes 

387 for attr_name in SAMPLE_RATE_ATTRS: 

388 if attr_name in f.attrs: 388 ↛ 389line 388 didn't jump to line 389 because the condition on line 388 was never true

389 value = f.attrs[attr_name] 

390 if attr_name in ("sample_interval", "dt") and value > 0: 

391 return 1.0 / float(value) 

392 return float(value) 

393 

394 # Check for metadata group 

395 if "metadata" in f: 

396 meta = f["metadata"] 

397 if isinstance(meta, h5py.Group | h5py.Dataset): 397 ↛ 405line 397 didn't jump to line 405 because the condition on line 397 was always true

398 for attr_name in SAMPLE_RATE_ATTRS: 398 ↛ 405line 398 didn't jump to line 405 because the loop on line 398 didn't complete

399 if attr_name in meta.attrs: 

400 value = meta.attrs[attr_name] 

401 if attr_name in ("sample_interval", "dt") and value > 0: 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true

402 return 1.0 / float(value) 

403 return float(value) 

404 

405 return None 

406 

407 

408def _get_attr(obj: h5py.Dataset | h5py.Group, names: list[str]) -> Any | None: 

409 """Get attribute value by trying multiple names.""" 

410 for name in names: 

411 if name in obj.attrs: 

412 value = obj.attrs[name] 

413 if isinstance(value, bytes): 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true

414 return value.decode("utf-8") 

415 return value 

416 return None 

417 

418 

419def list_datasets(path: str | PathLike[str]) -> list[str]: 

420 """List all datasets in an HDF5 file. 

421 

422 Args: 

423 path: Path to the HDF5 file. 

424 

425 Returns: 

426 List of dataset paths. 

427 

428 Raises: 

429 LoaderError: If h5py is not available or file not found. 

430 

431 Example: 

432 >>> datasets = list_datasets("data.h5") 

433 >>> print(datasets) 

434 ['/measurements/ch1', '/measurements/ch2', '/time'] 

435 """ 

436 if not H5PY_AVAILABLE: 

437 raise LoaderError( 

438 "HDF5 support not available", 

439 details="h5py package is required", 

440 ) 

441 

442 path = Path(path) 

443 if not path.exists(): 

444 raise LoaderError("File not found", file_path=str(path)) 

445 

446 datasets: list[str] = [] 

447 

448 def visitor(name: str, obj: Any) -> None: 

449 if isinstance(obj, h5py.Dataset): 

450 datasets.append("/" + name) 

451 

452 try: 

453 with h5py.File(path, "r") as f: 

454 f.visititems(visitor) 

455 except Exception as e: 

456 raise LoaderError( 

457 "Failed to read HDF5 file", 

458 file_path=str(path), 

459 details=str(e), 

460 ) from e 

461 

462 return datasets 

463 

464 

465def get_attributes( 

466 path: str | PathLike[str], 

467 dataset: str | None = None, 

468) -> dict[str, Any]: 

469 """Get attributes from an HDF5 file or dataset. 

470 

471 Args: 

472 path: Path to the HDF5 file. 

473 dataset: Dataset path. If None, returns root attributes. 

474 

475 Returns: 

476 Dictionary of attributes. 

477 

478 Raises: 

479 LoaderError: If h5py is not available or file not found. 

480 """ 

481 if not H5PY_AVAILABLE: 

482 raise LoaderError("HDF5 support not available") 

483 

484 path = Path(path) 

485 if not path.exists(): 

486 raise LoaderError("File not found", file_path=str(path)) 

487 

488 try: 

489 with h5py.File(path, "r") as f: 

490 obj = f[dataset] if dataset is not None else f 

491 

492 attrs = {} 

493 for key, value in obj.attrs.items(): 

494 if isinstance(value, bytes): 494 ↛ 495line 494 didn't jump to line 495 because the condition on line 494 was never true

495 value = value.decode("utf-8") 

496 elif isinstance(value, np.ndarray): 

497 value = value.tolist() 

498 attrs[key] = value 

499 

500 return attrs 

501 

502 except Exception as e: 

503 raise LoaderError( 

504 "Failed to read HDF5 attributes", 

505 file_path=str(path), 

506 details=str(e), 

507 ) from e 

508 

509 

510__all__ = ["get_attributes", "list_datasets", "load_hdf5"]