Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ groups \ features.py: 42%

216 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-27 20:09 -0800

1# -*- coding: utf-8 -*- 

2""" 

3Created on Fri Dec 13 12:40:34 2024 

4 

5@author: jpeacock 

6""" 

7 

8from __future__ import annotations 

9 

10# ============================================================================= 

11# Imports 

12# ============================================================================= 

13from typing import Optional 

14 

15import h5py 

16import numpy as np 

17import pandas as pd 

18import xarray as xr 

19from mt_metadata.features import FeatureDecimationChannel 

20from mt_metadata.processing.fourier_coefficients.decimation import Decimation 

21 

22from mth5.groups import BaseGroup, FeatureChannelDataset, RunGroup 

23from mth5.helpers import validate_name 

24from mth5.utils.exceptions import MTH5Error 

25 

26 

27# ============================================================================= 

28"""feature -> FeatureMasterGroup -> FeatureGroup -> DecimationLevelGroup -> ChannelGroup -> FeatureChannelDataset""" 

29 

30TIME_DOMAIN = ["ts", "time", "time series", "time_series"] 

31FREQUENCY_DOMAIN = ["fc", "frequency", "fourier", "fourier_domain"] 

32 

33 

34class MasterFeaturesGroup(BaseGroup): 

35 """ 

36 Master group container for features associated with Fourier Coefficients or time series. 

37 

38 This class manages the top-level organization of geophysical feature data, 

39 organizing it into feature-specific groups. Features can include various 

40 frequency or time-domain analyses. 

41 

42 Hierarchy 

43 --------- 

44 MasterFeatureGroup -> FeatureGroup -> FeatureRunGroup -> 

45 

46 - FC: FeatureDecimationGroup -> FeatureChannelDataset 

47 - Time Series: FeatureChannelDataset 

48 

49 Parameters 

50 ---------- 

51 group : h5py.Group 

52 HDF5 group object for this MasterFeaturesGroup. 

53 **kwargs 

54 Additional keyword arguments passed to BaseGroup. 

55 

56 Examples 

57 -------- 

58 >>> import h5py 

59 >>> from mth5.groups.features import MasterFeaturesGroup 

60 >>> with h5py.File('data.h5', 'r') as f: 

61 ... master = MasterFeaturesGroup(f['features']) 

62 ... feature_list = master.groups_list 

63 """ 

64 

65 def __init__(self, group: h5py.Group, **kwargs) -> None: 

66 super().__init__(group, **kwargs) 

67 

68 def add_feature_group( 

69 self, 

70 feature_name: str, 

71 feature_metadata: Optional[FeatureDecimationChannel] = None, 

72 ) -> FeatureGroup: 

73 """ 

74 Add a feature group to the master features container. 

75 

76 Creates a new FeatureGroup with the specified name and optional metadata. 

77 Feature groups organize all runs and decimation levels for a particular feature. 

78 

79 Parameters 

80 ---------- 

81 feature_name : str 

82 Name for the feature group. Will be validated and formatted. 

83 feature_metadata : FeatureDecimationChannel, optional 

84 Metadata describing the feature. Default is None. 

85 

86 Returns 

87 ------- 

88 FeatureGroup 

89 Newly created feature group object. 

90 

91 Examples 

92 -------- 

93 >>> master = MasterFeaturesGroup(h5_group) 

94 >>> feature = master.add_feature_group('coherency') 

95 >>> print(feature.name) 

96 'coherency' 

97 """ 

98 

99 return self._add_group( 

100 feature_name, 

101 FeatureGroup, 

102 group_metadata=feature_metadata, 

103 match="name", 

104 ) 

105 

106 def get_feature_group(self, feature_name: str) -> FeatureGroup: 

107 """ 

108 Retrieve a feature group by name. 

109 

110 Parameters 

111 ---------- 

112 feature_name : str 

113 Name of the feature group to retrieve. 

114 

115 Returns 

116 ------- 

117 FeatureGroup 

118 The requested feature group. 

119 

120 Raises 

121 ------ 

122 MTH5Error 

123 If the feature group does not exist. 

124 

125 Examples 

126 -------- 

127 >>> master = MasterFeaturesGroup(h5_group) 

128 >>> feature = master.get_feature_group('coherency') 

129 >>> print(feature.name) 

130 'coherency' 

131 """ 

132 return self._get_group(feature_name, FeatureGroup) 

133 

134 def remove_feature_group(self, feature_name: str) -> None: 

135 """ 

136 Remove a feature group from the master container. 

137 

138 Deletes the specified feature group and its associated data from the 

139 HDF5 file. Note that this operation removes the reference but does not 

140 reduce the file size; copy desired data to a new file for size reduction. 

141 

142 Parameters 

143 ---------- 

144 feature_name : str 

145 Name of the feature group to remove. 

146 

147 Raises 

148 ------ 

149 MTH5Error 

150 If the feature group does not exist. 

151 

152 Examples 

153 -------- 

154 >>> master = MasterFeaturesGroup(h5_group) 

155 >>> master.remove_feature_group('coherency') 

156 """ 

157 

158 self._remove_group(feature_name) 

159 

160 

161class FeatureGroup(BaseGroup): 

162 """ 

163 Container for a single feature set with all associated runs and decimation levels. 

164 

165 This class manages feature-specific data including all processing runs and 

166 decimation levels. Features can include both Fourier Coefficient and time series data. 

167 

168 Hierarchy 

169 --------- 

170 FeatureGroup -> FeatureRunGroup -> 

171 

172 - FC: FeatureDecimationLevel -> FeatureChannelDataset 

173 - TS: FeatureChannelDataset 

174 

175 Parameters 

176 ---------- 

177 group : h5py.Group 

178 HDF5 group object for this FeatureGroup. 

179 feature_metadata : optional 

180 Metadata specific to this feature. Should include description and parameters. 

181 **kwargs 

182 Additional keyword arguments passed to BaseGroup. 

183 

184 Notes 

185 ----- 

186 Feature metadata should be specific to the feature and include descriptions 

187 of the feature and any parameters used in its computation. 

188 

189 Examples 

190 -------- 

191 >>> feature = FeatureGroup(h5_group, feature_metadata=metadata) 

192 >>> run_group = feature.add_feature_run_group('run_1', domain='fc') 

193 """ 

194 

195 def __init__( 

196 self, 

197 group: h5py.Group, 

198 feature_metadata: Optional[object] = None, 

199 **kwargs, 

200 ) -> None: 

201 super().__init__(group, group_metadata=feature_metadata, **kwargs) 

202 

203 def add_feature_run_group( 

204 self, 

205 feature_name: str, 

206 feature_run_metadata: Optional[object] = None, 

207 domain: str = "fc", 

208 ) -> object: 

209 """ 

210 Add a feature run group for a single feature. 

211 

212 Creates either a Fourier Coefficient run group or a time series run group 

213 based on the specified domain. The domain can be determined from the metadata 

214 or explicitly provided. 

215 

216 Parameters 

217 ---------- 

218 feature_name : str 

219 Name for the feature run group. 

220 feature_run_metadata : optional 

221 Metadata for the feature run. If provided, domain is extracted from 

222 metadata.domain attribute. Default is None. 

223 domain : str, default='fc' 

224 Domain type for the data. Must be one of: 

225 

226 - 'fc', 'frequency', 'fourier', 'fourier_domain': Fourier Coefficients 

227 - 'ts', 'time', 'time series', 'time_series': Time series 

228 

229 Returns 

230 ------- 

231 FeatureFCRunGroup or FeatureTSRunGroup 

232 Newly created feature run group. 

233 

234 Raises 

235 ------ 

236 ValueError 

237 If domain is not recognized. 

238 AttributeError 

239 If metadata does not have a domain attribute when metadata is provided. 

240 

241 Examples 

242 -------- 

243 >>> feature = FeatureGroup(h5_group) 

244 >>> fc_run = feature.add_feature_run_group('processing_run_1', domain='fc') 

245 >>> ts_run = feature.add_feature_run_group('ts_analysis', domain='ts') 

246 """ 

247 if feature_run_metadata is not None: 

248 try: 

249 domain = feature_run_metadata.domain 

250 except AttributeError: 

251 raise AttributeError( 

252 "Could not find attribute 'domain' in metadata object" 

253 ) 

254 

255 if domain in FREQUENCY_DOMAIN: 

256 return self._add_group( 

257 feature_name, 

258 FeatureFCRunGroup, 

259 group_metadata=feature_run_metadata, 

260 match="id", 

261 ) 

262 elif domain in TIME_DOMAIN: 

263 return self._add_group( 

264 feature_name, 

265 FeatureTSRunGroup, 

266 group_metadata=feature_run_metadata, 

267 match="id", 

268 ) 

269 else: 

270 raise ValueError( 

271 f"feature_type {domain} not supported. Use either 'fc' " 

272 "for Fourier Coefficent or 'ts' for time series." 

273 ) 

274 

275 def get_feature_run_group( 

276 self, 

277 feature_name: str, 

278 domain: str = "frequency", 

279 ) -> object: 

280 """ 

281 Retrieve a feature run group by name and domain type. 

282 

283 Parameters 

284 ---------- 

285 feature_name : str 

286 Name of the feature run group to retrieve. 

287 domain : str, default='frequency' 

288 Domain type. Must be one of: 

289 

290 - 'fc', 'frequency', 'fourier', 'fourier_domain': Fourier Coefficients 

291 - 'ts', 'time', 'time series', 'time_series': Time series 

292 

293 Returns 

294 ------- 

295 FeatureFCRunGroup or FeatureTSRunGroup 

296 The requested feature run group. 

297 

298 Raises 

299 ------ 

300 ValueError 

301 If domain is not recognized. 

302 MTH5Error 

303 If the feature run group does not exist. 

304 

305 Examples 

306 -------- 

307 >>> feature = FeatureGroup(h5_group) 

308 >>> fc_run = feature.get_feature_run_group('processing_run_1', domain='fc') 

309 """ 

310 if domain in FREQUENCY_DOMAIN: 

311 return self._get_group(feature_name, FeatureFCRunGroup) 

312 elif domain in TIME_DOMAIN: 

313 return self._get_group(feature_name, FeatureTSRunGroup) 

314 else: 

315 raise ValueError( 

316 f"feature_type {domain} not supported. Use either 'fc' " 

317 "for Fourier Coefficent or 'ts' for time series." 

318 ) 

319 

320 def remove_feature_run_group(self, feature_name: str) -> None: 

321 """ 

322 Remove a feature run group. 

323 

324 Deletes the specified feature run group and all its associated data. 

325 Note that deletion removes the reference but does not reduce HDF5 file size. 

326 

327 Parameters 

328 ---------- 

329 feature_name : str 

330 Name of the feature run group to remove. 

331 

332 Raises 

333 ------ 

334 MTH5Error 

335 If the feature run group does not exist. 

336 

337 Examples 

338 -------- 

339 >>> feature = FeatureGroup(h5_group) 

340 >>> feature.remove_feature_run_group('processing_run_1') 

341 """ 

342 

343 self._remove_group(feature_name) 

344 

345 

346class FeatureTSRunGroup(BaseGroup): 

347 """ 

348 Container for time series features from a processing or analysis run. 

349 

350 This class wraps a RunGroup to manage time series data features while 

351 maintaining compatibility with the feature hierarchy structure. 

352 

353 Parameters 

354 ---------- 

355 group : h5py.Group 

356 HDF5 group object for this FeatureTSRunGroup. 

357 feature_run_metadata : optional 

358 Metadata for the feature run (same type as timeseries.Run). 

359 **kwargs 

360 Additional keyword arguments passed to BaseGroup. 

361 

362 Notes 

363 ----- 

364 This class uses methods from RunGroup for channel management, which may 

365 have performance implications due to multiple RunGroup instantiations. 

366 

367 Examples 

368 -------- 

369 >>> ts_run = FeatureTSRunGroup(h5_group, feature_run_metadata=metadata) 

370 >>> channel = ts_run.add_feature_channel('Ex', 'electric', data) 

371 """ 

372 

373 def __init__( 

374 self, 

375 group: h5py.Group, 

376 feature_run_metadata: Optional[object] = None, 

377 **kwargs, 

378 ) -> None: 

379 super().__init__(group, group_metadata=feature_run_metadata, **kwargs) 

380 

381 ### Use methods from RunGroup (might be slow cause initiating multiple 

382 ### RunGroups)? 

383 self._run_group = RunGroup(group, feature_run_metadata=None) 

384 

385 def add_feature_channel( 

386 self, 

387 channel_name: str, 

388 channel_type: str, 

389 data: Optional[np.ndarray] = None, 

390 channel_dtype: str = "int32", 

391 shape: Optional[tuple] = None, 

392 max_shape: tuple = (None,), 

393 chunks: bool = True, 

394 channel_metadata: Optional[object] = None, 

395 **kwargs, 

396 ) -> object: 

397 """ 

398 Add a time series channel to the feature run group. 

399 

400 Creates a new channel for time series data with the specified properties 

401 and optional metadata. Channel metadata should be a timeseries.Channel object. 

402 

403 Parameters 

404 ---------- 

405 channel_name : str 

406 Name for the channel. 

407 channel_type : str 

408 Type of channel (e.g., 'electric', 'magnetic'). 

409 data : np.ndarray, optional 

410 Initial data for the channel. Default is None. 

411 channel_dtype : str, default='int32' 

412 Data type for the channel. 

413 shape : tuple, optional 

414 Shape of the channel data. Default is None. 

415 max_shape : tuple, default=(None,) 

416 Maximum shape for expandable dimensions. 

417 chunks : bool, default=True 

418 Whether to use chunking for the dataset. 

419 channel_metadata : optional 

420 Metadata object (timeseries.Channel type). Default is None. 

421 **kwargs 

422 Additional keyword arguments for dataset creation. 

423 

424 Returns 

425 ------- 

426 object 

427 Channel object from RunGroup. 

428 

429 Examples 

430 -------- 

431 >>> ts_run = FeatureTSRunGroup(h5_group) 

432 >>> channel = ts_run.add_feature_channel( 

433 ... 'Ex', 'electric', data=np.arange(1000)) 

434 """ 

435 

436 return self._run_group.add_channel( 

437 channel_name, 

438 channel_type, 

439 data, 

440 channel_dtype=channel_dtype, 

441 shape=shape, 

442 max_shape=max_shape, 

443 chunks=chunks, 

444 channel_metadata=channel_metadata, 

445 **kwargs, 

446 ) 

447 

448 def get_feature_channel(self, channel_name: str) -> object: 

449 """ 

450 Retrieve a feature channel by name. 

451 

452 Parameters 

453 ---------- 

454 channel_name : str 

455 Name of the channel to retrieve. 

456 

457 Returns 

458 ------- 

459 object 

460 Channel object from RunGroup. 

461 

462 Raises 

463 ------ 

464 MTH5Error 

465 If the channel does not exist. 

466 

467 Examples 

468 -------- 

469 >>> ts_run = FeatureTSRunGroup(h5_group) 

470 >>> channel = ts_run.get_feature_channel('Ex') 

471 """ 

472 

473 return self._run_group.get_channel(channel_name) 

474 

475 def remove_feature_channel(self, channel_name: str) -> None: 

476 """ 

477 Remove a feature channel from the run group. 

478 

479 Parameters 

480 ---------- 

481 channel_name : str 

482 Name of the channel to remove. 

483 

484 Raises 

485 ------ 

486 MTH5Error 

487 If the channel does not exist. 

488 

489 Examples 

490 -------- 

491 >>> ts_run = FeatureTSRunGroup(h5_group) 

492 >>> ts_run.remove_feature_channel('Ex') 

493 """ 

494 self._run_group.remove_channel(channel_name) 

495 

496 

497class FeatureFCRunGroup(BaseGroup): 

498 """ 

499 Container for Fourier Coefficient features from a processing run. 

500 

501 This class manages Fourier Coefficient data organized by decimation levels, 

502 each containing multiple frequency channels with time-frequency data. 

503 

504 Hierarchy 

505 --------- 

506 FeatureFCRunGroup -> FeatureDecimationGroup -> FeatureChannelDataset 

507 

508 Attributes 

509 ---------- 

510 metadata : Decimation 

511 Metadata including: 

512 

513 - list of decimation levels 

514 - start time (earliest) 

515 - end time (latest) 

516 - method (fft, wavelet, ...) 

517 - list of channels used 

518 - starting sample rate 

519 - bands used 

520 - type (TS or FC) 

521 

522 Parameters 

523 ---------- 

524 group : h5py.Group 

525 HDF5 group object for this FeatureFCRunGroup. 

526 feature_run_metadata : optional 

527 Decimation metadata for the feature run. Default is None. 

528 **kwargs 

529 Additional keyword arguments passed to BaseGroup. 

530 

531 Examples 

532 -------- 

533 >>> fc_run = FeatureFCRunGroup(h5_group, feature_run_metadata=metadata) 

534 >>> decimation = fc_run.add_decimation_level('level_0', dec_metadata) 

535 """ 

536 

537 def __init__( 

538 self, 

539 group: h5py.Group, 

540 feature_run_metadata: Optional[Decimation] = None, 

541 **kwargs, 

542 ) -> None: 

543 super().__init__(group, group_metadata=feature_run_metadata, **kwargs) 

544 

545 @BaseGroup.metadata.getter 

546 def metadata(self) -> Decimation: 

547 """Overwrite get metadata to include channel information in the runs""" 

548 

549 # self._metadata.channels = [] 

550 # for dl in self.groups_list: 

551 # dl_group = self.get_decimation_level(dl) 

552 # self._metadata.levels.append(dl_group.metadata) 

553 self._metadata.hdf5_reference = self.hdf5_group.ref 

554 return self._metadata 

555 

556 @property 

557 def decimation_level_summary(self) -> pd.DataFrame: 

558 """ 

559 Get a summary of all decimation levels in the run. 

560 

561 Returns a pandas DataFrame with information about each decimation level 

562 including decimation factor, time range, and HDF5 reference. 

563 

564 Returns 

565 ------- 

566 pd.DataFrame 

567 DataFrame with columns: 

568 

569 - name : str 

570 Decimation level name 

571 - start : datetime64[ns] 

572 Start time of the decimation level 

573 - end : datetime64[ns] 

574 End time of the decimation level 

575 - hdf5_reference : h5py.ref_dtype 

576 HDF5 reference to the decimation level group 

577 

578 Examples 

579 -------- 

580 >>> fc_run = FeatureFCRunGroup(h5_group) 

581 >>> summary = fc_run.decimation_level_summary 

582 >>> print(summary[['name', 'start', 'end']]) 

583 """ 

584 

585 ch_list = [] 

586 for key, group in self.hdf5_group.items(): 

587 try: 

588 ch_type = group.attrs["mth5_type"] 

589 if ch_type in ["FeatureDecimation"]: 

590 ch_list.append( 

591 ( 

592 group.attrs["decimation_level"], 

593 group.attrs["time_period.start"].split("+")[0], 

594 group.attrs["time_period.end"].split("+")[0], 

595 group.ref, 

596 ) 

597 ) 

598 except KeyError as error: 

599 self.logger.debug(f"Could not find key: {error}") 

600 ch_summary = np.array( 

601 ch_list, 

602 dtype=np.dtype( 

603 [ 

604 ("name", "U20"), 

605 ("start", "datetime64[ns]"), 

606 ("end", "datetime64[ns]"), 

607 ("hdf5_reference", h5py.ref_dtype), 

608 ] 

609 ), 

610 ) 

611 

612 return pd.DataFrame(ch_summary) 

613 

614 def add_decimation_level( 

615 self, 

616 decimation_level_name: str, 

617 feature_decimation_level_metadata: Optional[object] = None, 

618 ) -> FeatureDecimationGroup: 

619 """ 

620 Add a decimation level group to the feature run. 

621 

622 Parameters 

623 ---------- 

624 decimation_level_name : str 

625 Name for the decimation level. 

626 feature_decimation_level_metadata : optional 

627 Metadata for the decimation level. Default is None. 

628 

629 Returns 

630 ------- 

631 FeatureDecimationGroup 

632 Newly created decimation level group. 

633 

634 Examples 

635 -------- 

636 >>> fc_run = FeatureFCRunGroup(h5_group) 

637 >>> decimation = fc_run.add_decimation_level('level_0', dec_metadata) 

638 >>> print(decimation.name) 

639 'level_0' 

640 """ 

641 

642 return self._add_group( 

643 decimation_level_name, 

644 FeatureDecimationGroup, 

645 group_metadata=feature_decimation_level_metadata, 

646 match="id", 

647 ) 

648 

649 def get_decimation_level( 

650 self, decimation_level_name: str 

651 ) -> FeatureDecimationGroup: 

652 """ 

653 Retrieve a decimation level group by name. 

654 

655 Parameters 

656 ---------- 

657 decimation_level_name : str 

658 Name of the decimation level to retrieve. 

659 

660 Returns 

661 ------- 

662 FeatureDecimationGroup 

663 The requested decimation level group. 

664 

665 Raises 

666 ------ 

667 MTH5Error 

668 If the decimation level does not exist. 

669 

670 Examples 

671 -------- 

672 >>> fc_run = FeatureFCRunGroup(h5_group) 

673 >>> decimation = fc_run.get_decimation_level('level_0') 

674 """ 

675 return self._get_group(decimation_level_name, FeatureDecimationGroup) 

676 

677 def remove_decimation_level(self, decimation_level_name: str) -> None: 

678 """ 

679 Remove a decimation level from the feature run. 

680 

681 Parameters 

682 ---------- 

683 decimation_level_name : str 

684 Name of the decimation level to remove. 

685 

686 Raises 

687 ------ 

688 MTH5Error 

689 If the decimation level does not exist. 

690 

691 Examples 

692 -------- 

693 >>> fc_run = FeatureFCRunGroup(h5_group) 

694 >>> fc_run.remove_decimation_level('level_0') 

695 """ 

696 

697 self._remove_group(decimation_level_name) 

698 

699 def update_metadata(self) -> None: 

700 """ 

701 Update metadata from all decimation levels. 

702 

703 Scans all decimation levels and updates the run-level metadata with 

704 aggregated information including time ranges. 

705 

706 Examples 

707 -------- 

708 >>> fc_run = FeatureFCRunGroup(h5_group) 

709 >>> fc_run.update_metadata() 

710 """ 

711 decimation_level_summary = self.decimation_level_summary.copy() 

712 if not decimation_level_summary.empty: 

713 self._metadata.time_period.start = ( 

714 decimation_level_summary.start.min().isoformat() 

715 ) 

716 self._metadata.time_period.end = ( 

717 decimation_level_summary.end.max().isoformat() 

718 ) 

719 self.write_metadata() 

720 

721 # def supports_aurora_processing_config( 

722 # self, processing_config, remote 

723 # ) -> bool: 

724 # """ 

725 

726 # An "all-or-nothing" check: Return True if every (valid) decimation needed to satisfy the processing_config 

727 # is available in the FCGroup (self) otherwise return False (and we will build all FCs). 

728 

729 # Logic: 

730 # 1. Get a list of all fc groups in the FCGroup (self) 

731 # 2. Loop the processing_config decimations, checking if there is a corresponding, already built FCDecimation 

732 # in the FCGroup. 

733 

734 # Parameters 

735 # ---------- 

736 # processing_config: aurora.config.metadata.processing.Processing 

737 # remote: bool 

738 

739 # Returns 

740 # ------- 

741 

742 # """ 

743 # pre_existing_fc_decimation_ids_to_check = self.groups_list 

744 # levels_present = np.full(processing_config.num_decimation_levels, False) 

745 # for i, dec_level in enumerate(processing_config.decimations): 

746 

747 # # Quit checking if dec_level wasn't there 

748 # if i > 0: 

749 # if not levels_present[i - 1]: 

750 # return False 

751 

752 # # iterate over existing decimations 

753 # for fc_decimation_id in pre_existing_fc_decimation_ids_to_check: 

754 # fc_dec_group = self.get_decimation_level(fc_decimation_id) 

755 # fc_decimation = fc_dec_group.metadata 

756 # levels_present[i] = fc_decimation.has_fcs_for_aurora_processing( 

757 # dec_level, remote 

758 # ) 

759 

760 # if levels_present[i]: 

761 # pre_existing_fc_decimation_ids_to_check.remove( 

762 # fc_decimation_id 

763 # ) # no need to check this one again 

764 # break # break inner for-loop over decimations 

765 

766 # return levels_present.all() 

767 

768 

769class FeatureDecimationGroup(BaseGroup): 

770 """ 

771 Container for a single decimation level with multiple Fourier Coefficient channels. 

772 

773 This class manages Fourier Coefficient data organized by frequency, time, and channel. 

774 Data is assumed to be uniformly sampled in both frequency and time domains. 

775 

776 Hierarchy 

777 --------- 

778 FeatureDecimationGroup -> FeatureChannelDataset (multiple channels) 

779 

780 Data Assumptions 

781 ---------------- 

782 1. Data are uniformly sampled in frequency domain 

783 2. Data are uniformly sampled in time domain 

784 3. FFT moving window has uniform step size 

785 

786 Attributes 

787 ---------- 

788 start time : datetime 

789 Start time of the decimation level 

790 end time : datetime 

791 End time of the decimation level 

792 channels : list 

793 List of channel names in this decimation level 

794 decimation_factor : int 

795 Factor by which data was decimated 

796 decimation_level : int 

797 Level index in decimation hierarchy 

798 decimation_sample_rate : float 

799 Sample rate after decimation (Hz) 

800 method : str 

801 Method used (FFT, wavelet, etc.) 

802 anti_alias_filter : optional 

803 Anti-aliasing filter used 

804 prewhitening_type : optional 

805 Type of prewhitening applied 

806 harmonics_kept : list or 'all' 

807 Harmonic indices kept in the data 

808 window : dict 

809 Window parameters (length, overlap, type, sample rate) 

810 bands : list 

811 Frequency bands in the data 

812 

813 Parameters 

814 ---------- 

815 group : h5py.Group 

816 HDF5 group object for this FeatureDecimationGroup. 

817 decimation_level_metadata : optional 

818 Metadata for the decimation level. Default is None. 

819 **kwargs 

820 Additional keyword arguments passed to BaseGroup. 

821 

822 Examples 

823 -------- 

824 >>> decimation = FeatureDecimationGroup(h5_group, metadata) 

825 >>> channel = decimation.add_channel('Ex', fc_data=fc_array, fc_metadata=ch_metadata) 

826 """ 

827 

828 def __init__( 

829 self, 

830 group: h5py.Group, 

831 decimation_level_metadata: Optional[object] = None, 

832 **kwargs, 

833 ) -> None: 

834 super().__init__(group, group_metadata=decimation_level_metadata, **kwargs) 

835 

836 @BaseGroup.metadata.getter 

837 def metadata(self): 

838 """Overwrite get metadata to include channel information in the runs""" 

839 

840 self._metadata.channels = [] 

841 for ch in self.groups_list: 

842 ch_group = self.get_channel(ch) 

843 self._metadata.channels.append(ch_group.metadata) 

844 self._metadata.hdf5_reference = self.hdf5_group.ref 

845 return self._metadata 

846 

847 @property 

848 def channel_summary(self) -> pd.DataFrame: 

849 """ 

850 Get a summary of all channels in this decimation level. 

851 

852 Returns a pandas DataFrame with detailed information about each Fourier 

853 Coefficient channel including time ranges, dimensions, and sampling rates. 

854 

855 Returns 

856 ------- 

857 pd.DataFrame 

858 DataFrame with columns: 

859 

860 - name : str 

861 Channel name 

862 - start : datetime64[ns] 

863 Start time of the channel data 

864 - end : datetime64[ns] 

865 End time of the channel data 

866 - n_frequency : int64 

867 Number of frequency bins 

868 - n_windows : int64 

869 Number of time windows 

870 - sample_rate_decimation_level : float64 

871 Decimation level sample rate (Hz) 

872 - sample_rate_window_step : float64 

873 Sample rate of window stepping (Hz) 

874 - units : str 

875 Physical units of the data 

876 - hdf5_reference : h5py.ref_dtype 

877 HDF5 reference to the channel dataset 

878 

879 Examples 

880 -------- 

881 >>> decimation = FeatureDecimationGroup(h5_group) 

882 >>> summary = decimation.channel_summary 

883 >>> print(summary[['name', 'n_frequency', 'n_windows']]) 

884 """ 

885 

886 ch_list = [] 

887 for key, group in self.hdf5_group.items(): 

888 try: 

889 ch_type = group.attrs["mth5_type"] 

890 if ch_type in ["FCChannel"]: 

891 ch_list.append( 

892 ( 

893 group.attrs["name"], 

894 group.attrs["time_period.start"].split("+")[0], 

895 group.attrs["time_period.end"].split("+")[0], 

896 group.shape[0], 

897 group.shape[1], 

898 group.attrs["sample_rate_decimation_level"], 

899 group.attrs["sample_rate_window_step"], 

900 group.attrs["units"], 

901 group.ref, 

902 ) 

903 ) 

904 except KeyError as error: 

905 self.logger.debug(f"Cannot find a key: {error}") 

906 ch_summary = np.array( 

907 ch_list, 

908 dtype=np.dtype( 

909 [ 

910 ("name", "U20"), 

911 ("start", "datetime64[ns]"), 

912 ("end", "datetime64[ns]"), 

913 ("n_frequency", np.int64), 

914 ("n_windows", np.int64), 

915 ("sample_rate_decimation_level", np.float64), 

916 ("sample_rate_window_step", np.float64), 

917 ("units", "U25"), 

918 ("hdf5_reference", h5py.ref_dtype), 

919 ] 

920 ), 

921 ) 

922 

923 return pd.DataFrame(ch_summary) 

924 

925 def from_dataframe( 

926 self, 

927 df: pd.DataFrame, 

928 channel_key: str, 

929 time_key: str = "time", 

930 frequency_key: str = "frequency", 

931 ) -> None: 

932 """ 

933 Load Fourier Coefficient data from a pandas DataFrame. 

934 

935 Assumes the channel_key column contains complex coefficient values 

936 organized with time and frequency dimensions. 

937 

938 Parameters 

939 ---------- 

940 df : pd.DataFrame 

941 Input DataFrame containing the coefficient data. 

942 channel_key : str 

943 Name of the column containing coefficient values. 

944 time_key : str, default='time' 

945 Name of the time coordinate column. 

946 frequency_key : str, default='frequency' 

947 Name of the frequency coordinate column. 

948 

949 Raises 

950 ------ 

951 TypeError 

952 If df is not a pandas DataFrame. 

953 

954 Examples 

955 -------- 

956 >>> decimation = FeatureDecimationGroup(h5_group) 

957 >>> decimation.from_dataframe(df, channel_key='Ex', time_key='time') 

958 """ 

959 

960 if not isinstance(df, pd.DataFrame): 

961 msg = f"Must input a pandas dataframe not {type(df)}" 

962 self.logger.error(msg) 

963 raise TypeError(msg) 

964 for col in df.columns: 

965 df[col] = np.complex128(df[col]) 

966 xrds = df[col].to_xarray() 

967 self.add_channel(col, fc_data=xrds.to_numpy()) 

968 

969 def from_xarray( 

970 self, 

971 data_array: xr.DataArray | xr.Dataset, 

972 sample_rate_decimation_level: float, 

973 ) -> None: 

974 """ 

975 Load Fourier Coefficient data from an xarray DataArray or Dataset. 

976 

977 Automatically extracts metadata (time, frequency, units) from the xarray 

978 object and creates appropriate FeatureChannelDataset instances for each 

979 variable or the single DataArray. 

980 

981 Parameters 

982 ---------- 

983 data_array : xr.DataArray or xr.Dataset 

984 Input xarray object with 'time' and 'frequency' coordinates and 

985 dimensions ['time', 'frequency'] (or transposed variant). 

986 sample_rate_decimation_level : float 

987 Sample rate of the decimation level (Hz). 

988 

989 Raises 

990 ------ 

991 TypeError 

992 If data_array is not an xarray Dataset or DataArray. 

993 

994 Notes 

995 ----- 

996 Automatically handles both (time, frequency) and (frequency, time) dimension ordering. 

997 Units are extracted from xarray attributes if available. 

998 

999 Examples 

1000 -------- 

1001 >>> import xarray as xr 

1002 >>> import numpy as np 

1003 >>> decimation = FeatureDecimationGroup(h5_group) 

1004 

1005 Create sample xarray data: 

1006 

1007 >>> times = np.arange('2023-01-01', '2023-01-02', dtype='datetime64[s]') 

1008 >>> freqs = np.linspace(0.01, 100, 256) 

1009 >>> data_array = np.random.randn(len(times), len(freqs)) + \\ 

1010 ... 1j * np.random.randn(len(times), len(freqs)) 

1011 >>> xr_data = xr.DataArray( 

1012 ... data_array, 

1013 ... dims=['time', 'frequency'], 

1014 ... coords={'time': times, 'frequency': freqs}, 

1015 ... name='Ex', 

1016 ... attrs={'units': 'mV/km'} 

1017 ... ) 

1018 

1019 Load into decimation group: 

1020 

1021 >>> decimation.from_xarray(xr_data, sample_rate_decimation_level=0.5) 

1022 """ 

1023 

1024 if not isinstance(data_array, (xr.Dataset, xr.DataArray)): 

1025 msg = f"Must input a xarray Dataset or DataArray not {type(data_array)}" 

1026 self.logger.error(msg) 

1027 raise TypeError(msg) 

1028 ch_metadata = FeatureDecimationChannel() 

1029 ch_metadata.time_period.start = data_array.time[0].values 

1030 ch_metadata.time_period.end = data_array.time[-1].values 

1031 ch_metadata.sample_rate_decimation_level = sample_rate_decimation_level 

1032 ch_metadata.frequency_min = data_array.coords["frequency"].data.min() 

1033 ch_metadata.frequency_max = data_array.coords["frequency"].data.max() 

1034 step_size = ( 

1035 data_array.coords["time"].data[1] - data_array.coords["time"].data[0] 

1036 ) 

1037 ch_metadata.sample_rate_window_step = step_size / np.timedelta64(1, "s") 

1038 try: 

1039 ch_metadata.units = data_array.units 

1040 except AttributeError: 

1041 self.logger.debug("Could not find 'units' in xarray") 

1042 if isinstance(data_array, xr.DataArray): 

1043 self.add_channel( 

1044 data_array.name, 

1045 fc_data=data_array.to_numpy(), 

1046 fc_metadata=ch_metadata, 

1047 ) 

1048 else: 

1049 for ch in data_array.data_vars.keys(): 

1050 ch_metadata.name = ch 

1051 if ch in self.channel_summary.name.to_list(): 

1052 self.remove_channel(ch) 

1053 # time index should be the first index 

1054 if data_array[ch].time.size == data_array[ch].shape[0]: 

1055 self.add_channel( 

1056 ch, 

1057 fc_data=data_array[ch].to_numpy(), 

1058 fc_metadata=ch_metadata, 

1059 dtype=data_array[ch].dtype, 

1060 ) 

1061 elif data_array[ch].time.size == data_array[ch].shape[1]: 

1062 self.add_channel( 

1063 ch, 

1064 fc_data=data_array[ch].to_numpy().T, 

1065 fc_metadata=ch_metadata, 

1066 dtype=data_array[ch].dtype, 

1067 ) 

1068 return 

1069 

1070 def to_xarray(self, channels: Optional[list] = None) -> xr.Dataset: 

1071 """ 

1072 Create an xarray Dataset from Fourier Coefficient channels. 

1073 

1074 If no channels are specified, all channels in the decimation level 

1075 are included. Each channel becomes a data variable in the resulting Dataset. 

1076 

1077 Parameters 

1078 ---------- 

1079 channels : list, optional 

1080 List of channel names to include. If None, all channels are used. 

1081 Default is None. 

1082 

1083 Returns 

1084 ------- 

1085 xr.Dataset 

1086 xarray Dataset with channels as data variables and 'time' and 

1087 'frequency' as shared coordinates. 

1088 

1089 Examples 

1090 -------- 

1091 >>> decimation = FeatureDecimationGroup(h5_group) 

1092 >>> xr_data = decimation.to_xarray() 

1093 >>> print(xr_data.data_vars) 

1094 Data variables: 

1095 Ex (time, frequency) complex128 

1096 Ey (time, frequency) complex128 

1097 

1098 Get specific channels: 

1099 

1100 >>> subset = decimation.to_xarray(channels=['Ex', 'Ey']) 

1101 """ 

1102 

1103 if channels is None: 

1104 channels = self.groups_list 

1105 ch_dict = {} 

1106 for ch in channels: 

1107 ch_ds = self.get_channel(ch) 

1108 ch_dict[ch] = ch_ds.to_xarray() 

1109 return xr.Dataset(ch_dict) 

1110 

1111 def from_numpy_array( 

1112 self, 

1113 nd_array: np.ndarray, 

1114 ch_name: str | list, 

1115 ) -> None: 

1116 """ 

1117 Load Fourier Coefficient data from a numpy array. 

1118 

1119 Assumes array shape is either (n_frequencies, n_windows) for a single 

1120 channel or (n_channels, n_frequencies, n_windows) for multiple channels. 

1121 

1122 Parameters 

1123 ---------- 

1124 nd_array : np.ndarray 

1125 Input numpy array containing coefficient data. 

1126 ch_name : str or list 

1127 Channel name (for 2D array) or list of channel names 

1128 (for 3D array). 

1129 

1130 Raises 

1131 ------ 

1132 TypeError 

1133 If nd_array is not a numpy ndarray. 

1134 ValueError 

1135 If array shape is not (n_frequencies, n_windows) or 

1136 (n_channels, n_frequencies, n_windows). 

1137 

1138 Examples 

1139 -------- 

1140 >>> decimation = FeatureDecimationGroup(h5_group) 

1141 

1142 Load single channel: 

1143 

1144 >>> data_2d = np.random.randn(256, 100) + 1j * np.random.randn(256, 100) 

1145 >>> decimation.from_numpy_array(data_2d, ch_name='Ex') 

1146 

1147 Load multiple channels: 

1148 

1149 >>> data_3d = np.random.randn(2, 256, 100) + 1j * np.random.randn(2, 256, 100) 

1150 >>> decimation.from_numpy_array(data_3d, ch_name=['Ex', 'Ey']) 

1151 """ 

1152 

1153 if not isinstance(nd_array, np.ndarray): 

1154 msg = f"Must input a numpy ndarray not {type(nd_array)}" 

1155 self.logger.error(msg) 

1156 raise TypeError(msg) 

1157 if len(nd_array.shape) == 3: 

1158 for index, ch in zip(nd_array.shape[0], ch_name): 

1159 self.add_channel(ch, fc_data=nd_array[index]) 

1160 elif len(nd_array.shape) == 2: 

1161 self.add_channel(ch_name, fc_data=nd_array) 

1162 else: 

1163 raise ValueError( 

1164 "input array must be shaped (n_frequencies, n_windows) or " 

1165 "(n_channels, n_frequencies, n_windows)" 

1166 ) 

1167 

1168 def add_channel( 

1169 self, 

1170 fc_name: str, 

1171 fc_data: Optional[np.ndarray | xr.DataArray | xr.Dataset | pd.DataFrame] = None, 

1172 fc_metadata: Optional[FeatureDecimationChannel] = None, 

1173 max_shape: tuple = (None, None), 

1174 chunks: bool = True, 

1175 dtype: type = complex, 

1176 **kwargs, 

1177 ) -> FeatureChannelDataset: 

1178 """ 

1179 Add a Fourier Coefficient channel to the decimation level. 

1180 

1181 Creates a new FeatureChannelDataset for a single channel at a single 

1182 decimation level. Input data can be provided as numpy array, xarray, 

1183 DataFrame, or created empty. 

1184 

1185 Parameters 

1186 ---------- 

1187 fc_name : str 

1188 Name for the Fourier Coefficient channel. 

1189 fc_data : np.ndarray, xr.DataArray, xr.Dataset, pd.DataFrame, optional 

1190 Input data. Can be numpy array (time, frequency) or xarray/DataFrame 

1191 format. Default is None (creates empty dataset). 

1192 fc_metadata : FeatureDecimationChannel, optional 

1193 Metadata for the channel. Default is None. 

1194 max_shape : tuple, default=(None, None) 

1195 Maximum shape for HDF5 dataset dimensions (expandable if None). 

1196 chunks : bool, default=True 

1197 Whether to use HDF5 chunking. 

1198 dtype : type, default=complex 

1199 Data type for the dataset (e.g., complex, float, int). 

1200 **kwargs 

1201 Additional keyword arguments for HDF5 dataset creation. 

1202 

1203 Returns 

1204 ------- 

1205 FeatureChannelDataset 

1206 Newly created FeatureChannelDataset object. 

1207 

1208 Raises 

1209 ------ 

1210 TypeError 

1211 If fc_data type is not supported or metadata type mismatch. 

1212 RuntimeError or OSError 

1213 If channel already exists (will return existing channel). 

1214 

1215 Notes 

1216 ----- 

1217 Data layout assumes (time, frequency) organization: 

1218 

1219 - time index: window start times 

1220 - frequency index: harmonic indices or float values 

1221 - data: complex Fourier coefficients 

1222 

1223 Examples 

1224 -------- 

1225 >>> decimation = FeatureDecimationGroup(h5_group) 

1226 >>> metadata = FeatureDecimationChannel(name='Ex') 

1227 

1228 Create from numpy array: 

1229 

1230 >>> fc_data = np.random.randn(100, 256) + 1j * np.random.randn(100, 256) 

1231 >>> channel = decimation.add_channel('Ex', fc_data=fc_data, fc_metadata=metadata) 

1232 

1233 Create empty channel (expandable): 

1234 

1235 >>> channel = decimation.add_channel('Ex', fc_metadata=metadata) 

1236 """ 

1237 

1238 fc_name = validate_name(fc_name) 

1239 

1240 if fc_metadata is None: 

1241 fc_metadata = FeatureDecimationChannel(name=fc_name) 

1242 if fc_data is not None: 

1243 if not isinstance( 

1244 fc_data, (np.ndarray, xr.DataArray, xr.Dataset, pd.DataFrame) 

1245 ): 

1246 msg = ( 

1247 "Need to input a numpy.array, xarray.DataArray, " 

1248 f"xr.Dataset, pd.DataFrame not {type(fc_data)}" 

1249 ) 

1250 self.logger.exception(msg) 

1251 raise TypeError(msg) 

1252 else: 

1253 chunks = True 

1254 fc_data = np.zeros((1, 1), dtype=dtype) 

1255 try: 

1256 dataset = self.hdf5_group.create_dataset( 

1257 fc_name, 

1258 data=fc_data, 

1259 dtype=dtype, 

1260 chunks=chunks, 

1261 maxshape=max_shape, 

1262 **self.dataset_options, 

1263 ) 

1264 

1265 fc_dataset = FeatureChannelDataset(dataset, dataset_metadata=fc_metadata) 

1266 except (OSError, RuntimeError, ValueError) as error: 

1267 self.logger.error(error) 

1268 msg = ( 

1269 f"estimate {fc_metadata.name} already exists, returning existing group." 

1270 ) 

1271 self.logger.debug(msg) 

1272 

1273 fc_dataset = self.get_channel(fc_metadata.name) 

1274 return fc_dataset 

1275 

1276 def get_channel(self, fc_name: str) -> FeatureChannelDataset: 

1277 """ 

1278 Retrieve a Fourier Coefficient channel by name. 

1279 

1280 Parameters 

1281 ---------- 

1282 fc_name : str 

1283 Name of the channel to retrieve. 

1284 

1285 Returns 

1286 ------- 

1287 FeatureChannelDataset 

1288 The requested FeatureChannelDataset object. 

1289 

1290 Raises 

1291 ------ 

1292 MTH5Error 

1293 If the channel does not exist. 

1294 

1295 Examples 

1296 -------- 

1297 >>> decimation = FeatureDecimationGroup(h5_group) 

1298 >>> channel = decimation.get_channel('Ex') 

1299 >>> data = channel.to_numpy() 

1300 """ 

1301 fc_name = validate_name(fc_name) 

1302 

1303 try: 

1304 fc_dataset = self.hdf5_group[fc_name] 

1305 fc_metadata = FeatureDecimationChannel(**dict(fc_dataset.attrs)) 

1306 return FeatureChannelDataset(fc_dataset, dataset_metadata=fc_metadata) 

1307 except KeyError: 

1308 msg = f"{fc_name} does not exist, check groups_list for existing names" 

1309 self.logger.error(msg) 

1310 raise MTH5Error(msg) 

1311 except OSError as error: 

1312 self.logger.error(error) 

1313 raise MTH5Error(error) 

1314 

1315 def remove_channel(self, fc_name: str) -> None: 

1316 """ 

1317 Remove a Fourier Coefficient channel from the decimation level. 

1318 

1319 Deletes the channel from the HDF5 file. Note that this removes the 

1320 reference but does not reduce file size. 

1321 

1322 Parameters 

1323 ---------- 

1324 fc_name : str 

1325 Name of the channel to remove. 

1326 

1327 Raises 

1328 ------ 

1329 MTH5Error 

1330 If the channel does not exist. 

1331 

1332 Notes 

1333 ----- 

1334 To reduce HDF5 file size, copy desired data to a new file. 

1335 

1336 Examples 

1337 -------- 

1338 >>> decimation = FeatureDecimationGroup(h5_group) 

1339 >>> decimation.remove_channel('Ex') 

1340 """ 

1341 fc_name = validate_name(fc_name.lower()) 

1342 

1343 try: 

1344 del self.hdf5_group[fc_name] 

1345 self.logger.info( 

1346 "Deleting a estimate does not reduce the HDF5" 

1347 "file size it simply remove the reference. If " 

1348 "file size reduction is your goal, simply copy" 

1349 " what you want into another file." 

1350 ) 

1351 except KeyError: 

1352 msg = f"{fc_name} does not exist, check groups_list for existing names" 

1353 self.logger.error(msg) 

1354 raise MTH5Error(msg) 

1355 

1356 def update_metadata(self) -> None: 

1357 """ 

1358 Update metadata from all channels in the decimation level. 

1359 

1360 Scans all channels and updates the decimation-level metadata with 

1361 aggregated information including time ranges and sampling rates. 

1362 

1363 Examples 

1364 -------- 

1365 >>> decimation = FeatureDecimationGroup(h5_group) 

1366 >>> decimation.update_metadata() 

1367 """ 

1368 channel_summary = self.channel_summary.copy() 

1369 

1370 if not channel_summary.empty: 

1371 self._metadata.time_period.start = channel_summary.start.min().isoformat() 

1372 self._metadata.time_period.end = channel_summary.end.max().isoformat() 

1373 self._metadata.sample_rate_decimation_level = ( 

1374 channel_summary.sample_rate_decimation_level.unique()[0] 

1375 ) 

1376 self._metadata.sample_rate_window_step = ( 

1377 channel_summary.sample_rate_window_step.unique()[0] 

1378 ) 

1379 self.write_metadata() 

1380 

1381 def add_weights( 

1382 self, 

1383 weight_name: str, 

1384 weight_data: Optional[np.ndarray] = None, 

1385 weight_metadata: Optional[object] = None, 

1386 max_shape: tuple = (None, None, None), 

1387 chunks: bool = True, 

1388 **kwargs, 

1389 ) -> None: 

1390 """ 

1391 Add weight or masking data for Fourier Coefficients. 

1392 

1393 Creates a dataset to store weights or masks for quality control, 

1394 frequency band selection, or time window filtering. 

1395 

1396 Parameters 

1397 ---------- 

1398 weight_name : str 

1399 Name for the weight dataset. 

1400 weight_data : np.ndarray, optional 

1401 Weight values. Default is None. 

1402 weight_metadata : optional 

1403 Metadata for the weight dataset. Default is None. 

1404 max_shape : tuple, default=(None, None, None) 

1405 Maximum shape for expandable dimensions. 

1406 chunks : bool, default=True 

1407 Whether to use HDF5 chunking. 

1408 **kwargs 

1409 Additional keyword arguments for HDF5 dataset creation. 

1410 

1411 Notes 

1412 ----- 

1413 Weight datasets can track: 

1414 

1415 - weight_channel: Per-channel weights 

1416 - weight_band: Per-frequency-band weights 

1417 - weight_time: Per-time-window weights 

1418 

1419 This method is a placeholder for future implementation. 

1420 

1421 Examples 

1422 -------- 

1423 >>> decimation = FeatureDecimationGroup(h5_group) 

1424 >>> decimation.add_weights('coherency_weights', weight_data=weights) 

1425 """