Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ groups \ feature_dataset.py: 47%

110 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-27 20:09 -0800

1# -*- coding: utf-8 -*- 

2""" 

3Created on Thu Mar 10 09:02:16 2022 

4 

5@author: jpeacock 

6""" 

7 

8# ============================================================================= 

9# Imports 

10# ============================================================================= 

11import weakref 

12from typing import Optional 

13 

14import h5py 

15import numpy as np 

16import xarray as xr 

17from loguru import logger 

18from mt_metadata.features import FeatureDecimationChannel 

19 

20from mth5.helpers import add_attributes_to_metadata_class_pydantic, to_numpy_type 

21from mth5.timeseries.ts_helpers import make_dt_coordinates 

22from mth5.utils.exceptions import MTH5Error 

23 

24 

25# ============================================================================= 

26 

27 

28class FeatureChannelDataset: 

29 """ 

30 Container for multi-dimensional Fourier Coefficients organized by time and frequency. 

31 

32 This class manages Fourier Coefficient data with frequency band organization, 

33 similar to FCDataset but with enhanced band tracking capabilities. The data array 

34 is organized with the following assumptions: 

35 

36 1. Data are grouped into frequency bands 

37 2. Data are uniformly sampled in time (uniform FFT moving window step size) 

38 

39 The dataset tracks temporal evolution of frequency content across multiple windows, 

40 making it suitable for time-frequency analysis of geophysical signals. 

41 

42 Parameters 

43 ---------- 

44 dataset : h5py.Dataset 

45 HDF5 dataset containing the Fourier coefficient data. 

46 dataset_metadata : FeatureDecimationChannel, optional 

47 Metadata for the dataset. See :class:`mt_metadata.features.FeatureDecimationChannel`. 

48 If provided, must be of the same type as the internal metadata class. 

49 Default is None. 

50 **kwargs 

51 Additional keyword arguments for future extensibility. 

52 

53 Attributes 

54 ---------- 

55 hdf5_dataset : h5py.Dataset 

56 Reference to the HDF5 dataset. 

57 metadata : FeatureDecimationChannel 

58 Metadata container with the following attributes: 

59 

60 - name : str 

61 Dataset name 

62 - time_period.start : datetime 

63 Start time of the data acquisition 

64 - time_period.end : datetime 

65 End time of the data acquisition 

66 - sample_rate_window_step : float 

67 Sample rate of the time window stepping (Hz) 

68 - frequency_min : float 

69 Minimum frequency in the band (Hz) 

70 - frequency_max : float 

71 Maximum frequency in the band (Hz) 

72 - units : str 

73 Physical units of the coefficient data 

74 - component : str 

75 Component identifier (e.g., 'Ex', 'Hy') 

76 - sample_rate_decimation_level : int 

77 Decimation level applied to acquire this data 

78 

79 Raises 

80 ------ 

81 MTH5Error 

82 If dataset_metadata type does not match the expected FeatureDecimationChannel type. 

83 

84 Examples 

85 -------- 

86 >>> import h5py 

87 >>> from mt_metadata.features import FeatureDecimationChannel 

88 >>> from mth5.groups.feature_dataset import FeatureChannelDataset 

89 

90 Create a feature dataset from an HDF5 group: 

91 

92 >>> with h5py.File('data.h5', 'r') as f: 

93 ... h5_dataset = f['feature_group']['Ex'] 

94 ... feature = FeatureChannelDataset(h5_dataset) 

95 ... print(f"Time windows: {feature.n_windows}") 

96 ... print(f"Frequencies: {feature.n_frequencies}") 

97 

98 Access time and frequency arrays: 

99 

100 >>> time_array = feature.time 

101 >>> freq_array = feature.frequency 

102 >>> data_array = feature.to_numpy() 

103 """ 

104 

105 def __init__( 

106 self, 

107 dataset: h5py.Dataset, 

108 dataset_metadata: Optional[FeatureDecimationChannel] = None, 

109 **kwargs, 

110 ) -> None: 

111 if dataset is not None and isinstance(dataset, (h5py.Dataset)): 

112 self.hdf5_dataset = weakref.ref(dataset)() 

113 self.logger = logger 

114 

115 # set metadata to the appropriate class. Standards is not a 

116 # Base object so should be skipped. If the class name is not 

117 # defined yet set to Base class. 

118 self.metadata = add_attributes_to_metadata_class_pydantic( 

119 FeatureDecimationChannel 

120 ) 

121 self.metadata.hdf5_reference = self.hdf5_dataset.ref 

122 self.metadata.mth5_type = self._class_name 

123 # if the input data set already has filled attributes, namely if the 

124 # channel data already exists then read them in with our writing back 

125 if "mth5_type" in list(self.hdf5_dataset.attrs.keys()): 

126 self.metadata.from_dict( 

127 {self.hdf5_dataset.attrs["mth5_type"]: dict(self.hdf5_dataset.attrs)} 

128 ) 

129 # if metadata is input, make sure that its the same class type amd write 

130 # to the hdf5 dataset 

131 if dataset_metadata is not None: 

132 if not isinstance(self.metadata, type(dataset_metadata)): 

133 msg = ( 

134 f"metadata must be type metadata.{self._class_name} not " 

135 "{type(dataset_metadata)}" 

136 ) 

137 self.logger.error(msg) 

138 raise MTH5Error(msg) 

139 # load from dict because of the extra attributes for MTH5 

140 self.metadata.from_dict(dataset_metadata.to_dict()) 

141 self.metadata.hdf5_reference = self.hdf5_dataset.ref 

142 self.metadata.mth5_type = self._class_name 

143 

144 # write out metadata to make sure that its in the file. 

145 try: 

146 self.write_metadata() 

147 except (RuntimeError, KeyError, OSError): 

148 # file is read only 

149 pass 

150 

151 # if the attrs don't have the proper metadata keys yet write them 

152 if not "mth5_type" in list(self.hdf5_dataset.attrs.keys()): 

153 self.write_metadata() 

154 

155 def __str__(self) -> str: 

156 """ 

157 String representation of the FeatureChannelDataset. 

158 

159 Returns 

160 ------- 

161 str 

162 JSON representation of the metadata. 

163 """ 

164 return self.metadata.to_json() 

165 

166 def __repr__(self) -> str: 

167 """ 

168 Official string representation of the FeatureChannelDataset. 

169 

170 Returns 

171 ------- 

172 str 

173 JSON representation of the metadata. 

174 """ 

175 return self.__str__() 

176 

177 @property 

178 def _class_name(self) -> str: 

179 """ 

180 Extract the class name prefix by removing 'Dataset' suffix. 

181 

182 Returns 

183 ------- 

184 str 

185 Class name without the 'Dataset' suffix. 

186 """ 

187 return self.__class__.__name__.split("Dataset")[0] 

188 

189 def read_metadata(self) -> None: 

190 """ 

191 Read metadata from the HDF5 file into the metadata container. 

192 

193 This method loads all attributes from the HDF5 dataset into the 

194 metadata container, enabling validation and type checking. 

195 

196 Examples 

197 -------- 

198 >>> feature.read_metadata() 

199 >>> print(feature.metadata.component) 

200 'Ex' 

201 """ 

202 

203 meta_dict = read_attrs_to_dict(dict(self.hdf5_dataset.attrs), self.metadata) 

204 # Defensive check: skip if meta_dict is empty 

205 if not meta_dict: 

206 self.logger.debug( 

207 f"No metadata found for {self._class_name}, skipping from_dict." 

208 ) 

209 return 

210 self.metadata.from_dict({self._class_name: meta_dict}) 

211 self._has_read_metadata = True 

212 

213 def write_metadata(self) -> None: 

214 """ 

215 Write metadata from the metadata container to the HDF5 attributes. 

216 

217 This method serializes the metadata container and writes all metadata 

218 as attributes to the HDF5 dataset. Raises exceptions are caught for 

219 read-only files. 

220 

221 Examples 

222 -------- 

223 >>> feature.metadata.component = 'Ey' 

224 >>> feature.write_metadata() 

225 """ 

226 meta_dict = self.metadata.to_dict()[self.metadata._class_name.lower()] 

227 for key, value in meta_dict.items(): 

228 value = to_numpy_type(value) 

229 self.hdf5_dataset.attrs.create(key, value) 

230 

231 @property 

232 def n_windows(self) -> int: 

233 """ 

234 Get the number of time windows in the dataset. 

235 

236 Returns 

237 ------- 

238 int 

239 Number of time windows (first dimension of the dataset). 

240 """ 

241 return self.hdf5_dataset.shape[0] 

242 

243 @property 

244 def time(self) -> np.ndarray: 

245 """ 

246 Get the time array for each window. 

247 

248 Returns an array of datetime64 values representing the start time 

249 of each time window. The time spacing is determined by the sample 

250 rate of the window stepping. 

251 

252 Returns 

253 ------- 

254 np.ndarray 

255 Array of datetime64 values with shape (n_windows,) representing 

256 the start time of each window. 

257 

258 Examples 

259 -------- 

260 >>> time_array = feature.time 

261 >>> print(time_array.shape) 

262 (100,) 

263 >>> print(time_array[0]) 

264 numpy.datetime64('2023-01-01T00:00:00') 

265 """ 

266 

267 return make_dt_coordinates( 

268 self.metadata.time_period.start, 

269 1.0 / self.metadata.sample_rate_window_step, 

270 self.n_windows, 

271 end_time=self.metadata.time_period.end, 

272 ) 

273 

274 @property 

275 def n_frequencies(self) -> int: 

276 """ 

277 Get the number of frequency bins in the dataset. 

278 

279 Returns 

280 ------- 

281 int 

282 Number of frequency bins (second dimension of the dataset). 

283 """ 

284 return self.hdf5_dataset.shape[1] 

285 

286 @property 

287 def frequency(self) -> np.ndarray: 

288 """ 

289 Get the frequency array for the dataset. 

290 

291 Returns a linearly-spaced frequency array from frequency_min to 

292 frequency_max with n_frequencies points. 

293 

294 Returns 

295 ------- 

296 np.ndarray 

297 Array of float64 frequencies in Hz with shape (n_frequencies,). 

298 

299 Examples 

300 -------- 

301 >>> freq_array = feature.frequency 

302 >>> print(freq_array.shape) 

303 (256,) 

304 >>> print(f"Frequency range: {freq_array[0]:.2f} - {freq_array[-1]:.2f} Hz") 

305 Frequency range: 0.01 - 100.00 Hz 

306 """ 

307 return np.linspace( 

308 self.metadata.frequency_min, 

309 self.metadata.frequency_max, 

310 self.n_frequencies, 

311 ) 

312 

313 def replace_dataset(self, new_data_array: np.ndarray) -> None: 

314 """ 

315 Replace the entire HDF5 dataset with new data. 

316 

317 This method resizes the HDF5 dataset as needed and replaces all data. 

318 The input array must have the same dtype as the existing dataset. 

319 

320 Parameters 

321 ---------- 

322 new_data_array : np.ndarray 

323 New data array to replace the existing dataset. Will be converted 

324 to numpy array if necessary. 

325 

326 Raises 

327 ------ 

328 TypeError 

329 If input cannot be converted to a numpy array or has incompatible shape. 

330 

331 Examples 

332 -------- 

333 >>> import numpy as np 

334 >>> new_data = np.random.randn(100, 256) 

335 >>> feature.replace_dataset(new_data) 

336 """ 

337 if not isinstance(new_data_array, np.ndarray): 

338 try: 

339 new_data_array = np.array(new_data_array) 

340 except (ValueError, TypeError) as error: 

341 msg = f"{error} Input must be a numpy array not {type(new_data_array)}" 

342 self.logger.exception(msg) 

343 raise TypeError(msg) 

344 if new_data_array.shape != self.hdf5_dataset.shape: 

345 self.hdf5_dataset.resize(new_data_array.shape) 

346 self.hdf5_dataset[...] = new_data_array 

347 

348 def to_xarray(self) -> xr.DataArray: 

349 """ 

350 Convert the feature dataset to an xarray DataArray. 

351 

352 Returns an xarray DataArray with proper time and frequency coordinates, 

353 metadata attributes, and component naming. The entire dataset is loaded 

354 into memory. 

355 

356 Returns 

357 ------- 

358 xr.DataArray 

359 DataArray with dimensions ['time', 'frequency'] and coordinates 

360 matching the dataset's time and frequency arrays. 

361 

362 Notes 

363 ----- 

364 Metadata stored in xarray attributes will not be validated if modified. 

365 The full dataset is loaded into memory; use with caution for large datasets. 

366 

367 Examples 

368 -------- 

369 >>> xr_data = feature.to_xarray() 

370 >>> print(xr_data.dims) 

371 ('time', 'frequency') 

372 >>> print(xr_data.name) 

373 'Ex' 

374 >>> subset = xr_data.sel(time=slice('2023-01-01', '2023-01-02')) 

375 """ 

376 

377 return xr.DataArray( 

378 data=self.hdf5_dataset[()], 

379 dims=["time", "frequency"], 

380 name=self.metadata.component, 

381 coords=[ 

382 ("time", self.time), 

383 ("frequency", self.frequency), 

384 ], 

385 attrs=self.metadata.to_dict(single=True), 

386 ) 

387 

388 def to_numpy(self) -> np.ndarray: 

389 """ 

390 Convert the feature dataset to a numpy array. 

391 

392 Returns the dataset as a numpy array by loading it from the HDF5 file 

393 into memory. The array shape is (n_windows, n_frequencies). 

394 

395 Returns 

396 ------- 

397 np.ndarray 

398 Numpy array containing all feature data with shape 

399 (n_windows, n_frequencies). 

400 

401 Examples 

402 -------- 

403 >>> data = feature.to_numpy() 

404 >>> print(data.shape) 

405 (100, 256) 

406 >>> print(data.dtype) 

407 complex128 

408 >>> mean_amplitude = np.abs(data).mean() 

409 """ 

410 

411 return self.hdf5_dataset[()] 

412 

413 def from_numpy(self, new_estimate: np.ndarray) -> None: 

414 """ 

415 Load data from a numpy array into the HDF5 dataset. 

416 

417 This method updates the HDF5 dataset with new data from a numpy array. 

418 The input array must match the dataset's dtype. The HDF5 dataset will 

419 be resized if necessary to accommodate the new data. 

420 

421 Parameters 

422 ---------- 

423 new_estimate : np.ndarray 

424 Numpy array to write to the HDF5 dataset. Must have compatible 

425 dtype with the existing dataset. 

426 

427 Raises 

428 ------ 

429 TypeError 

430 If input array dtype does not match the HDF5 dataset dtype or 

431 if input cannot be converted to numpy array. 

432 

433 Notes 

434 ----- 

435 The variable 'data' is a builtin in numpy and cannot be used as a parameter name. 

436 

437 Examples 

438 -------- 

439 >>> import numpy as np 

440 >>> new_data = np.random.randn(100, 256) + 1j * np.random.randn(100, 256) 

441 >>> feature.from_numpy(new_data) 

442 >>> loaded_data = feature.to_numpy() 

443 >>> assert loaded_data.shape == new_data.shape 

444 """ 

445 

446 if not isinstance(new_estimate, np.ndarray): 

447 try: 

448 new_estimate = np.array(new_estimate) 

449 except (ValueError, TypeError) as error: 

450 msg = f"{error} Input must be a numpy array not {type(new_estimate)}" 

451 self.logger.exception(msg) 

452 raise TypeError(msg) 

453 if new_estimate.dtype != self.hdf5_dataset.dtype: 

454 msg = ( 

455 f"Input array must be type {new_estimate.dtype} not " 

456 "{self.hdf5_dataset.dtype}" 

457 ) 

458 self.logger.error(msg) 

459 raise TypeError(msg) 

460 if new_estimate.shape != self.hdf5_dataset.shape: 

461 self.hdf5_dataset.resize(new_estimate.shape) 

462 self.hdf5_dataset[...] = new_estimate 

463 

464 def from_xarray( 

465 self, 

466 data: xr.DataArray, 

467 sample_rate_decimation_level: int, 

468 ) -> None: 

469 """ 

470 Load data and metadata from an xarray DataArray. 

471 

472 This method updates both the HDF5 dataset and metadata from an xarray 

473 DataArray. It extracts time coordinates, frequency range, and component 

474 information from the DataArray and its attributes. 

475 

476 Parameters 

477 ---------- 

478 data : xr.DataArray 

479 Input xarray DataArray with 'time' and 'frequency' coordinates. 

480 Expected dimensions are ['time', 'frequency']. 

481 sample_rate_decimation_level : int 

482 Decimation level applied to the original data to produce this 

483 feature dataset (integer ≥ 1). 

484 

485 Notes 

486 ----- 

487 Metadata stored in xarray attributes will be extracted and written to 

488 the HDF5 file. The full dataset is loaded into memory during this process. 

489 

490 Examples 

491 -------- 

492 >>> import xarray as xr 

493 >>> import numpy as np 

494 

495 Create sample xarray data: 

496 

497 >>> times = np.arange('2023-01-01', '2023-01-02', dtype='datetime64[s]') 

498 >>> freqs = np.linspace(0.01, 100, 256) 

499 >>> data_array = np.random.randn(len(times), len(freqs)) + \\ 

500 ... 1j * np.random.randn(len(times), len(freqs)) 

501 >>> xr_data = xr.DataArray( 

502 ... data_array, 

503 ... dims=['time', 'frequency'], 

504 ... coords={'time': times, 'frequency': freqs}, 

505 ... name='Ex', 

506 ... attrs={'units': 'mV/km'} 

507 ... ) 

508 

509 Load into feature dataset: 

510 

511 >>> feature.from_xarray(xr_data, sample_rate_decimation_level=2) 

512 >>> print(feature.metadata.component) 

513 'Ex' 

514 """ 

515 self.metadata.time_period.start = data.time[0].values 

516 self.metadata.time_period.end = data.time[-1].values 

517 

518 self.metadata.sample_rate_decimation_level = sample_rate_decimation_level 

519 self.metadata.frequency_min = data.coords["frequency"].data.min() 

520 self.metadata.frequency_max = data.coords["frequency"].data.max() 

521 step_size = data.coords["time"].data[1] - data.coords["time"].data[0] 

522 self.metadata.sample_rate_window_step = step_size / np.timedelta64(1, "s") 

523 self.metadata.component = data.name 

524 try: 

525 self.metadata.units = data.units 

526 except AttributeError: 

527 self.logger.debug("Could not find 'units' in xarray") 

528 self.write_metadata() 

529 

530 self.from_numpy(data.to_numpy())