Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ groups \ feature_dataset.py: 47%
110 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-27 20:09 -0800
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-27 20:09 -0800
1# -*- coding: utf-8 -*-
2"""
3Created on Thu Mar 10 09:02:16 2022
5@author: jpeacock
6"""
8# =============================================================================
9# Imports
10# =============================================================================
11import weakref
12from typing import Optional
14import h5py
15import numpy as np
16import xarray as xr
17from loguru import logger
18from mt_metadata.features import FeatureDecimationChannel
20from mth5.helpers import add_attributes_to_metadata_class_pydantic, to_numpy_type
21from mth5.timeseries.ts_helpers import make_dt_coordinates
22from mth5.utils.exceptions import MTH5Error
25# =============================================================================
28class FeatureChannelDataset:
29 """
30 Container for multi-dimensional Fourier Coefficients organized by time and frequency.
32 This class manages Fourier Coefficient data with frequency band organization,
33 similar to FCDataset but with enhanced band tracking capabilities. The data array
34 is organized with the following assumptions:
36 1. Data are grouped into frequency bands
37 2. Data are uniformly sampled in time (uniform FFT moving window step size)
39 The dataset tracks temporal evolution of frequency content across multiple windows,
40 making it suitable for time-frequency analysis of geophysical signals.
42 Parameters
43 ----------
44 dataset : h5py.Dataset
45 HDF5 dataset containing the Fourier coefficient data.
46 dataset_metadata : FeatureDecimationChannel, optional
47 Metadata for the dataset. See :class:`mt_metadata.features.FeatureDecimationChannel`.
48 If provided, must be of the same type as the internal metadata class.
49 Default is None.
50 **kwargs
51 Additional keyword arguments for future extensibility.
53 Attributes
54 ----------
55 hdf5_dataset : h5py.Dataset
56 Reference to the HDF5 dataset.
57 metadata : FeatureDecimationChannel
58 Metadata container with the following attributes:
60 - name : str
61 Dataset name
62 - time_period.start : datetime
63 Start time of the data acquisition
64 - time_period.end : datetime
65 End time of the data acquisition
66 - sample_rate_window_step : float
67 Sample rate of the time window stepping (Hz)
68 - frequency_min : float
69 Minimum frequency in the band (Hz)
70 - frequency_max : float
71 Maximum frequency in the band (Hz)
72 - units : str
73 Physical units of the coefficient data
74 - component : str
75 Component identifier (e.g., 'Ex', 'Hy')
76 - sample_rate_decimation_level : int
77 Decimation level applied to acquire this data
79 Raises
80 ------
81 MTH5Error
82 If dataset_metadata type does not match the expected FeatureDecimationChannel type.
84 Examples
85 --------
86 >>> import h5py
87 >>> from mt_metadata.features import FeatureDecimationChannel
88 >>> from mth5.groups.feature_dataset import FeatureChannelDataset
90 Create a feature dataset from an HDF5 group:
92 >>> with h5py.File('data.h5', 'r') as f:
93 ... h5_dataset = f['feature_group']['Ex']
94 ... feature = FeatureChannelDataset(h5_dataset)
95 ... print(f"Time windows: {feature.n_windows}")
96 ... print(f"Frequencies: {feature.n_frequencies}")
98 Access time and frequency arrays:
100 >>> time_array = feature.time
101 >>> freq_array = feature.frequency
102 >>> data_array = feature.to_numpy()
103 """
105 def __init__(
106 self,
107 dataset: h5py.Dataset,
108 dataset_metadata: Optional[FeatureDecimationChannel] = None,
109 **kwargs,
110 ) -> None:
111 if dataset is not None and isinstance(dataset, (h5py.Dataset)):
112 self.hdf5_dataset = weakref.ref(dataset)()
113 self.logger = logger
115 # set metadata to the appropriate class. Standards is not a
116 # Base object so should be skipped. If the class name is not
117 # defined yet set to Base class.
118 self.metadata = add_attributes_to_metadata_class_pydantic(
119 FeatureDecimationChannel
120 )
121 self.metadata.hdf5_reference = self.hdf5_dataset.ref
122 self.metadata.mth5_type = self._class_name
123 # if the input data set already has filled attributes, namely if the
124 # channel data already exists then read them in with our writing back
125 if "mth5_type" in list(self.hdf5_dataset.attrs.keys()):
126 self.metadata.from_dict(
127 {self.hdf5_dataset.attrs["mth5_type"]: dict(self.hdf5_dataset.attrs)}
128 )
129 # if metadata is input, make sure that its the same class type amd write
130 # to the hdf5 dataset
131 if dataset_metadata is not None:
132 if not isinstance(self.metadata, type(dataset_metadata)):
133 msg = (
134 f"metadata must be type metadata.{self._class_name} not "
135 "{type(dataset_metadata)}"
136 )
137 self.logger.error(msg)
138 raise MTH5Error(msg)
139 # load from dict because of the extra attributes for MTH5
140 self.metadata.from_dict(dataset_metadata.to_dict())
141 self.metadata.hdf5_reference = self.hdf5_dataset.ref
142 self.metadata.mth5_type = self._class_name
144 # write out metadata to make sure that its in the file.
145 try:
146 self.write_metadata()
147 except (RuntimeError, KeyError, OSError):
148 # file is read only
149 pass
151 # if the attrs don't have the proper metadata keys yet write them
152 if not "mth5_type" in list(self.hdf5_dataset.attrs.keys()):
153 self.write_metadata()
155 def __str__(self) -> str:
156 """
157 String representation of the FeatureChannelDataset.
159 Returns
160 -------
161 str
162 JSON representation of the metadata.
163 """
164 return self.metadata.to_json()
166 def __repr__(self) -> str:
167 """
168 Official string representation of the FeatureChannelDataset.
170 Returns
171 -------
172 str
173 JSON representation of the metadata.
174 """
175 return self.__str__()
177 @property
178 def _class_name(self) -> str:
179 """
180 Extract the class name prefix by removing 'Dataset' suffix.
182 Returns
183 -------
184 str
185 Class name without the 'Dataset' suffix.
186 """
187 return self.__class__.__name__.split("Dataset")[0]
189 def read_metadata(self) -> None:
190 """
191 Read metadata from the HDF5 file into the metadata container.
193 This method loads all attributes from the HDF5 dataset into the
194 metadata container, enabling validation and type checking.
196 Examples
197 --------
198 >>> feature.read_metadata()
199 >>> print(feature.metadata.component)
200 'Ex'
201 """
203 meta_dict = read_attrs_to_dict(dict(self.hdf5_dataset.attrs), self.metadata)
204 # Defensive check: skip if meta_dict is empty
205 if not meta_dict:
206 self.logger.debug(
207 f"No metadata found for {self._class_name}, skipping from_dict."
208 )
209 return
210 self.metadata.from_dict({self._class_name: meta_dict})
211 self._has_read_metadata = True
213 def write_metadata(self) -> None:
214 """
215 Write metadata from the metadata container to the HDF5 attributes.
217 This method serializes the metadata container and writes all metadata
218 as attributes to the HDF5 dataset. Raises exceptions are caught for
219 read-only files.
221 Examples
222 --------
223 >>> feature.metadata.component = 'Ey'
224 >>> feature.write_metadata()
225 """
226 meta_dict = self.metadata.to_dict()[self.metadata._class_name.lower()]
227 for key, value in meta_dict.items():
228 value = to_numpy_type(value)
229 self.hdf5_dataset.attrs.create(key, value)
231 @property
232 def n_windows(self) -> int:
233 """
234 Get the number of time windows in the dataset.
236 Returns
237 -------
238 int
239 Number of time windows (first dimension of the dataset).
240 """
241 return self.hdf5_dataset.shape[0]
243 @property
244 def time(self) -> np.ndarray:
245 """
246 Get the time array for each window.
248 Returns an array of datetime64 values representing the start time
249 of each time window. The time spacing is determined by the sample
250 rate of the window stepping.
252 Returns
253 -------
254 np.ndarray
255 Array of datetime64 values with shape (n_windows,) representing
256 the start time of each window.
258 Examples
259 --------
260 >>> time_array = feature.time
261 >>> print(time_array.shape)
262 (100,)
263 >>> print(time_array[0])
264 numpy.datetime64('2023-01-01T00:00:00')
265 """
267 return make_dt_coordinates(
268 self.metadata.time_period.start,
269 1.0 / self.metadata.sample_rate_window_step,
270 self.n_windows,
271 end_time=self.metadata.time_period.end,
272 )
274 @property
275 def n_frequencies(self) -> int:
276 """
277 Get the number of frequency bins in the dataset.
279 Returns
280 -------
281 int
282 Number of frequency bins (second dimension of the dataset).
283 """
284 return self.hdf5_dataset.shape[1]
286 @property
287 def frequency(self) -> np.ndarray:
288 """
289 Get the frequency array for the dataset.
291 Returns a linearly-spaced frequency array from frequency_min to
292 frequency_max with n_frequencies points.
294 Returns
295 -------
296 np.ndarray
297 Array of float64 frequencies in Hz with shape (n_frequencies,).
299 Examples
300 --------
301 >>> freq_array = feature.frequency
302 >>> print(freq_array.shape)
303 (256,)
304 >>> print(f"Frequency range: {freq_array[0]:.2f} - {freq_array[-1]:.2f} Hz")
305 Frequency range: 0.01 - 100.00 Hz
306 """
307 return np.linspace(
308 self.metadata.frequency_min,
309 self.metadata.frequency_max,
310 self.n_frequencies,
311 )
313 def replace_dataset(self, new_data_array: np.ndarray) -> None:
314 """
315 Replace the entire HDF5 dataset with new data.
317 This method resizes the HDF5 dataset as needed and replaces all data.
318 The input array must have the same dtype as the existing dataset.
320 Parameters
321 ----------
322 new_data_array : np.ndarray
323 New data array to replace the existing dataset. Will be converted
324 to numpy array if necessary.
326 Raises
327 ------
328 TypeError
329 If input cannot be converted to a numpy array or has incompatible shape.
331 Examples
332 --------
333 >>> import numpy as np
334 >>> new_data = np.random.randn(100, 256)
335 >>> feature.replace_dataset(new_data)
336 """
337 if not isinstance(new_data_array, np.ndarray):
338 try:
339 new_data_array = np.array(new_data_array)
340 except (ValueError, TypeError) as error:
341 msg = f"{error} Input must be a numpy array not {type(new_data_array)}"
342 self.logger.exception(msg)
343 raise TypeError(msg)
344 if new_data_array.shape != self.hdf5_dataset.shape:
345 self.hdf5_dataset.resize(new_data_array.shape)
346 self.hdf5_dataset[...] = new_data_array
348 def to_xarray(self) -> xr.DataArray:
349 """
350 Convert the feature dataset to an xarray DataArray.
352 Returns an xarray DataArray with proper time and frequency coordinates,
353 metadata attributes, and component naming. The entire dataset is loaded
354 into memory.
356 Returns
357 -------
358 xr.DataArray
359 DataArray with dimensions ['time', 'frequency'] and coordinates
360 matching the dataset's time and frequency arrays.
362 Notes
363 -----
364 Metadata stored in xarray attributes will not be validated if modified.
365 The full dataset is loaded into memory; use with caution for large datasets.
367 Examples
368 --------
369 >>> xr_data = feature.to_xarray()
370 >>> print(xr_data.dims)
371 ('time', 'frequency')
372 >>> print(xr_data.name)
373 'Ex'
374 >>> subset = xr_data.sel(time=slice('2023-01-01', '2023-01-02'))
375 """
377 return xr.DataArray(
378 data=self.hdf5_dataset[()],
379 dims=["time", "frequency"],
380 name=self.metadata.component,
381 coords=[
382 ("time", self.time),
383 ("frequency", self.frequency),
384 ],
385 attrs=self.metadata.to_dict(single=True),
386 )
388 def to_numpy(self) -> np.ndarray:
389 """
390 Convert the feature dataset to a numpy array.
392 Returns the dataset as a numpy array by loading it from the HDF5 file
393 into memory. The array shape is (n_windows, n_frequencies).
395 Returns
396 -------
397 np.ndarray
398 Numpy array containing all feature data with shape
399 (n_windows, n_frequencies).
401 Examples
402 --------
403 >>> data = feature.to_numpy()
404 >>> print(data.shape)
405 (100, 256)
406 >>> print(data.dtype)
407 complex128
408 >>> mean_amplitude = np.abs(data).mean()
409 """
411 return self.hdf5_dataset[()]
413 def from_numpy(self, new_estimate: np.ndarray) -> None:
414 """
415 Load data from a numpy array into the HDF5 dataset.
417 This method updates the HDF5 dataset with new data from a numpy array.
418 The input array must match the dataset's dtype. The HDF5 dataset will
419 be resized if necessary to accommodate the new data.
421 Parameters
422 ----------
423 new_estimate : np.ndarray
424 Numpy array to write to the HDF5 dataset. Must have compatible
425 dtype with the existing dataset.
427 Raises
428 ------
429 TypeError
430 If input array dtype does not match the HDF5 dataset dtype or
431 if input cannot be converted to numpy array.
433 Notes
434 -----
435 The variable 'data' is a builtin in numpy and cannot be used as a parameter name.
437 Examples
438 --------
439 >>> import numpy as np
440 >>> new_data = np.random.randn(100, 256) + 1j * np.random.randn(100, 256)
441 >>> feature.from_numpy(new_data)
442 >>> loaded_data = feature.to_numpy()
443 >>> assert loaded_data.shape == new_data.shape
444 """
446 if not isinstance(new_estimate, np.ndarray):
447 try:
448 new_estimate = np.array(new_estimate)
449 except (ValueError, TypeError) as error:
450 msg = f"{error} Input must be a numpy array not {type(new_estimate)}"
451 self.logger.exception(msg)
452 raise TypeError(msg)
453 if new_estimate.dtype != self.hdf5_dataset.dtype:
454 msg = (
455 f"Input array must be type {new_estimate.dtype} not "
456 "{self.hdf5_dataset.dtype}"
457 )
458 self.logger.error(msg)
459 raise TypeError(msg)
460 if new_estimate.shape != self.hdf5_dataset.shape:
461 self.hdf5_dataset.resize(new_estimate.shape)
462 self.hdf5_dataset[...] = new_estimate
464 def from_xarray(
465 self,
466 data: xr.DataArray,
467 sample_rate_decimation_level: int,
468 ) -> None:
469 """
470 Load data and metadata from an xarray DataArray.
472 This method updates both the HDF5 dataset and metadata from an xarray
473 DataArray. It extracts time coordinates, frequency range, and component
474 information from the DataArray and its attributes.
476 Parameters
477 ----------
478 data : xr.DataArray
479 Input xarray DataArray with 'time' and 'frequency' coordinates.
480 Expected dimensions are ['time', 'frequency'].
481 sample_rate_decimation_level : int
482 Decimation level applied to the original data to produce this
483 feature dataset (integer ≥ 1).
485 Notes
486 -----
487 Metadata stored in xarray attributes will be extracted and written to
488 the HDF5 file. The full dataset is loaded into memory during this process.
490 Examples
491 --------
492 >>> import xarray as xr
493 >>> import numpy as np
495 Create sample xarray data:
497 >>> times = np.arange('2023-01-01', '2023-01-02', dtype='datetime64[s]')
498 >>> freqs = np.linspace(0.01, 100, 256)
499 >>> data_array = np.random.randn(len(times), len(freqs)) + \\
500 ... 1j * np.random.randn(len(times), len(freqs))
501 >>> xr_data = xr.DataArray(
502 ... data_array,
503 ... dims=['time', 'frequency'],
504 ... coords={'time': times, 'frequency': freqs},
505 ... name='Ex',
506 ... attrs={'units': 'mV/km'}
507 ... )
509 Load into feature dataset:
511 >>> feature.from_xarray(xr_data, sample_rate_decimation_level=2)
512 >>> print(feature.metadata.component)
513 'Ex'
514 """
515 self.metadata.time_period.start = data.time[0].values
516 self.metadata.time_period.end = data.time[-1].values
518 self.metadata.sample_rate_decimation_level = sample_rate_decimation_level
519 self.metadata.frequency_min = data.coords["frequency"].data.min()
520 self.metadata.frequency_max = data.coords["frequency"].data.max()
521 step_size = data.coords["time"].data[1] - data.coords["time"].data[0]
522 self.metadata.sample_rate_window_step = step_size / np.timedelta64(1, "s")
523 self.metadata.component = data.name
524 try:
525 self.metadata.units = data.units
526 except AttributeError:
527 self.logger.debug("Could not find 'units' in xarray")
528 self.write_metadata()
530 self.from_numpy(data.to_numpy())