Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ data \ station_config.py: 90%
174 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-10 00:01 -0800
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-10 00:01 -0800
1"""
3This module contains tools for building MTH5 files from synthetic data.
5Development Notes:
6 - These tools are a work in progress and ideally will be able to yield
7 generalize to more than just the legacy EMTF ascii datasets that they
8 initially served.
10Definitions used in the creation of synthetic mth5 files.
12Survey level: 'mth5_path', Path to output h5
14Station level: mt_metadata Station() object with station info.
15 - the `id` field (name of the station) is required.
16 - other station metadata can be added
17 - channel_nomenclature
18 - The channel_nomenclature was previously stored at the run level. It makes more sense to store
19 this info at the station level, as the only reason the nomenclature would change (that I can
20 think of) would be if the acquistion system changed, in which case it would make the most
21 sense to initialize a new station object.
23Run level: 'columns', :channel names as a list; ["hx", "hy", "hz", "ex", "ey"]
24Run level: 'raw_data_path', Path to ascii data source
25Run level: 'noise_scalars', dict keyed by channel, default is zero,
26Run level: 'nan_indices', iterable of integers, where to put nan [
27Run level: 'filters', dict of filters keyed by columns
28Run level: 'run_id', name of the run
29Run level: 'sample_rate', 1.0
32"""
34import pathlib
35from typing import List, Optional, Union
37import pandas as pd
38import scipy.signal as ssig
39from loguru import logger
40from mt_metadata.processing.aurora import ChannelNomenclature
41from mt_metadata.processing.aurora.channel_nomenclature import SupportedNomenclatureEnum
42from mt_metadata.timeseries import Run, Station
43from mt_metadata.timeseries.filters.helper_functions import make_coefficient_filter
46ASCII_DATA_PATH = pathlib.Path(__file__).parent.resolve()
49def make_filters(as_list: Optional[bool] = False) -> Union[dict, list]:
50 """
51 Creates a collection of filters
52 Because the synthetic data from EMTF are already in mV/km and nT, no calibration filters are required.
53 The filters here are placeholders to show where instrument response function information would get assigned.
55 :param as_list: If True we return a list, False return a dict
56 :type as_list: bool
57 :return filters_list: Filters for populating the filters lists of synthetic data
58 :rtype filters_list: Union[List, Dict]
59 """
60 unity_coeff_filter = make_coefficient_filter(name="1", gain=1.0)
61 multipy_by_10_filter = make_coefficient_filter(gain=10.0, name="10")
62 divide_by_10_filter = make_coefficient_filter(gain=0.1, name="0.1")
64 if as_list:
65 return [unity_coeff_filter, multipy_by_10_filter, divide_by_10_filter]
66 else:
67 filters = {}
68 filters["1x"] = unity_coeff_filter
69 filters["10x"] = multipy_by_10_filter
70 filters["0.1x"] = divide_by_10_filter
71 return filters
74FILTERS = make_filters()
77class SyntheticRun(object):
78 """
79 Place to store information that will be needed to initialize and MTH5 Run object.
81 Initially this class worked only with the synthetic ASCII data from legacy EMTF.
82 """
84 def __init__(
85 self,
86 id: str,
87 sample_rate: float,
88 channels: List[str],
89 raw_data_path: Optional[Union[str, pathlib.Path]] = None,
90 noise_scalars: Optional[dict] = None,
91 nan_indices: Optional[dict] = None,
92 filters: Optional[dict] = None,
93 start: Optional[str] = None,
94 timeseries_dataframe: Optional[pd.DataFrame] = None,
95 data_source: str = "legacy emtf ascii",
96 ) -> None:
97 """
98 Constructor.
100 :param id: label for the run
101 :type id: str
102 :param sample_rate: sample rate of the time series in Hz.
103 :type sample_rate: float
104 :param channels: the channel names to include in the run.
105 :type channels: List[str]
106 :param raw_data_path: Path to ascii data source
107 :type raw_data_path: Union[str, pathlib.Path, None]
108 :param noise_scalars: Keys are channels, values are scale factors for noise to add
109 :type noise_scalars: Union[dict, None]
110 :param nan_indices: Keys are channels, values lists. List elements are pairs of (index, num_nan_to_add)
111 :type nan_indices: Union[dict, None]
112 :param filters: Keys are channels, values lists. List elements are Filter objects
113 :type filters: Union[dict, None]
114 :param start: Setting the run start time. e.g. start="1980-01-01T00:00:00+00:00"
115 :type start: Union[str, None]
116 :param timeseries_dataframe: The time series data for the run.
117 Added 2025 to try to allow more general data to be cast to mth5
118 :type timeseries_dataframe: Optional[pd.DataFrame] = None
119 :param data_source: Keyword to tell if data are a legacy EMTF ASCII file
120 :type data_source: str
122 """
123 run_metadata = Run()
124 run_metadata.id = id
125 run_metadata.sample_rate = sample_rate
126 run_metadata.time_period.start = start
128 self._timeseries_dataframe = (
129 timeseries_dataframe # normally None for legacy EMTF data
130 )
131 if isinstance(self._timeseries_dataframe, pd.DataFrame):
132 self.data_source = "dataframe"
133 else:
134 self.data_source = data_source
135 self.raw_data_path = raw_data_path
137 # set channel names
138 self.channels = channels
140 # Set scale factors for adding noise to individual channels
141 self.noise_scalars = noise_scalars
142 if noise_scalars is None:
143 self.noise_scalars = {}
144 for channel in self.channels:
145 self.noise_scalars[channel] = 0.0
147 # Set indices for adding nan to individual channels
148 if nan_indices is None:
149 self.nan_indices = (
150 {}
151 ) # TODO: make this consistent with noise_scalars, None or empty dict.
153 # Set filters individual channels
154 if filters is None:
155 self.filters = (
156 {}
157 ) # TODO: make this consistent with noise_scalars, None or empty dict.
159 # run_metadata.add_base_attribute("")
160 self.run_metadata = run_metadata
162 def _get_timeseries_dataframe(
163 self,
164 ) -> pd.DataFrame:
165 """
166 Returns time series data in a dataframe with columns named for EM field component.
168 Up-samples data to self.run_metadata.sample_rate, which is treated as in integer,
169 in teh case that self.data_source == "legacy emtf ascii".
170 Only tested for 8, to make 8Hz data for testing. If run.sample_rate is default (1.0)
171 then no up-sampling takes place.
173 :rtype df: pandas.DataFrame
174 :return df: The time series data for the synthetic run
176 """
177 if isinstance(self._timeseries_dataframe, pd.DataFrame):
178 msg = f"Run Data appear to be already set in dataframe"
179 logger.info(msg)
180 return self._timeseries_dataframe
182 elif self.data_source == "legacy emtf ascii":
183 ascii_file = LegacyEMTFAsciiFile(file_path=self.raw_data_path)
184 df = ascii_file.load_dataframe(
185 channel_names=self.channels, sample_rate=self.run_metadata.sample_rate
186 )
188 return df
189 else:
190 msg = f"No dataframe associated with run, nor a legacy EMTF ASCII file"
191 msg += ".. add support for your filetype or declare dataframe"
192 raise NotImplementedError(msg)
195class SyntheticStation(object):
196 """
197 Class used to contain information needed to generate MTH5 file from synthetic data.
199 TODO: could add channel_nomenclature to this obj (instead of run) but would need to decide that
200 runs cannot change channel nomenclature first. If that were decided, the channel_map() could go here as well.
202 """
204 def __init__(
205 self,
206 station_metadata: Station,
207 mth5_name: Optional[Union[str, pathlib.Path]] = None,
208 channel_nomenclature_keyword: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default, # TODO: rename to channel_nomenclature_keyword
209 ) -> None:
210 """
211 Constructor.
213 :param station_metadata: mt_metadata objeect with station metadata
214 :type id: Station
215 :param mth5_name: The name of the h5 file to which the station data and metadata will be written.
216 :type mth5_name: Optional[Union[str, pathlib.Path]]
217 :param channel_nomenclature_keyword: the keyword for the channel nomenclature
218 :type channel_nomenclature_keyword: str
221 """
222 self.station_metadata = station_metadata
223 self.runs = []
224 self.mth5_name = mth5_name
225 self.channel_nomenclature_keyword = channel_nomenclature_keyword
226 self._channel_nomenclature = None
228 self.station_metadata.channels_recorded = self.channel_nomenclature.channels
230 @property
231 def channel_nomenclature(self):
232 if self._channel_nomenclature is None:
233 self._channel_nomenclature = ChannelNomenclature(
234 keyword=self.channel_nomenclature_keyword
235 )
236 return self._channel_nomenclature
239def make_station_01(
240 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default,
241) -> SyntheticStation:
242 """
243 This method prepares the metadata needed to generate an mth5 with syntheric data.
245 :param channel_nomenclature: Must be one of the nomenclatures defined in SupportedNomenclatureEnum
246 :type channel_nomenclature: str
248 :return: Object with all info needed to generate MTH5 file from synthetic data.
249 :rtype: SyntheticStation
251 """
252 station_metadata = Station()
253 station_metadata.id = "test1"
254 station_metadata.location.latitude = (
255 17.996 # TODO: Add more metadata here as an example
256 )
258 # initialize SyntheticStation
259 station = SyntheticStation(
260 station_metadata=station_metadata,
261 channel_nomenclature_keyword=channel_nomenclature, # Needed to assign channel types in RunTS
262 )
264 station.mth5_name = f"{station_metadata.id}.h5"
266 run_001 = SyntheticRun(
267 id="001",
268 sample_rate=1.0,
269 channels=station.channel_nomenclature.channels,
270 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"),
271 start="1980-01-01T00:00:00+00:00",
272 )
274 # assign indices to set to Nan (not used 2024-06-06)
275 nan_indices = {}
276 for ch in run_001.channels:
277 nan_indices[ch] = []
278 if ch == station.channel_nomenclature.hx:
279 nan_indices[ch].append([11, 100])
280 if ch == station.channel_nomenclature.hy:
281 nan_indices[ch].append([11, 100])
282 nan_indices[ch].append([20000, 444])
283 run_001.nan_indices = nan_indices
285 # assign some filters to the channels
286 filters = {}
287 for ch in run_001.channels:
288 if ch in station.channel_nomenclature.ex_ey:
289 filters[ch] = [
290 FILTERS["1x"].name,
291 ]
292 elif ch in station.channel_nomenclature.hx_hy_hz:
293 filters[ch] = [FILTERS["10x"].name, FILTERS["0.1x"].name]
294 run_001.filters = filters
296 station.runs = [
297 run_001,
298 ]
300 return station
303def make_station_02(
304 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default,
305) -> SyntheticStation:
306 """
307 Just like station 1, but the data are different
309 :param channel_nomenclature: Must be one of the nomenclatures defined in SupportedNomenclatureEnum
310 :type channel_nomenclature: SupportedNomenclatureEnum
311 :return: Object with all info needed to generate MTH5 file from synthetic data.
312 :rtype: SyntheticStation
314 """
315 test2 = make_station_01(channel_nomenclature=channel_nomenclature)
316 test2.station_metadata.id = "test2"
317 test2.mth5_name = "test2.h5"
318 test2.runs[0].raw_data_path = ASCII_DATA_PATH.joinpath("test2.asc")
320 nan_indices = {}
321 for channel in test2.runs[0].channels:
322 nan_indices[channel] = []
323 test2.runs[0].nan_indices = nan_indices
325 return test2
328def make_station_03(
329 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default,
330) -> SyntheticStation:
331 """
332 Create a synthetic station with multiple runs. Rather than generate fresh
333 synthetic data, we just reuse test1.asc for each run.
335 :param channel_nomenclature: Literal, Must be one of the nomenclatures defined in "channel_nomenclatures.json"
336 :type channel_nomenclature: SupportedNomenclatureEnum
337 :rtype: SyntheticStation
338 :return: Object with all info needed to generate MTH5 file from synthetic data.
340 """
342 station_metadata = Station()
343 station_metadata.id = "test3"
344 station = SyntheticStation(
345 station_metadata=station_metadata,
346 channel_nomenclature_keyword=channel_nomenclature,
347 )
348 station.mth5_name = "test3.h5"
350 channels = station.channel_nomenclature.channels
352 nan_indices = {}
353 for ch in channels:
354 nan_indices[ch] = []
356 filters = {}
357 for ch in channels:
358 if ch in station.channel_nomenclature.ex_ey:
359 filters[ch] = [
360 FILTERS["1x"].name,
361 ]
362 elif ch in station.channel_nomenclature.hx_hy_hz:
363 filters[ch] = [FILTERS["10x"].name, FILTERS["0.1x"].name]
365 run_001 = SyntheticRun(
366 id="001",
367 sample_rate=1.0,
368 channels=channels,
369 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"),
370 nan_indices=nan_indices,
371 filters=filters,
372 start="1980-01-01T00:00:00+00:00",
373 )
375 noise_scalars = {}
376 for ch in channels:
377 noise_scalars[ch] = 2.0
378 run_002 = SyntheticRun(
379 id="002",
380 sample_rate=1.0,
381 channels=channels,
382 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"),
383 noise_scalars=noise_scalars,
384 nan_indices=nan_indices,
385 filters=filters,
386 start="1980-01-02T00:00:00+00:00",
387 )
389 for ch in channels:
390 noise_scalars[ch] = 5.0
391 run_003 = SyntheticRun(
392 id="003",
393 sample_rate=1.0,
394 channels=channels,
395 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"),
396 noise_scalars=noise_scalars,
397 nan_indices=nan_indices,
398 filters=filters,
399 start="1980-01-03T00:00:00+00:00",
400 )
402 for ch in channels:
403 noise_scalars[ch] = 10.0
404 run_004 = SyntheticRun(
405 id="004",
406 sample_rate=1.0,
407 channels=channels,
408 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"),
409 noise_scalars=noise_scalars,
410 nan_indices=nan_indices,
411 filters=filters,
412 start="1980-01-04T00:00:00+00:00",
413 )
415 run_001.filters = filters
416 run_002.filters = filters
417 run_003.filters = filters
418 run_004.filters = filters
420 station.runs = [run_001, run_002, run_003, run_004]
422 return station
425def make_station_04(
426 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default,
427) -> SyntheticStation:
428 """
429 Just like station 01, but data are resampled to 8Hz
431 :param channel_nomenclature: Literal, Must be one of the nomenclatures defined in "channel_nomenclatures.json"
432 :type channel_nomenclature: SupportedNomenclatureEnum
433 :rtype: SyntheticStation
434 :return: Object with all info needed to generate MTH5 file from synthetic data.
435 """
437 station_metadata = Station()
438 station_metadata.id = "test1"
440 station = SyntheticStation(
441 station_metadata=station_metadata,
442 channel_nomenclature_keyword=channel_nomenclature,
443 )
444 station.mth5_name = "test_04_8Hz.h5"
446 run_001 = SyntheticRun(
447 id="001",
448 sample_rate=8.0,
449 channels=station.channel_nomenclature.channels,
450 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"),
451 start=None,
452 )
453 run_001.nan_indices = {}
455 filters = {}
456 for ch in run_001.channels:
457 if ch in station.channel_nomenclature.ex_ey:
458 filters[ch] = [
459 FILTERS["1x"].name,
460 ]
461 elif ch in station.channel_nomenclature.hx_hy_hz:
462 filters[ch] = [FILTERS["10x"].name, FILTERS["0.1x"].name]
463 run_001.filters = filters
465 station.runs = [
466 run_001,
467 ]
468 station_metadata.run_list = [
469 run_001.run_metadata.id,
470 ]
471 station.station_metadata = station_metadata
472 return station
475class LegacyEMTFAsciiFile:
476 """
477 This class can be used to interact with the legacy synthetic data files
478 that were originally in EMTF.
480 Development Notes:
481 As of 2025-02-03 the only LegacyEMTFAsciiFile date sources are sampled at 1Hz.
482 One-off upsampling can be handled in this class if the requested sample rate differs.
484 """
486 IMPLICIT_SAMPLE_RATE = 1.0 # Hz
488 def __init__(self, file_path: pathlib.Path):
489 self.file_path = file_path
491 def load_dataframe(
492 self,
493 channel_names: list,
494 sample_rate: float,
495 ) -> pd.DataFrame:
496 """
497 Loads an EMTF legacy ASCII time series into a dataframe.
499 These files have an awkward whitespace separator, and also need to have the
500 electric field channels inverted to fix a phase swap.
502 :param channel_names: The names of the channels in the legacy EMTF file, in order.
503 :type channel_names: list
504 :param sample_rate: The sample rate of the output time series in Hz.
505 :type sample_rate: float
507 :return df: The labelled time series from the legacy EMTF file.
508 :rtype df: pd.DataFrame
510 """
512 # read in data
513 df = pd.read_csv(self.file_path, names=channel_names, sep=r"\s+")
515 # Invert electric channels to fix phase swap due to modeling coordinates.
516 # Column indices are used to avoid handling channel nomenclature here.
517 df[df.columns[-2]] = -df[df.columns[-2]] # df["ex"] = -df["ex"]
518 df[df.columns[-1]] = -df[df.columns[-1]] # df["ey"] = -df["ey"]
520 # Temporary kludge: One-off handling for a test case to upsample data.
521 # TODO: delete this once synthetic data module is built can offer multiple sample rates
522 if sample_rate != self.IMPLICIT_SAMPLE_RATE:
523 df_orig = df.copy(deep=True)
524 new_data_dict = {}
525 for ch in df.columns:
526 data = df_orig[ch].to_numpy()
527 new_data_dict[ch] = ssig.resample(data, int(sample_rate) * len(df_orig))
528 df = pd.DataFrame(data=new_data_dict)
530 return df
533def main():
534 # sr = SyntheticRun("001")
535 make_station_04()
538if __name__ == "__main__":
539 main()