Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ data \ station_config.py: 90%

174 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-10 00:01 -0800

1""" 

2 

3This module contains tools for building MTH5 files from synthetic data. 

4 

5Development Notes: 

6 - These tools are a work in progress and ideally will be able to yield 

7 generalize to more than just the legacy EMTF ascii datasets that they 

8 initially served. 

9 

10Definitions used in the creation of synthetic mth5 files. 

11 

12Survey level: 'mth5_path', Path to output h5 

13 

14Station level: mt_metadata Station() object with station info. 

15 - the `id` field (name of the station) is required. 

16 - other station metadata can be added 

17 - channel_nomenclature 

18 - The channel_nomenclature was previously stored at the run level. It makes more sense to store 

19 this info at the station level, as the only reason the nomenclature would change (that I can 

20 think of) would be if the acquistion system changed, in which case it would make the most 

21 sense to initialize a new station object. 

22 

23Run level: 'columns', :channel names as a list; ["hx", "hy", "hz", "ex", "ey"] 

24Run level: 'raw_data_path', Path to ascii data source 

25Run level: 'noise_scalars', dict keyed by channel, default is zero, 

26Run level: 'nan_indices', iterable of integers, where to put nan [ 

27Run level: 'filters', dict of filters keyed by columns 

28Run level: 'run_id', name of the run 

29Run level: 'sample_rate', 1.0 

30 

31 

32""" 

33 

34import pathlib 

35from typing import List, Optional, Union 

36 

37import pandas as pd 

38import scipy.signal as ssig 

39from loguru import logger 

40from mt_metadata.processing.aurora import ChannelNomenclature 

41from mt_metadata.processing.aurora.channel_nomenclature import SupportedNomenclatureEnum 

42from mt_metadata.timeseries import Run, Station 

43from mt_metadata.timeseries.filters.helper_functions import make_coefficient_filter 

44 

45 

46ASCII_DATA_PATH = pathlib.Path(__file__).parent.resolve() 

47 

48 

49def make_filters(as_list: Optional[bool] = False) -> Union[dict, list]: 

50 """ 

51 Creates a collection of filters 

52 Because the synthetic data from EMTF are already in mV/km and nT, no calibration filters are required. 

53 The filters here are placeholders to show where instrument response function information would get assigned. 

54 

55 :param as_list: If True we return a list, False return a dict 

56 :type as_list: bool 

57 :return filters_list: Filters for populating the filters lists of synthetic data 

58 :rtype filters_list: Union[List, Dict] 

59 """ 

60 unity_coeff_filter = make_coefficient_filter(name="1", gain=1.0) 

61 multipy_by_10_filter = make_coefficient_filter(gain=10.0, name="10") 

62 divide_by_10_filter = make_coefficient_filter(gain=0.1, name="0.1") 

63 

64 if as_list: 

65 return [unity_coeff_filter, multipy_by_10_filter, divide_by_10_filter] 

66 else: 

67 filters = {} 

68 filters["1x"] = unity_coeff_filter 

69 filters["10x"] = multipy_by_10_filter 

70 filters["0.1x"] = divide_by_10_filter 

71 return filters 

72 

73 

74FILTERS = make_filters() 

75 

76 

77class SyntheticRun(object): 

78 """ 

79 Place to store information that will be needed to initialize and MTH5 Run object. 

80 

81 Initially this class worked only with the synthetic ASCII data from legacy EMTF. 

82 """ 

83 

84 def __init__( 

85 self, 

86 id: str, 

87 sample_rate: float, 

88 channels: List[str], 

89 raw_data_path: Optional[Union[str, pathlib.Path]] = None, 

90 noise_scalars: Optional[dict] = None, 

91 nan_indices: Optional[dict] = None, 

92 filters: Optional[dict] = None, 

93 start: Optional[str] = None, 

94 timeseries_dataframe: Optional[pd.DataFrame] = None, 

95 data_source: str = "legacy emtf ascii", 

96 ) -> None: 

97 """ 

98 Constructor. 

99 

100 :param id: label for the run 

101 :type id: str 

102 :param sample_rate: sample rate of the time series in Hz. 

103 :type sample_rate: float 

104 :param channels: the channel names to include in the run. 

105 :type channels: List[str] 

106 :param raw_data_path: Path to ascii data source 

107 :type raw_data_path: Union[str, pathlib.Path, None] 

108 :param noise_scalars: Keys are channels, values are scale factors for noise to add 

109 :type noise_scalars: Union[dict, None] 

110 :param nan_indices: Keys are channels, values lists. List elements are pairs of (index, num_nan_to_add) 

111 :type nan_indices: Union[dict, None] 

112 :param filters: Keys are channels, values lists. List elements are Filter objects 

113 :type filters: Union[dict, None] 

114 :param start: Setting the run start time. e.g. start="1980-01-01T00:00:00+00:00" 

115 :type start: Union[str, None] 

116 :param timeseries_dataframe: The time series data for the run. 

117 Added 2025 to try to allow more general data to be cast to mth5 

118 :type timeseries_dataframe: Optional[pd.DataFrame] = None 

119 :param data_source: Keyword to tell if data are a legacy EMTF ASCII file 

120 :type data_source: str 

121 

122 """ 

123 run_metadata = Run() 

124 run_metadata.id = id 

125 run_metadata.sample_rate = sample_rate 

126 run_metadata.time_period.start = start 

127 

128 self._timeseries_dataframe = ( 

129 timeseries_dataframe # normally None for legacy EMTF data 

130 ) 

131 if isinstance(self._timeseries_dataframe, pd.DataFrame): 

132 self.data_source = "dataframe" 

133 else: 

134 self.data_source = data_source 

135 self.raw_data_path = raw_data_path 

136 

137 # set channel names 

138 self.channels = channels 

139 

140 # Set scale factors for adding noise to individual channels 

141 self.noise_scalars = noise_scalars 

142 if noise_scalars is None: 

143 self.noise_scalars = {} 

144 for channel in self.channels: 

145 self.noise_scalars[channel] = 0.0 

146 

147 # Set indices for adding nan to individual channels 

148 if nan_indices is None: 

149 self.nan_indices = ( 

150 {} 

151 ) # TODO: make this consistent with noise_scalars, None or empty dict. 

152 

153 # Set filters individual channels 

154 if filters is None: 

155 self.filters = ( 

156 {} 

157 ) # TODO: make this consistent with noise_scalars, None or empty dict. 

158 

159 # run_metadata.add_base_attribute("") 

160 self.run_metadata = run_metadata 

161 

162 def _get_timeseries_dataframe( 

163 self, 

164 ) -> pd.DataFrame: 

165 """ 

166 Returns time series data in a dataframe with columns named for EM field component. 

167 

168 Up-samples data to self.run_metadata.sample_rate, which is treated as in integer, 

169 in teh case that self.data_source == "legacy emtf ascii". 

170 Only tested for 8, to make 8Hz data for testing. If run.sample_rate is default (1.0) 

171 then no up-sampling takes place. 

172 

173 :rtype df: pandas.DataFrame 

174 :return df: The time series data for the synthetic run 

175 

176 """ 

177 if isinstance(self._timeseries_dataframe, pd.DataFrame): 

178 msg = f"Run Data appear to be already set in dataframe" 

179 logger.info(msg) 

180 return self._timeseries_dataframe 

181 

182 elif self.data_source == "legacy emtf ascii": 

183 ascii_file = LegacyEMTFAsciiFile(file_path=self.raw_data_path) 

184 df = ascii_file.load_dataframe( 

185 channel_names=self.channels, sample_rate=self.run_metadata.sample_rate 

186 ) 

187 

188 return df 

189 else: 

190 msg = f"No dataframe associated with run, nor a legacy EMTF ASCII file" 

191 msg += ".. add support for your filetype or declare dataframe" 

192 raise NotImplementedError(msg) 

193 

194 

195class SyntheticStation(object): 

196 """ 

197 Class used to contain information needed to generate MTH5 file from synthetic data. 

198 

199 TODO: could add channel_nomenclature to this obj (instead of run) but would need to decide that 

200 runs cannot change channel nomenclature first. If that were decided, the channel_map() could go here as well. 

201 

202 """ 

203 

204 def __init__( 

205 self, 

206 station_metadata: Station, 

207 mth5_name: Optional[Union[str, pathlib.Path]] = None, 

208 channel_nomenclature_keyword: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default, # TODO: rename to channel_nomenclature_keyword 

209 ) -> None: 

210 """ 

211 Constructor. 

212 

213 :param station_metadata: mt_metadata objeect with station metadata 

214 :type id: Station 

215 :param mth5_name: The name of the h5 file to which the station data and metadata will be written. 

216 :type mth5_name: Optional[Union[str, pathlib.Path]] 

217 :param channel_nomenclature_keyword: the keyword for the channel nomenclature 

218 :type channel_nomenclature_keyword: str 

219 

220 

221 """ 

222 self.station_metadata = station_metadata 

223 self.runs = [] 

224 self.mth5_name = mth5_name 

225 self.channel_nomenclature_keyword = channel_nomenclature_keyword 

226 self._channel_nomenclature = None 

227 

228 self.station_metadata.channels_recorded = self.channel_nomenclature.channels 

229 

230 @property 

231 def channel_nomenclature(self): 

232 if self._channel_nomenclature is None: 

233 self._channel_nomenclature = ChannelNomenclature( 

234 keyword=self.channel_nomenclature_keyword 

235 ) 

236 return self._channel_nomenclature 

237 

238 

239def make_station_01( 

240 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default, 

241) -> SyntheticStation: 

242 """ 

243 This method prepares the metadata needed to generate an mth5 with syntheric data. 

244 

245 :param channel_nomenclature: Must be one of the nomenclatures defined in SupportedNomenclatureEnum 

246 :type channel_nomenclature: str 

247 

248 :return: Object with all info needed to generate MTH5 file from synthetic data. 

249 :rtype: SyntheticStation 

250 

251 """ 

252 station_metadata = Station() 

253 station_metadata.id = "test1" 

254 station_metadata.location.latitude = ( 

255 17.996 # TODO: Add more metadata here as an example 

256 ) 

257 

258 # initialize SyntheticStation 

259 station = SyntheticStation( 

260 station_metadata=station_metadata, 

261 channel_nomenclature_keyword=channel_nomenclature, # Needed to assign channel types in RunTS 

262 ) 

263 

264 station.mth5_name = f"{station_metadata.id}.h5" 

265 

266 run_001 = SyntheticRun( 

267 id="001", 

268 sample_rate=1.0, 

269 channels=station.channel_nomenclature.channels, 

270 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"), 

271 start="1980-01-01T00:00:00+00:00", 

272 ) 

273 

274 # assign indices to set to Nan (not used 2024-06-06) 

275 nan_indices = {} 

276 for ch in run_001.channels: 

277 nan_indices[ch] = [] 

278 if ch == station.channel_nomenclature.hx: 

279 nan_indices[ch].append([11, 100]) 

280 if ch == station.channel_nomenclature.hy: 

281 nan_indices[ch].append([11, 100]) 

282 nan_indices[ch].append([20000, 444]) 

283 run_001.nan_indices = nan_indices 

284 

285 # assign some filters to the channels 

286 filters = {} 

287 for ch in run_001.channels: 

288 if ch in station.channel_nomenclature.ex_ey: 

289 filters[ch] = [ 

290 FILTERS["1x"].name, 

291 ] 

292 elif ch in station.channel_nomenclature.hx_hy_hz: 

293 filters[ch] = [FILTERS["10x"].name, FILTERS["0.1x"].name] 

294 run_001.filters = filters 

295 

296 station.runs = [ 

297 run_001, 

298 ] 

299 

300 return station 

301 

302 

303def make_station_02( 

304 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default, 

305) -> SyntheticStation: 

306 """ 

307 Just like station 1, but the data are different 

308 

309 :param channel_nomenclature: Must be one of the nomenclatures defined in SupportedNomenclatureEnum 

310 :type channel_nomenclature: SupportedNomenclatureEnum 

311 :return: Object with all info needed to generate MTH5 file from synthetic data. 

312 :rtype: SyntheticStation 

313 

314 """ 

315 test2 = make_station_01(channel_nomenclature=channel_nomenclature) 

316 test2.station_metadata.id = "test2" 

317 test2.mth5_name = "test2.h5" 

318 test2.runs[0].raw_data_path = ASCII_DATA_PATH.joinpath("test2.asc") 

319 

320 nan_indices = {} 

321 for channel in test2.runs[0].channels: 

322 nan_indices[channel] = [] 

323 test2.runs[0].nan_indices = nan_indices 

324 

325 return test2 

326 

327 

328def make_station_03( 

329 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default, 

330) -> SyntheticStation: 

331 """ 

332 Create a synthetic station with multiple runs. Rather than generate fresh 

333 synthetic data, we just reuse test1.asc for each run. 

334 

335 :param channel_nomenclature: Literal, Must be one of the nomenclatures defined in "channel_nomenclatures.json" 

336 :type channel_nomenclature: SupportedNomenclatureEnum 

337 :rtype: SyntheticStation 

338 :return: Object with all info needed to generate MTH5 file from synthetic data. 

339 

340 """ 

341 

342 station_metadata = Station() 

343 station_metadata.id = "test3" 

344 station = SyntheticStation( 

345 station_metadata=station_metadata, 

346 channel_nomenclature_keyword=channel_nomenclature, 

347 ) 

348 station.mth5_name = "test3.h5" 

349 

350 channels = station.channel_nomenclature.channels 

351 

352 nan_indices = {} 

353 for ch in channels: 

354 nan_indices[ch] = [] 

355 

356 filters = {} 

357 for ch in channels: 

358 if ch in station.channel_nomenclature.ex_ey: 

359 filters[ch] = [ 

360 FILTERS["1x"].name, 

361 ] 

362 elif ch in station.channel_nomenclature.hx_hy_hz: 

363 filters[ch] = [FILTERS["10x"].name, FILTERS["0.1x"].name] 

364 

365 run_001 = SyntheticRun( 

366 id="001", 

367 sample_rate=1.0, 

368 channels=channels, 

369 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"), 

370 nan_indices=nan_indices, 

371 filters=filters, 

372 start="1980-01-01T00:00:00+00:00", 

373 ) 

374 

375 noise_scalars = {} 

376 for ch in channels: 

377 noise_scalars[ch] = 2.0 

378 run_002 = SyntheticRun( 

379 id="002", 

380 sample_rate=1.0, 

381 channels=channels, 

382 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"), 

383 noise_scalars=noise_scalars, 

384 nan_indices=nan_indices, 

385 filters=filters, 

386 start="1980-01-02T00:00:00+00:00", 

387 ) 

388 

389 for ch in channels: 

390 noise_scalars[ch] = 5.0 

391 run_003 = SyntheticRun( 

392 id="003", 

393 sample_rate=1.0, 

394 channels=channels, 

395 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"), 

396 noise_scalars=noise_scalars, 

397 nan_indices=nan_indices, 

398 filters=filters, 

399 start="1980-01-03T00:00:00+00:00", 

400 ) 

401 

402 for ch in channels: 

403 noise_scalars[ch] = 10.0 

404 run_004 = SyntheticRun( 

405 id="004", 

406 sample_rate=1.0, 

407 channels=channels, 

408 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"), 

409 noise_scalars=noise_scalars, 

410 nan_indices=nan_indices, 

411 filters=filters, 

412 start="1980-01-04T00:00:00+00:00", 

413 ) 

414 

415 run_001.filters = filters 

416 run_002.filters = filters 

417 run_003.filters = filters 

418 run_004.filters = filters 

419 

420 station.runs = [run_001, run_002, run_003, run_004] 

421 

422 return station 

423 

424 

425def make_station_04( 

426 channel_nomenclature: SupportedNomenclatureEnum = SupportedNomenclatureEnum.default, 

427) -> SyntheticStation: 

428 """ 

429 Just like station 01, but data are resampled to 8Hz 

430 

431 :param channel_nomenclature: Literal, Must be one of the nomenclatures defined in "channel_nomenclatures.json" 

432 :type channel_nomenclature: SupportedNomenclatureEnum 

433 :rtype: SyntheticStation 

434 :return: Object with all info needed to generate MTH5 file from synthetic data. 

435 """ 

436 

437 station_metadata = Station() 

438 station_metadata.id = "test1" 

439 

440 station = SyntheticStation( 

441 station_metadata=station_metadata, 

442 channel_nomenclature_keyword=channel_nomenclature, 

443 ) 

444 station.mth5_name = "test_04_8Hz.h5" 

445 

446 run_001 = SyntheticRun( 

447 id="001", 

448 sample_rate=8.0, 

449 channels=station.channel_nomenclature.channels, 

450 raw_data_path=ASCII_DATA_PATH.joinpath("test1.asc"), 

451 start=None, 

452 ) 

453 run_001.nan_indices = {} 

454 

455 filters = {} 

456 for ch in run_001.channels: 

457 if ch in station.channel_nomenclature.ex_ey: 

458 filters[ch] = [ 

459 FILTERS["1x"].name, 

460 ] 

461 elif ch in station.channel_nomenclature.hx_hy_hz: 

462 filters[ch] = [FILTERS["10x"].name, FILTERS["0.1x"].name] 

463 run_001.filters = filters 

464 

465 station.runs = [ 

466 run_001, 

467 ] 

468 station_metadata.run_list = [ 

469 run_001.run_metadata.id, 

470 ] 

471 station.station_metadata = station_metadata 

472 return station 

473 

474 

475class LegacyEMTFAsciiFile: 

476 """ 

477 This class can be used to interact with the legacy synthetic data files 

478 that were originally in EMTF. 

479 

480 Development Notes: 

481 As of 2025-02-03 the only LegacyEMTFAsciiFile date sources are sampled at 1Hz. 

482 One-off upsampling can be handled in this class if the requested sample rate differs. 

483 

484 """ 

485 

486 IMPLICIT_SAMPLE_RATE = 1.0 # Hz 

487 

488 def __init__(self, file_path: pathlib.Path): 

489 self.file_path = file_path 

490 

491 def load_dataframe( 

492 self, 

493 channel_names: list, 

494 sample_rate: float, 

495 ) -> pd.DataFrame: 

496 """ 

497 Loads an EMTF legacy ASCII time series into a dataframe. 

498 

499 These files have an awkward whitespace separator, and also need to have the 

500 electric field channels inverted to fix a phase swap. 

501 

502 :param channel_names: The names of the channels in the legacy EMTF file, in order. 

503 :type channel_names: list 

504 :param sample_rate: The sample rate of the output time series in Hz. 

505 :type sample_rate: float 

506 

507 :return df: The labelled time series from the legacy EMTF file. 

508 :rtype df: pd.DataFrame 

509 

510 """ 

511 

512 # read in data 

513 df = pd.read_csv(self.file_path, names=channel_names, sep=r"\s+") 

514 

515 # Invert electric channels to fix phase swap due to modeling coordinates. 

516 # Column indices are used to avoid handling channel nomenclature here. 

517 df[df.columns[-2]] = -df[df.columns[-2]] # df["ex"] = -df["ex"] 

518 df[df.columns[-1]] = -df[df.columns[-1]] # df["ey"] = -df["ey"] 

519 

520 # Temporary kludge: One-off handling for a test case to upsample data. 

521 # TODO: delete this once synthetic data module is built can offer multiple sample rates 

522 if sample_rate != self.IMPLICIT_SAMPLE_RATE: 

523 df_orig = df.copy(deep=True) 

524 new_data_dict = {} 

525 for ch in df.columns: 

526 data = df_orig[ch].to_numpy() 

527 new_data_dict[ch] = ssig.resample(data, int(sample_rate) * len(df_orig)) 

528 df = pd.DataFrame(data=new_data_dict) 

529 

530 return df 

531 

532 

533def main(): 

534 # sr = SyntheticRun("001") 

535 make_station_04() 

536 

537 

538if __name__ == "__main__": 

539 main()