Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ io \ nims \ nims_collection.py: 89%
55 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-27 20:09 -0800
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-27 20:09 -0800
1# -*- coding: utf-8 -*-
2"""
3NIMS Collection
4===============
6Collection of NIMS binary files combined into runs for magnetotelluric data processing.
8Created on Wed Aug 31 10:32:44 2022
10@author: jpeacock
11"""
13# =============================================================================
14# Imports
15# =============================================================================
16from __future__ import annotations
18from pathlib import Path
19from typing import Any
21import pandas as pd
23from mth5.io.collection import Collection
24from mth5.io.nims import NIMS
27# =============================================================================
30class NIMSCollection(Collection):
31 """
32 Collection of NIMS binary files into runs.
34 This class provides functionality for organizing and processing multiple NIMS
35 binary files into a structured format for magnetotelluric data analysis.
37 Parameters
38 ----------
39 file_path : str | Path | None, optional
40 Path to the directory containing NIMS binary files.
41 **kwargs : dict
42 Additional keyword arguments passed to the parent Collection class.
44 Attributes
45 ----------
46 file_ext : str
47 File extension for NIMS binary files ('bin').
48 survey_id : str
49 Survey identifier, defaults to 'mt'.
51 Examples
52 --------
53 >>> from mth5.io.nims import NIMSCollection
54 >>> nc = NIMSCollection(r"/path/to/nims/station")
55 >>> nc.survey_id = "mt001"
56 >>> df = nc.to_dataframe()
58 See Also
59 --------
60 mth5.io.collection.Collection : Base collection class
61 mth5.io.nims.NIMS : NIMS file reader
62 """
64 def __init__(self, file_path: str | Path | None = None, **kwargs: Any) -> None:
65 """
66 Initialize NIMSCollection instance.
68 Parameters
69 ----------
70 file_path : str | Path | None, optional
71 Path to the directory containing NIMS binary files.
72 **kwargs : dict
73 Additional keyword arguments passed to the parent Collection class.
74 """
75 super().__init__(file_path=file_path, **kwargs)
76 self.file_ext: str = "bin"
77 self.survey_id: str = "mt"
79 def to_dataframe(
80 self,
81 sample_rates: int | list[int] = [1],
82 run_name_zeros: int = 2,
83 calibration_path: str | Path | None = None,
84 ) -> pd.DataFrame:
85 """
86 Create a DataFrame of each NIMS binary file in the collection directory.
88 This method processes all NIMS binary files in the specified directory and
89 extracts metadata to create a structured DataFrame suitable for further
90 magnetotelluric data processing.
92 Parameters
93 ----------
94 sample_rates : int | list[int], default [1]
95 Sample rates to include in the DataFrame. Note that for NIMS data,
96 this parameter is present for interface consistency but all files
97 will be processed regardless of their sample rate.
98 run_name_zeros : int, default 2
99 Number of zeros to use when formatting run names in the output.
100 calibration_path : str | Path | None, optional
101 Path to calibration files. Currently not used in NIMS processing
102 but included for interface consistency.
104 Returns
105 -------
106 pd.DataFrame
107 DataFrame containing metadata for each NIMS file with columns:
108 - survey : Survey identifier
109 - station : Station name from NIMS file
110 - run : Run identifier from NIMS file
111 - start : Start time in ISO format
112 - end : End time in ISO format
113 - fn : File path
114 - sample_rate : Sampling rate
115 - file_size : File size in bytes
116 - n_samples : Number of samples
117 - dipole : Electric dipole lengths [Ex, Ey]
118 - channel_id : Channel identifier (always 1)
119 - sequence_number : Sequence number (always 0)
120 - component : Comma-separated component list
121 - instrument_id : Instrument identifier (always 'NIMS')
123 Notes
124 -----
125 This method assumes the directory contains files from a single station.
126 Each NIMS file is read to extract header information including timing,
127 station identification, and measurement parameters.
129 Examples
130 --------
131 >>> from mth5.io.nims import NIMSCollection
132 >>> nc = NIMSCollection("/path/to/nims/station")
133 >>> df = nc.to_dataframe(run_name_zeros=3)
134 >>> print(df[['station', 'run', 'start', 'sample_rate']])
135 """
136 entries = []
137 for fn in self.get_files(
138 [self.file_ext, self.file_ext.lower(), self.file_ext.upper()]
139 ):
140 nims_obj = NIMS(fn)
141 nims_obj.read_header()
142 entry = self.get_empty_entry_dict()
143 entry["survey"] = self.survey_id
144 entry["station"] = nims_obj.station
145 entry["run"] = nims_obj.run_id
146 entry["start"] = nims_obj.start_time.isoformat()
147 entry["end"] = nims_obj.end_time.isoformat()
148 entry["fn"] = fn
149 entry["sample_rate"] = nims_obj.sample_rate
150 entry["file_size"] = nims_obj.file_size
151 entry["n_samples"] = nims_obj.n_samples
152 entry["dipole"] = [nims_obj.ex_length, nims_obj.ey_length]
154 entries.append(entry)
156 # make pandas dataframe and set data types
157 df = pd.DataFrame(entries)
159 # If there are no entries, create an empty DataFrame with the
160 # expected columns so subsequent scalar assignments and dtype
161 # enforcement work without raising (pandas raises when assigning
162 # scalars into an empty frame with no defined index).
163 if df.empty:
164 expected_cols = [
165 "survey",
166 "station",
167 "run",
168 "start",
169 "end",
170 "fn",
171 "sample_rate",
172 "file_size",
173 "n_samples",
174 "dipole",
175 "channel_id",
176 "sequence_number",
177 "component",
178 "instrument_id",
179 ]
180 df = pd.DataFrame(columns=expected_cols)
182 # Populate/ensure scalar columns exist
183 if "channel_id" not in df.columns:
184 df["channel_id"] = 1
185 else:
186 # Explicitly coerce to numeric before filling to avoid future downcast warnings
187 df.loc[:, "channel_id"] = (
188 pd.to_numeric(df.loc[:, "channel_id"], errors="coerce")
189 .fillna(1)
190 .astype("int64")
191 )
193 if "sequence_number" not in df.columns:
194 df["sequence_number"] = 0
195 else:
196 df.loc[:, "sequence_number"] = (
197 pd.to_numeric(df.loc[:, "sequence_number"], errors="coerce")
198 .fillna(0)
199 .astype("int64")
200 )
202 if "component" not in df.columns:
203 df["component"] = ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"])
204 else:
205 df.loc[:, "component"] = df.loc[:, "component"].fillna(
206 ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"])
207 )
209 if "instrument_id" not in df.columns:
210 df["instrument_id"] = "NIMS"
211 else:
212 df.loc[:, "instrument_id"] = df.loc[:, "instrument_id"].fillna("NIMS")
214 df = self._sort_df(self._set_df_dtypes(df), run_name_zeros)
216 return df
218 def assign_run_names(self, df: pd.DataFrame, zeros: int = 2) -> pd.DataFrame:
219 """
220 Assign standardized run names to DataFrame entries by station.
222 This method assigns run names following the pattern 'sr{sample_rate}_{run_number}'
223 where run_number is zero-padded according to the zeros parameter. Run names
224 are assigned sequentially within each station, ordered by start time.
226 Parameters
227 ----------
228 df : pd.DataFrame
229 DataFrame containing NIMS file metadata with required columns:
230 'station', 'start', 'run', 'sample_rate'. The DataFrame will be
231 modified in-place.
232 zeros : int, default 2
233 Number of zeros to use for zero-padding the run number in the
234 generated run names (e.g., zeros=2 gives '01', '02', etc.).
236 Returns
237 -------
238 pd.DataFrame
239 The input DataFrame with updated 'run' and 'sequence_number' columns.
240 Run names follow the format 'sr{sample_rate}_{run_number:0{zeros}}'.
242 Notes
243 -----
244 - Existing run names (non-None values) are preserved
245 - Files are processed in chronological order within each station
246 - Sequence numbers are assigned incrementally starting from 1
247 - Only files with None run names receive new assignments
249 Examples
250 --------
251 >>> import pandas as pd
252 >>> from mth5.io.nims import NIMSCollection
253 >>> # Assuming df has columns: station, start, run, sample_rate
254 >>> nc = NIMSCollection()
255 >>> df_updated = nc.assign_run_names(df, zeros=3)
256 >>> print(df_updated['run'].tolist())
257 ['sr8_001', 'sr8_002', 'sr1_001']
258 """
260 for station in df.station.unique():
261 count = 1
262 for row in df[df.station == station].sort_values("start").itertuples():
263 if row.run is None:
264 df.loc[row.Index, "run"] = f"sr{row.sample_rate}_{count:0{zeros}}"
265 df.loc[row.Index, "sequence_number"] = count
266 count += 1
268 return df