Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ io \ nims \ nims_collection.py: 89%

55 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-27 20:09 -0800

1# -*- coding: utf-8 -*- 

2""" 

3NIMS Collection 

4=============== 

5 

6Collection of NIMS binary files combined into runs for magnetotelluric data processing. 

7 

8Created on Wed Aug 31 10:32:44 2022 

9 

10@author: jpeacock 

11""" 

12 

13# ============================================================================= 

14# Imports 

15# ============================================================================= 

16from __future__ import annotations 

17 

18from pathlib import Path 

19from typing import Any 

20 

21import pandas as pd 

22 

23from mth5.io.collection import Collection 

24from mth5.io.nims import NIMS 

25 

26 

27# ============================================================================= 

28 

29 

30class NIMSCollection(Collection): 

31 """ 

32 Collection of NIMS binary files into runs. 

33 

34 This class provides functionality for organizing and processing multiple NIMS 

35 binary files into a structured format for magnetotelluric data analysis. 

36 

37 Parameters 

38 ---------- 

39 file_path : str | Path | None, optional 

40 Path to the directory containing NIMS binary files. 

41 **kwargs : dict 

42 Additional keyword arguments passed to the parent Collection class. 

43 

44 Attributes 

45 ---------- 

46 file_ext : str 

47 File extension for NIMS binary files ('bin'). 

48 survey_id : str 

49 Survey identifier, defaults to 'mt'. 

50 

51 Examples 

52 -------- 

53 >>> from mth5.io.nims import NIMSCollection 

54 >>> nc = NIMSCollection(r"/path/to/nims/station") 

55 >>> nc.survey_id = "mt001" 

56 >>> df = nc.to_dataframe() 

57 

58 See Also 

59 -------- 

60 mth5.io.collection.Collection : Base collection class 

61 mth5.io.nims.NIMS : NIMS file reader 

62 """ 

63 

64 def __init__(self, file_path: str | Path | None = None, **kwargs: Any) -> None: 

65 """ 

66 Initialize NIMSCollection instance. 

67 

68 Parameters 

69 ---------- 

70 file_path : str | Path | None, optional 

71 Path to the directory containing NIMS binary files. 

72 **kwargs : dict 

73 Additional keyword arguments passed to the parent Collection class. 

74 """ 

75 super().__init__(file_path=file_path, **kwargs) 

76 self.file_ext: str = "bin" 

77 self.survey_id: str = "mt" 

78 

79 def to_dataframe( 

80 self, 

81 sample_rates: int | list[int] = [1], 

82 run_name_zeros: int = 2, 

83 calibration_path: str | Path | None = None, 

84 ) -> pd.DataFrame: 

85 """ 

86 Create a DataFrame of each NIMS binary file in the collection directory. 

87 

88 This method processes all NIMS binary files in the specified directory and 

89 extracts metadata to create a structured DataFrame suitable for further 

90 magnetotelluric data processing. 

91 

92 Parameters 

93 ---------- 

94 sample_rates : int | list[int], default [1] 

95 Sample rates to include in the DataFrame. Note that for NIMS data, 

96 this parameter is present for interface consistency but all files 

97 will be processed regardless of their sample rate. 

98 run_name_zeros : int, default 2 

99 Number of zeros to use when formatting run names in the output. 

100 calibration_path : str | Path | None, optional 

101 Path to calibration files. Currently not used in NIMS processing 

102 but included for interface consistency. 

103 

104 Returns 

105 ------- 

106 pd.DataFrame 

107 DataFrame containing metadata for each NIMS file with columns: 

108 - survey : Survey identifier 

109 - station : Station name from NIMS file 

110 - run : Run identifier from NIMS file 

111 - start : Start time in ISO format 

112 - end : End time in ISO format 

113 - fn : File path 

114 - sample_rate : Sampling rate 

115 - file_size : File size in bytes 

116 - n_samples : Number of samples 

117 - dipole : Electric dipole lengths [Ex, Ey] 

118 - channel_id : Channel identifier (always 1) 

119 - sequence_number : Sequence number (always 0) 

120 - component : Comma-separated component list 

121 - instrument_id : Instrument identifier (always 'NIMS') 

122 

123 Notes 

124 ----- 

125 This method assumes the directory contains files from a single station. 

126 Each NIMS file is read to extract header information including timing, 

127 station identification, and measurement parameters. 

128 

129 Examples 

130 -------- 

131 >>> from mth5.io.nims import NIMSCollection 

132 >>> nc = NIMSCollection("/path/to/nims/station") 

133 >>> df = nc.to_dataframe(run_name_zeros=3) 

134 >>> print(df[['station', 'run', 'start', 'sample_rate']]) 

135 """ 

136 entries = [] 

137 for fn in self.get_files( 

138 [self.file_ext, self.file_ext.lower(), self.file_ext.upper()] 

139 ): 

140 nims_obj = NIMS(fn) 

141 nims_obj.read_header() 

142 entry = self.get_empty_entry_dict() 

143 entry["survey"] = self.survey_id 

144 entry["station"] = nims_obj.station 

145 entry["run"] = nims_obj.run_id 

146 entry["start"] = nims_obj.start_time.isoformat() 

147 entry["end"] = nims_obj.end_time.isoformat() 

148 entry["fn"] = fn 

149 entry["sample_rate"] = nims_obj.sample_rate 

150 entry["file_size"] = nims_obj.file_size 

151 entry["n_samples"] = nims_obj.n_samples 

152 entry["dipole"] = [nims_obj.ex_length, nims_obj.ey_length] 

153 

154 entries.append(entry) 

155 

156 # make pandas dataframe and set data types 

157 df = pd.DataFrame(entries) 

158 

159 # If there are no entries, create an empty DataFrame with the 

160 # expected columns so subsequent scalar assignments and dtype 

161 # enforcement work without raising (pandas raises when assigning 

162 # scalars into an empty frame with no defined index). 

163 if df.empty: 

164 expected_cols = [ 

165 "survey", 

166 "station", 

167 "run", 

168 "start", 

169 "end", 

170 "fn", 

171 "sample_rate", 

172 "file_size", 

173 "n_samples", 

174 "dipole", 

175 "channel_id", 

176 "sequence_number", 

177 "component", 

178 "instrument_id", 

179 ] 

180 df = pd.DataFrame(columns=expected_cols) 

181 

182 # Populate/ensure scalar columns exist 

183 if "channel_id" not in df.columns: 

184 df["channel_id"] = 1 

185 else: 

186 # Explicitly coerce to numeric before filling to avoid future downcast warnings 

187 df.loc[:, "channel_id"] = ( 

188 pd.to_numeric(df.loc[:, "channel_id"], errors="coerce") 

189 .fillna(1) 

190 .astype("int64") 

191 ) 

192 

193 if "sequence_number" not in df.columns: 

194 df["sequence_number"] = 0 

195 else: 

196 df.loc[:, "sequence_number"] = ( 

197 pd.to_numeric(df.loc[:, "sequence_number"], errors="coerce") 

198 .fillna(0) 

199 .astype("int64") 

200 ) 

201 

202 if "component" not in df.columns: 

203 df["component"] = ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"]) 

204 else: 

205 df.loc[:, "component"] = df.loc[:, "component"].fillna( 

206 ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"]) 

207 ) 

208 

209 if "instrument_id" not in df.columns: 

210 df["instrument_id"] = "NIMS" 

211 else: 

212 df.loc[:, "instrument_id"] = df.loc[:, "instrument_id"].fillna("NIMS") 

213 

214 df = self._sort_df(self._set_df_dtypes(df), run_name_zeros) 

215 

216 return df 

217 

218 def assign_run_names(self, df: pd.DataFrame, zeros: int = 2) -> pd.DataFrame: 

219 """ 

220 Assign standardized run names to DataFrame entries by station. 

221 

222 This method assigns run names following the pattern 'sr{sample_rate}_{run_number}' 

223 where run_number is zero-padded according to the zeros parameter. Run names 

224 are assigned sequentially within each station, ordered by start time. 

225 

226 Parameters 

227 ---------- 

228 df : pd.DataFrame 

229 DataFrame containing NIMS file metadata with required columns: 

230 'station', 'start', 'run', 'sample_rate'. The DataFrame will be 

231 modified in-place. 

232 zeros : int, default 2 

233 Number of zeros to use for zero-padding the run number in the 

234 generated run names (e.g., zeros=2 gives '01', '02', etc.). 

235 

236 Returns 

237 ------- 

238 pd.DataFrame 

239 The input DataFrame with updated 'run' and 'sequence_number' columns. 

240 Run names follow the format 'sr{sample_rate}_{run_number:0{zeros}}'. 

241 

242 Notes 

243 ----- 

244 - Existing run names (non-None values) are preserved 

245 - Files are processed in chronological order within each station 

246 - Sequence numbers are assigned incrementally starting from 1 

247 - Only files with None run names receive new assignments 

248 

249 Examples 

250 -------- 

251 >>> import pandas as pd 

252 >>> from mth5.io.nims import NIMSCollection 

253 >>> # Assuming df has columns: station, start, run, sample_rate 

254 >>> nc = NIMSCollection() 

255 >>> df_updated = nc.assign_run_names(df, zeros=3) 

256 >>> print(df_updated['run'].tolist()) 

257 ['sr8_001', 'sr8_002', 'sr1_001'] 

258 """ 

259 

260 for station in df.station.unique(): 

261 count = 1 

262 for row in df[df.station == station].sort_values("start").itertuples(): 

263 if row.run is None: 

264 df.loc[row.Index, "run"] = f"sr{row.sample_rate}_{count:0{zeros}}" 

265 df.loc[row.Index, "sequence_number"] = count 

266 count += 1 

267 

268 return df