Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ io \ lemi \ lemi_collection.py: 97%

66 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-10 00:01 -0800

1# -*- coding: utf-8 -*- 

2""" 

3LEMI 424 Collection 

4==================== 

5 

6Collection of TXT files combined into runs 

7 

8Created on Wed Aug 31 10:32:44 2022 

9 

10@author: jpeacock 

11""" 

12 

13import pathlib 

14from pathlib import Path 

15from typing import List 

16 

17# ============================================================================= 

18# Imports 

19# ============================================================================= 

20import pandas as pd 

21 

22from mth5.io.collection import Collection 

23from mth5.io.lemi import LEMI424 

24 

25 

26# ============================================================================= 

27 

28 

29class LEMICollection(Collection): 

30 """ 

31 Collection of LEMI 424 files into runs based on start and end times. 

32 

33 Will assign the run name as 'sr1_{index:0{zeros}}' --> 'sr1_0001' for 

34 `zeros` = 4. 

35 

36 Notes 

37 ----- 

38 This class assumes that the given file path contains a single 

39 LEMI station. If you want to do multiple stations merge the returned 

40 data frames. 

41 

42 LEMI data comes with little metadata about the station or survey, 

43 therefore you should assign `station_id` and `survey_id`. 

44 

45 Parameters 

46 ---------- 

47 file_path : str or pathlib.Path, optional 

48 Full path to single station LEMI424 directory, by default None 

49 file_ext : list of str, optional 

50 Extension of LEMI424 files, by default ["txt", "TXT"] 

51 **kwargs 

52 Additional keyword arguments passed to parent Collection class 

53 

54 Attributes 

55 ---------- 

56 station_id : str 

57 Station identification string, defaults to "mt001" 

58 survey_id : str 

59 Survey identification string, defaults to "mt" 

60 

61 Examples 

62 -------- 

63 >>> from mth5.io.lemi import LEMICollection 

64 >>> lc = LEMICollection(r"/path/to/single/lemi/station") 

65 >>> lc.station_id = "mt001" 

66 >>> lc.survey_id = "test_survey" 

67 >>> run_dict = lc.get_runs(1) 

68 """ 

69 

70 def __init__( 

71 self, 

72 file_path: str | pathlib.Path | None = None, 

73 file_ext: List[str] | None = None, 

74 **kwargs, 

75 ) -> None: 

76 if file_ext is None: 

77 file_ext = ["txt", "TXT"] 

78 super().__init__(file_path=file_path, file_ext=file_ext, **kwargs) 

79 

80 self.station_id = "mt001" 

81 self.survey_id = "mt" 

82 self.calibration_dict = {} 

83 

84 def get_calibrations(self, calibration_path: str | Path) -> dict: 

85 """ 

86 Get calibration dictionary for LEMI424 files. This assumes that the 

87 calibrations files are in JSON format and named as 

88 'LEMI-424-<component>.json' 

89 

90 Parameters 

91 ---------- 

92 calibration_path : str or pathlib.Path 

93 Path to calibration files 

94 

95 Returns 

96 ------- 

97 dict 

98 Calibration dictionary for LEMI424 files 

99 

100 Examples 

101 -------- 

102 >>> from mth5.io.lemi import LEMICollection 

103 >>> lc = LEMICollection("/path/to/single/lemi/station") 

104 >>> cal_dict = lc.get_calibrations(Path("/path/to/calibrations")) 

105 """ 

106 calibration_path = Path(calibration_path) 

107 

108 calibration_dict = {} 

109 for fn in calibration_path.rglob("*.json"): 

110 comp = fn.stem.split("-")[-1].split(".", 1)[0] 

111 calibration_dict[comp] = fn 

112 

113 return calibration_dict 

114 

115 def to_dataframe( 

116 self, 

117 sample_rates: int | List[int] | None = None, 

118 run_name_zeros: int = 4, 

119 calibration_path: str | Path | None = None, 

120 ) -> pd.DataFrame: 

121 """ 

122 Create a data frame of each TXT file in a given directory. 

123 

124 Notes 

125 ----- 

126 This assumes the given directory contains a single station 

127 

128 Parameters 

129 ---------- 

130 sample_rates : int or list of int, optional 

131 Sample rate to get, will always be 1 for LEMI data, by default [1] 

132 run_name_zeros : int, optional 

133 Number of zeros to assign to the run name, by default 4 

134 calibration_path : str or pathlib.Path, optional 

135 Path to calibration files, by default None 

136 

137 Returns 

138 ------- 

139 pd.DataFrame 

140 DataFrame with information of each TXT file in the given directory 

141 

142 Examples 

143 -------- 

144 >>> from mth5.io.lemi import LEMICollection 

145 >>> lc = LEMICollection("/path/to/single/lemi/station") 

146 >>> lemi_df = lc.to_dataframe() 

147 """ 

148 if sample_rates is None: 

149 sample_rates = [1] 

150 

151 if calibration_path is None: 

152 calibration_path = Path(self.file_path) 

153 self.calibration_dict = self.get_calibrations(calibration_path) 

154 if not self.calibration_dict: 

155 self.logger.warning( 

156 f"No calibration files found in {calibration_path}, " 

157 "proceeding without calibrations." 

158 ) 

159 

160 entries = [] 

161 for fn in self.get_files(self.file_ext): 

162 lemi_obj = LEMI424(fn) 

163 n_samples = int(lemi_obj.n_samples or 0) 

164 lemi_obj.read_metadata() 

165 

166 entry = self.get_empty_entry_dict() 

167 entry["survey"] = self.survey_id 

168 entry["station"] = self.station_id 

169 entry["start"] = lemi_obj.start.isoformat() if lemi_obj.start else "" 

170 entry["end"] = lemi_obj.end.isoformat() if lemi_obj.end else "" 

171 entry["component"] = ",".join(lemi_obj.run_metadata.channels_recorded_all) 

172 entry["fn"] = fn 

173 entry["sample_rate"] = lemi_obj.sample_rate 

174 entry["file_size"] = lemi_obj.file_size 

175 entry["n_samples"] = n_samples 

176 

177 entries.append(entry) 

178 

179 # make pandas dataframe and set data types 

180 if len(entries) == 0: 

181 self.logger.warning("No entries found for LEMI collection") 

182 return pd.DataFrame() 

183 

184 df = pd.DataFrame(entries) 

185 df.loc[:, "channel_id"] = 1 

186 df.loc[:, "sequence_number"] = 0 

187 df.loc[:, "instrument_id"] = "LEMI424" 

188 

189 df = self._sort_df(self._set_df_dtypes(df), run_name_zeros) 

190 

191 return df 

192 

193 def assign_run_names(self, df: pd.DataFrame, zeros: int = 4) -> pd.DataFrame: 

194 """ 

195 Assign run names based on start and end times. 

196 

197 Checks if a file has the same start time as the last end time. 

198 Run names are assigned as sr{sample_rate}_{run_number:0{zeros}}. 

199 

200 Parameters 

201 ---------- 

202 df : pd.DataFrame 

203 DataFrame with the appropriate columns 

204 zeros : int, optional 

205 Number of zeros in run name, by default 4 

206 

207 Returns 

208 ------- 

209 pd.DataFrame 

210 DataFrame with run names assigned 

211 """ 

212 count = 1 

213 for row in df.itertuples(): 

214 if row.Index == 0: 

215 df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}" 

216 previous_end = row.end 

217 else: 

218 if ( 

219 row.start - previous_end 

220 ).total_seconds() / row.sample_rate == row.sample_rate: 

221 df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}" 

222 else: 

223 count += 1 

224 df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}" 

225 previous_end = row.end 

226 

227 return df