Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ processing \ run_summary.py: 94%

87 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-10 00:01 -0800

1""" 

2 

3This module contains the RunSummary class. 

4 

5This is a helper class that summarizes the Runs in an mth5. 

6 

7TODO: This class and methods could be replaced by methods in MTH5. 

8 

9Functionality of RunSummary() 

101. User can get a list of local_station options, which correspond to unique pairs 

11of values: (survey, station) 

12 

132. User can see all possible ways of processing the data: 

14- one list per (survey, station) pair in the run_summary 

15 

16Some of the following functionalities may end up in KernelDataset: 

173. User can select local_station 

18-this can trigger a reduction of runs to only those that are from the local staion 

19and simultaneous runs at other stations 

204. Given a local station, a list of possible reference stations can be generated 

215. Given a remote reference station, a list of all relevent runs, truncated to 

22maximize coverage of the local station runs is generated 

236. Given such a "restricted run list", runs can be dropped 

247. Time interval endpoints can be changed 

25 

26 

27Development Notes: 

28 TODO: consider adding methods: 

29 - drop_runs_shorter_than": removes short runs from summary 

30 - fill_gaps_by_time_interval": allows runs to be merged if gaps between 

31 are short 

32 - fill_gaps_by_run_names": allows runs to be merged if gaps between are 

33 short 

34 TODO: Consider whether this should return a copy or modify in-place when 

35 querying the df. 

36 

37""" 

38 

39# ============================================================================= 

40# Imports 

41# ============================================================================= 

42import copy 

43from typing import Optional, Union 

44 

45import pandas as pd 

46from loguru import logger 

47 

48import mth5 

49from mth5.processing import MINI_SUMMARY_COLUMNS, RUN_SUMMARY_COLUMNS 

50from mth5.utils.helpers import initialize_mth5 

51 

52 

53# ============================================================================= 

54 

55 

56class RunSummary: 

57 """Class to contain a run-summary table from one or more mth5s. 

58 

59 WIP: For the full MMT case this may need modification to a channel based 

60 summary. 

61 """ 

62 

63 def __init__( 

64 self, 

65 input_dict: Optional[Union[dict, None]] = None, 

66 df: Optional[Union[pd.DataFrame, None]] = None, 

67 ): 

68 """Constructor. 

69 

70 Parameters 

71 ---------- 

72 df : Optional[Union[pd.DataFrame, None]], optional 

73 By default, None. 

74 input_dict : Optional[Union[dict, None]], optional 

75 By default, None. 

76 kwargs : 

77 """ 

78 self.column_dtypes = [str, str, pd.Timestamp, pd.Timestamp] 

79 self._input_dict = input_dict 

80 self.df = df 

81 self._mini_summary_columns = MINI_SUMMARY_COLUMNS 

82 

83 def __str__(self): 

84 """Str function.""" 

85 return str(self.mini_summary.head(None)) 

86 

87 def __repr__(self): 

88 """Repr function.""" 

89 return self.__str__() 

90 

91 @property 

92 def df(self) -> pd.DataFrame: 

93 """Df function.""" 

94 return self._df 

95 

96 @df.setter 

97 def df(self, value: pd.DataFrame): 

98 """Make sure the data frame is set properly with proper column names. 

99 

100 Parameters 

101 ---------- 

102 value : pd.DataFrame 

103 DESCRIPTION. 

104 

105 Returns 

106 ------- 

107 TYPE 

108 DESCRIPTION. 

109 """ 

110 if value is None: 

111 self._df = None 

112 return 

113 

114 if not isinstance(value, pd.DataFrame): 

115 msg = f"Need to set df with a Pandas.DataFrame not type({type(value)})" 

116 logger.error(msg) 

117 

118 raise TypeError(msg) 

119 

120 need_columns = [] 

121 for col in RUN_SUMMARY_COLUMNS: 

122 if not col in value.columns: 

123 need_columns.append(col) 

124 if need_columns: 

125 msg = f"DataFrame needs columns {', '.join(need_columns)}" 

126 logger.error(msg) 

127 raise ValueError(msg) 

128 self._df = value 

129 

130 def clone(self): 

131 """2022-10-20: 

132 Cloning may be causing issues with extra instances of open h5 files ... 

133 """ 

134 return copy.deepcopy(self) 

135 

136 def from_mth5s(self, mth5_list) -> list: 

137 """Iterates over mth5s in list and creates one big dataframe 

138 summarizing the runs 

139 """ 

140 run_summary_df = extract_run_summaries_from_mth5s(mth5_list) 

141 self.df = run_summary_df 

142 

143 def _warn_no_data_runs(self): 

144 """Warn no data runs.""" 

145 if False in self.df.has_data.values: 

146 for row in self.df[self.df.has_data == False].itertuples(): 

147 logger.warning( 

148 f"Found no data run in row {row.Index}: " 

149 f"survey: {row.survey}, station: {row.station}, run: {row.run}" 

150 ) 

151 logger.info("To drop no data runs use `drop_no_data_rows`") 

152 

153 @property 

154 def mini_summary(self) -> pd.DataFrame: 

155 """Shows the dataframe with only a few columns for readbility.""" 

156 return self.df[self._mini_summary_columns] 

157 

158 @property 

159 def print_mini_summary(self) -> str: 

160 """Calls minisummary through logger so it is formatted.""" 

161 logger.info(self.mini_summary) 

162 

163 def drop_no_data_rows(self) -> bool: 

164 """Drops rows marked `has_data` = False and resets the index of self.df.""" 

165 self.df = self.df[self.df.has_data] 

166 self.df.reset_index(drop=True, inplace=True) 

167 

168 def set_sample_rate(self, sample_rate: float, inplace: bool = False): 

169 """Set the sample rate so that the run summary represents all runs for 

170 a single sample rate. 

171 

172 Parameters 

173 ---------- 

174 sample_rate : float 

175 inplace : bool, optional 

176 DESCRIPTION. By default, False. 

177 

178 Returns 

179 ------- 

180 TYPE 

181 DESCRIPTION. 

182 """ 

183 

184 if sample_rate not in self.df.sample_rate.values: 

185 msg = ( 

186 f"Sample rate {sample_rate} is not in RunSummary. Unique " 

187 f"values are {self.df.sample_rate.unique()}" 

188 ) 

189 logger.error(msg) 

190 raise ValueError(msg) 

191 if inplace: 

192 self.df = self.df[self.df.sample_rate == sample_rate] 

193 else: 

194 new_rs = self.clone() 

195 new_rs.df = new_rs.df[new_rs.df.sample_rate == sample_rate] 

196 return new_rs 

197 

198 

199### this can be deprcated now 

200# def extract_run_summary_from_mth5(mth5_obj, summary_type: Optional[str] = "run"): 

201# """Given a single mth5 object, get the channel_summary and compress it to a 

202# run_summary. 

203 

204# Development Notes: 

205# TODO: Move this into MTH5 or replace with MTH5 built-in run_summary method. 

206 

207# Parameters 

208# ---------- 

209# mth5_obj : mth5.mth5.MTH5 

210# The initialized mth5 object that will be interrogated. 

211# summary_type : Optional[str], optional 

212# One of ["run", "channel"]. Returns a run summary or a channel summary. By default, "run". 

213 

214# Returns 

215# ------- 

216# out_df : pd.Dataframe 

217# Table summarizing the available runs in the input mth5_obj. 

218# """ 

219 

220# if summary_type == "run": 

221# out_df = mth5_obj.run_summary 

222# else: 

223# out_df = mth5_obj.channel_summary.to_dataframe() 

224# out_df["mth5_path"] = str(mth5_obj.filename) 

225# return out_df 

226 

227 

228def extract_run_summaries_from_mth5s(mth5_list, summary_type="run", deduplicate=True): 

229 """Given a list of mth5's, iterate over them, extracting run_summaries and 

230 merging into one big table. 

231 

232 Development Notes: 

233 ToDo: Move this method into mth5? or mth5_helpers? 

234 ToDo: Make this a class so that the __repr__ is a nice visual representation 

235 of the 

236 df, like what channel summary does in mth5 

237 - 2022-05-28 Modified to allow this method to accept mth5 objects as well 

238 as the 

239 already supported types of pathlib.Path or str 

240 

241 

242 In order to drop duplicates I used the solution here: 

243 https://stackoverflow.com/questions/43855462/pandas-drop-duplicates-method-not-working-on-dataframe-containing-lists 

244 

245 Parameters 

246 ---------- 

247 deduplicate : 

248 By default, True. 

249 mth5_list : 

250 mth5_paths : list 

251 Paths or strings that point to mth5s. 

252 summary_type : string, optional 

253 One of ["channel", "run"] 

254 "channel" returns concatenated channel summary, 

255 "run" returns concatenated run summary,. By default, "run". 

256 deduplicate:, defaults to True. : bool, optional 

257 

258 Returns 

259 ------- 

260 super_summary : pd.DataFrame 

261 Given a list of mth5s, a dataframe of all available runs. 

262 """ 

263 dfs = len(mth5_list) * [None] 

264 

265 for i, mth5_elt in enumerate(mth5_list): 

266 if isinstance(mth5_elt, mth5.mth5.MTH5): 

267 mth5_obj = mth5_elt 

268 else: # mth5_elt is a path or a string 

269 mth5_obj = initialize_mth5(mth5_elt, mode="a") 

270 

271 df = mth5_obj.run_summary.copy() 

272 

273 # close it back up if you opened it 

274 if not isinstance(mth5_elt, mth5.mth5.MTH5): 

275 mth5_obj.close_mth5() 

276 dfs[i] = df 

277 

278 # merge all summaries into a super_summary 

279 super_summary = pd.concat(dfs) 

280 super_summary.reset_index(drop=True, inplace=True) 

281 

282 # drop rows that correspond to TFs: 

283 run_rows = super_summary.sample_rate != 0 

284 super_summary = super_summary[run_rows] 

285 super_summary.reset_index(drop=True, inplace=True) 

286 

287 if deduplicate: 

288 keep_indices = super_summary.astype(str).drop_duplicates().index 

289 super_summary = super_summary.loc[keep_indices] 

290 super_summary.reset_index(drop=True, inplace=True) 

291 return super_summary