Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ processing \ run_summary.py: 94%
87 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-10 00:01 -0800
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-10 00:01 -0800
1"""
3This module contains the RunSummary class.
5This is a helper class that summarizes the Runs in an mth5.
7TODO: This class and methods could be replaced by methods in MTH5.
9Functionality of RunSummary()
101. User can get a list of local_station options, which correspond to unique pairs
11of values: (survey, station)
132. User can see all possible ways of processing the data:
14- one list per (survey, station) pair in the run_summary
16Some of the following functionalities may end up in KernelDataset:
173. User can select local_station
18-this can trigger a reduction of runs to only those that are from the local staion
19and simultaneous runs at other stations
204. Given a local station, a list of possible reference stations can be generated
215. Given a remote reference station, a list of all relevent runs, truncated to
22maximize coverage of the local station runs is generated
236. Given such a "restricted run list", runs can be dropped
247. Time interval endpoints can be changed
27Development Notes:
28 TODO: consider adding methods:
29 - drop_runs_shorter_than": removes short runs from summary
30 - fill_gaps_by_time_interval": allows runs to be merged if gaps between
31 are short
32 - fill_gaps_by_run_names": allows runs to be merged if gaps between are
33 short
34 TODO: Consider whether this should return a copy or modify in-place when
35 querying the df.
37"""
39# =============================================================================
40# Imports
41# =============================================================================
42import copy
43from typing import Optional, Union
45import pandas as pd
46from loguru import logger
48import mth5
49from mth5.processing import MINI_SUMMARY_COLUMNS, RUN_SUMMARY_COLUMNS
50from mth5.utils.helpers import initialize_mth5
53# =============================================================================
56class RunSummary:
57 """Class to contain a run-summary table from one or more mth5s.
59 WIP: For the full MMT case this may need modification to a channel based
60 summary.
61 """
63 def __init__(
64 self,
65 input_dict: Optional[Union[dict, None]] = None,
66 df: Optional[Union[pd.DataFrame, None]] = None,
67 ):
68 """Constructor.
70 Parameters
71 ----------
72 df : Optional[Union[pd.DataFrame, None]], optional
73 By default, None.
74 input_dict : Optional[Union[dict, None]], optional
75 By default, None.
76 kwargs :
77 """
78 self.column_dtypes = [str, str, pd.Timestamp, pd.Timestamp]
79 self._input_dict = input_dict
80 self.df = df
81 self._mini_summary_columns = MINI_SUMMARY_COLUMNS
83 def __str__(self):
84 """Str function."""
85 return str(self.mini_summary.head(None))
87 def __repr__(self):
88 """Repr function."""
89 return self.__str__()
91 @property
92 def df(self) -> pd.DataFrame:
93 """Df function."""
94 return self._df
96 @df.setter
97 def df(self, value: pd.DataFrame):
98 """Make sure the data frame is set properly with proper column names.
100 Parameters
101 ----------
102 value : pd.DataFrame
103 DESCRIPTION.
105 Returns
106 -------
107 TYPE
108 DESCRIPTION.
109 """
110 if value is None:
111 self._df = None
112 return
114 if not isinstance(value, pd.DataFrame):
115 msg = f"Need to set df with a Pandas.DataFrame not type({type(value)})"
116 logger.error(msg)
118 raise TypeError(msg)
120 need_columns = []
121 for col in RUN_SUMMARY_COLUMNS:
122 if not col in value.columns:
123 need_columns.append(col)
124 if need_columns:
125 msg = f"DataFrame needs columns {', '.join(need_columns)}"
126 logger.error(msg)
127 raise ValueError(msg)
128 self._df = value
130 def clone(self):
131 """2022-10-20:
132 Cloning may be causing issues with extra instances of open h5 files ...
133 """
134 return copy.deepcopy(self)
136 def from_mth5s(self, mth5_list) -> list:
137 """Iterates over mth5s in list and creates one big dataframe
138 summarizing the runs
139 """
140 run_summary_df = extract_run_summaries_from_mth5s(mth5_list)
141 self.df = run_summary_df
143 def _warn_no_data_runs(self):
144 """Warn no data runs."""
145 if False in self.df.has_data.values:
146 for row in self.df[self.df.has_data == False].itertuples():
147 logger.warning(
148 f"Found no data run in row {row.Index}: "
149 f"survey: {row.survey}, station: {row.station}, run: {row.run}"
150 )
151 logger.info("To drop no data runs use `drop_no_data_rows`")
153 @property
154 def mini_summary(self) -> pd.DataFrame:
155 """Shows the dataframe with only a few columns for readbility."""
156 return self.df[self._mini_summary_columns]
158 @property
159 def print_mini_summary(self) -> str:
160 """Calls minisummary through logger so it is formatted."""
161 logger.info(self.mini_summary)
163 def drop_no_data_rows(self) -> bool:
164 """Drops rows marked `has_data` = False and resets the index of self.df."""
165 self.df = self.df[self.df.has_data]
166 self.df.reset_index(drop=True, inplace=True)
168 def set_sample_rate(self, sample_rate: float, inplace: bool = False):
169 """Set the sample rate so that the run summary represents all runs for
170 a single sample rate.
172 Parameters
173 ----------
174 sample_rate : float
175 inplace : bool, optional
176 DESCRIPTION. By default, False.
178 Returns
179 -------
180 TYPE
181 DESCRIPTION.
182 """
184 if sample_rate not in self.df.sample_rate.values:
185 msg = (
186 f"Sample rate {sample_rate} is not in RunSummary. Unique "
187 f"values are {self.df.sample_rate.unique()}"
188 )
189 logger.error(msg)
190 raise ValueError(msg)
191 if inplace:
192 self.df = self.df[self.df.sample_rate == sample_rate]
193 else:
194 new_rs = self.clone()
195 new_rs.df = new_rs.df[new_rs.df.sample_rate == sample_rate]
196 return new_rs
199### this can be deprcated now
200# def extract_run_summary_from_mth5(mth5_obj, summary_type: Optional[str] = "run"):
201# """Given a single mth5 object, get the channel_summary and compress it to a
202# run_summary.
204# Development Notes:
205# TODO: Move this into MTH5 or replace with MTH5 built-in run_summary method.
207# Parameters
208# ----------
209# mth5_obj : mth5.mth5.MTH5
210# The initialized mth5 object that will be interrogated.
211# summary_type : Optional[str], optional
212# One of ["run", "channel"]. Returns a run summary or a channel summary. By default, "run".
214# Returns
215# -------
216# out_df : pd.Dataframe
217# Table summarizing the available runs in the input mth5_obj.
218# """
220# if summary_type == "run":
221# out_df = mth5_obj.run_summary
222# else:
223# out_df = mth5_obj.channel_summary.to_dataframe()
224# out_df["mth5_path"] = str(mth5_obj.filename)
225# return out_df
228def extract_run_summaries_from_mth5s(mth5_list, summary_type="run", deduplicate=True):
229 """Given a list of mth5's, iterate over them, extracting run_summaries and
230 merging into one big table.
232 Development Notes:
233 ToDo: Move this method into mth5? or mth5_helpers?
234 ToDo: Make this a class so that the __repr__ is a nice visual representation
235 of the
236 df, like what channel summary does in mth5
237 - 2022-05-28 Modified to allow this method to accept mth5 objects as well
238 as the
239 already supported types of pathlib.Path or str
242 In order to drop duplicates I used the solution here:
243 https://stackoverflow.com/questions/43855462/pandas-drop-duplicates-method-not-working-on-dataframe-containing-lists
245 Parameters
246 ----------
247 deduplicate :
248 By default, True.
249 mth5_list :
250 mth5_paths : list
251 Paths or strings that point to mth5s.
252 summary_type : string, optional
253 One of ["channel", "run"]
254 "channel" returns concatenated channel summary,
255 "run" returns concatenated run summary,. By default, "run".
256 deduplicate:, defaults to True. : bool, optional
258 Returns
259 -------
260 super_summary : pd.DataFrame
261 Given a list of mth5s, a dataframe of all available runs.
262 """
263 dfs = len(mth5_list) * [None]
265 for i, mth5_elt in enumerate(mth5_list):
266 if isinstance(mth5_elt, mth5.mth5.MTH5):
267 mth5_obj = mth5_elt
268 else: # mth5_elt is a path or a string
269 mth5_obj = initialize_mth5(mth5_elt, mode="a")
271 df = mth5_obj.run_summary.copy()
273 # close it back up if you opened it
274 if not isinstance(mth5_elt, mth5.mth5.MTH5):
275 mth5_obj.close_mth5()
276 dfs[i] = df
278 # merge all summaries into a super_summary
279 super_summary = pd.concat(dfs)
280 super_summary.reset_index(drop=True, inplace=True)
282 # drop rows that correspond to TFs:
283 run_rows = super_summary.sample_rate != 0
284 super_summary = super_summary[run_rows]
285 super_summary.reset_index(drop=True, inplace=True)
287 if deduplicate:
288 keep_indices = super_summary.astype(str).drop_duplicates().index
289 super_summary = super_summary.loc[keep_indices]
290 super_summary.reset_index(drop=True, inplace=True)
291 return super_summary