Coverage for C: \ Users \ peaco \ OneDrive \ Documents \ GitHub \ mth5 \ mth5 \ io \ lemi \ lemi_collection.py: 97%
66 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-10 00:01 -0800
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-10 00:01 -0800
1# -*- coding: utf-8 -*-
2"""
3LEMI 424 Collection
4====================
6Collection of TXT files combined into runs
8Created on Wed Aug 31 10:32:44 2022
10@author: jpeacock
11"""
13import pathlib
14from pathlib import Path
15from typing import List
17# =============================================================================
18# Imports
19# =============================================================================
20import pandas as pd
22from mth5.io.collection import Collection
23from mth5.io.lemi import LEMI424
26# =============================================================================
29class LEMICollection(Collection):
30 """
31 Collection of LEMI 424 files into runs based on start and end times.
33 Will assign the run name as 'sr1_{index:0{zeros}}' --> 'sr1_0001' for
34 `zeros` = 4.
36 Notes
37 -----
38 This class assumes that the given file path contains a single
39 LEMI station. If you want to do multiple stations merge the returned
40 data frames.
42 LEMI data comes with little metadata about the station or survey,
43 therefore you should assign `station_id` and `survey_id`.
45 Parameters
46 ----------
47 file_path : str or pathlib.Path, optional
48 Full path to single station LEMI424 directory, by default None
49 file_ext : list of str, optional
50 Extension of LEMI424 files, by default ["txt", "TXT"]
51 **kwargs
52 Additional keyword arguments passed to parent Collection class
54 Attributes
55 ----------
56 station_id : str
57 Station identification string, defaults to "mt001"
58 survey_id : str
59 Survey identification string, defaults to "mt"
61 Examples
62 --------
63 >>> from mth5.io.lemi import LEMICollection
64 >>> lc = LEMICollection(r"/path/to/single/lemi/station")
65 >>> lc.station_id = "mt001"
66 >>> lc.survey_id = "test_survey"
67 >>> run_dict = lc.get_runs(1)
68 """
70 def __init__(
71 self,
72 file_path: str | pathlib.Path | None = None,
73 file_ext: List[str] | None = None,
74 **kwargs,
75 ) -> None:
76 if file_ext is None:
77 file_ext = ["txt", "TXT"]
78 super().__init__(file_path=file_path, file_ext=file_ext, **kwargs)
80 self.station_id = "mt001"
81 self.survey_id = "mt"
82 self.calibration_dict = {}
84 def get_calibrations(self, calibration_path: str | Path) -> dict:
85 """
86 Get calibration dictionary for LEMI424 files. This assumes that the
87 calibrations files are in JSON format and named as
88 'LEMI-424-<component>.json'
90 Parameters
91 ----------
92 calibration_path : str or pathlib.Path
93 Path to calibration files
95 Returns
96 -------
97 dict
98 Calibration dictionary for LEMI424 files
100 Examples
101 --------
102 >>> from mth5.io.lemi import LEMICollection
103 >>> lc = LEMICollection("/path/to/single/lemi/station")
104 >>> cal_dict = lc.get_calibrations(Path("/path/to/calibrations"))
105 """
106 calibration_path = Path(calibration_path)
108 calibration_dict = {}
109 for fn in calibration_path.rglob("*.json"):
110 comp = fn.stem.split("-")[-1].split(".", 1)[0]
111 calibration_dict[comp] = fn
113 return calibration_dict
115 def to_dataframe(
116 self,
117 sample_rates: int | List[int] | None = None,
118 run_name_zeros: int = 4,
119 calibration_path: str | Path | None = None,
120 ) -> pd.DataFrame:
121 """
122 Create a data frame of each TXT file in a given directory.
124 Notes
125 -----
126 This assumes the given directory contains a single station
128 Parameters
129 ----------
130 sample_rates : int or list of int, optional
131 Sample rate to get, will always be 1 for LEMI data, by default [1]
132 run_name_zeros : int, optional
133 Number of zeros to assign to the run name, by default 4
134 calibration_path : str or pathlib.Path, optional
135 Path to calibration files, by default None
137 Returns
138 -------
139 pd.DataFrame
140 DataFrame with information of each TXT file in the given directory
142 Examples
143 --------
144 >>> from mth5.io.lemi import LEMICollection
145 >>> lc = LEMICollection("/path/to/single/lemi/station")
146 >>> lemi_df = lc.to_dataframe()
147 """
148 if sample_rates is None:
149 sample_rates = [1]
151 if calibration_path is None:
152 calibration_path = Path(self.file_path)
153 self.calibration_dict = self.get_calibrations(calibration_path)
154 if not self.calibration_dict:
155 self.logger.warning(
156 f"No calibration files found in {calibration_path}, "
157 "proceeding without calibrations."
158 )
160 entries = []
161 for fn in self.get_files(self.file_ext):
162 lemi_obj = LEMI424(fn)
163 n_samples = int(lemi_obj.n_samples or 0)
164 lemi_obj.read_metadata()
166 entry = self.get_empty_entry_dict()
167 entry["survey"] = self.survey_id
168 entry["station"] = self.station_id
169 entry["start"] = lemi_obj.start.isoformat() if lemi_obj.start else ""
170 entry["end"] = lemi_obj.end.isoformat() if lemi_obj.end else ""
171 entry["component"] = ",".join(lemi_obj.run_metadata.channels_recorded_all)
172 entry["fn"] = fn
173 entry["sample_rate"] = lemi_obj.sample_rate
174 entry["file_size"] = lemi_obj.file_size
175 entry["n_samples"] = n_samples
177 entries.append(entry)
179 # make pandas dataframe and set data types
180 if len(entries) == 0:
181 self.logger.warning("No entries found for LEMI collection")
182 return pd.DataFrame()
184 df = pd.DataFrame(entries)
185 df.loc[:, "channel_id"] = 1
186 df.loc[:, "sequence_number"] = 0
187 df.loc[:, "instrument_id"] = "LEMI424"
189 df = self._sort_df(self._set_df_dtypes(df), run_name_zeros)
191 return df
193 def assign_run_names(self, df: pd.DataFrame, zeros: int = 4) -> pd.DataFrame:
194 """
195 Assign run names based on start and end times.
197 Checks if a file has the same start time as the last end time.
198 Run names are assigned as sr{sample_rate}_{run_number:0{zeros}}.
200 Parameters
201 ----------
202 df : pd.DataFrame
203 DataFrame with the appropriate columns
204 zeros : int, optional
205 Number of zeros in run name, by default 4
207 Returns
208 -------
209 pd.DataFrame
210 DataFrame with run names assigned
211 """
212 count = 1
213 for row in df.itertuples():
214 if row.Index == 0:
215 df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}"
216 previous_end = row.end
217 else:
218 if (
219 row.start - previous_end
220 ).total_seconds() / row.sample_rate == row.sample_rate:
221 df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}"
222 else:
223 count += 1
224 df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}"
225 previous_end = row.end
227 return df