Coverage for C:\src\imod-python\imod\formats\ipf.py: 86%
257 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-08 13:27 +0200
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-08 13:27 +0200
1"""
2Functions for reading and writing iMOD Point Files (IDFs) to ``pandas.DataFrame``.
4The primary functions to use are :func:`imod.ipf.read` and
5:func:`imod.ipf.save`, though lower level functions are also available.
6"""
8import collections
9import csv
10import glob
11import io
12import pathlib
13import warnings
14from typing import Tuple
16import numpy as np
17import pandas as pd
19import imod
22def _infer_delimwhitespace(line, ncol):
23 n_elem = len(next(csv.reader([line])))
24 if n_elem == 1:
25 return True
26 elif n_elem == ncol:
27 return False
28 else:
29 warnings.warn(
30 f"Inconsistent IPF: header states {ncol} columns, first line contains {n_elem}"
31 )
32 return False
35def _read_ipf(path, kwargs=None) -> Tuple[pd.DataFrame, int, str]:
36 path = pathlib.Path(path)
37 if kwargs is None:
38 kwargs = {}
40 with open(path) as f:
41 nrow = int(f.readline().strip())
42 ncol = int(f.readline().strip())
43 colnames = [f.readline().strip().strip("'").strip('"') for _ in range(ncol)]
44 line = f.readline()
45 try:
46 # csv.reader parse one line
47 # this catches commas in quotes
48 indexcol, ext = map(str.strip, next(csv.reader([line])))
49 except ValueError: # then try whitespace delimited
50 indexcol, ext = map(str.strip, next(csv.reader([line], delimiter=" ")))
52 position = f.tell()
53 line = f.readline()
54 delim_whitespace = _infer_delimwhitespace(line, ncol)
55 f.seek(position)
57 ipf_kwargs = {
58 "delim_whitespace": delim_whitespace,
59 "header": None,
60 "names": colnames,
61 "nrows": nrow,
62 "skipinitialspace": True,
63 }
64 ipf_kwargs.update(kwargs)
65 df = pd.read_csv(f, **ipf_kwargs)
67 return df, int(indexcol), ext
70def _read(path, kwargs=None, assoc_kwargs=None):
71 """
72 Read one IPF file to a single pandas.DataFrame, including associated (TXT) files.
74 Parameters
75 ----------
76 path: pathlib.Path or str
77 globpath for IPF files to read.
78 kwargs : dict
79 Dictionary containing the ``pandas.read_csv()`` keyword arguments for the
80 IPF files (e.g. `{"delim_whitespace": True}`)
81 assoc_kwargs: dict
82 Dictionary containing the ``pandas.read_csv()`` keyword arguments for the
83 associated (TXT) files (e.g. `{"delim_whitespace": True}`)
85 Returns
86 -------
87 pandas.DataFrame
88 """
89 df, indexcol, ext = _read_ipf(path, kwargs)
90 if assoc_kwargs is None:
91 assoc_kwargs = {}
93 # See if reading associated files is necessary
94 if indexcol > 1:
95 colnames = df.columns
96 # df = pd.read_csv(f, header=None, names=colnames, nrows=nrow, **kwargs)
97 dfs = []
98 for row in df.itertuples():
99 filename = row[indexcol]
100 # associate paths are relative to the ipf
101 path_assoc = path.parent.joinpath(f"{filename}.{ext}")
102 # Note that these kwargs handle all associated files, which might differ
103 # within an IPF. If this happens we could consider supporting a dict
104 # or function that maps assoc filenames to different kwargs.
105 try: # Capture the error and print the offending path
106 df_assoc = read_associated(path_assoc, assoc_kwargs)
107 except Exception as e:
108 raise type(e)(
109 f'{e}\nWhile reading associated file "{path_assoc}" of IPF file "{path}"'
110 ) from e
112 # Include records of the "mother" ipf file.
113 for name, value in zip(colnames, row[1:]): # ignores df.index in row
114 df_assoc[name] = value
115 # Append to list
116 dfs.append(df_assoc)
117 # Merge into a single whole
118 df = pd.concat(dfs, ignore_index=True, sort=False)
120 return df
123def read_associated(path, kwargs={}):
124 """
125 Read an IPF associated file (TXT).
127 Parameters
128 ----------
129 path : pathlib.Path or str
130 Path to associated file.
131 kwargs : dict
132 Dictionary containing the ``pandas.read_csv()`` keyword arguments for the
133 associated (TXT) file (e.g. `{"delim_whitespace": True}`).
135 Returns
136 -------
137 pandas.DataFrame
138 """
140 # deal with e.g. incorrect capitalization
141 path = pathlib.Path(path).resolve()
143 with open(path) as f:
144 nrow = int(f.readline().strip())
145 line = f.readline()
146 try:
147 # csv.reader parse one line
148 # this catches commas in quotes
149 ncol, itype = map(int, map(str.strip, next(csv.reader([line]))))
150 # itype can be implicit, in which case it's a timeseries
151 except ValueError:
152 try:
153 ncol = int(line.strip())
154 itype = 1
155 except ValueError: # then try whitespace delimited
156 ncol, itype = map(
157 int, map(str.strip, next(csv.reader([line], delimiter=" ")))
158 )
160 # use pandas for csv parsing: stuff like commas within quotes
161 # this is a workaround for a pandas bug, probable related issue:
162 # https://github.com/pandas-dev/pandas/issues/19827#issuecomment-398649163
163 lines = [f.readline() for _ in range(ncol)]
164 delim_whitespace = _infer_delimwhitespace(lines[0], 2)
165 # Normally, this ought to work:
166 # metadata = pd.read_csv(f, header=None, nrows=ncol).values
167 # TODO: replace when bugfix is released
168 # try both comma and whitespace delimited, everything can be be mixed
169 # in a single file...
170 lines = "".join(lines)
172 # TODO: find out whether this can be replace by csv.reader
173 # the challenge lies in replacing the pd.notnull for nodata values.
174 # is otherwise quite a bit faster for such a header block.
175 metadata_kwargs = {
176 "delim_whitespace": delim_whitespace,
177 "header": None,
178 "nrows": ncol,
179 "skipinitialspace": True,
180 }
181 metadata_kwargs.update(kwargs)
182 metadata = pd.read_csv(io.StringIO(lines), **metadata_kwargs)
183 # header description possibly includes nodata
184 usecols = np.arange(ncol)[pd.notnull(metadata[0])]
185 metadata = metadata.iloc[usecols, :]
187 # Collect column names and nodata values
188 colnames = []
189 na_values = collections.OrderedDict()
190 for colname, nodata in metadata.values:
191 na_values[colname] = [nodata, "-"] # "-" seems common enough to ignore
192 if isinstance(colname, str):
193 colnames.append(colname.strip())
194 else:
195 colnames.append(colname)
197 # Sniff the first line of the data block
198 position = f.tell()
199 line = f.readline()
200 f.seek(position)
201 delim_whitespace = _infer_delimwhitespace(line, ncol)
203 itype_kwargs = {
204 "delim_whitespace": delim_whitespace,
205 "header": None,
206 "names": colnames,
207 "usecols": usecols,
208 "nrows": nrow,
209 "na_values": na_values,
210 "skipinitialspace": True,
211 }
212 if itype == 1: # Timevariant information: timeseries
213 # check if first column is time in [yyyymmdd] or [yyyymmddhhmmss]
214 itype_kwargs["dtype"] = {colnames[0]: str}
215 elif itype == 2: # 1D borehole
216 # enforce first column is a float
217 itype_kwargs["dtype"] = {colnames[0]: np.float64}
218 elif itype == 3: # cpt
219 # all columns must be numeric
220 itype_kwargs["dtype"] = {colname: np.float64 for colname in colnames}
221 elif itype == 4: # 3D borehole
222 # enforce first 3 columns are float
223 itype_kwargs["dtype"] = {
224 colnames[0]: np.float64,
225 colnames[1]: np.float64,
226 colnames[2]: np.float64,
227 }
228 itype_kwargs.update(kwargs)
229 df = pd.read_csv(f, **itype_kwargs)
231 if nrow > 0 and itype == 1:
232 time_column = colnames[0]
233 len_date = len(df[time_column].iloc[0])
234 if len_date == 14:
235 df[time_column] = pd.to_datetime(df[time_column], format="%Y%m%d%H%M%S")
236 elif len_date == 8:
237 df[time_column] = pd.to_datetime(df[time_column], format="%Y%m%d")
238 else:
239 raise ValueError(
240 f"{path.name}: datetime format must be yyyymmddhhmmss or yyyymmdd"
241 )
242 return df
245def read(path, kwargs={}, assoc_kwargs={}):
246 """
247 Read one or more IPF files to a single pandas.DataFrame, including associated
248 (TXT) files.
250 The different IPF files can be from different model layers,
251 and column names may differ between them.
253 Note that this function always returns a ``pandas.DataFrame``. IPF files
254 always contain spatial information, for which ``geopandas.GeoDataFrame``
255 is a better fit, in principle. However, GeoDataFrames are not the best fit
256 for the associated data.
258 To perform spatial operations on the points, you're likely best served by
259 (temporarily) creating a GeoDataFrame, doing the spatial operation, and
260 then using the output to select values in the original DataFrame. Please
261 refer to the examples.
263 Parameters
264 ----------
265 path: str, Path or list
266 This can be a single file, 'wells_l1.ipf', a glob pattern expansion,
267 'wells_l*.ipf', or a list of files, ['wells_l1.ipf', 'wells_l2.ipf'].
268 Note that each file needs to have the same columns, such that they can
269 be combined in a single pd.DataFrame.
270 kwargs : dict
271 Dictionary containing the ``pandas.read_csv()`` keyword arguments for the
272 IPF files (e.g. `{"delim_whitespace": True}`)
273 assoc_kwargs: dict
274 Dictionary containing the ``pandas.read_csv()`` keyword arguments for the
275 associated (TXT) files (e.g. `{"delim_whitespace": True}`)
277 Returns
278 -------
279 pandas.DataFrame
281 Examples
282 --------
283 Read an IPF file into a dataframe:
285 >>> import imod
286 >>> df = imod.ipf.read("example.ipf")
288 Convert the x and y data into a GeoDataFrame, do a spatial operation, and
289 use it to select points within a polygon.
290 Note: ``gpd.points_from_xy()`` requires a geopandas version >= 0.5.
292 >>> import geopandas as gpd
293 >>> polygon = gpd.read_file("polygon.shp").geometry[0]
294 >>> ipf_points = gpd.GeoDataFrame(geometry=gpd.points_from_xy(df["x"], df["y"]))
295 >>> within_polygon = ipf_points.within(polygon)
296 >>> selection = df[within_polygon]
298 The same exercise is a little more complicated when associated files (like
299 timeseries) are involved, since many duplicate values of x and y will exist.
300 The easiest way to isolate these is by applying a groupby, and then taking
301 first of x and y of every group:
303 >>> df = imod.ipf.read("example_with_time.ipf")
304 >>> first = df.groupby("id").first() # replace "id" by what your ID column is called
305 >>> x = first["x"]
306 >>> y = first["y"]
307 >>> id_code = first.index # id is a reserved keyword in python
308 >>> ipf_points = gpd.GeoDataFrame(geometry=gpd.points_from_xy(x, y))
309 >>> within_polygon = ipf_points.within(polygon)
311 Using the result is a little more complicated as well, since it has to be
312 mapped back to many duplicate values of the original dataframe.
313 There are two options. First, by using the index:
315 >>> within_polygon.index = id_code
316 >>> df = df.set_index("id")
317 >>> selection = df[within_polygon]
319 If you do not wish to change index on the original dataframe, use
320 ``pandas.DataFrame.merge()`` instead.
322 >>> import pandas as pd
323 >>> within_polygon = pd.DataFrame({"within": within_polygon})
324 >>> within_polygon["id"] = id_code
325 >>> df = df.merge(within_polygon, on="id")
326 >>> df = df[df["within"]]
327 """
328 if isinstance(path, list):
329 paths = path
330 elif isinstance(path, (str, pathlib.Path)):
331 # convert since for Path.glob non-relative patterns are unsupported
332 path = str(path)
333 paths = [pathlib.Path(p) for p in glob.glob(path)]
334 else:
335 raise ValueError("Path should be either a list, str or pathlib.Path")
337 n = len(paths)
338 if n == 0:
339 raise FileNotFoundError(f"Could not find any files matching {path}")
340 elif n == 1:
341 bigdf = _read(paths[0], kwargs, assoc_kwargs)
342 else:
343 dfs = []
344 for p in paths:
345 layer = imod.util.path.decompose(p).get("layer")
346 try:
347 df = _read(p, kwargs, assoc_kwargs)
348 except Exception as e:
349 raise type(e)(f'{e}\nWhile reading IPF file "{p}"') from e
350 if layer is not None:
351 df["layer"] = layer
352 dfs.append(df)
353 bigdf = pd.concat(
354 dfs, ignore_index=True, sort=False
355 ) # this sorts in pandas < 0.23
357 return bigdf
360def _coerce_itype(itype):
361 """Changes string itype to int"""
362 if itype in [None, 1, 2, 3, 4]:
363 pass
364 elif itype.lower() == "timeseries":
365 itype = 1
366 elif itype.lower() == "borehole1d":
367 itype = 2
368 elif itype.lower() == "cpt":
369 itype = 3
370 elif itype.lower() == "borehole3d":
371 itype = 4
372 else:
373 raise ValueError("Invalid IPF itype")
374 return itype
377def _lower(colnames):
378 """Lowers colnames, checking for uniqueness"""
379 lowered_colnames = [s.lower() for s in colnames]
380 if len(set(lowered_colnames)) != len(colnames):
381 seen = set()
382 for name in lowered_colnames:
383 if name in seen:
384 raise ValueError(f'Column name "{name}" is not unique, after lowering.')
385 else:
386 seen.add(name)
387 return lowered_colnames
390def write_assoc(path, df, itype=1, nodata=1.0e20, assoc_columns=None):
391 """
392 Writes a single IPF associated (TXT) file.
394 Parameters
395 ----------
396 path : pathlib.Path or str
397 Path for the written associated file.
398 df : pandas.DataFrame
399 DataFrame containing the data to write.
400 itype : int or str
401 IPF type.
402 Possible values, either integer or string:
404 1 : "timeseries"
405 2 : "borehole1d"
406 3 : "cpt"
407 4 : "borehole3d"
408 nodata : float
409 The value given to nodata values. These are generally NaN (Not-a-Number)
410 in pandas, but this leads to errors in iMOD(FLOW) for IDFs.
411 Defaults to value of 1.0e20 instead.
412 assoc_columns : optional, list or dict
413 Columns to store in the associated file. In case of a dictionary, the
414 columns will be renamed according to the mapping in the dictionary.
415 Defaults to None.
417 Returns
418 -------
419 None
420 Writes a file.
421 """
423 itype = _coerce_itype(itype)
424 required_columns = {
425 1: ["time"],
426 2: ["top"],
427 3: ["top"],
428 4: ["x_offset", "y_offset", "top"],
429 }
431 # Ensure columns are in the right order for the itype
432 colnames = _lower(list(df))
433 df.columns = colnames
434 columnorder = []
435 for colname in required_columns[itype]:
436 if colname not in colnames:
437 raise ValueError(f'given itype requires column "{colname}"')
438 colnames.remove(colname)
439 columnorder.append(colname)
440 columnorder += colnames
442 # Check if columns have to be renamed
443 if isinstance(assoc_columns, dict):
444 columnorder = [assoc_columns[col] for col in columnorder]
445 df = df.rename(columns=assoc_columns)
447 nrecords, nfields = df.shape
448 with open(path, "w") as f:
449 f.write(f"{nrecords}\n{nfields},{itype}\n")
450 for colname in columnorder:
451 if "," in colname or " " in colname:
452 colname = '"' + colname + '"'
453 f.write(f"{colname},{nodata}\n")
454 # workaround pandas issue by closing the file first, see
455 # https://github.com/pandas-dev/pandas/issues/19827#issuecomment-398649163
457 df = df.fillna(nodata)
458 df = df[columnorder]
460 # We cannot rely on the quoting=QUOTE_NONNUMERIC policy
461 # The reason is that datetime columns are converted to string as well
462 # and then quoted. This causes trouble with some iMOD(batch) functions.
463 for column in df.columns:
464 if df.loc[:, column].dtype == np.dtype("O"):
465 df.loc[:, column] = df.loc[:, column].astype(str)
466 df.loc[:, column] = '"' + df.loc[:, column] + '"'
468 df.to_csv(
469 path,
470 index=False,
471 header=False,
472 mode="a",
473 date_format="%Y%m%d%H%M%S",
474 quoting=csv.QUOTE_NONE,
475 )
478def write(path, df, indexcolumn=0, assoc_ext="txt", nodata=1.0e20):
479 """
480 Writes a single IPF file.
482 Parameters
483 ----------
484 path : pathlib.Path or str
485 path of the written IPF file.
486 Any associated files are written relative to this path, based on the ID
487 column.
488 df : pandas.DataFrame
489 DataFrame containing the data to write.
490 indexcolumn : integer
491 number of the column containg the paths to the associated (TXT) files.
492 Defaults to a value of 0 (no associated files).
493 assoc_ext : str
494 Extension of the associated files. Defaults to "txt".
496 Returns
497 -------
498 None
499 Writes a file.
500 """
501 df = df.fillna(nodata)
502 nrecords, nfields = df.shape
503 with open(path, "w") as f:
504 f.write(f"{nrecords}\n{nfields}\n")
505 for colname in df.columns:
506 if "," in colname or " " in colname:
507 colname = '"' + colname + '"'
508 f.write(f"{colname}\n")
509 f.write(f"{indexcolumn},{assoc_ext}\n")
511 # We cannot rely on the quoting=QUOTE_NONNUMERIC policy
512 # The reason is that datetime columns are converted to string as well
513 # and then quoted. This causes trouble with some iMOD(batch) functions.
514 for column in df.columns:
515 if df.loc[:, column].dtype == np.dtype("O"):
516 df.loc[:, column] = df.loc[:, column].astype(str)
517 df.loc[:, column] = '"' + df.loc[:, column] + '"'
519 # workaround pandas issue by closing the file first, see
520 # https://github.com/pandas-dev/pandas/issues/19827#issuecomment-398649163
521 df.to_csv(path, index=False, header=False, mode="a", quoting=csv.QUOTE_NONE)
524def _is_single_value(group):
525 return len(pd.unique(group)) == 1
528def _compose_ipf(path, df, itype, assoc_ext, nodata=1.0e20, assoc_columns=None):
529 """
530 When itype is not None, breaks down the pandas DataFrame into its IPF part
531 and its associated TXT files, creating the IPF data structure.
533 Parameters
534 ----------
535 path : pathlib.Path or str
536 path of the written IPF file.
537 Any associated files are written relative to this path, based on the ID
538 column.
539 df : pandas.DataFrame
540 DataFrame containing the data to write.
541 itype : int or str or None
542 If ``None`` no associated files are written.
543 Other possible values, either integer or string:
545 * ``1`` or ``"timeseries"``
546 * ``2`` or ``"borehole1d"``
547 * ``3`` or ``"cpt"``
548 * ``4`` or ``"borehole3d"``
549 assoc_ext : str
550 Extension of the associated files. Normally ".txt".
551 nodata : float
552 The value given to nodata values. These are generally NaN (Not-a-Number)
553 in pandas, but this leads to errors in iMOD(FLOW) for IDFs.
554 Defaults to value of 1.0e20 instead.
555 assoc_columns : optional, list or dict
556 Columns to store in the associated file. In case of a dictionary, the
557 columns will be renamed according to the mapping in the dictionary.
558 Defaults to None.
560 Returns
561 -------
562 None
563 Writes files.
564 """
565 if itype is None:
566 write(path, df, nodata=nodata)
567 else:
568 itype = _coerce_itype(itype)
569 colnames = _lower(list(df))
570 df.columns = colnames
571 for refname in ["x", "y", "id"]:
572 if refname not in colnames:
573 raise ValueError(f'given itype requires column "{refname}"')
574 colnames.remove(refname)
576 grouped = df.groupby("id")
577 if not grouped["x"].apply(_is_single_value).all():
578 raise ValueError("column x contains more than one value per id")
579 if not grouped["y"].apply(_is_single_value).all():
580 raise ValueError("column y contains more than one value per id")
581 # get columns that have only one value within a group, to save them in ipf
582 ipf_columns = [
583 (colname, "first")
584 for colname in colnames
585 if grouped[colname].apply(_is_single_value).all()
586 ]
588 for idcode, group in grouped:
589 assoc_path = path.parent.joinpath(str(idcode) + "." + str(assoc_ext))
590 assoc_path.parent.mkdir(parents=True, exist_ok=True)
591 if isinstance(assoc_columns, list):
592 selection = assoc_columns
593 elif isinstance(assoc_columns, dict):
594 selection = list(assoc_columns.keys())
595 else:
596 selection = [
597 colname for colname in colnames if colname not in ipf_columns
598 ]
599 out_df = group[selection]
600 write_assoc(assoc_path, out_df, itype, nodata, assoc_columns)
602 # ensures right order for x, y, id; so that also indexcolumn == 3
603 agg_kwargs = collections.OrderedDict(
604 [("x", "first"), ("y", "first"), ("id", "first")]
605 )
606 agg_kwargs.update(ipf_columns)
607 agg_df = grouped.agg(agg_kwargs)
608 write(path, agg_df, 3, assoc_ext, nodata=nodata)
611def save(path, df, itype=None, assoc_ext="txt", nodata=1.0e20, assoc_columns=None):
612 """
613 Saves the contents of a pandas DataFrame to one or more IPF files, and
614 associated (TXT) files.
616 Can write multiple IPF files if one of the columns is named "layer". In
617 turn, multiple associated (TXT) files may written for each of these IPF
618 files. Note that the ID must be unique for each layer. See the examples.
620 Parameters
621 ----------
622 path : pathlib.Path or str
623 path of the written IPF file.
624 Any associated files are written relative to this path, based on the ID
625 column.
626 df : pandas.DataFrame
627 DataFrame containing the data to write.
628 itype : int or str or None
629 IPF type. Defaults to ``None``, in which case no associated files are
630 created. Possible other values, either integer or string:
632 * ``1`` or ``"timeseries"``
633 * ``2`` or ``"borehole1d"``
634 * ``3`` or ``"cpt"``
635 * ``4`` or ``"borehole3d"``
636 assoc_ext : str
637 Extension of the associated files. Defaults to "txt".
638 nodata : float
639 The value given to nodata values. These are generally NaN (Not-a-Number)
640 in pandas, but this leads to errors in iMOD(FLOW) for IDFs.
641 Defaults to value of 1.0e20 instead.
642 assoc_columns : optional, list or dict
643 Columns to store in the associated file. In case of a dictionary, the
644 columns will be renamed according to the mapping in the dictionary.
645 Defaults to None.
647 Returns
648 -------
649 None
650 Writes files.
652 Examples
653 --------
654 To write a single IPF without associated timeseries or boreholes:
656 >>> imod.ipf.save("static-data.ipf", df)
658 To write timeseries data:
660 >>> imod.ipf.save("transient-data.ipf", df, itype="timeseries")
662 If a ``"layer"`` column is present, make sure the ID is unique per layer:
664 >>> df["id"] = df["id"].str.cat(df["layer"], sep="_")
665 >>> imod.ipf.save("layered.ipf", df, itype="timeseries")
667 An error will be raised otherwise.
668 """
670 path = pathlib.Path(path)
672 d = {"extension": ".ipf", "name": path.stem, "directory": path.parent}
673 d["directory"].mkdir(exist_ok=True, parents=True)
675 colnames = _lower(list(df))
676 # Lower assoc_columns as well if available
677 if isinstance(assoc_columns, list):
678 assoc_columns = _lower(assoc_columns)
679 elif isinstance(assoc_columns, dict):
680 keys = _lower(assoc_columns.keys())
681 values = _lower(assoc_columns.values())
682 assoc_columns = {k: v for k, v in zip(keys, values)}
684 df.columns = colnames
685 if "layer" in colnames:
686 if "time" in colnames:
687 groupcols = ["time", "id"]
688 else:
689 groupcols = "id"
691 n_layer_per_id = df.groupby(groupcols)["layer"].nunique()
692 if (n_layer_per_id > 1).any():
693 raise ValueError(
694 "Multiple layer values for a single ID detected. "
695 "Unique IDs are required for each layer."
696 )
698 for layer, group in df.groupby("layer"):
699 d["layer"] = layer
700 fn = imod.util.path.compose(d)
701 _compose_ipf(fn, group, itype, assoc_ext, nodata, assoc_columns)
702 else:
703 fn = imod.util.path.compose(d)
704 _compose_ipf(fn, df, itype, assoc_ext, nodata, assoc_columns)