Coverage for C:\src\imod-python\imod\prepare\spatial.py: 92%
360 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-08 13:27 +0200
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-08 13:27 +0200
1import pathlib
2from typing import TYPE_CHECKING, Callable, Union
4import dask
5import numba
6import numpy as np
7import pandas as pd
8import scipy.ndimage
9import xarray as xr
11import imod
12from imod.prepare import common, pcg
13from imod.util.imports import MissingOptionalModule
14from imod.util.spatial import _polygonize
16# since rasterio is a big dependency that is sometimes hard to install
17# and not always required, we made this an optional dependency
18try:
19 import rasterio
20 import rasterio.features
21 import rasterio.warp
22except ImportError:
23 rasterio = MissingOptionalModule("rasterio")
25if TYPE_CHECKING:
26 import geopandas as gpd
28def round_extent(extent, cellsize):
29 """Increases the extent until all sides lie on a coordinate
30 divisible by cellsize."""
31 xmin, ymin, xmax, ymax = extent
32 xmin = np.floor(xmin / cellsize) * cellsize
33 ymin = np.floor(ymin / cellsize) * cellsize
34 xmax = np.ceil(xmax / cellsize) * cellsize
35 ymax = np.ceil(ymax / cellsize) * cellsize
36 return xmin, ymin, xmax, ymax
39def round_z(z_extent, dz):
40 """Increases the extent until all sides lie on a coordinate
41 divisible by dz."""
42 zmin, zmax = z_extent
43 zmin = np.floor(zmin / dz) * dz
44 zmax = np.ceil(zmax / dz) * dz
45 return zmin, zmax
48def _fill_np(data, invalid):
49 """Basic nearest neighbour interpolation"""
50 # see: https://stackoverflow.com/questions/5551286/filling-gaps-in-a-numpy-array
51 ind = scipy.ndimage.distance_transform_edt(
52 invalid, return_distances=False, return_indices=True
53 )
54 return data[tuple(ind)]
57def fill(da, invalid=None, by=None):
58 """
59 Replace the value of invalid ``da`` cells (indicated by ``invalid``)
60 using basic nearest neighbour interpolation.
62 Parameters
63 ----------
64 da: xr.DataArray with gaps
65 array containing missing value
66 by: str, optional
67 dimension over which the array will be filled, one by one.
68 See the examples.
70 invalid: xr.DataArray
71 a binary array of same shape as ``da``.
72 data value are replaced where invalid is True
73 If None (default), uses: `invalid = np.isnan(data)`
75 Returns
76 -------
77 xarray.DataArray
78 with the same coordinates as the input.
80 Examples
81 --------
83 A common use case is filling holes in a DataArray, filling it with the
84 value of its nearest (valid) neighbor:
86 >>> filled = imod.prepare.fill(da)
88 In case of a tie (e.g. neighbors in x and y are both one cell removed), the
89 neighbor in the last dimension is chosen (for rasters, that's generally x).
91 A typical use case is filling a 3D array (layer, y, x), but only in the
92 horizontal dimensions. The ``by`` keyword can be used to do this:
94 >>> filled = imod.prepare.fill(da, by="layer")
96 In this case, the array is filled by one layer at a time.
97 """
99 out = xr.full_like(da, np.nan)
100 if invalid is None:
101 invalid = np.isnan(da)
102 if by:
103 for coordvalue in da[by]:
104 d = {by: coordvalue}
105 out.sel(d)[...] = _fill_np(da.sel(d).values, invalid.sel(d).values)
106 else:
107 out.values = _fill_np(da.values, invalid.values)
109 return out
112def laplace_interpolate(
113 source, ibound=None, close=0.01, mxiter=5, iter1=50, relax=0.98
114):
115 """
116 Fills gaps in `source` by interpolating from existing values using Laplace
117 interpolation.
119 Parameters
120 ----------
121 source : xr.DataArray of floats with dims (y, x)
122 Data values to interpolate.
123 ibound : xr.DataArray of bool with dims (y, x)
124 Precomputed array which marks where to interpolate.
125 close : float
126 Closure criteration of iterative solver. Should be one to two orders
127 of magnitude smaller than desired accuracy.
128 mxiter : int
129 Outer iterations of iterative solver.
130 iter1 : int
131 Inner iterations of iterative solver. Should not exceed 50.
132 relax : float
133 Iterative solver relaxation parameter. Should be between 0 and 1.
135 Returns
136 -------
137 interpolated : xr.DataArray with dims (y, x)
138 source, with interpolated values where ibound equals 1
139 """
140 solver = pcg.PreconditionedConjugateGradientSolver(
141 close, close * 1.0e6, mxiter, iter1, relax
142 )
144 if not source.dims == ("y", "x"):
145 raise ValueError('source dims must be ("y", "x")')
147 # expand dims to make 3d
148 source3d = source.expand_dims("layer")
149 if ibound is None:
150 iboundv = xr.full_like(source3d, 1, dtype=np.int32).values
151 else:
152 if not ibound.dims == ("y", "x"):
153 raise ValueError('ibound dims must be ("y", "x")')
154 if not ibound.shape == source.shape:
155 raise ValueError("ibound and source must have the same shape")
156 iboundv = ibound.expand_dims("layer").astype(np.int32).values
158 has_data = source3d.notnull().values
159 iboundv[has_data] = -1
160 hnew = source3d.fillna(0.0).values.astype(
161 np.float64
162 ) # Set start interpolated estimate to 0.0
164 shape = iboundv.shape
165 nlay, nrow, ncol = shape
166 nodes = nlay * nrow * ncol
167 # Allocate work arrays
168 # Not really used now, but might come in handy to implements weights
169 cc = np.ones(shape)
170 cr = np.ones(shape)
171 cv = np.ones(shape)
172 rhs = np.zeros(shape)
173 hcof = np.zeros(shape)
174 # Solver work arrays
175 res = np.zeros(nodes)
176 cd = np.zeros(nodes)
177 v = np.zeros(nodes)
178 ss = np.zeros(nodes)
179 p = np.zeros(nodes)
181 # Picard iteration
182 converged = False
183 outer_iteration = 0
184 while not converged and outer_iteration < mxiter:
185 # Mutates hnew
186 converged = solver.solve(
187 hnew=hnew,
188 cc=cc,
189 cr=cr,
190 cv=cv,
191 ibound=iboundv,
192 rhs=rhs,
193 hcof=hcof,
194 res=res,
195 cd=cd,
196 v=v,
197 ss=ss,
198 p=p,
199 )
200 outer_iteration += 1
201 else:
202 if not converged:
203 raise RuntimeError("Failed to converge")
205 hnew[iboundv == 0] = np.nan
206 return source.copy(data=hnew[0])
209def rasterize(geodataframe, like, column=None, fill=np.nan, **kwargs):
210 """
211 Rasterize a geopandas GeoDataFrame onto the given
212 xarray coordinates.
214 Parameters
215 ----------
216 geodataframe : geopandas.GeoDataFrame
217 column : str, int, float
218 column name of geodataframe to burn into raster
219 like : xarray.DataArray
220 Example DataArray. The rasterized result will match the shape and
221 coordinates of this DataArray.
222 fill : float, int
223 Fill value for nodata areas. Optional, default value is np.nan.
224 kwargs : additional keyword arguments for rasterio.features.rasterize.
225 See: https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize
227 Returns
228 -------
229 rasterized : xarray.DataArray
230 Vector data rasterized. Matches shape and coordinates of ``like``.
231 """
233 if column is not None:
234 shapes = [
235 (geom, value)
236 for geom, value in zip(geodataframe.geometry, geodataframe[column])
237 ]
238 else:
239 shapes = [geom for geom in geodataframe.geometry]
241 # shapes must be an iterable
242 try:
243 iter(shapes)
244 except TypeError:
245 shapes = (shapes,)
247 raster = rasterio.features.rasterize(
248 shapes,
249 out_shape=like.shape,
250 fill=fill,
251 transform=imod.util.spatial.transform(like),
252 **kwargs,
253 )
255 return xr.DataArray(raster, like.coords, like.dims)
258def polygonize(da: xr.DataArray) -> "gpd.GeoDataFrame":
259 """
260 Polygonize a 2D-DataArray into a GeoDataFrame of polygons.
262 Parameters
263 ----------
264 da : xr.DataArray
266 Returns
267 -------
268 polygonized : geopandas.GeoDataFrame
269 """
270 return _polygonize(da)
273def _handle_dtype(dtype, nodata):
274 # Largely taken from rasterio.dtypes
275 # https://github.com/mapbox/rasterio/blob/master/rasterio/dtypes.py
276 # Not supported:
277 # GDT_CInt16 = 8, GDT_CInt32 = 9, GDT_CFloat32 = 10, GDT_CFloat64 = 11
278 dtype_mapping = {
279 "uint8": 1, # GDT_Byte
280 "uint16": 2, # GDT_Uint16
281 "int16": 3, # GDT_Int16
282 "uint32": 4, # GDT_Uint32
283 "int32": 5, # GDT_Int32
284 "float32": 6, # GDT_Float32
285 "float64": 7, # GDT_Float64
286 }
287 dtype_ranges = {
288 "uint8": (0, 255),
289 "uint16": (0, 65535),
290 "int16": (-32768, 32767),
291 "uint32": (0, 4294967295),
292 "int32": (-2147483648, 2147483647),
293 "float32": (-3.4028235e38, 3.4028235e38),
294 "float64": (-1.7976931348623157e308, 1.7976931348623157e308),
295 }
297 def format_invalid(str_dtype):
298 str_dtypes = dtype_mapping.keys()
299 return "Invalid dtype: {0}, must be one of: {1}".format(
300 str_dtype, ", ".join(str_dtypes)
301 )
303 if dtype is np.dtype(np.int64):
304 dtype = np.int32
306 str_dtype = str(np.dtype(dtype))
307 if str_dtype not in dtype_mapping.keys():
308 raise ValueError(format_invalid(str_dtype))
309 gdal_dtype = dtype_mapping[str_dtype]
311 if nodata is None:
312 if np.issubdtype(dtype, np.integer):
313 # Default to lowest value in case of integers
314 nodata = dtype_ranges[str_dtype][0]
315 elif np.issubdtype(dtype, np.floating):
316 # Default to NaN in case of floats
317 nodata = np.nan
318 else:
319 lower, upper = dtype_ranges[str_dtype]
320 if nodata < lower or nodata > upper:
321 raise ValueError(f"Nodata value {nodata} is out of bounds for {str_dtype}")
323 return gdal_dtype, nodata
326def gdal_rasterize(
327 path,
328 column,
329 like=None,
330 nodata=None,
331 dtype=None,
332 spatial_reference=None,
333 all_touched=False,
334):
335 """
336 Use GDAL to rasterize a vector file into an xarray.DataArray.
338 Can be significantly more efficient than rasterize. This doesn't load the
339 vector data into a GeoDataFrame and loops over the individual shapely
340 geometries like rasterio.rasterize does, but loops over the features within
341 GDAL instead.
343 Parameters
344 ----------
345 path : str or pathlib.Path
346 path to OGR supported vector file (e.g. a shapefile)
347 column : str
348 column name of column to burn into raster
349 like : xr.DataArray, optional
350 example of raster
351 nodata : int, float; optional
352 dtype : numpy.dtype, optional
353 spatial_reference : dict, optional
354 Optional dict to avoid allocating the like DataArray. Used if template
355 is None. Dict has keys "bounds" and "cellsizes", with:
357 * bounds = (xmin, xmax, ymin, ymax)
358 * cellsizes = (dx, dy)
359 all_touched : bool
360 If True: all pixels touched by lines or polygons will be updated, not
361 just those on the line render path, or whose center point is within the
362 polygon. Default value is False.
364 Returns
365 -------
366 rasterized : xr.DataArray
367 """
368 from osgeo import gdal, ogr
370 if isinstance(path, pathlib.Path):
371 p = path
372 path = str(p)
373 else:
374 p = pathlib.Path(path)
375 if not p.exists():
376 raise FileNotFoundError(f"No such file: {path}")
378 if dtype is None:
379 if like is None:
380 raise ValueError("If ``like`` is not provided, ``dtype`` has to be given")
381 else:
382 dtype = like.dtype
383 gdal_dtype, nodata = _handle_dtype(dtype, nodata)
385 # Exceptions will get raised on anything >= gdal.CE_Failure
386 gdal.UseExceptions()
388 # An attempt at decent errors
389 class GdalErrorHandler:
390 def __init__(self):
391 self.err_level = gdal.CE_None
392 self.err_no = 0
393 self.err_msg = ""
395 def handler(self, err_level, err_no, err_msg):
396 self.err_level = err_level
397 self.err_no = err_no
398 self.err_msg = err_msg
400 error = GdalErrorHandler()
401 handler = error.handler
402 gdal.PushErrorHandler(handler)
404 # Get spatial data from template
405 if like is not None:
406 dx, xmin, _, dy, _, ymax = imod.util.spatial.spatial_reference(like)
407 nrow, ncol = like.shape
408 else:
409 cellsizes = spatial_reference["cellsizes"]
410 bounds = spatial_reference["bounds"]
411 dx, dy = cellsizes
412 if not isinstance(dx, (int, float)) or not isinstance(dy, (int, float)):
413 raise ValueError("Cannot rasterize to a non-equidistant grid")
414 coords = imod.util.spatial._xycoords(bounds, cellsizes)
415 xmin, _, _, ymax = bounds
416 nrow = coords["y"].size
417 ncol = coords["x"].size
418 dims = ("y", "x")
420 # File will be closed when vector is dereferenced, after return
421 vector = ogr.Open(path)
422 vector_layer = vector.GetLayer()
424 memory_driver = gdal.GetDriverByName("MEM")
425 memory_raster = memory_driver.Create("", ncol, nrow, 1, gdal_dtype)
426 memory_raster.SetGeoTransform([xmin, dx, 0, ymax, 0, dy])
427 memory_band = memory_raster.GetRasterBand(1)
428 memory_band.SetNoDataValue(nodata)
429 memory_band.Fill(nodata)
431 options = [f"ATTRIBUTE={column}", f"ALL_TOUCHED={str(all_touched).upper()}"]
432 gdal.RasterizeLayer(memory_raster, [1], vector_layer, None, None, [1], options)
433 if error.err_level >= gdal.CE_Warning:
434 message = error.err_msg
435 if message.startswith(
436 "Failed to fetch spatial reference on layer"
437 ) and message.endswith("assuming matching coordinate systems."):
438 pass
439 else:
440 raise RuntimeError("GDAL error: " + error.err_msg)
442 if like is not None:
443 rasterized = like.copy(data=memory_raster.ReadAsArray())
444 else:
445 rasterized = xr.DataArray(memory_raster.ReadAsArray(), coords, dims)
447 return rasterized
450@numba.njit(cache=True)
451def _cell_count(src, values, frequencies, nodata, *inds_weights):
452 """
453 numba compiled function to count the number of src cells occuring in the dst
454 cells.
456 Parameters
457 ----------
458 src : np.array
459 values : np.array
460 work array to store the unique values
461 frequencies : np.array
462 work array to store the tallied counts
463 nodata : int, float
464 inds_weights : tuple of np.arrays
465 Contains indices of dst, indices of src, and weights.
467 Returns
468 -------
469 tuple of np.arrays
471 * row_indices
472 * col_indices
473 * values
474 * frequencies
475 """
476 jj, blocks_iy, blocks_weights_y, kk, blocks_ix, blocks_weights_x = inds_weights
478 # Use list for dynamic allocation, since we don't know number of rows in
479 # advance.
480 row_indices = []
481 col_indices = []
482 value_list = []
483 count_list = []
485 # j, k are indices of dst array
486 # block_i contains indices of src array
487 # block_w contains weights of src array
488 for countj, j in enumerate(jj):
489 block_iy = blocks_iy[countj]
490 block_wy = blocks_weights_y[countj]
491 for countk, k in enumerate(kk):
492 block_ix = blocks_ix[countk]
493 block_wx = blocks_weights_x[countk]
495 # TODO: use weights in frequency count, and area sum?
496 # Since src is equidistant, normed weights are easy to calculate.
498 # Add the values and weights per cell in multi-dim block
499 value_count = 0
500 for iy, wy in zip(block_iy, block_wy):
501 if iy < 0:
502 break
503 for ix, wx in zip(block_ix, block_wx):
504 if ix < 0:
505 break
507 v = src[iy, ix]
508 if v == nodata: # Skip nodata cells
509 continue
510 # Work on a single destination cell
511 # Count the number of polygon id's occuring in the cell
512 # a new row per id
513 found = False
514 for i in range(value_count):
515 if v == values[i]:
516 frequencies[i] += 1
517 found = True
518 break
519 if not found:
520 values[value_count] = v
521 frequencies[value_count] = 1
522 value_count += 1
523 # Add a new entry
524 row_indices.append(j)
525 col_indices.append(k)
527 # Store for output
528 value_list.extend(values[:value_count])
529 count_list.extend(frequencies[:value_count])
531 # reset storage
532 values[:value_count] = 0
533 frequencies[:value_count] = 0
535 # Cast to numpy arrays
536 row_i_arr = np.array(row_indices)
537 col_i_arr = np.array(col_indices)
538 value_arr = np.array(value_list)
539 count_arr = np.array(count_list)
541 return row_i_arr, col_i_arr, value_arr, count_arr
544def _celltable(path, column, resolution, like, rowstart=0, colstart=0):
545 """
546 Returns a table of cell indices (row, column) with feature ID, and feature
547 area within cell. Essentially returns a COO sparse matrix, but with
548 duplicate values per cell, since more than one geometry may be present.
550 The feature area within the cell is approximated by first rasterizing the
551 feature, and then counting the number of occuring cells. This means the
552 accuracy of the area depends on the resolution of the rasterization step.
554 Parameters
555 ----------
556 path : str or pathlib.Path
557 path to OGR supported vector file (e.g. a shapefile)
558 column : str
559 column name of column to burn into raster
560 resolution : float
561 cellsize at which the rasterization, and determination of area within
562 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m).
563 like : xarray.DataArray
564 Example DataArray of where the cells will be located. Used only for the
565 coordinates.
567 Returns
568 -------
569 cell_table : pandas.DataFrame
570 """
571 # Avoid side-effects
572 like = like.copy(deep=False)
573 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
574 dx = resolution
575 dy = -dx
576 nodata = -1
577 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}
579 rasterized = gdal_rasterize(
580 path, column, nodata=nodata, dtype=np.int32, spatial_reference=spatial_reference
581 )
583 # Make sure the coordinates are increasing.
584 dims = ("y", "x")
585 rasterized, _ = common._increasing_dims(rasterized, dims)
586 like, flip_dst = common._increasing_dims(like, dims)
588 dst_coords = [imod.prepare.common._coord(like, dim) for dim in ("y", "x")]
589 src_coords = [imod.prepare.common._coord(rasterized, dim) for dim in ("y", "x")]
590 # Determine weights for every regrid dimension, and alloc_len,
591 # the maximum number of src cells that may end up in a single dst cell
592 inds_weights = []
593 alloc_len = 1
594 for src_x, dst_x in zip(src_coords, dst_coords):
595 size, i_w = imod.prepare.common._weights_1d(src_x, dst_x)
596 for elem in i_w:
597 inds_weights.append(elem)
598 alloc_len *= size
600 # Pre-allocate work arrays
601 values = np.full(alloc_len, 0)
602 frequencies = np.full(alloc_len, 0)
603 rows, cols, values, counts = _cell_count(
604 rasterized.values, values, frequencies, nodata, *inds_weights
605 )
607 if "y" in flip_dst:
608 rows = (like["y"].size - 1) - rows
609 if "x" in flip_dst:
610 cols = (like["x"].size - 1) - cols
612 df = pd.DataFrame()
613 df["row_index"] = rows + rowstart
614 df["col_index"] = cols + colstart
615 df[column] = values
616 df["area"] = counts * (dx * dx)
618 return df
621def _create_chunks(like, resolution, chunksize):
622 """
623 Cuts data into chunksize by chunksize.
625 Parameters
626 ----------
627 like : xarray.DataArray
628 resolution : float
629 chunksize : int
631 Returns
632 -------
633 chunks : list of xr.DataArray
634 """
636 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
637 # Compute how many rows and columns are necessary for fine resolution
638 nrow = int((ymax - ymin) / resolution)
639 ncol = int((xmax - xmin) / resolution)
640 # Find out where to cut
641 x_starts = np.arange(0, ncol, chunksize) * resolution + xmin
642 y_starts = np.arange(0, nrow, chunksize) * resolution + ymin
643 # Searchsorted assumes the arrays are pre-sorted.
644 x = np.sort(like.coords["x"].values)
645 y = np.sort(like.coords["y"].values)
646 # Get the matching indices of like.
647 ix_starts = list(np.searchsorted(x, x_starts))
648 iy_starts = list(np.searchsorted(y, y_starts))
649 # Append None. In python's slice object, None denotes "slice including
650 # first/last element"
651 ix_ends = ix_starts[1:] + [None]
652 iy_ends = iy_starts[1:] + [None]
653 # Use xarray to grab the chunks. The chunks have x and y coordinates.
654 # These will inform GDAL on which part to rasterize.
655 # GDAL will only rasterize within the boundaries of the chunks, so there's
656 # no need to clip the shapefile beforehand.
657 chunks = []
658 rowstarts = []
659 colstarts = []
660 for j0, j1 in zip(iy_starts, iy_ends):
661 for i0, i1 in zip(ix_starts, ix_ends):
662 chunks.append(like.isel(y=slice(j0, j1), x=slice(i0, i1)))
663 rowstarts.append(j0)
664 colstarts.append(i0)
665 return chunks, rowstarts, colstarts
668def celltable(path, column, resolution, like, chunksize=1e4):
669 r"""
670 Process area of features by rasterizing in a chunkwise manner to limit
671 memory usage.
673 Returns a table of cell indices (row, column) with for example feature ID,
674 and feature area within cell. Essentially returns a COO sparse matrix, but
675 with duplicate values per cell, since more than one geometry may be present.
677 The feature area within the cell is approximated by first rasterizing the
678 feature, and then counting the number of occuring cells. This means the
679 accuracy of the area depends on the cellsize of the rasterization step.
681 A celltable is returned, as a ``pandas.DataFrame``. It has the following
682 columns:
684 1. ``"row_index"``
685 2. ``"col_index"``
686 3. the value of the ``column`` argument
687 4. ``"area"``
689 ``"row_index"`` and ``"col_index"`` are the indices of the like array in
690 which the polygon is located. The ``column`` value holds the rasterized
691 value of the specified column. ``"area"`` contains the area of the
692 polygon within the cell.
694 The most convenient way of using this celltable is by specifying a feature
695 ID as ``column``. After creating a celltable, ``pandas.DataFrame.merge()``
696 can be used to join additional data on this ID. Refer to the examples.
698 Parameters
699 ----------
700 path : str or pathlib.Path
701 path to OGR supported vector file (e.g. a shapefile)
702 column : str
703 column name of column to burn into raster
704 resolution : float
705 cellsize at which the rasterization, and determination of area within
706 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m).
707 like : xarray.DataArray
708 Example DataArray of where the cells will be located. Used only for the
709 coordinates.
710 chunksize : int, optional
711 The size of the chunksize. Used for both x and y dimension.
713 Returns
714 -------
715 celltable : pandas.DataFrame
717 Examples
718 --------
719 Assume we have a shapefile called ``waterways.shp`` and information on the
720 model discretization is described by a ``like`` DataArray. The feature ID is
721 provided by a column in the shapefile called "ID-code". Additionally, this
722 shapefile also specifies bed hydraulic resistance (c0). For this specific
723 discretization, we wish to calculate a conductance (area divided by
724 hydraulic resistance). To do so, we:
726 1. create a ``celltable``
727 2. join the additional attributes (such as c0)
728 3. compute the conductance per feature
729 4. sum conductances per cell
731 Import the required packages.
733 >>> import imod
734 >>> import geopandas as gpd
736 Generate the celltable.
738 >>> celltable = imod.prepare.celltable(
739 path="waterways.shp",
740 column="ID-code",
741 resolution=0.5,
742 like=like,
743 )
745 Load the shapefile with geopandas into a ``GeoDataFrame``.
747 >>> gdf = gpd.read_file("waterways.shp)
749 Select the relevant columns into a ``pandas.DataFrame`` and merge with the
750 celltable.
752 >>> df = gdf[["ID-code", "c0"]]
753 >>> joined = celltable.merge(gdf, on="ID-code")
755 We compute the conductance, and sum it per cell using ``pandas`` methods:
757 >>> joined["conductance"] = joined["area"] / joined["c0"]
758 >>> summed_conductance = joined.groupby(["row_index", "col_index"], as_index=False)[
759 "conductance"
760 ].sum()
762 Finally, turn the result into a DataArray so it can be used as model input:
764 >>> conductance = imod.prepare.rasterize_celltable(
765 table=summed_conductance,
766 column="conductance",
767 like=like,
768 )
770 """
771 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like)
772 if not imod.util.spatial.is_divisor(dx, resolution):
773 raise ValueError("resolution is not an (integer) divisor of dx")
774 if not imod.util.spatial.is_divisor(dy, resolution):
775 raise ValueError("resolution is not an (integer) divisor of dy")
777 like_chunks, rowstarts, colstarts = _create_chunks(like, resolution, chunksize)
778 collection = [
779 dask.delayed(_celltable)(path, column, resolution, chunk, rowstart, colstart)
780 for chunk, rowstart, colstart in zip(like_chunks, rowstarts, colstarts)
781 ]
782 result = dask.compute(collection)[0]
783 return pd.concat(result)
786@numba.njit
787def _burn_cells(raster, rows, cols, values):
788 """
789 Burn values of sparse COO-matrix into raster.
790 rows, cols, and values form a sparse matrix in coordinate format (COO)
791 (also known as "ijv" or "triplet" format).
793 Parameters
794 ----------
795 raster : np.array
796 raster to burn values into.
797 rows : np.array of integers
798 row indices (i)
799 cols : np.array of integers
800 column indices (j)
801 values : np.array of floats
802 values to burn (v)
803 """
804 for i, j, v in zip(rows, cols, values):
805 raster[i, j] = v
806 return raster
809def rasterize_celltable(table, column, like):
810 """
811 Rasterizes a table, such as produced by ``imod.prepare.spatial.celltable``.
812 Before rasterization, multiple values should be grouped and aggregated per
813 cell. Values will be overwritten otherwise.
815 Parameters
816 ----------
817 like : xr.DataArray
818 table : pandas.DataFrame
819 with columns: "row_index", "col_index"
820 column : str, int, float
821 column name of values to rasterize
823 Returns
824 -------
825 rasterized : xr.DataArray
826 """
827 rows = table["row_index"].values
828 cols = table["col_index"].values
829 area = table[column].values
830 dst = like.copy()
831 dst.values = _burn_cells(dst.values, rows, cols, area)
832 return dst
835def _zonal_aggregate_raster(
836 path: Union[str, pathlib.Path],
837 column: str,
838 resolution: float,
839 raster: xr.DataArray,
840 method: Union[str, Callable],
841) -> pd.DataFrame:
842 """
843 * Rasterize the polygon at given `resolution`
844 * Sample the raster at the rasterized polygon cells
845 * Store both in a dataframe, groupby and aggregate according to `method`
846 """
847 data_dx, xmin, xmax, data_dy, ymin, ymax = imod.util.spatial.spatial_reference(
848 raster
849 )
850 dx = resolution
851 dy = -dx
852 nodata = -1
853 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}
855 rasterized = gdal_rasterize(
856 path,
857 column,
858 nodata=nodata,
859 dtype=np.int32,
860 spatial_reference=spatial_reference,
861 all_touched=True,
862 )
863 ravelled = rasterized.values.ravel()
865 y = np.arange(ymax + 0.5 * dy, ymin, dy)
866 x = np.arange(xmin + 0.5 * dx, xmax, dx)
867 is_data = ravelled != nodata
868 feature_id = ravelled[is_data]
869 yy, xx = [a.ravel()[is_data] for a in np.meshgrid(y, x, indexing="ij")]
871 dims = ("y", "x")
872 rasterized, _ = common._increasing_dims(rasterized, dims)
873 raster, _ = common._increasing_dims(raster, dims)
874 y_ind = ((yy - ymin) / abs(data_dy)).astype(int)
875 x_ind = ((xx - xmin) / abs(data_dx)).astype(int)
876 sample = raster.values[y_ind, x_ind]
878 df = pd.DataFrame({column: feature_id, "data": sample})
879 # Remove entries where the raster has nodata.
880 # This may result in areas significantly smaller than the polygon geometry,
881 # but should come in handy for weighting later?
882 df = df[df["data"].notnull()]
883 result = df.groupby(column, as_index=False).agg(["count", method])
884 # Reset index to set "column" as column again, make sure index is dropped
885 result = result.reset_index(drop=True)
886 # Compute the area from the counted number of cells
887 result["data", "count"] *= resolution * resolution
888 name = raster.name if raster.name else "aggregated"
889 result.columns = [column, "area", name]
890 return result
893def _zonal_aggregate_polygons(
894 path_a: Union[str, pathlib.Path],
895 path_b: Union[str, pathlib.Path],
896 column_a: str,
897 column_b: str,
898 resolution: float,
899 like: xr.DataArray,
900 method: Union[str, Callable],
901) -> pd.DataFrame:
902 """
903 * Rasterize a, rasterize b for the same domain
904 * Store both in a dataframe, groupby and aggregate according to `method`
905 """
906 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
907 dx = resolution
908 dy = -dx
909 nodata = -1
910 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}
912 rasterized_a = gdal_rasterize(
913 path_a,
914 column_a,
915 nodata=nodata,
916 dtype=np.int32,
917 spatial_reference=spatial_reference,
918 all_touched=True,
919 )
920 rasterized_b = gdal_rasterize(
921 path_b,
922 column_b,
923 nodata=np.nan,
924 dtype=np.float64,
925 spatial_reference=spatial_reference,
926 all_touched=True,
927 )
928 is_data = ((rasterized_a != nodata) & (rasterized_b.notnull())).values
929 a = rasterized_a.values[is_data].ravel()
930 b = rasterized_b.values[is_data].ravel()
931 df = pd.DataFrame({column_a: a, column_b: b})
932 # Remove entries where the raster has nodata.
933 # This may result in areas significantly smaller than the polygon geometry,
934 # but should come in handy for weighting later?
935 result = df.groupby(column_a, as_index=False).agg(["count", method])
936 # Reset index to set "column_a" as column again, make sure index is dropped
937 result = result.reset_index(drop=True)
938 # Compute the area from the counted number of cells
939 result[column_b, "count"] *= resolution * resolution
940 result.columns = [column_a, "area", column_b]
941 return result
944def zonal_aggregate_raster(
945 path: Union[pathlib.Path, str],
946 column: str,
947 raster: xr.DataArray,
948 resolution: float,
949 method: Union[str, Callable],
950 chunksize: int = 1e4,
951) -> pd.DataFrame:
952 """
953 Compute a zonal aggregate of raster data for polygon geometries, e.g. a mean,
954 mode, or percentile.
956 Parameters
957 ----------
958 path : str or pathlib.Path
959 path to OGR supported vector file (e.g. a shapefile). Defines zones
960 of aggregation.
961 column : str
962 column name of path, integer IDs defining zones.
963 raster : xarray.DataArray
964 Raster data from which to sample and aggregate data
965 resolution : float
966 cellsize at which the rasterization of polygons and sampling occurs
967 method : Union[str, Callable]
968 Aggregation method.
969 Anything that is acceptable by a pandas groupby aggregate:
970 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html
971 chunksize : int, optional
972 The size of the chunksize. Used for both x and y dimension.
974 Returns
975 -------
976 zonal_aggregates : pandas.DataFrame
978 Examples
979 --------
981 To compute the mean surface level at polygons of water bodies:
983 >>> import imod
984 >>> surface_level = imod.rasterio.open("surface_level.tif")
985 >>> df = imod.prepare.spatial.zonal_aggregate_raster(
986 >>> path="water-bodies.shp",
987 >>> column="id",
988 >>> raster=surface_level,
989 >>> resolution=1.0,
990 >>> method="mean",
991 >>> )
993 For some functions, like the mode, a function should be passed instead:
995 >>> import pandas as pd
996 >>> df = imod.prepare.spatial.zonal_aggregate_raster(
997 >>> path="water-bodies.shp",
998 >>> column="id",
999 >>> raster=surface_level,
1000 >>> resolution=1.0,
1001 >>> method=pd.Series.mode,
1002 >>> )
1003 """
1004 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(raster)
1005 if not imod.util.spatial.is_divisor(dx, resolution):
1006 raise ValueError("resolution is not an (integer) divisor of dx")
1007 if not imod.util.spatial.is_divisor(dy, resolution):
1008 raise ValueError("resolution is not an (integer) divisor of dy")
1010 without_chunks = (raster.chunks is None) or (
1011 all(length == 1 for length in map(len, raster.chunks))
1012 )
1013 if without_chunks:
1014 raster = raster.compute()
1016 raster_chunks, _, _ = _create_chunks(raster, resolution, chunksize)
1017 collection = [
1018 dask.delayed(_zonal_aggregate_raster)(path, column, resolution, chunk, method)
1019 for chunk in raster_chunks
1020 ]
1021 result = dask.compute(collection)[0]
1022 return pd.concat(result)
1025def zonal_aggregate_polygons(
1026 path_a: Union[pathlib.Path, str],
1027 path_b: Union[pathlib.Path, str],
1028 column_a: str,
1029 column_b: str,
1030 like: xr.DataArray,
1031 resolution: float,
1032 method: Union[str, Callable],
1033 chunksize: int = 1e4,
1034) -> pd.DataFrame:
1035 """
1036 Compute a zonal aggregate of polygon data for (other) polygon geometries,
1037 e.g. a mean, mode, or percentile.
1039 Parameters
1040 ----------
1041 path_a : str or pathlib.Path
1042 path to OGR supported vector file (e.g. a shapefile)
1043 path_b : str or pathlib.Path
1044 path to OGR supported vector file (e.g. a shapefile)
1045 column_a : str
1046 column name of path_a. Defines zones of aggregation.
1047 column_b : str
1048 column name of path_b. Data to aggregate.
1049 like : xarray.DataArray with dims ("y", "x")
1050 Example DataArray of where the cells will be located. Used only for its
1051 x and y coordinates.
1052 resolution : float
1053 cellsize at which the rasterization, sampling, and area measurement occur.
1054 method: Union[str, Callable]
1055 Aggregation method.
1056 Anything that is acceptable by a pandas groupby aggregate:
1057 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html
1058 chunksize : int, optional
1059 The size of the chunksize. Used for both x and y dimension.
1061 Returns
1062 -------
1063 zonal_aggregates: pandas.DataFrame
1064 """
1065 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like)
1066 if not imod.util.spatial.is_divisor(dx, resolution):
1067 raise ValueError("resolution is not an (integer) divisor of dx")
1068 if not imod.util.spatial.is_divisor(dy, resolution):
1069 raise ValueError("resolution is not an (integer) divisor of dy")
1071 like_chunks, _, _ = _create_chunks(like, resolution, chunksize)
1072 collection = [
1073 dask.delayed(_zonal_aggregate_polygons)(
1074 path_a, path_b, column_a, column_b, resolution, chunk, method
1075 )
1076 for chunk in like_chunks
1077 ]
1078 result = dask.compute(collection)[0]
1079 return pd.concat(result)