Coverage for C:\src\imod-python\imod\formats\array_io\reading.py: 98%

171 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-08 13:27 +0200

1import collections 

2import glob 

3import itertools 

4import pathlib 

5 

6import dask 

7import numpy as np 

8import xarray as xr 

9 

10from imod.util import nested_dict, spatial, time 

11 

12 

13def _all_equal(seq, elem): 

14 """Raise an error if not all elements of a list are equal""" 

15 if not seq.count(seq[0]) == len(seq): 

16 raise ValueError(f"All {elem} must be the same, found: {set(seq)}") 

17 

18 

19def _check_cellsizes(cellsizes): 

20 """ 

21 Checks if cellsizes match, raises ValueError otherwise 

22 

23 Parameters 

24 ---------- 

25 cellsizes : list of tuples 

26 tuples may contain: 

27 * two floats, dx and dy, for equidistant files 

28 * two ndarrays, dx and dy, for nonequidistant files 

29 

30 Returns 

31 ------- 

32 None 

33 """ 

34 msg = "Cellsizes of IDFs do not match" 

35 if len(cellsizes) == 1: 

36 return None 

37 try: 

38 if not (cellsizes.count(cellsizes[0]) == len(cellsizes)): 

39 raise ValueError(msg) 

40 except ValueError: # contains ndarrays 

41 try: 

42 # all ndarrays 

43 dx0, dy0 = cellsizes[0] 

44 for dx, dy in cellsizes[1:]: 

45 if np.allclose(dx0, dx) and np.allclose(dy0, dy): 

46 pass 

47 else: 

48 raise ValueError(msg) 

49 except ValueError: 

50 # some ndarrays, some floats 

51 # create floats for comparison with allclose 

52 try: 

53 dx = cellsizes[0][0][0] 

54 dy = cellsizes[0][1][0] 

55 except TypeError: 

56 dx = cellsizes[0][0] 

57 dy = cellsizes[0][1] 

58 # comparison 

59 for cellsize in cellsizes: 

60 # Unfortunately this allocates by broadcasting dx and dy 

61 if not np.allclose(cellsize[0], dx): 

62 raise ValueError(msg) 

63 if not np.allclose(cellsize[1], dy): 

64 raise ValueError(msg) 

65 

66 

67def _has_dim(seq): 

68 """Check if either 0 or all None are present 

69 

70 Returns 

71 ------- 

72 True if no None in seq, False if all None, error otherwise 

73 """ 

74 nones = [x is None for x in seq] 

75 if any(nones): 

76 if all(nones): 

77 return False 

78 else: 

79 raise ValueError("Either 0 or all None allowed") 

80 return True 

81 

82 

83def _to_nan(a, nodata): 

84 """Change all nodata values in the array to NaN""" 

85 # it needs to be NaN for xarray to deal with it properly 

86 # no need to store the nodata value if it is always NaN 

87 if np.isnan(nodata): 

88 return a 

89 else: 

90 isnodata = np.isclose(a, nodata) 

91 a[isnodata] = np.nan 

92 return a 

93 

94 

95def _array_z_coord(coords, tops, bots, unique_indices): 

96 top = np.array(tops)[unique_indices] 

97 bot = np.array(bots)[unique_indices] 

98 dz = top - bot 

99 z = top - 0.5 * dz 

100 if top[0] > top[1]: # decreasing coordinate 

101 dz *= -1.0 

102 if np.allclose(dz, dz[0]): 

103 coords["dz"] = dz[0] 

104 else: 

105 coords["dz"] = ("layer", dz) 

106 coords["z"] = ("layer", z) 

107 return coords 

108 

109 

110def _scalar_z_coord(coords, tops, bots): 

111 # They must be all the same to be used, as they cannot be assigned 

112 # to layer. 

113 top = np.unique(tops) 

114 bot = np.unique(bots) 

115 if top.size == bot.size == 1: 

116 dz = top - bot 

117 z = top - 0.5 * dz 

118 coords["dz"] = float(dz) # cast from array 

119 coords["z"] = float(z) 

120 return coords 

121 

122 

123def _initialize_groupby(ndims): 

124 """ 

125 This function generates a data structure consisting of defaultdicts, to use 

126 for grouping arrays by dimension. The number of dimensions may vary, so the 

127 degree of nesting might vary as well. 

128 

129 For a single dimension such as layer, it'll look like: 

130 d = {1: da1, 2: da2, etc.} 

131 

132 For two dimensions, layer and time: 

133 d = {"2001-01-01": {1: da1, 2: da3}, "2001-01-02": {1: da3, 2: da4}, etc.} 

134 

135 And so on for more dimensions. 

136 

137 Defaultdicts are very well suited to this application. The 

138 itertools.groupby object does not provide any benefits in this case, it 

139 simply provides a generator; its entries have to come presorted. It also 

140 does not provide tools for these kind of variably nested groupby's. 

141 

142 Pandas.groupby does provide this functionality. However, pandas dataframes 

143 do not accept any field value, whereas these dictionaries do. Might be 

144 worthwhile to look into, if performance is an issue. 

145 

146 Parameters 

147 ---------- 

148 ndims : int 

149 Number of dimensions 

150 

151 Returns 

152 ------- 

153 groupby : Defaultdicts, ndims - 1 times nested 

154 """ 

155 return nested_dict.initialize_nested_dict(ndims - 1) 

156 

157 

158def _ndconcat(arrays, ndim): 

159 """ 

160 Parameters 

161 ---------- 

162 arrays : list of lists, n levels deep. 

163 E.g. [[da1, da2], [da3, da4]] for n = 2. 

164 (compare with docstring for _initialize_groupby) 

165 ndim : int 

166 number of dimensions over which to concatenate. 

167 

168 Returns 

169 ------- 

170 concatenated : xr.DataArray 

171 Input concatenated over n dimensions. 

172 """ 

173 if ndim == 1: # base case 

174 return dask.array.stack(arrays, axis=0) 

175 else: # recursive case 

176 ndim = ndim - 1 

177 out = [_ndconcat(arrays_in, ndim) for arrays_in in arrays] 

178 return dask.array.stack(out, axis=0) 

179 

180 

181def _dims_coordinates(header_coords, bounds, cellsizes, tops, bots, use_cftime): 

182 # create coordinates 

183 coords = spatial._xycoords(bounds[0], cellsizes[0]) 

184 dims = ["y", "x"] 

185 # Time and layer have to be special cased. 

186 # Time due to the multitude of datetimes possible 

187 # Layer because layer is required to properly assign top and bot data. 

188 haslayer = False 

189 hastime = False 

190 otherdims = [] 

191 for dim in list(header_coords.keys()): 

192 if dim == "layer": 

193 haslayer = True 

194 coords["layer"], unique_indices = np.unique( 

195 header_coords["layer"], return_index=True 

196 ) 

197 elif dim == "time": 

198 hastime = True 

199 times, use_cftime = time._convert_datetimes( 

200 header_coords["time"], use_cftime 

201 ) 

202 if use_cftime: 

203 coords["time"] = xr.CFTimeIndex(np.unique(times)) 

204 else: 

205 coords["time"] = np.unique(times) 

206 else: 

207 otherdims.append(dim) 

208 coords[dim] = np.unique(header_coords[dim]) 

209 

210 # Ensure right dimension 

211 if haslayer: 

212 dims.insert(0, "layer") 

213 if hastime: 

214 dims.insert(0, "time") 

215 for dim in otherdims: 

216 dims.insert(0, dim) 

217 

218 # Deal with voxel idf top and bottom data 

219 all_have_z = all(map(lambda v: v is not None, itertools.chain(tops, bots))) 

220 if all_have_z: 

221 if haslayer and coords["layer"].size > 1: 

222 coords = _array_z_coord(coords, tops, bots, unique_indices) 

223 else: 

224 coords = _scalar_z_coord(coords, tops, bots) 

225 

226 return dims, coords 

227 

228 

229def _dask(path, attrs=None, pattern=None, _read=None, header=None): 

230 """ 

231 Read a single IDF file to a dask.array 

232 

233 Parameters 

234 ---------- 

235 path : str or Path 

236 Path to the IDF file to be read 

237 attrs : dict, optional 

238 A dict as returned by imod.idf.header, this function is called if not supplied. 

239 Used to minimize unneeded filesystem calls. 

240 pattern : str, regex pattern, optional 

241 If the filenames do match default naming conventions of 

242 {name}_{time}_l{layer}, a custom pattern can be defined here either 

243 as a string, or as a compiled regular expression pattern. Please refer 

244 to the examples in ``imod.idf.open``. 

245 

246 Returns 

247 ------- 

248 dask.array 

249 A float32 dask.array with shape (nrow, ncol) of the values 

250 in the IDF file. On opening all nodata values are changed 

251 to NaN in the dask.array. 

252 dict 

253 A dict with all metadata. 

254 """ 

255 

256 path = pathlib.Path(path) 

257 

258 if attrs is None: 

259 attrs = header(path, pattern) 

260 # If we don't unpack, it seems we run into trouble with the dask array later 

261 # on, probably because attrs isn't immutable. This works fine instead. 

262 headersize = attrs.pop("headersize") 

263 nrow = attrs["nrow"] 

264 ncol = attrs["ncol"] 

265 dtype = attrs["dtype"] 

266 # In case of floating point data, nodata is always represented by nan. 

267 if "float" in dtype: 

268 nodata = attrs.pop("nodata") 

269 else: 

270 nodata = attrs["nodata"] 

271 

272 # Dask delayed caches the input arguments. If the working directory changes 

273 # before .compute(), the file cannot be found if the path is relative. 

274 abspath = path.resolve() 

275 # dask.delayed requires currying 

276 a = dask.delayed(_read)(abspath, headersize, nrow, ncol, nodata, dtype) 

277 x = dask.array.from_delayed(a, shape=(nrow, ncol), dtype=dtype) 

278 return x, attrs 

279 

280 

281def _load(paths, use_cftime, _read, headers): 

282 """Combine a list of paths to IDFs to a single xarray.DataArray""" 

283 # this function also works for single IDFs 

284 names = [h["name"] for h in headers] 

285 _all_equal(names, "names") 

286 

287 # Extract data from headers 

288 bounds = [] 

289 cellsizes = [] 

290 tops = [] 

291 bots = [] 

292 header_coords = collections.defaultdict(list) 

293 for h in headers: 

294 bounds.append((h["xmin"], h["xmax"], h["ymin"], h["ymax"])) 

295 cellsizes.append((h["dx"], h["dy"])) 

296 tops.append(h.get("top", None)) 

297 bots.append(h.get("bot", None)) 

298 for key in h["dims"]: 

299 header_coords[key].append(h[key]) 

300 # Do a basic check whether IDFs align in x and y 

301 _all_equal(bounds, "bounding boxes") 

302 _check_cellsizes(cellsizes) 

303 # Generate coordinates 

304 dims, coords = _dims_coordinates( 

305 header_coords, bounds, cellsizes, tops, bots, use_cftime 

306 ) 

307 # This part have makes use of recursion to deal with an arbitrary number 

308 # of dimensions. It may therefore be a little hard to read. 

309 groupbydims = dims[:-2] # skip y and x 

310 ndim = len(groupbydims) 

311 groupby = _initialize_groupby(ndim) 

312 if ndim == 0: # Single idf 

313 dask_array, _ = _dask(paths[0], headers[0], _read=_read) 

314 else: 

315 for path, attrs in zip(paths, headers): 

316 da, _ = _dask(path, attrs=attrs, _read=_read) 

317 groupbykeys = [attrs[k] for k in groupbydims] 

318 nested_dict.set_nested(groupby, groupbykeys, da) 

319 dask_arrays = nested_dict.sorted_nested_dict(groupby) 

320 dask_array = _ndconcat(dask_arrays, ndim) 

321 

322 out = xr.DataArray(dask_array, coords, dims, name=names[0]) 

323 

324 first_attrs = headers[0] 

325 

326 if "crs" in first_attrs: 

327 out.attrs["crs"] = first_attrs["crs"] 

328 if "nodata" in first_attrs: 

329 out.attrs["nodata"] = first_attrs["nodata"] 

330 

331 return out 

332 

333 

334def _open(path, use_cftime, pattern, header, _read): 

335 if isinstance(path, pathlib.Path): 

336 path = str(path) 

337 

338 if isinstance(path, list): 

339 paths = path 

340 else: 

341 paths = [pathlib.Path(p) for p in glob.glob(path)] 

342 

343 headers = [header(p, pattern) for p in paths] 

344 n = len(paths) 

345 if n == 0: 

346 raise FileNotFoundError(f"Could not find any files matching {path}") 

347 return _load(paths, use_cftime, _read, headers)