Coverage for src/hdmf/data_utils.py: 93%

449 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-08-18 20:49 +0000

1import copy 

2import math 

3from abc import ABCMeta, abstractmethod 

4from collections.abc import Iterable 

5from warnings import warn 

6from typing import Tuple, Callable 

7from itertools import product, chain 

8 

9import h5py 

10import numpy as np 

11 

12from .utils import docval, getargs, popargs, docval_macro, get_data_shape 

13 

14 

15def append_data(data, arg): 

16 if isinstance(data, (list, DataIO)): 

17 data.append(arg) 

18 return data 

19 elif isinstance(data, np.ndarray): 

20 return np.append(data, np.expand_dims(arg, axis=0), axis=0) 

21 elif isinstance(data, h5py.Dataset): 21 ↛ 28line 21 didn't jump to line 28, because the condition on line 21 was never false

22 shape = list(data.shape) 

23 shape[0] += 1 

24 data.resize(shape) 

25 data[-1] = arg 

26 return data 

27 else: 

28 msg = "Data cannot append to object of type '%s'" % type(data) 

29 raise ValueError(msg) 

30 

31 

32def extend_data(data, arg): 

33 """Add all the elements of the iterable arg to the end of data. 

34 

35 :param data: The array to extend 

36 :type data: list, DataIO, np.ndarray, h5py.Dataset 

37 """ 

38 if isinstance(data, (list, DataIO)): 

39 data.extend(arg) 

40 return data 

41 elif isinstance(data, np.ndarray): 

42 return np.vstack((data, arg)) 

43 elif isinstance(data, h5py.Dataset): 43 ↛ 50line 43 didn't jump to line 50, because the condition on line 43 was never false

44 shape = list(data.shape) 

45 shape[0] += len(arg) 

46 data.resize(shape) 

47 data[-len(arg):] = arg 

48 return data 

49 else: 

50 msg = "Data cannot extend object of type '%s'" % type(data) 

51 raise ValueError(msg) 

52 

53 

54@docval_macro('array_data') 

55class AbstractDataChunkIterator(metaclass=ABCMeta): 

56 """ 

57 Abstract iterator class used to iterate over DataChunks. 

58 

59 Derived classes must ensure that all abstract methods and abstract properties are implemented, in 

60 particular, dtype, maxshape, __iter__, ___next__, recommended_chunk_shape, and recommended_data_shape. 

61 

62 Iterating over AbstractContainer objects is not yet supported. 

63 """ 

64 

65 @abstractmethod 

66 def __iter__(self): 

67 """Return the iterator object""" 

68 raise NotImplementedError("__iter__ not implemented for derived class") 

69 

70 @abstractmethod 

71 def __next__(self): 

72 r""" 

73 Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved. 

74 

75 HINT: numpy.s\_ provides a convenient way to generate index tuples using standard array slicing. This 

76 is often useful to define the DataChunk.selection of the current chunk 

77 

78 :returns: DataChunk object with the data and selection of the current chunk 

79 :rtype: DataChunk 

80 """ 

81 raise NotImplementedError("__next__ not implemented for derived class") 

82 

83 @abstractmethod 

84 def recommended_chunk_shape(self): 

85 """ 

86 Recommend the chunk shape for the data array. 

87 

88 :return: NumPy-style shape tuple describing the recommended shape for the chunks of the target 

89 array or None. This may or may not be the same as the shape of the chunks returned in the 

90 iteration process. 

91 """ 

92 raise NotImplementedError("recommended_chunk_shape not implemented for derived class") 

93 

94 @abstractmethod 

95 def recommended_data_shape(self): 

96 """ 

97 Recommend the initial shape for the data array. 

98 

99 This is useful in particular to avoid repeated resized of the target array when reading from 

100 this data iterator. This should typically be either the final size of the array or the known 

101 minimal shape of the array. 

102 

103 :return: NumPy-style shape tuple indicating the recommended initial shape for the target array. 

104 This may or may not be the final full shape of the array, i.e., the array is allowed 

105 to grow. This should not be None. 

106 """ 

107 raise NotImplementedError("recommended_data_shape not implemented for derived class") 

108 

109 @property 

110 @abstractmethod 

111 def dtype(self): 

112 """ 

113 Define the data type of the array 

114 

115 :return: NumPy style dtype or otherwise compliant dtype string 

116 """ 

117 raise NotImplementedError("dtype not implemented for derived class") 

118 

119 @property 

120 @abstractmethod 

121 def maxshape(self): 

122 """ 

123 Property describing the maximum shape of the data array that is being iterated over 

124 

125 :return: NumPy-style shape tuple indicating the maximum dimensions up to which the dataset may be 

126 resized. Axes with None are unlimited. 

127 """ 

128 raise NotImplementedError("maxshape not implemented for derived class") 

129 

130 

131class GenericDataChunkIterator(AbstractDataChunkIterator): 

132 """DataChunkIterator that lets the user specify chunk and buffer shapes.""" 

133 

134 __docval_init = ( 

135 dict( 

136 name="buffer_gb", 

137 type=(float, int), 

138 doc=( 

139 "If buffer_shape is not specified, it will be inferred as the smallest chunk " 

140 "below the buffer_gb threshold." 

141 "Defaults to 1GB." 

142 ), 

143 default=None, 

144 ), 

145 dict( 

146 name="buffer_shape", 

147 type=tuple, 

148 doc="Manually defined shape of the buffer.", 

149 default=None, 

150 ), 

151 dict( 

152 name="chunk_mb", 

153 type=(float, int), 

154 doc=( 

155 "If chunk_shape is not specified, it will be inferred as the smallest chunk " 

156 "below the chunk_mb threshold.", 

157 "Defaults to 10MB.", 

158 ), 

159 default=None, 

160 ), 

161 dict( 

162 name="chunk_shape", 

163 type=tuple, 

164 doc="Manually defined shape of the chunks.", 

165 default=None, 

166 ), 

167 dict( 

168 name="display_progress", 

169 type=bool, 

170 doc="Display a progress bar with iteration rate and estimated completion time.", 

171 default=False, 

172 ), 

173 dict( 

174 name="progress_bar_options", 

175 type=None, 

176 doc="Dictionary of keyword arguments to be passed directly to tqdm.", 

177 default=None, 

178 ), 

179 ) 

180 

181 @docval(*__docval_init) 

182 def __init__(self, **kwargs): 

183 """ 

184 Break a dataset into buffers containing multiple chunks to be written into an HDF5 dataset. 

185 

186 Basic users should set the buffer_gb argument to as much free RAM space as can be safely allocated. 

187 Advanced users are offered full control over the shape parameters for the buffer and the chunks; however, 

188 the chunk shape must perfectly divide the buffer shape along each axis. 

189 

190 HDF5 recommends chunk size in the range of 2 to 16 MB for optimal cloud performance. 

191 https://youtu.be/rcS5vt-mKok?t=621 

192 """ 

193 buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, progress_bar_options = getargs( 

194 "buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs 

195 ) 

196 self.progress_bar_options = progress_bar_options or dict() 

197 

198 if buffer_gb is None and buffer_shape is None: 

199 buffer_gb = 1.0 

200 if chunk_mb is None and chunk_shape is None: 

201 chunk_mb = 10.0 

202 assert (buffer_gb is not None) != ( 

203 buffer_shape is not None 

204 ), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!" 

205 assert (chunk_mb is not None) != ( 

206 chunk_shape is not None 

207 ), "Only one of 'chunk_mb' or 'chunk_shape' can be specified!" 

208 

209 self._dtype = self._get_dtype() 

210 self._maxshape = tuple(int(x) for x in self._get_maxshape()) 

211 chunk_shape = tuple(int(x) for x in chunk_shape) if chunk_shape else chunk_shape 

212 self.chunk_shape = chunk_shape or self._get_default_chunk_shape(chunk_mb=chunk_mb) 

213 buffer_shape = tuple(int(x) for x in buffer_shape) if buffer_shape else buffer_shape 

214 self.buffer_shape = buffer_shape or self._get_default_buffer_shape(buffer_gb=buffer_gb) 

215 

216 # Shape assertions 

217 assert all( 

218 buffer_axis > 0 for buffer_axis in self.buffer_shape 

219 ), f"Some dimensions of buffer_shape ({self.buffer_shape}) are less than zero!" 

220 assert all( 

221 chunk_axis <= maxshape_axis for chunk_axis, maxshape_axis in zip(self.chunk_shape, self.maxshape) 

222 ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the data dimensions ({self.maxshape})!" 

223 assert all( 

224 buffer_axis <= maxshape_axis for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) 

225 ), f"Some dimensions of buffer_shape ({self.buffer_shape}) exceed the data dimensions ({self.maxshape})!" 

226 assert all( 

227 (chunk_axis <= buffer_axis for chunk_axis, buffer_axis in zip(self.chunk_shape, self.buffer_shape)) 

228 ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the buffer shape ({self.buffer_shape})!" 

229 assert all( 

230 buffer_axis % chunk_axis == 0 

231 for chunk_axis, buffer_axis, maxshape_axis in zip(self.chunk_shape, self.buffer_shape, self.maxshape) 

232 if buffer_axis != maxshape_axis 

233 ), ( 

234 f"Some dimensions of chunk_shape ({self.chunk_shape}) do not " 

235 f"evenly divide the buffer shape ({self.buffer_shape})!" 

236 ) 

237 

238 self.num_buffers = math.prod( 

239 [ 

240 math.ceil(maxshape_axis / buffer_axis) 

241 for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) 

242 ], 

243 ) 

244 self.buffer_selection_generator = ( 

245 tuple( 

246 [ 

247 slice(lower_bound, upper_bound) 

248 for lower_bound, upper_bound in zip(lower_bounds, upper_bounds) 

249 ] 

250 ) 

251 for lower_bounds, upper_bounds in zip( 

252 product( 

253 *[ 

254 range(0, max_shape_axis, buffer_shape_axis) 

255 for max_shape_axis, buffer_shape_axis in zip(self.maxshape, self.buffer_shape) 

256 ] 

257 ), 

258 product( 

259 *[ 

260 chain(range(buffer_shape_axis, max_shape_axis, buffer_shape_axis), [max_shape_axis]) 

261 for max_shape_axis, buffer_shape_axis in zip(self.maxshape, self.buffer_shape) 

262 ] 

263 ), 

264 ) 

265 ) 

266 

267 if self.display_progress: 

268 try: 

269 from tqdm import tqdm 

270 

271 if "total" in self.progress_bar_options: 

272 warn("Option 'total' in 'progress_bar_options' is not allowed to be over-written! Ignoring.") 

273 self.progress_bar_options.pop("total") 

274 

275 self.progress_bar = tqdm(total=self.num_buffers, **self.progress_bar_options) 

276 except ImportError: 

277 warn( 

278 "You must install tqdm to use the progress bar feature (pip install tqdm)! " 

279 "Progress bar is disabled." 

280 ) 

281 self.display_progress = False 

282 

283 @docval( 

284 dict( 

285 name="chunk_mb", 

286 type=(float, int), 

287 doc="Size of the HDF5 chunk in megabytes. Recommended to be less than 1MB.", 

288 default=None, 

289 ) 

290 ) 

291 def _get_default_chunk_shape(self, **kwargs) -> Tuple[int, ...]: 

292 """ 

293 Select chunk shape with size in MB less than the threshold of chunk_mb. 

294 

295 Keeps the dimensional ratios of the original data. 

296 """ 

297 chunk_mb = getargs("chunk_mb", kwargs) 

298 assert chunk_mb > 0, f"chunk_mb ({chunk_mb}) must be greater than zero!" 

299 

300 n_dims = len(self.maxshape) 

301 itemsize = self.dtype.itemsize 

302 chunk_bytes = chunk_mb * 1e6 

303 

304 min_maxshape = min(self.maxshape) 

305 v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in self.maxshape) 

306 prod_v = math.prod(v) 

307 while prod_v * itemsize > chunk_bytes and prod_v != 1: 

308 non_unit_min_v = min(x for x in v if x != 1) 

309 v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v) 

310 prod_v = math.prod(v) 

311 k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)) 

312 return tuple([min(k * x, self.maxshape[dim]) for dim, x in enumerate(v)]) 

313 

314 @docval( 

315 dict( 

316 name="buffer_gb", 

317 type=(float, int), 

318 doc="Size of the data buffer in gigabytes. Recommended to be as much free RAM as safely available.", 

319 default=None, 

320 ) 

321 ) 

322 def _get_default_buffer_shape(self, **kwargs) -> Tuple[int, ...]: 

323 """ 

324 Select buffer shape with size in GB less than the threshold of buffer_gb. 

325 

326 Keeps the dimensional ratios of the original data. 

327 Assumes the chunk_shape has already been set. 

328 """ 

329 buffer_gb = getargs("buffer_gb", kwargs) 

330 assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!" 

331 assert all(chunk_axis > 0 for chunk_axis in self.chunk_shape), ( 

332 f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!" 

333 ) 

334 

335 k = math.floor( 

336 ( 

337 buffer_gb * 1e9 / (math.prod(self.chunk_shape) * self.dtype.itemsize) 

338 ) ** (1 / len(self.chunk_shape)) 

339 ) 

340 return tuple( 

341 [ 

342 min(max(k * x, self.chunk_shape[j]), self.maxshape[j]) 

343 for j, x in enumerate(self.chunk_shape) 

344 ] 

345 ) 

346 

347 def __iter__(self): 

348 return self 

349 

350 def __next__(self): 

351 """ 

352 Retrieve the next DataChunk object from the buffer, refilling the buffer if necessary. 

353 

354 :returns: DataChunk object with the data and selection of the current buffer. 

355 :rtype: DataChunk 

356 """ 

357 if self.display_progress: 

358 self.progress_bar.update(n=1) 

359 try: 

360 buffer_selection = next(self.buffer_selection_generator) 

361 return DataChunk(data=self._get_data(selection=buffer_selection), selection=buffer_selection) 

362 except StopIteration: 

363 if self.display_progress: 

364 self.progress_bar.write("\n") # Allows text to be written to new lines after completion 

365 raise StopIteration 

366 

367 def __reduce__(self) -> Tuple[Callable, Iterable]: 

368 instance_constructor = self._from_dict 

369 initialization_args = (self._to_dict(),) 

370 return (instance_constructor, initialization_args) 

371 

372 @abstractmethod 

373 def _get_data(self, selection: Tuple[slice]) -> np.ndarray: 

374 """ 

375 Retrieve the data specified by the selection using minimal I/O. 

376 

377 The developer of a new implementation of the GenericDataChunkIterator must ensure the data is actually 

378 loaded into memory, and not simply mapped. 

379 

380 :param selection: Tuple of slices, each indicating the selection indexed with respect to maxshape for that axis 

381 :type selection: tuple of slices 

382 

383 :returns: Array of data specified by selection 

384 :rtype: np.ndarray 

385 Parameters 

386 ---------- 

387 selection : tuple of slices 

388 Each axis of tuple is a slice of the full shape from which to pull data into the buffer. 

389 """ 

390 raise NotImplementedError("The data fetching method has not been built for this DataChunkIterator!") 

391 

392 @abstractmethod 

393 def _get_maxshape(self) -> Tuple[int, ...]: 

394 """Retrieve the maximum bounds of the data shape using minimal I/O.""" 

395 raise NotImplementedError("The setter for the maxshape property has not been built for this DataChunkIterator!") 

396 

397 @abstractmethod 

398 def _get_dtype(self) -> np.dtype: 

399 """Retrieve the dtype of the data using minimal I/O.""" 

400 raise NotImplementedError("The setter for the internal dtype has not been built for this DataChunkIterator!") 

401 

402 def _to_dict(self) -> dict: 

403 """Optional method to add in child classes to enable pickling (required for multiprocessing).""" 

404 raise NotImplementedError( 

405 "The `._to_dict()` method for pickling has not been defined for this DataChunkIterator!" 

406 ) 

407 

408 @staticmethod 

409 def _from_dict(self) -> Callable: 

410 """Optional method to add in child classes to enable pickling (required for multiprocessing).""" 

411 raise NotImplementedError( 

412 "The `._from_dict()` method for pickling has not been defined for this DataChunkIterator!" 

413 ) 

414 

415 def recommended_chunk_shape(self) -> Tuple[int, ...]: 

416 return self.chunk_shape 

417 

418 def recommended_data_shape(self) -> Tuple[int, ...]: 

419 return self.maxshape 

420 

421 @property 

422 def maxshape(self) -> Tuple[int, ...]: 

423 return self._maxshape 

424 @property 

425 def dtype(self) -> np.dtype: 

426 return self._dtype 

427 

428 

429class DataChunkIterator(AbstractDataChunkIterator): 

430 """ 

431 Custom iterator class used to iterate over chunks of data. 

432 

433 This default implementation of AbstractDataChunkIterator accepts any iterable and assumes that we iterate over 

434 a single dimension of the data array (default: the first dimension). DataChunkIterator supports buffered read, 

435 i.e., multiple values from the input iterator can be combined to a single chunk. This is 

436 useful for buffered I/O operations, e.g., to improve performance by accumulating data 

437 in memory and writing larger blocks at once. 

438 

439 .. note:: 

440 

441 DataChunkIterator assumes that the iterator that it wraps returns one element along the 

442 iteration dimension at a time. I.e., the iterator is expected to return chunks that are 

443 one dimension lower than the array itself. For example, when iterating over the first dimension 

444 of a dataset with shape (1000, 10, 10), then the iterator would return 1000 chunks of 

445 shape (10, 10) one-chunk-at-a-time. If this pattern does not match your use-case then 

446 using :py:class:`~hdmf.data_utils.GenericDataChunkIterator` or 

447 :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more appropriate. 

448 """ 

449 

450 __docval_init = ( 

451 {'name': 'data', 'type': None, 'doc': 'The data object used for iteration', 'default': None}, 

452 {'name': 'maxshape', 'type': tuple, 

453 'doc': 'The maximum shape of the full data array. Use None to indicate unlimited dimensions', 

454 'default': None}, 

455 {'name': 'dtype', 'type': np.dtype, 'doc': 'The Numpy data type for the array', 'default': None}, 

456 {'name': 'buffer_size', 'type': int, 'doc': 'Number of values to be buffered in a chunk', 'default': 1}, 

457 {'name': 'iter_axis', 'type': int, 'doc': 'The dimension to iterate over', 'default': 0} 

458 ) 

459 

460 @docval(*__docval_init) 

461 def __init__(self, **kwargs): 

462 """Initialize the DataChunkIterator. 

463 If 'data' is an iterator and 'dtype' is not specified, then next is called on the iterator in order to determine 

464 the dtype of the data. 

465 """ 

466 # Get the user parameters 

467 self.data, self.__maxshape, self.__dtype, self.buffer_size, self.iter_axis = getargs('data', 

468 'maxshape', 

469 'dtype', 

470 'buffer_size', 

471 'iter_axis', 

472 kwargs) 

473 self.chunk_index = 0 

474 # Create an iterator for the data if possible 

475 if isinstance(self.data, Iterable): 

476 if self.iter_axis != 0 and isinstance(self.data, (list, tuple)): 

477 warn('Iterating over an axis other than the first dimension of list or tuple data ' 

478 'involves converting the data object to a numpy ndarray, which may incur a computational ' 

479 'cost.') 

480 self.data = np.asarray(self.data) 

481 if isinstance(self.data, np.ndarray): 

482 # iterate over the given axis by adding a new view on data (iter only works on the first dim) 

483 self.__data_iter = iter(np.moveaxis(self.data, self.iter_axis, 0)) 

484 else: 

485 self.__data_iter = iter(self.data) 

486 else: 

487 self.__data_iter = None 

488 self.__next_chunk = DataChunk(None, None) 

489 self.__next_chunk_start = 0 

490 self.__first_chunk_shape = None 

491 # Determine the shape of the data if possible 

492 if self.__maxshape is None: 

493 # If the self.data object identifies its shape, then use it 

494 if hasattr(self.data, "shape"): 

495 self.__maxshape = self.data.shape 

496 # Avoid the special case of scalar values by making them into a 1D numpy array 

497 if len(self.__maxshape) == 0: 497 ↛ 498line 497 didn't jump to line 498, because the condition on line 497 was never true

498 self.data = np.asarray([self.data, ]) 

499 self.__maxshape = self.data.shape 

500 self.__data_iter = iter(self.data) 

501 # Try to get an accurate idea of __maxshape for other Python data structures if possible. 

502 # Don't just call get_data_shape for a generator as that would potentially trigger loading of all the data 

503 elif isinstance(self.data, list) or isinstance(self.data, tuple): 

504 self.__maxshape = get_data_shape(self.data, strict_no_data_load=True) 

505 

506 # If we have a data iterator and do not know the dtype, then read the first chunk 

507 if self.__data_iter is not None and self.__dtype is None: 

508 self._read_next_chunk() 

509 

510 # Determine the type of the data if possible 

511 if self.__next_chunk.data is not None: 

512 self.__dtype = self.__next_chunk.data.dtype 

513 self.__first_chunk_shape = get_data_shape(self.__next_chunk.data) 

514 

515 # This should be done as a last resort only 

516 if self.__first_chunk_shape is None and self.__maxshape is not None: 

517 self.__first_chunk_shape = tuple(1 if i is None else i for i in self.__maxshape) 

518 

519 if self.__dtype is None: 

520 raise Exception('Data type could not be determined. Please specify dtype in DataChunkIterator init.') 

521 

522 @classmethod 

523 @docval(*__docval_init) 

524 def from_iterable(cls, **kwargs): 

525 return cls(**kwargs) 

526 

527 def __iter__(self): 

528 """Return the iterator object""" 

529 return self 

530 

531 def _read_next_chunk(self): 

532 """Read a single chunk from self.__data_iter and store the results in self.__next_chunk 

533 

534 :returns: self.__next_chunk, i.e., the DataChunk object describing the next chunk 

535 """ 

536 from h5py import Dataset as H5Dataset 

537 if isinstance(self.data, H5Dataset): 

538 start_index = self.chunk_index * self.buffer_size 

539 stop_index = start_index + self.buffer_size 

540 iter_data_bounds = self.data.shape[self.iter_axis] 

541 if start_index >= iter_data_bounds: 

542 self.__next_chunk = DataChunk(None, None) 

543 else: 

544 if stop_index > iter_data_bounds: 

545 stop_index = iter_data_bounds 

546 

547 selection = [slice(None)] * len(self.maxshape) 

548 selection[self.iter_axis] = slice(start_index, stop_index) 

549 selection = tuple(selection) 

550 self.__next_chunk.data = self.data[selection] 

551 self.__next_chunk.selection = selection 

552 elif self.__data_iter is not None: 

553 # the pieces in the buffer - first dimension consists of individual calls to next 

554 iter_pieces = [] 

555 # offset of where data begins - shift the selection of where to place this chunk by this much 

556 curr_chunk_offset = 0 

557 read_next_empty = False 

558 while len(iter_pieces) < self.buffer_size: 

559 try: 

560 dat = next(self.__data_iter) 

561 if dat is None and len(iter_pieces) == 0: 

562 # Skip forward in our chunk until we find data 

563 curr_chunk_offset += 1 

564 elif dat is None and len(iter_pieces) > 0: 

565 # Stop iteration if we hit empty data while constructing our block 

566 # Buffer may not be full. 

567 read_next_empty = True 

568 break 

569 else: 

570 # Add pieces of data to our buffer 

571 iter_pieces.append(np.asarray(dat)) 

572 except StopIteration: 

573 break 

574 

575 if len(iter_pieces) == 0: 

576 self.__next_chunk = DataChunk(None, None) # signal end of iteration 

577 else: 

578 # concatenate all the pieces into the chunk along the iteration axis 

579 piece_shape = list(get_data_shape(iter_pieces[0])) 

580 piece_shape.insert(self.iter_axis, 1) # insert the missing axis 

581 next_chunk_shape = piece_shape.copy() 

582 next_chunk_shape[self.iter_axis] *= len(iter_pieces) 

583 next_chunk_size = next_chunk_shape[self.iter_axis] 

584 

585 # use the piece dtype because the actual dtype may not have been determined yet 

586 # NOTE: this could be problematic if a generator returns e.g. floats first and ints later 

587 self.__next_chunk.data = np.empty(next_chunk_shape, dtype=iter_pieces[0].dtype) 

588 self.__next_chunk.data = np.stack(iter_pieces, axis=self.iter_axis) 

589 

590 selection = [slice(None)] * len(self.maxshape) 

591 selection[self.iter_axis] = slice(self.__next_chunk_start + curr_chunk_offset, 

592 self.__next_chunk_start + curr_chunk_offset + next_chunk_size) 

593 self.__next_chunk.selection = tuple(selection) 

594 

595 # next chunk should start at self.__next_chunk.selection[self.iter_axis].stop 

596 # but if this chunk stopped because of reading empty data, then this should be adjusted by 1 

597 self.__next_chunk_start = self.__next_chunk.selection[self.iter_axis].stop 

598 if read_next_empty: 

599 self.__next_chunk_start += 1 

600 else: 

601 self.__next_chunk = DataChunk(None, None) 

602 

603 self.chunk_index += 1 

604 return self.__next_chunk 

605 

606 def __next__(self): 

607 """ 

608 Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved. 

609 

610 .. tip:: 

611 

612 :py:attr:`numpy.s_` provides a convenient way to generate index tuples using standard array slicing. This 

613 is often useful to define the DataChunk.selection of the current chunk 

614 

615 :returns: DataChunk object with the data and selection of the current chunk 

616 :rtype: DataChunk 

617 

618 """ 

619 # If we have not already read the next chunk, then read it now 

620 if self.__next_chunk.data is None: 

621 self._read_next_chunk() 

622 # If we do not have any next chunk 

623 if self.__next_chunk.data is None: 

624 raise StopIteration 

625 # If this is the first time we see a chunk then remember the size of the first chunk 

626 if self.__first_chunk_shape is None: 626 ↛ 627line 626 didn't jump to line 627, because the condition on line 626 was never true

627 self.__first_chunk_shape = self.__next_chunk.data.shape 

628 # Keep the next chunk we need to return 

629 curr_chunk = DataChunk(self.__next_chunk.data, 

630 self.__next_chunk.selection) 

631 # Remove the data for the next chunk from our list since we are returning it here. 

632 # This is to allow the GarbageCollector to remove the data when it goes out of scope and avoid 

633 # having 2 full chunks in memory if not necessary 

634 self.__next_chunk.data = None 

635 # Return the current next chunk 

636 return curr_chunk 

637 

638 next = __next__ 

639 

640 @docval(returns='Tuple with the recommended chunk shape or None if no particular shape is recommended.') 

641 def recommended_chunk_shape(self): 

642 """Recommend a chunk shape. 

643 

644 To optimize iterative write the chunk should be aligned with the common shape of chunks returned by __next__ 

645 or if those chunks are too large, then a well-aligned subset of those chunks. This may also be 

646 any other value in case one wants to recommend chunk shapes to optimize read rather 

647 than write. The default implementation returns None, indicating no preferential chunking option.""" 

648 return None 

649 

650 @docval(returns='Recommended initial shape for the full data. This should be the shape of the full dataset' + 

651 'if known beforehand or alternatively the minimum shape of the dataset. Return None if no ' + 

652 'recommendation is available') 

653 def recommended_data_shape(self): 

654 """Recommend an initial shape of the data. This is useful when progressively writing data and 

655 we want to recommend an initial size for the dataset""" 

656 if self.maxshape is not None: 

657 if np.all([i is not None for i in self.maxshape]): 

658 return self.maxshape 

659 return self.__first_chunk_shape 

660 

661 @property 

662 def maxshape(self): 

663 """ 

664 Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator. 

665 

666 .. note:: 

667 

668 If an iterator is provided and no data has been read yet, then the first chunk will be read 

669 (i.e., next will be called on the iterator) in order to determine the maxshape. The iterator 

670 is expected to return single chunks along the iterator dimension, this means that maxshape will 

671 add an additional dimension along the iteration dimension. E.g., if we iterate over 

672 the first dimension and the iterator returns chunks of shape (10, 10), then the maxshape would 

673 be (None, 10, 10) or (len(self.data), 10, 10), depending on whether size of the 

674 iteration dimension is known. 

675 

676 :return: Shape tuple. None is used for dimensions where the maximum shape is not known or unlimited. 

677 """ 

678 if self.__maxshape is None: 

679 # If no data has been read from the iterator yet, read the first chunk and use it to determine the maxshape 

680 if self.__data_iter is not None and self.__next_chunk.data is None: 680 ↛ 681line 680 didn't jump to line 681, because the condition on line 680 was never true

681 self._read_next_chunk() 

682 

683 # Determine maxshape from self.__next_chunk 

684 if self.__next_chunk.data is None: 

685 return None 

686 data_shape = get_data_shape(self.__next_chunk.data) 

687 self.__maxshape = list(data_shape) 

688 try: 

689 # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a 

690 # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if 

691 # possible. Otherwise, use None to represent an unlimited size 

692 if hasattr(self.data, '__len__') and self.iter_axis == 0: 

693 # special case of 1-D array 

694 self.__maxshape[0] = len(self.data) 

695 else: 

696 self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis] 

697 except AttributeError: # from self.data.shape 

698 self.__maxshape[self.iter_axis] = None 

699 self.__maxshape = tuple(self.__maxshape) 

700 

701 return self.__maxshape 

702 

703 @property 

704 def dtype(self): 

705 """ 

706 Get the value data type 

707 

708 :return: np.dtype object describing the datatype 

709 """ 

710 return self.__dtype 

711 

712 

713class DataChunk: 

714 """ 

715 Class used to describe a data chunk. Used in DataChunkIterator. 

716 """ 

717 

718 @docval({'name': 'data', 'type': np.ndarray, 

719 'doc': 'Numpy array with the data value(s) of the chunk', 'default': None}, 

720 {'name': 'selection', 'type': None, 

721 'doc': 'Numpy index tuple describing the location of the chunk', 'default': None}) 

722 def __init__(self, **kwargs): 

723 self.data, self.selection = getargs('data', 'selection', kwargs) 

724 

725 def __len__(self): 

726 """Get the number of values in the data chunk""" 

727 if self.data is not None: 

728 return len(self.data) 

729 else: 

730 return 0 

731 

732 def __getattr__(self, attr): 

733 """Delegate retrieval of attributes to the data in self.data""" 

734 return getattr(self.data, attr) 

735 

736 def __copy__(self): 

737 newobj = DataChunk(data=self.data, 

738 selection=self.selection) 

739 return newobj 

740 

741 def __deepcopy__(self, memo): 

742 result = DataChunk(data=copy.deepcopy(self.data), 

743 selection=copy.deepcopy(self.selection)) 

744 memo[id(self)] = result 

745 return result 

746 

747 def astype(self, dtype): 

748 """Get a new DataChunk with the self.data converted to the given type""" 

749 return DataChunk(data=self.data.astype(dtype), 

750 selection=self.selection) 

751 

752 @property 

753 def dtype(self): 

754 """ 

755 Data type of the values in the chunk 

756 

757 :returns: np.dtype of the values in the DataChunk 

758 """ 

759 return self.data.dtype 

760 

761 def get_min_bounds(self): 

762 """ 

763 Helper function to compute the minimum dataset size required to fit the selection of this chunk. 

764 

765 :raises TypeError: If the the selection is not a single int, slice, or tuple of slices. 

766 

767 :return: Tuple with the minimum shape required to store the selection 

768 """ 

769 if isinstance(self.selection, tuple): 769 ↛ 772line 769 didn't jump to line 772, because the condition on line 769 was never false

770 # Determine the minimum array dimensions to fit the chunk selection 

771 max_bounds = tuple([x.stop or 0 if isinstance(x, slice) else x+1 for x in self.selection]) 

772 elif isinstance(self.selection, int): 

773 max_bounds = (self.selection+1, ) 

774 elif isinstance(self.selection, slice): 

775 max_bounds = (self.selection.stop or 0, ) 

776 else: 

777 # Note: Technically any numpy index tuple would be allowed, but h5py is not as general and this case 

778 # only implements the selections supported by h5py. We could add more cases to support a 

779 # broader range of valid numpy selection types 

780 msg = ("Chunk selection %s must be a single int, single slice, or tuple of slices " 

781 "and/or integers") % str(self.selection) 

782 raise TypeError(msg) 

783 return max_bounds 

784 

785 

786def assertEqualShape(data1, 

787 data2, 

788 axes1=None, 

789 axes2=None, 

790 name1=None, 

791 name2=None, 

792 ignore_undetermined=True): 

793 """ 

794 Ensure that the shape of data1 and data2 match along the given dimensions 

795 

796 :param data1: The first input array 

797 :type data1: List, Tuple, np.ndarray, DataChunkIterator etc. 

798 :param data2: The second input array 

799 :type data2: List, Tuple, np.ndarray, DataChunkIterator etc. 

800 :param name1: Optional string with the name of data1 

801 :param name2: Optional string with the name of data2 

802 :param axes1: The dimensions of data1 that should be matched to the dimensions of data2. Set to None to 

803 compare all axes in order. 

804 :type axes1: int, Tuple of ints, List of ints, or None 

805 :param axes2: The dimensions of data2 that should be matched to the dimensions of data1. Must have 

806 the same length as axes1. Set to None to compare all axes in order. 

807 :type axes1: int, Tuple of ints, List of ints, or None 

808 :param ignore_undetermined: Boolean indicating whether non-matching unlimited dimensions should be ignored, 

809 i.e., if two dimension don't match because we can't determine the shape of either one, then 

810 should we ignore that case or treat it as no match 

811 

812 :return: Bool indicating whether the check passed and a string with a message about the matching process 

813 """ 

814 # Create the base return object 

815 response = ShapeValidatorResult() 

816 # Determine the shape of the datasets 

817 response.shape1 = get_data_shape(data1) 

818 response.shape2 = get_data_shape(data2) 

819 # Determine the number of dimensions of the datasets 

820 num_dims_1 = len(response.shape1) if response.shape1 is not None else None 

821 num_dims_2 = len(response.shape2) if response.shape2 is not None else None 

822 # Determine the string names of the datasets 

823 n1 = name1 if name1 is not None else ("data1 at " + str(hex(id(data1)))) 

824 n2 = name2 if name2 is not None else ("data2 at " + str(hex(id(data2)))) 

825 # Determine the axes we should compare 

826 response.axes1 = list(range(num_dims_1)) if axes1 is None else ([axes1] if isinstance(axes1, int) else axes1) 

827 response.axes2 = list(range(num_dims_2)) if axes2 is None else ([axes2] if isinstance(axes2, int) else axes2) 

828 # Validate the array shape 

829 # 1) Check the number of dimensions of the arrays 

830 if (response.axes1 is None and response.axes2 is None) and num_dims_1 != num_dims_2: 830 ↛ 831line 830 didn't jump to line 831, because the condition on line 830 was never true

831 response.result = False 

832 response.error = 'NUM_DIMS_ERROR' 

833 response.message = response.SHAPE_ERROR[response.error] 

834 response.message += " %s is %sD and %s is %sD" % (n1, num_dims_1, n2, num_dims_2) 

835 # 2) Check that we have the same number of dimensions to compare on both arrays 

836 elif len(response.axes1) != len(response.axes2): 

837 response.result = False 

838 response.error = 'NUM_AXES_ERROR' 

839 response.message = response.SHAPE_ERROR[response.error] 

840 response.message += " Cannot compare axes %s with %s" % (str(response.axes1), str(response.axes2)) 

841 # 3) Check that the datasets have sufficient number of dimensions 

842 elif np.max(response.axes1) >= num_dims_1 or np.max(response.axes2) >= num_dims_2: 

843 response.result = False 

844 response.error = 'AXIS_OUT_OF_BOUNDS' 

845 response.message = response.SHAPE_ERROR[response.error] 

846 if np.max(response.axes1) >= num_dims_1: 

847 response.message += "Insufficient number of dimensions for %s -- Expected %i found %i" % \ 

848 (n1, np.max(response.axes1) + 1, num_dims_1) 

849 elif np.max(response.axes2) >= num_dims_2: 849 ↛ 885line 849 didn't jump to line 885, because the condition on line 849 was never false

850 response.message += "Insufficient number of dimensions for %s -- Expected %i found %i" % \ 

851 (n2, np.max(response.axes2) + 1, num_dims_2) 

852 # 4) Compare the length of the dimensions we should validate 

853 else: 

854 unmatched = [] 

855 ignored = [] 

856 for ax in zip(response.axes1, response.axes2): 

857 if response.shape1[ax[0]] != response.shape2[ax[1]]: 

858 if ignore_undetermined and (response.shape1[ax[0]] is None or response.shape2[ax[1]] is None): 

859 ignored.append(ax) 

860 else: 

861 unmatched.append(ax) 

862 response.unmatched = unmatched 

863 response.ignored = ignored 

864 

865 # Check if everything checked out 

866 if len(response.unmatched) == 0: 

867 response.result = True 

868 response.error = None 

869 response.message = response.SHAPE_ERROR[response.error] 

870 if len(response.ignored) > 0: 

871 response.message += " Ignored undetermined axes %s" % str(response.ignored) 

872 else: 

873 response.result = False 

874 response.error = 'AXIS_LEN_ERROR' 

875 response.message = response.SHAPE_ERROR[response.error] 

876 response.message += "Axes %s with size %s of %s did not match dimensions %s with sizes %s of %s." % \ 

877 (str([un[0] for un in response.unmatched]), 

878 str([response.shape1[un[0]] for un in response.unmatched]), 

879 n1, 

880 str([un[1] for un in response.unmatched]), 

881 str([response.shape2[un[1]] for un in response.unmatched]), 

882 n2) 

883 if len(response.ignored) > 0: 883 ↛ 884line 883 didn't jump to line 884, because the condition on line 883 was never true

884 response.message += " Ignored undetermined axes %s" % str(response.ignored) 

885 return response 

886 

887 

888class ShapeValidatorResult: 

889 """Class for storing results from validating the shape of multi-dimensional arrays. 

890 

891 This class is used to store results generated by ShapeValidator 

892 

893 :ivar result: Boolean indicating whether results matched or not 

894 :type result: bool 

895 :ivar message: Message indicating the result of the matching procedure 

896 :type messaage: str, None 

897 """ 

898 SHAPE_ERROR = {None: 'All required axes matched', 

899 'NUM_DIMS_ERROR': 'Unequal number of dimensions.', 

900 'NUM_AXES_ERROR': "Unequal number of axes for comparison.", 

901 'AXIS_OUT_OF_BOUNDS': "Axis index for comparison out of bounds.", 

902 'AXIS_LEN_ERROR': "Unequal length of axes."} 

903 """ 

904 Dict where the Keys are the type of errors that may have occurred during shape comparison and the 

905 values are strings with default error messages for the type. 

906 """ 

907 

908 @docval({'name': 'result', 'type': bool, 'doc': 'Result of the shape validation', 'default': False}, 

909 {'name': 'message', 'type': str, 

910 'doc': 'Message describing the result of the shape validation', 'default': None}, 

911 {'name': 'ignored', 'type': tuple, 

912 'doc': 'Axes that have been ignored in the validaton process', 'default': tuple(), 'shape': (None,)}, 

913 {'name': 'unmatched', 'type': tuple, 

914 'doc': 'List of axes that did not match during shape validation', 'default': tuple(), 'shape': (None,)}, 

915 {'name': 'error', 'type': str, 'doc': 'Error that may have occurred. One of ERROR_TYPE', 'default': None}, 

916 {'name': 'shape1', 'type': tuple, 

917 'doc': 'Shape of the first array for comparison', 'default': tuple(), 'shape': (None,)}, 

918 {'name': 'shape2', 'type': tuple, 

919 'doc': 'Shape of the second array for comparison', 'default': tuple(), 'shape': (None,)}, 

920 {'name': 'axes1', 'type': tuple, 

921 'doc': 'Axes for the first array that should match', 'default': tuple(), 'shape': (None,)}, 

922 {'name': 'axes2', 'type': tuple, 

923 'doc': 'Axes for the second array that should match', 'default': tuple(), 'shape': (None,)}, 

924 ) 

925 def __init__(self, **kwargs): 

926 self.result, self.message, self.ignored, self.unmatched, \ 

927 self.error, self.shape1, self.shape2, self.axes1, self.axes2 = getargs( 

928 'result', 'message', 'ignored', 'unmatched', 'error', 'shape1', 'shape2', 'axes1', 'axes2', kwargs) 

929 

930 def __setattr__(self, key, value): 

931 """ 

932 Overwrite to ensure that, e.g., error_message is not set to an illegal value. 

933 """ 

934 if key == 'error': 

935 if value not in self.SHAPE_ERROR.keys(): 

936 raise ValueError("Illegal error type. Error must be one of ShapeValidatorResult.SHAPE_ERROR: %s" 

937 % str(self.SHAPE_ERROR)) 

938 else: 

939 super().__setattr__(key, value) 

940 elif key in ['shape1', 'shape2', 'axes1', 'axes2', 'ignored', 'unmatched']: # Make sure we sore tuples 

941 super().__setattr__(key, tuple(value)) 

942 else: 

943 super().__setattr__(key, value) 

944 

945 def __getattr__(self, item): 

946 """ 

947 Overwrite to allow dynamic retrieval of the default message 

948 """ 

949 if item == 'default_message': 949 ↛ 951line 949 didn't jump to line 951, because the condition on line 949 was never false

950 return self.SHAPE_ERROR[self.error] 

951 return self.__getattribute__(item) 

952 

953 

954@docval_macro('data') 

955class DataIO: 

956 """ 

957 Base class for wrapping data arrays for I/O. Derived classes of DataIO are typically 

958 used to pass dataset-specific I/O parameters to the particular HDMFIO backend. 

959 """ 

960 

961 @docval({'name': 'data', 

962 'type': 'array_data', 

963 'doc': 'the data to be written', 

964 'default': None}, 

965 {'name': 'dtype', 

966 'type': (type, np.dtype), 

967 'doc': 'the data type of the dataset. Not used if data is specified.', 

968 'default': None}, 

969 {'name': 'shape', 

970 'type': tuple, 

971 'doc': 'the shape of the dataset. Not used if data is specified.', 

972 'default': None}) 

973 def __init__(self, **kwargs): 

974 data, dtype, shape = popargs('data', 'dtype', 'shape', kwargs) 

975 if data is None: 

976 if (dtype is None) ^ (shape is None): 

977 raise ValueError("Must specify 'dtype' and 'shape' if not specifying 'data'") 

978 else: 

979 if dtype is not None: 

980 warn("Argument 'dtype' is ignored when 'data' is specified") 

981 dtype = None 

982 if shape is not None: 

983 warn("Argument 'shape' is ignored when 'data' is specified") 

984 shape = None 

985 self.__data = data 

986 self.__dtype = dtype 

987 self.__shape = shape 

988 

989 def get_io_params(self): 

990 """ 

991 Returns a dict with the I/O parameters specified in this DataIO. 

992 """ 

993 return dict() 

994 

995 @property 

996 def data(self): 

997 """Get the wrapped data object""" 

998 return self.__data 

999 

1000 @data.setter 

1001 def data(self, val): 

1002 """Set the wrapped data object""" 

1003 if self.__data is not None: 

1004 raise ValueError("cannot overwrite 'data' on DataIO") 

1005 if not (self.__dtype is None and self.__shape is None): 

1006 raise ValueError("Setting data when dtype and shape are not None is not supported") 

1007 self.__data = val 

1008 

1009 @property 

1010 def dtype(self): 

1011 """Get the wrapped data object""" 

1012 return self.__dtype or self.__getattr__("dtype") 

1013 

1014 @property 

1015 def shape(self): 

1016 """Get the wrapped data object""" 

1017 return self.__shape or self.__getattr__("shape") 

1018 

1019 def __copy__(self): 

1020 """ 

1021 Define a custom copy method for shallow copy.. 

1022 

1023 This is needed due to delegation of __getattr__ to the data to 

1024 ensure proper copy. 

1025 

1026 :return: Shallow copy of self, ie., a new instance of DataIO wrapping the same self.data object 

1027 """ 

1028 newobj = DataIO(data=self.data) 

1029 return newobj 

1030 

1031 def append(self, arg): 

1032 self.__data = append_data(self.__data, arg) 

1033 

1034 def extend(self, arg): 

1035 self.__data = extend_data(self.__data, arg) 

1036 

1037 def __deepcopy__(self, memo): 

1038 """ 

1039 Define a custom copy method for deep copy. 

1040 

1041 This is needed due to delegation of __getattr__ to the data to 

1042 ensure proper copy. 

1043 

1044 :param memo: 

1045 :return: Deep copy of self, i.e., a new instance of DataIO wrapping a deepcopy of the 

1046 self.data object. 

1047 """ 

1048 result = DataIO(data=copy.deepcopy(self.__data)) 

1049 memo[id(self)] = result 

1050 return result 

1051 

1052 def __len__(self): 

1053 """Number of values in self.data""" 

1054 if self.__shape is not None: 

1055 return self.__shape[0] 

1056 if not self.valid: 

1057 raise InvalidDataIOError("Cannot get length of data. Data is not valid.") 

1058 return len(self.data) 

1059 

1060 def __bool__(self): 

1061 if self.valid: 1061 ↛ 1065line 1061 didn't jump to line 1065, because the condition on line 1061 was never false

1062 if isinstance(self.data, AbstractDataChunkIterator): 1062 ↛ 1063line 1062 didn't jump to line 1063, because the condition on line 1062 was never true

1063 return True 

1064 return len(self) > 0 

1065 return False 

1066 

1067 def __getattr__(self, attr): 

1068 """Delegate attribute lookup to data object""" 

1069 if attr == '__array_struct__' and not self.valid: 

1070 # np.array() checks __array__ or __array_struct__ attribute dep. on numpy version 

1071 raise InvalidDataIOError("Cannot convert data to array. Data is not valid.") 

1072 if not self.valid: 

1073 raise InvalidDataIOError("Cannot get attribute '%s' of data. Data is not valid." % attr) 

1074 return getattr(self.data, attr) 

1075 

1076 def __getitem__(self, item): 

1077 """Delegate slicing to the data object""" 

1078 if not self.valid: 1078 ↛ 1079line 1078 didn't jump to line 1079, because the condition on line 1078 was never true

1079 raise InvalidDataIOError("Cannot get item from data. Data is not valid.") 

1080 return self.data[item] 

1081 

1082 def __array__(self): 

1083 """ 

1084 Support conversion of DataIO.data to a numpy array. This function is 

1085 provided to improve transparent interoperability of DataIO with numpy. 

1086 

1087 :return: An array instance of self.data 

1088 """ 

1089 if not self.valid: 1089 ↛ 1090line 1089 didn't jump to line 1090, because the condition on line 1089 was never true

1090 raise InvalidDataIOError("Cannot convert data to array. Data is not valid.") 

1091 if hasattr(self.data, '__array__'): 

1092 return self.data.__array__() 

1093 elif isinstance(self.data, DataChunkIterator): 

1094 raise NotImplementedError("Conversion of DataChunkIterator to array not supported") 

1095 else: 

1096 # NOTE this may result in a copy of the array 

1097 return np.asarray(self.data) 

1098 

1099 def __next__(self): 

1100 """Delegate iteration interface to data object""" 

1101 if not self.valid: 

1102 raise InvalidDataIOError("Cannot iterate on data. Data is not valid.") 

1103 return self.data.__next__() 

1104 

1105 def __iter__(self): 

1106 """Delegate iteration interface to the data object""" 

1107 if not self.valid: 

1108 raise InvalidDataIOError("Cannot iterate on data. Data is not valid.") 

1109 return self.data.__iter__() 

1110 

1111 @property 

1112 def valid(self): 

1113 """bool indicating if the data object is valid""" 

1114 return self.data is not None 

1115 

1116 

1117class InvalidDataIOError(Exception): 

1118 pass