Coverage for src/hdmf/data_utils.py: 93%

455 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-04 02:57 +0000

1import copy 

2import math 

3from abc import ABCMeta, abstractmethod 

4from collections.abc import Iterable 

5from warnings import warn 

6from typing import Tuple, Callable 

7from itertools import product, chain 

8 

9import h5py 

10import numpy as np 

11 

12from .utils import docval, getargs, popargs, docval_macro, get_data_shape 

13 

14 

15def append_data(data, arg): 

16 if isinstance(data, (list, DataIO)): 

17 data.append(arg) 

18 return data 

19 elif type(data).__name__ == 'TermSetWrapper': # circular import 

20 data.append(arg) 

21 return data 

22 elif isinstance(data, np.ndarray): 

23 return np.append(data, np.expand_dims(arg, axis=0), axis=0) 

24 elif isinstance(data, h5py.Dataset): 24 ↛ 31line 24 didn't jump to line 31, because the condition on line 24 was never false

25 shape = list(data.shape) 

26 shape[0] += 1 

27 data.resize(shape) 

28 data[-1] = arg 

29 return data 

30 else: 

31 msg = "Data cannot append to object of type '%s'" % type(data) 

32 raise ValueError(msg) 

33 

34 

35def extend_data(data, arg): 

36 """Add all the elements of the iterable arg to the end of data. 

37 

38 :param data: The array to extend 

39 :type data: list, DataIO, np.ndarray, h5py.Dataset 

40 """ 

41 if isinstance(data, (list, DataIO)): 

42 data.extend(arg) 

43 return data 

44 elif type(data).__name__ == 'TermSetWrapper': 

45 data.extend(arg) 

46 return data 

47 elif isinstance(data, np.ndarray): 

48 return np.vstack((data, arg)) 

49 elif isinstance(data, h5py.Dataset): 49 ↛ 56line 49 didn't jump to line 56, because the condition on line 49 was never false

50 shape = list(data.shape) 

51 shape[0] += len(arg) 

52 data.resize(shape) 

53 data[-len(arg):] = arg 

54 return data 

55 else: 

56 msg = "Data cannot extend object of type '%s'" % type(data) 

57 raise ValueError(msg) 

58 

59 

60@docval_macro('array_data') 

61class AbstractDataChunkIterator(metaclass=ABCMeta): 

62 """ 

63 Abstract iterator class used to iterate over DataChunks. 

64 

65 Derived classes must ensure that all abstract methods and abstract properties are implemented, in 

66 particular, dtype, maxshape, __iter__, ___next__, recommended_chunk_shape, and recommended_data_shape. 

67 

68 Iterating over AbstractContainer objects is not yet supported. 

69 """ 

70 

71 @abstractmethod 

72 def __iter__(self): 

73 """Return the iterator object""" 

74 raise NotImplementedError("__iter__ not implemented for derived class") 

75 

76 @abstractmethod 

77 def __next__(self): 

78 r""" 

79 Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved. 

80 

81 HINT: numpy.s\_ provides a convenient way to generate index tuples using standard array slicing. This 

82 is often useful to define the DataChunk.selection of the current chunk 

83 

84 :returns: DataChunk object with the data and selection of the current chunk 

85 :rtype: DataChunk 

86 """ 

87 raise NotImplementedError("__next__ not implemented for derived class") 

88 

89 @abstractmethod 

90 def recommended_chunk_shape(self): 

91 """ 

92 Recommend the chunk shape for the data array. 

93 

94 :return: NumPy-style shape tuple describing the recommended shape for the chunks of the target 

95 array or None. This may or may not be the same as the shape of the chunks returned in the 

96 iteration process. 

97 """ 

98 raise NotImplementedError("recommended_chunk_shape not implemented for derived class") 

99 

100 @abstractmethod 

101 def recommended_data_shape(self): 

102 """ 

103 Recommend the initial shape for the data array. 

104 

105 This is useful in particular to avoid repeated resized of the target array when reading from 

106 this data iterator. This should typically be either the final size of the array or the known 

107 minimal shape of the array. 

108 

109 :return: NumPy-style shape tuple indicating the recommended initial shape for the target array. 

110 This may or may not be the final full shape of the array, i.e., the array is allowed 

111 to grow. This should not be None. 

112 """ 

113 raise NotImplementedError("recommended_data_shape not implemented for derived class") 

114 

115 @property 

116 @abstractmethod 

117 def dtype(self): 

118 """ 

119 Define the data type of the array 

120 

121 :return: NumPy style dtype or otherwise compliant dtype string 

122 """ 

123 raise NotImplementedError("dtype not implemented for derived class") 

124 

125 @property 

126 @abstractmethod 

127 def maxshape(self): 

128 """ 

129 Property describing the maximum shape of the data array that is being iterated over 

130 

131 :return: NumPy-style shape tuple indicating the maximum dimensions up to which the dataset may be 

132 resized. Axes with None are unlimited. 

133 """ 

134 raise NotImplementedError("maxshape not implemented for derived class") 

135 

136 

137class GenericDataChunkIterator(AbstractDataChunkIterator): 

138 """DataChunkIterator that lets the user specify chunk and buffer shapes.""" 

139 

140 __docval_init = ( 

141 dict( 

142 name="buffer_gb", 

143 type=(float, int), 

144 doc=( 

145 "If buffer_shape is not specified, it will be inferred as the smallest chunk " 

146 "below the buffer_gb threshold." 

147 "Defaults to 1GB." 

148 ), 

149 default=None, 

150 ), 

151 dict( 

152 name="buffer_shape", 

153 type=tuple, 

154 doc="Manually defined shape of the buffer.", 

155 default=None, 

156 ), 

157 dict( 

158 name="chunk_mb", 

159 type=(float, int), 

160 doc=( 

161 "If chunk_shape is not specified, it will be inferred as the smallest chunk " 

162 "below the chunk_mb threshold.", 

163 "Defaults to 10MB.", 

164 ), 

165 default=None, 

166 ), 

167 dict( 

168 name="chunk_shape", 

169 type=tuple, 

170 doc="Manually defined shape of the chunks.", 

171 default=None, 

172 ), 

173 dict( 

174 name="display_progress", 

175 type=bool, 

176 doc="Display a progress bar with iteration rate and estimated completion time.", 

177 default=False, 

178 ), 

179 dict( 

180 name="progress_bar_options", 

181 type=None, 

182 doc="Dictionary of keyword arguments to be passed directly to tqdm.", 

183 default=None, 

184 ), 

185 ) 

186 

187 @docval(*__docval_init) 

188 def __init__(self, **kwargs): 

189 """ 

190 Break a dataset into buffers containing multiple chunks to be written into an HDF5 dataset. 

191 

192 Basic users should set the buffer_gb argument to as much free RAM space as can be safely allocated. 

193 Advanced users are offered full control over the shape parameters for the buffer and the chunks; however, 

194 the chunk shape must perfectly divide the buffer shape along each axis. 

195 

196 HDF5 recommends chunk size in the range of 2 to 16 MB for optimal cloud performance. 

197 https://youtu.be/rcS5vt-mKok?t=621 

198 """ 

199 buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, progress_bar_options = getargs( 

200 "buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs 

201 ) 

202 self.progress_bar_options = progress_bar_options or dict() 

203 

204 if buffer_gb is None and buffer_shape is None: 

205 buffer_gb = 1.0 

206 if chunk_mb is None and chunk_shape is None: 

207 chunk_mb = 10.0 

208 assert (buffer_gb is not None) != ( 

209 buffer_shape is not None 

210 ), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!" 

211 assert (chunk_mb is not None) != ( 

212 chunk_shape is not None 

213 ), "Only one of 'chunk_mb' or 'chunk_shape' can be specified!" 

214 

215 self._dtype = self._get_dtype() 

216 self._maxshape = tuple(int(x) for x in self._get_maxshape()) 

217 chunk_shape = tuple(int(x) for x in chunk_shape) if chunk_shape else chunk_shape 

218 self.chunk_shape = chunk_shape or self._get_default_chunk_shape(chunk_mb=chunk_mb) 

219 buffer_shape = tuple(int(x) for x in buffer_shape) if buffer_shape else buffer_shape 

220 self.buffer_shape = buffer_shape or self._get_default_buffer_shape(buffer_gb=buffer_gb) 

221 

222 # Shape assertions 

223 assert all( 

224 buffer_axis > 0 for buffer_axis in self.buffer_shape 

225 ), f"Some dimensions of buffer_shape ({self.buffer_shape}) are less than zero!" 

226 assert all( 

227 chunk_axis <= maxshape_axis for chunk_axis, maxshape_axis in zip(self.chunk_shape, self.maxshape) 

228 ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the data dimensions ({self.maxshape})!" 

229 assert all( 

230 buffer_axis <= maxshape_axis for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) 

231 ), f"Some dimensions of buffer_shape ({self.buffer_shape}) exceed the data dimensions ({self.maxshape})!" 

232 assert all( 

233 (chunk_axis <= buffer_axis for chunk_axis, buffer_axis in zip(self.chunk_shape, self.buffer_shape)) 

234 ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the buffer shape ({self.buffer_shape})!" 

235 assert all( 

236 buffer_axis % chunk_axis == 0 

237 for chunk_axis, buffer_axis, maxshape_axis in zip(self.chunk_shape, self.buffer_shape, self.maxshape) 

238 if buffer_axis != maxshape_axis 

239 ), ( 

240 f"Some dimensions of chunk_shape ({self.chunk_shape}) do not " 

241 f"evenly divide the buffer shape ({self.buffer_shape})!" 

242 ) 

243 

244 self.num_buffers = math.prod( 

245 [ 

246 math.ceil(maxshape_axis / buffer_axis) 

247 for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) 

248 ], 

249 ) 

250 self.buffer_selection_generator = ( 

251 tuple( 

252 [ 

253 slice(lower_bound, upper_bound) 

254 for lower_bound, upper_bound in zip(lower_bounds, upper_bounds) 

255 ] 

256 ) 

257 for lower_bounds, upper_bounds in zip( 

258 product( 

259 *[ 

260 range(0, max_shape_axis, buffer_shape_axis) 

261 for max_shape_axis, buffer_shape_axis in zip(self.maxshape, self.buffer_shape) 

262 ] 

263 ), 

264 product( 

265 *[ 

266 chain(range(buffer_shape_axis, max_shape_axis, buffer_shape_axis), [max_shape_axis]) 

267 for max_shape_axis, buffer_shape_axis in zip(self.maxshape, self.buffer_shape) 

268 ] 

269 ), 

270 ) 

271 ) 

272 

273 if self.display_progress: 

274 try: 

275 from tqdm import tqdm 

276 

277 if "total" in self.progress_bar_options: 

278 warn("Option 'total' in 'progress_bar_options' is not allowed to be over-written! Ignoring.") 

279 self.progress_bar_options.pop("total") 

280 

281 self.progress_bar = tqdm(total=self.num_buffers, **self.progress_bar_options) 

282 except ImportError: 

283 warn( 

284 "You must install tqdm to use the progress bar feature (pip install tqdm)! " 

285 "Progress bar is disabled." 

286 ) 

287 self.display_progress = False 

288 

289 @docval( 

290 dict( 

291 name="chunk_mb", 

292 type=(float, int), 

293 doc="Size of the HDF5 chunk in megabytes. Recommended to be less than 1MB.", 

294 default=None, 

295 ) 

296 ) 

297 def _get_default_chunk_shape(self, **kwargs) -> Tuple[int, ...]: 

298 """ 

299 Select chunk shape with size in MB less than the threshold of chunk_mb. 

300 

301 Keeps the dimensional ratios of the original data. 

302 """ 

303 chunk_mb = getargs("chunk_mb", kwargs) 

304 assert chunk_mb > 0, f"chunk_mb ({chunk_mb}) must be greater than zero!" 

305 

306 n_dims = len(self.maxshape) 

307 itemsize = self.dtype.itemsize 

308 chunk_bytes = chunk_mb * 1e6 

309 

310 min_maxshape = min(self.maxshape) 

311 v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in self.maxshape) 

312 prod_v = math.prod(v) 

313 while prod_v * itemsize > chunk_bytes and prod_v != 1: 

314 non_unit_min_v = min(x for x in v if x != 1) 

315 v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v) 

316 prod_v = math.prod(v) 

317 k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)) 

318 return tuple([min(k * x, self.maxshape[dim]) for dim, x in enumerate(v)]) 

319 

320 @docval( 

321 dict( 

322 name="buffer_gb", 

323 type=(float, int), 

324 doc="Size of the data buffer in gigabytes. Recommended to be as much free RAM as safely available.", 

325 default=None, 

326 ) 

327 ) 

328 def _get_default_buffer_shape(self, **kwargs) -> Tuple[int, ...]: 

329 """ 

330 Select buffer shape with size in GB less than the threshold of buffer_gb. 

331 

332 Keeps the dimensional ratios of the original data. 

333 Assumes the chunk_shape has already been set. 

334 """ 

335 buffer_gb = getargs("buffer_gb", kwargs) 

336 assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!" 

337 assert all(chunk_axis > 0 for chunk_axis in self.chunk_shape), ( 

338 f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!" 

339 ) 

340 

341 k = math.floor( 

342 ( 

343 buffer_gb * 1e9 / (math.prod(self.chunk_shape) * self.dtype.itemsize) 

344 ) ** (1 / len(self.chunk_shape)) 

345 ) 

346 return tuple( 

347 [ 

348 min(max(k * x, self.chunk_shape[j]), self.maxshape[j]) 

349 for j, x in enumerate(self.chunk_shape) 

350 ] 

351 ) 

352 

353 def __iter__(self): 

354 return self 

355 

356 def __next__(self): 

357 """ 

358 Retrieve the next DataChunk object from the buffer, refilling the buffer if necessary. 

359 

360 :returns: DataChunk object with the data and selection of the current buffer. 

361 :rtype: DataChunk 

362 """ 

363 if self.display_progress: 

364 self.progress_bar.update(n=1) 

365 try: 

366 buffer_selection = next(self.buffer_selection_generator) 

367 return DataChunk(data=self._get_data(selection=buffer_selection), selection=buffer_selection) 

368 except StopIteration: 

369 if self.display_progress: 

370 self.progress_bar.write("\n") # Allows text to be written to new lines after completion 

371 raise StopIteration 

372 

373 def __reduce__(self) -> Tuple[Callable, Iterable]: 

374 instance_constructor = self._from_dict 

375 initialization_args = (self._to_dict(),) 

376 return (instance_constructor, initialization_args) 

377 

378 @abstractmethod 

379 def _get_data(self, selection: Tuple[slice]) -> np.ndarray: 

380 """ 

381 Retrieve the data specified by the selection using minimal I/O. 

382 

383 The developer of a new implementation of the GenericDataChunkIterator must ensure the data is actually 

384 loaded into memory, and not simply mapped. 

385 

386 :param selection: Tuple of slices, each indicating the selection indexed with respect to maxshape for that axis 

387 :type selection: tuple of slices 

388 

389 :returns: Array of data specified by selection 

390 :rtype: np.ndarray 

391 Parameters 

392 ---------- 

393 selection : tuple of slices 

394 Each axis of tuple is a slice of the full shape from which to pull data into the buffer. 

395 """ 

396 raise NotImplementedError("The data fetching method has not been built for this DataChunkIterator!") 

397 

398 @abstractmethod 

399 def _get_maxshape(self) -> Tuple[int, ...]: 

400 """Retrieve the maximum bounds of the data shape using minimal I/O.""" 

401 raise NotImplementedError("The setter for the maxshape property has not been built for this DataChunkIterator!") 

402 

403 @abstractmethod 

404 def _get_dtype(self) -> np.dtype: 

405 """Retrieve the dtype of the data using minimal I/O.""" 

406 raise NotImplementedError("The setter for the internal dtype has not been built for this DataChunkIterator!") 

407 

408 def _to_dict(self) -> dict: 

409 """Optional method to add in child classes to enable pickling (required for multiprocessing).""" 

410 raise NotImplementedError( 

411 "The `._to_dict()` method for pickling has not been defined for this DataChunkIterator!" 

412 ) 

413 

414 @staticmethod 

415 def _from_dict(self) -> Callable: 

416 """Optional method to add in child classes to enable pickling (required for multiprocessing).""" 

417 raise NotImplementedError( 

418 "The `._from_dict()` method for pickling has not been defined for this DataChunkIterator!" 

419 ) 

420 

421 def recommended_chunk_shape(self) -> Tuple[int, ...]: 

422 return self.chunk_shape 

423 

424 def recommended_data_shape(self) -> Tuple[int, ...]: 

425 return self.maxshape 

426 

427 @property 

428 def maxshape(self) -> Tuple[int, ...]: 

429 return self._maxshape 

430 @property 

431 def dtype(self) -> np.dtype: 

432 return self._dtype 

433 

434 

435class DataChunkIterator(AbstractDataChunkIterator): 

436 """ 

437 Custom iterator class used to iterate over chunks of data. 

438 

439 This default implementation of AbstractDataChunkIterator accepts any iterable and assumes that we iterate over 

440 a single dimension of the data array (default: the first dimension). DataChunkIterator supports buffered read, 

441 i.e., multiple values from the input iterator can be combined to a single chunk. This is 

442 useful for buffered I/O operations, e.g., to improve performance by accumulating data 

443 in memory and writing larger blocks at once. 

444 

445 .. note:: 

446 

447 DataChunkIterator assumes that the iterator that it wraps returns one element along the 

448 iteration dimension at a time. I.e., the iterator is expected to return chunks that are 

449 one dimension lower than the array itself. For example, when iterating over the first dimension 

450 of a dataset with shape (1000, 10, 10), then the iterator would return 1000 chunks of 

451 shape (10, 10) one-chunk-at-a-time. If this pattern does not match your use-case then 

452 using :py:class:`~hdmf.data_utils.GenericDataChunkIterator` or 

453 :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more appropriate. 

454 """ 

455 

456 __docval_init = ( 

457 {'name': 'data', 'type': None, 'doc': 'The data object used for iteration', 'default': None}, 

458 {'name': 'maxshape', 'type': tuple, 

459 'doc': 'The maximum shape of the full data array. Use None to indicate unlimited dimensions', 

460 'default': None}, 

461 {'name': 'dtype', 'type': np.dtype, 'doc': 'The Numpy data type for the array', 'default': None}, 

462 {'name': 'buffer_size', 'type': int, 'doc': 'Number of values to be buffered in a chunk', 'default': 1}, 

463 {'name': 'iter_axis', 'type': int, 'doc': 'The dimension to iterate over', 'default': 0} 

464 ) 

465 

466 @docval(*__docval_init) 

467 def __init__(self, **kwargs): 

468 """Initialize the DataChunkIterator. 

469 If 'data' is an iterator and 'dtype' is not specified, then next is called on the iterator in order to determine 

470 the dtype of the data. 

471 """ 

472 # Get the user parameters 

473 self.data, self.__maxshape, self.__dtype, self.buffer_size, self.iter_axis = getargs('data', 

474 'maxshape', 

475 'dtype', 

476 'buffer_size', 

477 'iter_axis', 

478 kwargs) 

479 self.chunk_index = 0 

480 # Create an iterator for the data if possible 

481 if isinstance(self.data, Iterable): 

482 if self.iter_axis != 0 and isinstance(self.data, (list, tuple)): 

483 warn('Iterating over an axis other than the first dimension of list or tuple data ' 

484 'involves converting the data object to a numpy ndarray, which may incur a computational ' 

485 'cost.') 

486 self.data = np.asarray(self.data) 

487 if isinstance(self.data, np.ndarray): 

488 # iterate over the given axis by adding a new view on data (iter only works on the first dim) 

489 self.__data_iter = iter(np.moveaxis(self.data, self.iter_axis, 0)) 

490 else: 

491 self.__data_iter = iter(self.data) 

492 else: 

493 self.__data_iter = None 

494 self.__next_chunk = DataChunk(None, None) 

495 self.__next_chunk_start = 0 

496 self.__first_chunk_shape = None 

497 # Determine the shape of the data if possible 

498 if self.__maxshape is None: 

499 # If the self.data object identifies its shape, then use it 

500 if hasattr(self.data, "shape"): 

501 self.__maxshape = self.data.shape 

502 # Avoid the special case of scalar values by making them into a 1D numpy array 

503 if len(self.__maxshape) == 0: 503 ↛ 504line 503 didn't jump to line 504, because the condition on line 503 was never true

504 self.data = np.asarray([self.data, ]) 

505 self.__maxshape = self.data.shape 

506 self.__data_iter = iter(self.data) 

507 # Try to get an accurate idea of __maxshape for other Python data structures if possible. 

508 # Don't just call get_data_shape for a generator as that would potentially trigger loading of all the data 

509 elif isinstance(self.data, list) or isinstance(self.data, tuple): 

510 self.__maxshape = get_data_shape(self.data, strict_no_data_load=True) 

511 

512 # If we have a data iterator and do not know the dtype, then read the first chunk 

513 if self.__data_iter is not None and self.__dtype is None: 

514 self._read_next_chunk() 

515 

516 # Determine the type of the data if possible 

517 if self.__next_chunk.data is not None: 

518 self.__dtype = self.__next_chunk.data.dtype 

519 self.__first_chunk_shape = get_data_shape(self.__next_chunk.data) 

520 

521 # This should be done as a last resort only 

522 if self.__first_chunk_shape is None and self.__maxshape is not None: 

523 self.__first_chunk_shape = tuple(1 if i is None else i for i in self.__maxshape) 

524 

525 if self.__dtype is None: 

526 raise Exception('Data type could not be determined. Please specify dtype in DataChunkIterator init.') 

527 

528 @classmethod 

529 @docval(*__docval_init) 

530 def from_iterable(cls, **kwargs): 

531 return cls(**kwargs) 

532 

533 def __iter__(self): 

534 """Return the iterator object""" 

535 return self 

536 

537 def _read_next_chunk(self): 

538 """Read a single chunk from self.__data_iter and store the results in self.__next_chunk 

539 

540 :returns: self.__next_chunk, i.e., the DataChunk object describing the next chunk 

541 """ 

542 from h5py import Dataset as H5Dataset 

543 if isinstance(self.data, H5Dataset): 

544 start_index = self.chunk_index * self.buffer_size 

545 stop_index = start_index + self.buffer_size 

546 iter_data_bounds = self.data.shape[self.iter_axis] 

547 if start_index >= iter_data_bounds: 

548 self.__next_chunk = DataChunk(None, None) 

549 else: 

550 if stop_index > iter_data_bounds: 

551 stop_index = iter_data_bounds 

552 

553 selection = [slice(None)] * len(self.maxshape) 

554 selection[self.iter_axis] = slice(start_index, stop_index) 

555 selection = tuple(selection) 

556 self.__next_chunk.data = self.data[selection] 

557 self.__next_chunk.selection = selection 

558 elif self.__data_iter is not None: 

559 # the pieces in the buffer - first dimension consists of individual calls to next 

560 iter_pieces = [] 

561 # offset of where data begins - shift the selection of where to place this chunk by this much 

562 curr_chunk_offset = 0 

563 read_next_empty = False 

564 while len(iter_pieces) < self.buffer_size: 

565 try: 

566 dat = next(self.__data_iter) 

567 if dat is None and len(iter_pieces) == 0: 

568 # Skip forward in our chunk until we find data 

569 curr_chunk_offset += 1 

570 elif dat is None and len(iter_pieces) > 0: 

571 # Stop iteration if we hit empty data while constructing our block 

572 # Buffer may not be full. 

573 read_next_empty = True 

574 break 

575 else: 

576 # Add pieces of data to our buffer 

577 iter_pieces.append(np.asarray(dat)) 

578 except StopIteration: 

579 break 

580 

581 if len(iter_pieces) == 0: 

582 self.__next_chunk = DataChunk(None, None) # signal end of iteration 

583 else: 

584 # concatenate all the pieces into the chunk along the iteration axis 

585 piece_shape = list(get_data_shape(iter_pieces[0])) 

586 piece_shape.insert(self.iter_axis, 1) # insert the missing axis 

587 next_chunk_shape = piece_shape.copy() 

588 next_chunk_shape[self.iter_axis] *= len(iter_pieces) 

589 next_chunk_size = next_chunk_shape[self.iter_axis] 

590 

591 # use the piece dtype because the actual dtype may not have been determined yet 

592 # NOTE: this could be problematic if a generator returns e.g. floats first and ints later 

593 self.__next_chunk.data = np.empty(next_chunk_shape, dtype=iter_pieces[0].dtype) 

594 self.__next_chunk.data = np.stack(iter_pieces, axis=self.iter_axis) 

595 

596 selection = [slice(None)] * len(self.maxshape) 

597 selection[self.iter_axis] = slice(self.__next_chunk_start + curr_chunk_offset, 

598 self.__next_chunk_start + curr_chunk_offset + next_chunk_size) 

599 self.__next_chunk.selection = tuple(selection) 

600 

601 # next chunk should start at self.__next_chunk.selection[self.iter_axis].stop 

602 # but if this chunk stopped because of reading empty data, then this should be adjusted by 1 

603 self.__next_chunk_start = self.__next_chunk.selection[self.iter_axis].stop 

604 if read_next_empty: 

605 self.__next_chunk_start += 1 

606 else: 

607 self.__next_chunk = DataChunk(None, None) 

608 

609 self.chunk_index += 1 

610 return self.__next_chunk 

611 

612 def __next__(self): 

613 """ 

614 Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved. 

615 

616 .. tip:: 

617 

618 :py:attr:`numpy.s_` provides a convenient way to generate index tuples using standard array slicing. This 

619 is often useful to define the DataChunk.selection of the current chunk 

620 

621 :returns: DataChunk object with the data and selection of the current chunk 

622 :rtype: DataChunk 

623 

624 """ 

625 # If we have not already read the next chunk, then read it now 

626 if self.__next_chunk.data is None: 

627 self._read_next_chunk() 

628 # If we do not have any next chunk 

629 if self.__next_chunk.data is None: 

630 raise StopIteration 

631 # If this is the first time we see a chunk then remember the size of the first chunk 

632 if self.__first_chunk_shape is None: 632 ↛ 633line 632 didn't jump to line 633, because the condition on line 632 was never true

633 self.__first_chunk_shape = self.__next_chunk.data.shape 

634 # Keep the next chunk we need to return 

635 curr_chunk = DataChunk(self.__next_chunk.data, 

636 self.__next_chunk.selection) 

637 # Remove the data for the next chunk from our list since we are returning it here. 

638 # This is to allow the GarbageCollector to remove the data when it goes out of scope and avoid 

639 # having 2 full chunks in memory if not necessary 

640 self.__next_chunk.data = None 

641 # Return the current next chunk 

642 return curr_chunk 

643 

644 next = __next__ 

645 

646 @docval(returns='Tuple with the recommended chunk shape or None if no particular shape is recommended.') 

647 def recommended_chunk_shape(self): 

648 """Recommend a chunk shape. 

649 

650 To optimize iterative write the chunk should be aligned with the common shape of chunks returned by __next__ 

651 or if those chunks are too large, then a well-aligned subset of those chunks. This may also be 

652 any other value in case one wants to recommend chunk shapes to optimize read rather 

653 than write. The default implementation returns None, indicating no preferential chunking option.""" 

654 return None 

655 

656 @docval(returns='Recommended initial shape for the full data. This should be the shape of the full dataset' + 

657 'if known beforehand or alternatively the minimum shape of the dataset. Return None if no ' + 

658 'recommendation is available') 

659 def recommended_data_shape(self): 

660 """Recommend an initial shape of the data. This is useful when progressively writing data and 

661 we want to recommend an initial size for the dataset""" 

662 if self.maxshape is not None: 

663 if np.all([i is not None for i in self.maxshape]): 

664 return self.maxshape 

665 return self.__first_chunk_shape 

666 

667 @property 

668 def maxshape(self): 

669 """ 

670 Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator. 

671 

672 .. note:: 

673 

674 If an iterator is provided and no data has been read yet, then the first chunk will be read 

675 (i.e., next will be called on the iterator) in order to determine the maxshape. The iterator 

676 is expected to return single chunks along the iterator dimension, this means that maxshape will 

677 add an additional dimension along the iteration dimension. E.g., if we iterate over 

678 the first dimension and the iterator returns chunks of shape (10, 10), then the maxshape would 

679 be (None, 10, 10) or (len(self.data), 10, 10), depending on whether size of the 

680 iteration dimension is known. 

681 

682 :return: Shape tuple. None is used for dimensions where the maximum shape is not known or unlimited. 

683 """ 

684 if self.__maxshape is None: 

685 # If no data has been read from the iterator yet, read the first chunk and use it to determine the maxshape 

686 if self.__data_iter is not None and self.__next_chunk.data is None: 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true

687 self._read_next_chunk() 

688 

689 # Determine maxshape from self.__next_chunk 

690 if self.__next_chunk.data is None: 

691 return None 

692 data_shape = get_data_shape(self.__next_chunk.data) 

693 self.__maxshape = list(data_shape) 

694 try: 

695 # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a 

696 # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if 

697 # possible. Otherwise, use None to represent an unlimited size 

698 if hasattr(self.data, '__len__') and self.iter_axis == 0: 

699 # special case of 1-D array 

700 self.__maxshape[0] = len(self.data) 

701 else: 

702 self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis] 

703 except AttributeError: # from self.data.shape 

704 self.__maxshape[self.iter_axis] = None 

705 self.__maxshape = tuple(self.__maxshape) 

706 

707 return self.__maxshape 

708 

709 @property 

710 def dtype(self): 

711 """ 

712 Get the value data type 

713 

714 :return: np.dtype object describing the datatype 

715 """ 

716 return self.__dtype 

717 

718 

719class DataChunk: 

720 """ 

721 Class used to describe a data chunk. Used in DataChunkIterator. 

722 """ 

723 

724 @docval({'name': 'data', 'type': np.ndarray, 

725 'doc': 'Numpy array with the data value(s) of the chunk', 'default': None}, 

726 {'name': 'selection', 'type': None, 

727 'doc': 'Numpy index tuple describing the location of the chunk', 'default': None}) 

728 def __init__(self, **kwargs): 

729 self.data, self.selection = getargs('data', 'selection', kwargs) 

730 

731 def __len__(self): 

732 """Get the number of values in the data chunk""" 

733 if self.data is not None: 

734 return len(self.data) 

735 else: 

736 return 0 

737 

738 def __getattr__(self, attr): 

739 """Delegate retrieval of attributes to the data in self.data""" 

740 return getattr(self.data, attr) 

741 

742 def __copy__(self): 

743 newobj = DataChunk(data=self.data, 

744 selection=self.selection) 

745 return newobj 

746 

747 def __deepcopy__(self, memo): 

748 result = DataChunk(data=copy.deepcopy(self.data), 

749 selection=copy.deepcopy(self.selection)) 

750 memo[id(self)] = result 

751 return result 

752 

753 def astype(self, dtype): 

754 """Get a new DataChunk with the self.data converted to the given type""" 

755 return DataChunk(data=self.data.astype(dtype), 

756 selection=self.selection) 

757 

758 @property 

759 def dtype(self): 

760 """ 

761 Data type of the values in the chunk 

762 

763 :returns: np.dtype of the values in the DataChunk 

764 """ 

765 return self.data.dtype 

766 

767 def get_min_bounds(self): 

768 """ 

769 Helper function to compute the minimum dataset size required to fit the selection of this chunk. 

770 

771 :raises TypeError: If the the selection is not a single int, slice, or tuple of slices. 

772 

773 :return: Tuple with the minimum shape required to store the selection 

774 """ 

775 if isinstance(self.selection, tuple): 775 ↛ 778line 775 didn't jump to line 778, because the condition on line 775 was never false

776 # Determine the minimum array dimensions to fit the chunk selection 

777 max_bounds = tuple([x.stop or 0 if isinstance(x, slice) else x+1 for x in self.selection]) 

778 elif isinstance(self.selection, int): 

779 max_bounds = (self.selection+1, ) 

780 elif isinstance(self.selection, slice): 

781 max_bounds = (self.selection.stop or 0, ) 

782 else: 

783 # Note: Technically any numpy index tuple would be allowed, but h5py is not as general and this case 

784 # only implements the selections supported by h5py. We could add more cases to support a 

785 # broader range of valid numpy selection types 

786 msg = ("Chunk selection %s must be a single int, single slice, or tuple of slices " 

787 "and/or integers") % str(self.selection) 

788 raise TypeError(msg) 

789 return max_bounds 

790 

791 

792def assertEqualShape(data1, 

793 data2, 

794 axes1=None, 

795 axes2=None, 

796 name1=None, 

797 name2=None, 

798 ignore_undetermined=True): 

799 """ 

800 Ensure that the shape of data1 and data2 match along the given dimensions 

801 

802 :param data1: The first input array 

803 :type data1: List, Tuple, np.ndarray, DataChunkIterator etc. 

804 :param data2: The second input array 

805 :type data2: List, Tuple, np.ndarray, DataChunkIterator etc. 

806 :param name1: Optional string with the name of data1 

807 :param name2: Optional string with the name of data2 

808 :param axes1: The dimensions of data1 that should be matched to the dimensions of data2. Set to None to 

809 compare all axes in order. 

810 :type axes1: int, Tuple of ints, List of ints, or None 

811 :param axes2: The dimensions of data2 that should be matched to the dimensions of data1. Must have 

812 the same length as axes1. Set to None to compare all axes in order. 

813 :type axes1: int, Tuple of ints, List of ints, or None 

814 :param ignore_undetermined: Boolean indicating whether non-matching unlimited dimensions should be ignored, 

815 i.e., if two dimension don't match because we can't determine the shape of either one, then 

816 should we ignore that case or treat it as no match 

817 

818 :return: Bool indicating whether the check passed and a string with a message about the matching process 

819 """ 

820 # Create the base return object 

821 response = ShapeValidatorResult() 

822 # Determine the shape of the datasets 

823 response.shape1 = get_data_shape(data1) 

824 response.shape2 = get_data_shape(data2) 

825 # Determine the number of dimensions of the datasets 

826 num_dims_1 = len(response.shape1) if response.shape1 is not None else None 

827 num_dims_2 = len(response.shape2) if response.shape2 is not None else None 

828 # Determine the string names of the datasets 

829 n1 = name1 if name1 is not None else ("data1 at " + str(hex(id(data1)))) 

830 n2 = name2 if name2 is not None else ("data2 at " + str(hex(id(data2)))) 

831 # Determine the axes we should compare 

832 response.axes1 = list(range(num_dims_1)) if axes1 is None else ([axes1] if isinstance(axes1, int) else axes1) 

833 response.axes2 = list(range(num_dims_2)) if axes2 is None else ([axes2] if isinstance(axes2, int) else axes2) 

834 # Validate the array shape 

835 # 1) Check the number of dimensions of the arrays 

836 if (response.axes1 is None and response.axes2 is None) and num_dims_1 != num_dims_2: 836 ↛ 837line 836 didn't jump to line 837, because the condition on line 836 was never true

837 response.result = False 

838 response.error = 'NUM_DIMS_ERROR' 

839 response.message = response.SHAPE_ERROR[response.error] 

840 response.message += " %s is %sD and %s is %sD" % (n1, num_dims_1, n2, num_dims_2) 

841 # 2) Check that we have the same number of dimensions to compare on both arrays 

842 elif len(response.axes1) != len(response.axes2): 

843 response.result = False 

844 response.error = 'NUM_AXES_ERROR' 

845 response.message = response.SHAPE_ERROR[response.error] 

846 response.message += " Cannot compare axes %s with %s" % (str(response.axes1), str(response.axes2)) 

847 # 3) Check that the datasets have sufficient number of dimensions 

848 elif np.max(response.axes1) >= num_dims_1 or np.max(response.axes2) >= num_dims_2: 

849 response.result = False 

850 response.error = 'AXIS_OUT_OF_BOUNDS' 

851 response.message = response.SHAPE_ERROR[response.error] 

852 if np.max(response.axes1) >= num_dims_1: 

853 response.message += "Insufficient number of dimensions for %s -- Expected %i found %i" % \ 

854 (n1, np.max(response.axes1) + 1, num_dims_1) 

855 elif np.max(response.axes2) >= num_dims_2: 855 ↛ 891line 855 didn't jump to line 891, because the condition on line 855 was never false

856 response.message += "Insufficient number of dimensions for %s -- Expected %i found %i" % \ 

857 (n2, np.max(response.axes2) + 1, num_dims_2) 

858 # 4) Compare the length of the dimensions we should validate 

859 else: 

860 unmatched = [] 

861 ignored = [] 

862 for ax in zip(response.axes1, response.axes2): 

863 if response.shape1[ax[0]] != response.shape2[ax[1]]: 

864 if ignore_undetermined and (response.shape1[ax[0]] is None or response.shape2[ax[1]] is None): 

865 ignored.append(ax) 

866 else: 

867 unmatched.append(ax) 

868 response.unmatched = unmatched 

869 response.ignored = ignored 

870 

871 # Check if everything checked out 

872 if len(response.unmatched) == 0: 

873 response.result = True 

874 response.error = None 

875 response.message = response.SHAPE_ERROR[response.error] 

876 if len(response.ignored) > 0: 

877 response.message += " Ignored undetermined axes %s" % str(response.ignored) 

878 else: 

879 response.result = False 

880 response.error = 'AXIS_LEN_ERROR' 

881 response.message = response.SHAPE_ERROR[response.error] 

882 response.message += "Axes %s with size %s of %s did not match dimensions %s with sizes %s of %s." % \ 

883 (str([un[0] for un in response.unmatched]), 

884 str([response.shape1[un[0]] for un in response.unmatched]), 

885 n1, 

886 str([un[1] for un in response.unmatched]), 

887 str([response.shape2[un[1]] for un in response.unmatched]), 

888 n2) 

889 if len(response.ignored) > 0: 889 ↛ 890line 889 didn't jump to line 890, because the condition on line 889 was never true

890 response.message += " Ignored undetermined axes %s" % str(response.ignored) 

891 return response 

892 

893 

894class ShapeValidatorResult: 

895 """Class for storing results from validating the shape of multi-dimensional arrays. 

896 

897 This class is used to store results generated by ShapeValidator 

898 

899 :ivar result: Boolean indicating whether results matched or not 

900 :type result: bool 

901 :ivar message: Message indicating the result of the matching procedure 

902 :type messaage: str, None 

903 """ 

904 SHAPE_ERROR = {None: 'All required axes matched', 

905 'NUM_DIMS_ERROR': 'Unequal number of dimensions.', 

906 'NUM_AXES_ERROR': "Unequal number of axes for comparison.", 

907 'AXIS_OUT_OF_BOUNDS': "Axis index for comparison out of bounds.", 

908 'AXIS_LEN_ERROR': "Unequal length of axes."} 

909 """ 

910 Dict where the Keys are the type of errors that may have occurred during shape comparison and the 

911 values are strings with default error messages for the type. 

912 """ 

913 

914 @docval({'name': 'result', 'type': bool, 'doc': 'Result of the shape validation', 'default': False}, 

915 {'name': 'message', 'type': str, 

916 'doc': 'Message describing the result of the shape validation', 'default': None}, 

917 {'name': 'ignored', 'type': tuple, 

918 'doc': 'Axes that have been ignored in the validaton process', 'default': tuple(), 'shape': (None,)}, 

919 {'name': 'unmatched', 'type': tuple, 

920 'doc': 'List of axes that did not match during shape validation', 'default': tuple(), 'shape': (None,)}, 

921 {'name': 'error', 'type': str, 'doc': 'Error that may have occurred. One of ERROR_TYPE', 'default': None}, 

922 {'name': 'shape1', 'type': tuple, 

923 'doc': 'Shape of the first array for comparison', 'default': tuple(), 'shape': (None,)}, 

924 {'name': 'shape2', 'type': tuple, 

925 'doc': 'Shape of the second array for comparison', 'default': tuple(), 'shape': (None,)}, 

926 {'name': 'axes1', 'type': tuple, 

927 'doc': 'Axes for the first array that should match', 'default': tuple(), 'shape': (None,)}, 

928 {'name': 'axes2', 'type': tuple, 

929 'doc': 'Axes for the second array that should match', 'default': tuple(), 'shape': (None,)}, 

930 ) 

931 def __init__(self, **kwargs): 

932 self.result, self.message, self.ignored, self.unmatched, \ 

933 self.error, self.shape1, self.shape2, self.axes1, self.axes2 = getargs( 

934 'result', 'message', 'ignored', 'unmatched', 'error', 'shape1', 'shape2', 'axes1', 'axes2', kwargs) 

935 

936 def __setattr__(self, key, value): 

937 """ 

938 Overwrite to ensure that, e.g., error_message is not set to an illegal value. 

939 """ 

940 if key == 'error': 

941 if value not in self.SHAPE_ERROR.keys(): 

942 raise ValueError("Illegal error type. Error must be one of ShapeValidatorResult.SHAPE_ERROR: %s" 

943 % str(self.SHAPE_ERROR)) 

944 else: 

945 super().__setattr__(key, value) 

946 elif key in ['shape1', 'shape2', 'axes1', 'axes2', 'ignored', 'unmatched']: # Make sure we sore tuples 

947 super().__setattr__(key, tuple(value)) 

948 else: 

949 super().__setattr__(key, value) 

950 

951 def __getattr__(self, item): 

952 """ 

953 Overwrite to allow dynamic retrieval of the default message 

954 """ 

955 if item == 'default_message': 955 ↛ 957line 955 didn't jump to line 957, because the condition on line 955 was never false

956 return self.SHAPE_ERROR[self.error] 

957 return self.__getattribute__(item) 

958 

959 

960@docval_macro('data') 

961class DataIO: 

962 """ 

963 Base class for wrapping data arrays for I/O. Derived classes of DataIO are typically 

964 used to pass dataset-specific I/O parameters to the particular HDMFIO backend. 

965 """ 

966 

967 @docval({'name': 'data', 

968 'type': 'array_data', 

969 'doc': 'the data to be written', 

970 'default': None}, 

971 {'name': 'dtype', 

972 'type': (type, np.dtype), 

973 'doc': 'the data type of the dataset. Not used if data is specified.', 

974 'default': None}, 

975 {'name': 'shape', 

976 'type': tuple, 

977 'doc': 'the shape of the dataset. Not used if data is specified.', 

978 'default': None}) 

979 def __init__(self, **kwargs): 

980 data, dtype, shape = popargs('data', 'dtype', 'shape', kwargs) 

981 if data is None: 

982 if (dtype is None) ^ (shape is None): 

983 raise ValueError("Must specify 'dtype' and 'shape' if not specifying 'data'") 

984 else: 

985 if dtype is not None: 

986 warn("Argument 'dtype' is ignored when 'data' is specified") 

987 dtype = None 

988 if shape is not None: 

989 warn("Argument 'shape' is ignored when 'data' is specified") 

990 shape = None 

991 self.__data = data 

992 self.__dtype = dtype 

993 self.__shape = shape 

994 

995 def get_io_params(self): 

996 """ 

997 Returns a dict with the I/O parameters specified in this DataIO. 

998 """ 

999 return dict() 

1000 

1001 @property 

1002 def data(self): 

1003 """Get the wrapped data object""" 

1004 return self.__data 

1005 

1006 @data.setter 

1007 def data(self, val): 

1008 """Set the wrapped data object""" 

1009 if self.__data is not None: 

1010 raise ValueError("cannot overwrite 'data' on DataIO") 

1011 if not (self.__dtype is None and self.__shape is None): 

1012 raise ValueError("Setting data when dtype and shape are not None is not supported") 

1013 self.__data = val 

1014 

1015 @property 

1016 def dtype(self): 

1017 """Get the wrapped data object""" 

1018 return self.__dtype or self.__getattr__("dtype") 

1019 

1020 @property 

1021 def shape(self): 

1022 """Get the wrapped data object""" 

1023 return self.__shape or self.__getattr__("shape") 

1024 

1025 def __copy__(self): 

1026 """ 

1027 Define a custom copy method for shallow copy.. 

1028 

1029 This is needed due to delegation of __getattr__ to the data to 

1030 ensure proper copy. 

1031 

1032 :return: Shallow copy of self, ie., a new instance of DataIO wrapping the same self.data object 

1033 """ 

1034 newobj = DataIO(data=self.data) 

1035 return newobj 

1036 

1037 def append(self, arg): 

1038 self.__data = append_data(self.__data, arg) 

1039 

1040 def extend(self, arg): 

1041 self.__data = extend_data(self.__data, arg) 

1042 

1043 def __deepcopy__(self, memo): 

1044 """ 

1045 Define a custom copy method for deep copy. 

1046 

1047 This is needed due to delegation of __getattr__ to the data to 

1048 ensure proper copy. 

1049 

1050 :param memo: 

1051 :return: Deep copy of self, i.e., a new instance of DataIO wrapping a deepcopy of the 

1052 self.data object. 

1053 """ 

1054 result = DataIO(data=copy.deepcopy(self.__data)) 

1055 memo[id(self)] = result 

1056 return result 

1057 

1058 def __len__(self): 

1059 """Number of values in self.data""" 

1060 if self.__shape is not None: 

1061 return self.__shape[0] 

1062 if not self.valid: 

1063 raise InvalidDataIOError("Cannot get length of data. Data is not valid.") 

1064 return len(self.data) 

1065 

1066 def __bool__(self): 

1067 if self.valid: 1067 ↛ 1071line 1067 didn't jump to line 1071, because the condition on line 1067 was never false

1068 if isinstance(self.data, AbstractDataChunkIterator): 1068 ↛ 1069line 1068 didn't jump to line 1069, because the condition on line 1068 was never true

1069 return True 

1070 return len(self) > 0 

1071 return False 

1072 

1073 def __getattr__(self, attr): 

1074 """Delegate attribute lookup to data object""" 

1075 if attr == '__array_struct__' and not self.valid: 

1076 # np.array() checks __array__ or __array_struct__ attribute dep. on numpy version 

1077 raise InvalidDataIOError("Cannot convert data to array. Data is not valid.") 

1078 if not self.valid: 

1079 raise InvalidDataIOError("Cannot get attribute '%s' of data. Data is not valid." % attr) 

1080 return getattr(self.data, attr) 

1081 

1082 def __getitem__(self, item): 

1083 """Delegate slicing to the data object""" 

1084 if not self.valid: 1084 ↛ 1085line 1084 didn't jump to line 1085, because the condition on line 1084 was never true

1085 raise InvalidDataIOError("Cannot get item from data. Data is not valid.") 

1086 return self.data[item] 

1087 

1088 def __array__(self): 

1089 """ 

1090 Support conversion of DataIO.data to a numpy array. This function is 

1091 provided to improve transparent interoperability of DataIO with numpy. 

1092 

1093 :return: An array instance of self.data 

1094 """ 

1095 if not self.valid: 1095 ↛ 1096line 1095 didn't jump to line 1096, because the condition on line 1095 was never true

1096 raise InvalidDataIOError("Cannot convert data to array. Data is not valid.") 

1097 if hasattr(self.data, '__array__'): 

1098 return self.data.__array__() 

1099 elif isinstance(self.data, DataChunkIterator): 

1100 raise NotImplementedError("Conversion of DataChunkIterator to array not supported") 

1101 else: 

1102 # NOTE this may result in a copy of the array 

1103 return np.asarray(self.data) 

1104 

1105 def __next__(self): 

1106 """Delegate iteration interface to data object""" 

1107 if not self.valid: 

1108 raise InvalidDataIOError("Cannot iterate on data. Data is not valid.") 

1109 return self.data.__next__() 

1110 

1111 def __iter__(self): 

1112 """Delegate iteration interface to the data object""" 

1113 if not self.valid: 

1114 raise InvalidDataIOError("Cannot iterate on data. Data is not valid.") 

1115 return self.data.__iter__() 

1116 

1117 @property 

1118 def valid(self): 

1119 """bool indicating if the data object is valid""" 

1120 return self.data is not None 

1121 

1122 

1123class InvalidDataIOError(Exception): 

1124 pass