Coverage for src/hdmf/data_utils.py: 92%

441 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-07-10 23:48 +0000

1import copy 

2import math 

3from abc import ABCMeta, abstractmethod 

4from collections.abc import Iterable 

5from warnings import warn 

6from typing import Tuple 

7from itertools import product, chain 

8 

9import h5py 

10import numpy as np 

11 

12from .utils import docval, getargs, popargs, docval_macro, get_data_shape 

13 

14 

15def append_data(data, arg): 

16 if isinstance(data, (list, DataIO)): 

17 data.append(arg) 

18 return data 

19 elif isinstance(data, np.ndarray): 

20 return np.append(data, np.expand_dims(arg, axis=0), axis=0) 

21 elif isinstance(data, h5py.Dataset): 21 ↛ 28line 21 didn't jump to line 28, because the condition on line 21 was never false

22 shape = list(data.shape) 

23 shape[0] += 1 

24 data.resize(shape) 

25 data[-1] = arg 

26 return data 

27 else: 

28 msg = "Data cannot append to object of type '%s'" % type(data) 

29 raise ValueError(msg) 

30 

31 

32def extend_data(data, arg): 

33 """Add all the elements of the iterable arg to the end of data. 

34 

35 :param data: The array to extend 

36 :type data: list, DataIO, np.ndarray, h5py.Dataset 

37 """ 

38 if isinstance(data, (list, DataIO)): 

39 data.extend(arg) 

40 return data 

41 elif isinstance(data, np.ndarray): 

42 return np.vstack((data, arg)) 

43 elif isinstance(data, h5py.Dataset): 43 ↛ 50line 43 didn't jump to line 50, because the condition on line 43 was never false

44 shape = list(data.shape) 

45 shape[0] += len(arg) 

46 data.resize(shape) 

47 data[-len(arg):] = arg 

48 return data 

49 else: 

50 msg = "Data cannot extend object of type '%s'" % type(data) 

51 raise ValueError(msg) 

52 

53 

54@docval_macro('array_data') 

55class AbstractDataChunkIterator(metaclass=ABCMeta): 

56 """ 

57 Abstract iterator class used to iterate over DataChunks. 

58 

59 Derived classes must ensure that all abstract methods and abstract properties are implemented, in 

60 particular, dtype, maxshape, __iter__, ___next__, recommended_chunk_shape, and recommended_data_shape. 

61 

62 Iterating over AbstractContainer objects is not yet supported. 

63 """ 

64 

65 @abstractmethod 

66 def __iter__(self): 

67 """Return the iterator object""" 

68 raise NotImplementedError("__iter__ not implemented for derived class") 

69 

70 @abstractmethod 

71 def __next__(self): 

72 r""" 

73 Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved. 

74 

75 HINT: numpy.s\_ provides a convenient way to generate index tuples using standard array slicing. This 

76 is often useful to define the DataChunk.selection of the current chunk 

77 

78 :returns: DataChunk object with the data and selection of the current chunk 

79 :rtype: DataChunk 

80 """ 

81 raise NotImplementedError("__next__ not implemented for derived class") 

82 

83 @abstractmethod 

84 def recommended_chunk_shape(self): 

85 """ 

86 Recommend the chunk shape for the data array. 

87 

88 :return: NumPy-style shape tuple describing the recommended shape for the chunks of the target 

89 array or None. This may or may not be the same as the shape of the chunks returned in the 

90 iteration process. 

91 """ 

92 raise NotImplementedError("recommended_chunk_shape not implemented for derived class") 

93 

94 @abstractmethod 

95 def recommended_data_shape(self): 

96 """ 

97 Recommend the initial shape for the data array. 

98 

99 This is useful in particular to avoid repeated resized of the target array when reading from 

100 this data iterator. This should typically be either the final size of the array or the known 

101 minimal shape of the array. 

102 

103 :return: NumPy-style shape tuple indicating the recommended initial shape for the target array. 

104 This may or may not be the final full shape of the array, i.e., the array is allowed 

105 to grow. This should not be None. 

106 """ 

107 raise NotImplementedError("recommended_data_shape not implemented for derived class") 

108 

109 @property 

110 @abstractmethod 

111 def dtype(self): 

112 """ 

113 Define the data type of the array 

114 

115 :return: NumPy style dtype or otherwise compliant dtype string 

116 """ 

117 raise NotImplementedError("dtype not implemented for derived class") 

118 

119 @property 

120 @abstractmethod 

121 def maxshape(self): 

122 """ 

123 Property describing the maximum shape of the data array that is being iterated over 

124 

125 :return: NumPy-style shape tuple indicating the maximum dimensions up to which the dataset may be 

126 resized. Axes with None are unlimited. 

127 """ 

128 raise NotImplementedError("maxshape not implemented for derived class") 

129 

130 

131class GenericDataChunkIterator(AbstractDataChunkIterator): 

132 """DataChunkIterator that lets the user specify chunk and buffer shapes.""" 

133 

134 __docval_init = ( 

135 dict( 

136 name="buffer_gb", 

137 type=(float, int), 

138 doc=( 

139 "If buffer_shape is not specified, it will be inferred as the smallest chunk " 

140 "below the buffer_gb threshold." 

141 "Defaults to 1GB." 

142 ), 

143 default=None, 

144 ), 

145 dict( 

146 name="buffer_shape", 

147 type=tuple, 

148 doc="Manually defined shape of the buffer.", 

149 default=None, 

150 ), 

151 dict( 

152 name="chunk_mb", 

153 type=(float, int), 

154 doc=( 

155 "If chunk_shape is not specified, it will be inferred as the smallest chunk " 

156 "below the chunk_mb threshold.", 

157 "Defaults to 1MB.", 

158 ), 

159 default=None, 

160 ), 

161 dict( 

162 name="chunk_shape", 

163 type=tuple, 

164 doc="Manually defined shape of the chunks.", 

165 default=None, 

166 ), 

167 dict( 

168 name="display_progress", 

169 type=bool, 

170 doc="Display a progress bar with iteration rate and estimated completion time.", 

171 default=False, 

172 ), 

173 dict( 

174 name="progress_bar_options", 

175 type=None, 

176 doc="Dictionary of keyword arguments to be passed directly to tqdm.", 

177 default=None, 

178 ), 

179 ) 

180 

181 @docval(*__docval_init) 

182 def __init__(self, **kwargs): 

183 """ 

184 Break a dataset into buffers containing multiple chunks to be written into an HDF5 dataset. 

185 

186 Basic users should set the buffer_gb argument to as much free RAM space as can be safely allocated. 

187 Advanced users are offered full control over the shape parameters for the buffer and the chunks; however, 

188 the chunk shape must perfectly divide the buffer shape along each axis. 

189 

190 HDF5 also recommends not setting chunk_mb greater than 1 MB for optimal caching speeds. 

191 See https://support.hdfgroup.org/HDF5/doc/TechNotes/TechNote-HDF5-ImprovingIOPerformanceCompressedDatasets.pdf 

192 for more details. 

193 """ 

194 buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, self.progress_bar_options = getargs( 

195 "buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs 

196 ) 

197 

198 if buffer_gb is None and buffer_shape is None: 

199 buffer_gb = 1.0 

200 if chunk_mb is None and chunk_shape is None: 

201 chunk_mb = 1.0 

202 assert (buffer_gb is not None) != ( 

203 buffer_shape is not None 

204 ), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!" 

205 assert (chunk_mb is not None) != ( 

206 chunk_shape is not None 

207 ), "Only one of 'chunk_mb' or 'chunk_shape' can be specified!" 

208 

209 self._dtype = self._get_dtype() 

210 self._maxshape = tuple(int(x) for x in self._get_maxshape()) 

211 chunk_shape = tuple(int(x) for x in chunk_shape) if chunk_shape else chunk_shape 

212 self.chunk_shape = chunk_shape or self._get_default_chunk_shape(chunk_mb=chunk_mb) 

213 buffer_shape = tuple(int(x) for x in buffer_shape) if buffer_shape else buffer_shape 

214 self.buffer_shape = buffer_shape or self._get_default_buffer_shape(buffer_gb=buffer_gb) 

215 

216 # Shape assertions 

217 assert all( 

218 buffer_axis > 0 for buffer_axis in self.buffer_shape 

219 ), f"Some dimensions of buffer_shape ({self.buffer_shape}) are less than zero!" 

220 assert all( 

221 chunk_axis <= maxshape_axis for chunk_axis, maxshape_axis in zip(self.chunk_shape, self.maxshape) 

222 ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the data dimensions ({self.maxshape})!" 

223 assert all( 

224 buffer_axis <= maxshape_axis for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) 

225 ), f"Some dimensions of buffer_shape ({self.buffer_shape}) exceed the data dimensions ({self.maxshape})!" 

226 assert all( 

227 (chunk_axis <= buffer_axis for chunk_axis, buffer_axis in zip(self.chunk_shape, self.buffer_shape)) 

228 ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the buffer shape ({self.buffer_shape})!" 

229 assert all( 

230 buffer_axis % chunk_axis == 0 

231 for chunk_axis, buffer_axis, maxshape_axis in zip(self.chunk_shape, self.buffer_shape, self.maxshape) 

232 if buffer_axis != maxshape_axis 

233 ), ( 

234 f"Some dimensions of chunk_shape ({self.chunk_shape}) do not " 

235 f"evenly divide the buffer shape ({self.buffer_shape})!" 

236 ) 

237 

238 self.num_buffers = math.prod( 

239 [ 

240 math.ceil(maxshape_axis / buffer_axis) 

241 for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) 

242 ], 

243 ) 

244 self.buffer_selection_generator = ( 

245 tuple( 

246 [ 

247 slice(lower_bound, upper_bound) 

248 for lower_bound, upper_bound in zip(lower_bounds, upper_bounds) 

249 ] 

250 ) 

251 for lower_bounds, upper_bounds in zip( 

252 product( 

253 *[ 

254 range(0, max_shape_axis, buffer_shape_axis) 

255 for max_shape_axis, buffer_shape_axis in zip(self.maxshape, self.buffer_shape) 

256 ] 

257 ), 

258 product( 

259 *[ 

260 chain(range(buffer_shape_axis, max_shape_axis, buffer_shape_axis), [max_shape_axis]) 

261 for max_shape_axis, buffer_shape_axis in zip(self.maxshape, self.buffer_shape) 

262 ] 

263 ), 

264 ) 

265 ) 

266 

267 if self.display_progress: 

268 if self.progress_bar_options is None: 

269 self.progress_bar_options = dict() 

270 

271 try: 

272 from tqdm import tqdm 

273 

274 if "total" in self.progress_bar_options: 

275 warn("Option 'total' in 'progress_bar_options' is not allowed to be over-written! Ignoring.") 

276 self.progress_bar_options.pop("total") 

277 self.progress_bar = tqdm(total=self.num_buffers, **self.progress_bar_options) 

278 except ImportError: 

279 warn( 

280 "You must install tqdm to use the progress bar feature (pip install tqdm)! " 

281 "Progress bar is disabled." 

282 ) 

283 self.display_progress = False 

284 

285 @docval( 

286 dict( 

287 name="chunk_mb", 

288 type=(float, int), 

289 doc="Size of the HDF5 chunk in megabytes. Recommended to be less than 1MB.", 

290 default=None, 

291 ) 

292 ) 

293 def _get_default_chunk_shape(self, **kwargs) -> Tuple[int, ...]: 

294 """ 

295 Select chunk shape with size in MB less than the threshold of chunk_mb. 

296 

297 Keeps the dimensional ratios of the original data. 

298 """ 

299 chunk_mb = getargs("chunk_mb", kwargs) 

300 assert chunk_mb > 0, f"chunk_mb ({chunk_mb}) must be greater than zero!" 

301 

302 n_dims = len(self.maxshape) 

303 itemsize = self.dtype.itemsize 

304 chunk_bytes = chunk_mb * 1e6 

305 

306 min_maxshape = min(self.maxshape) 

307 v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in self.maxshape) 

308 prod_v = math.prod(v) 

309 while prod_v * itemsize > chunk_bytes and prod_v != 1: 

310 non_unit_min_v = min(x for x in v if x != 1) 

311 v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v) 

312 prod_v = math.prod(v) 

313 k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)) 

314 return tuple([min(k * x, self.maxshape[dim]) for dim, x in enumerate(v)]) 

315 

316 @docval( 

317 dict( 

318 name="buffer_gb", 

319 type=(float, int), 

320 doc="Size of the data buffer in gigabytes. Recommended to be as much free RAM as safely available.", 

321 default=None, 

322 ) 

323 ) 

324 def _get_default_buffer_shape(self, **kwargs) -> Tuple[int, ...]: 

325 """ 

326 Select buffer shape with size in GB less than the threshold of buffer_gb. 

327 

328 Keeps the dimensional ratios of the original data. 

329 Assumes the chunk_shape has already been set. 

330 """ 

331 buffer_gb = getargs("buffer_gb", kwargs) 

332 assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!" 

333 assert all(chunk_axis > 0 for chunk_axis in self.chunk_shape), ( 

334 f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!" 

335 ) 

336 

337 k = math.floor( 

338 ( 

339 buffer_gb * 1e9 / (math.prod(self.chunk_shape) * self.dtype.itemsize) 

340 ) ** (1 / len(self.chunk_shape)) 

341 ) 

342 return tuple( 

343 [ 

344 min(max(k * x, self.chunk_shape[j]), self.maxshape[j]) 

345 for j, x in enumerate(self.chunk_shape) 

346 ] 

347 ) 

348 

349 def recommended_chunk_shape(self) -> Tuple[int, ...]: 

350 return self.chunk_shape 

351 

352 def recommended_data_shape(self) -> Tuple[int, ...]: 

353 return self.maxshape 

354 

355 def __iter__(self): 

356 return self 

357 

358 def __next__(self): 

359 """ 

360 Retrieve the next DataChunk object from the buffer, refilling the buffer if necessary. 

361 

362 :returns: DataChunk object with the data and selection of the current buffer. 

363 :rtype: DataChunk 

364 """ 

365 if self.display_progress: 

366 self.progress_bar.update(n=1) 

367 try: 

368 buffer_selection = next(self.buffer_selection_generator) 

369 return DataChunk(data=self._get_data(selection=buffer_selection), selection=buffer_selection) 

370 except StopIteration: 

371 if self.display_progress: 

372 self.progress_bar.write("\n") # Allows text to be written to new lines after completion 

373 raise StopIteration 

374 

375 @abstractmethod 

376 def _get_data(self, selection: Tuple[slice]) -> np.ndarray: 

377 """ 

378 Retrieve the data specified by the selection using minimal I/O. 

379 

380 The developer of a new implementation of the GenericDataChunkIterator must ensure the data is actually 

381 loaded into memory, and not simply mapped. 

382 

383 :param selection: Tuple of slices, each indicating the selection indexed with respect to maxshape for that axis 

384 :type selection: tuple of slices 

385 

386 :returns: Array of data specified by selection 

387 :rtype: np.ndarray 

388 Parameters 

389 ---------- 

390 selection : tuple of slices 

391 Each axis of tuple is a slice of the full shape from which to pull data into the buffer. 

392 """ 

393 raise NotImplementedError("The data fetching method has not been built for this DataChunkIterator!") 

394 

395 @property 

396 def maxshape(self) -> Tuple[int, ...]: 

397 return self._maxshape 

398 

399 @abstractmethod 

400 def _get_maxshape(self) -> Tuple[int, ...]: 

401 """Retrieve the maximum bounds of the data shape using minimal I/O.""" 

402 raise NotImplementedError("The setter for the maxshape property has not been built for this DataChunkIterator!") 

403 

404 @property 

405 def dtype(self) -> np.dtype: 

406 return self._dtype 

407 

408 @abstractmethod 

409 def _get_dtype(self) -> np.dtype: 

410 """Retrieve the dtype of the data using minimal I/O.""" 

411 raise NotImplementedError("The setter for the internal dtype has not been built for this DataChunkIterator!") 

412 

413 

414class DataChunkIterator(AbstractDataChunkIterator): 

415 """ 

416 Custom iterator class used to iterate over chunks of data. 

417 

418 This default implementation of AbstractDataChunkIterator accepts any iterable and assumes that we iterate over 

419 a single dimension of the data array (default: the first dimension). DataChunkIterator supports buffered read, 

420 i.e., multiple values from the input iterator can be combined to a single chunk. This is 

421 useful for buffered I/O operations, e.g., to improve performance by accumulating data 

422 in memory and writing larger blocks at once. 

423 

424 .. note:: 

425 

426 DataChunkIterator assumes that the iterator that it wraps returns one element along the 

427 iteration dimension at a time. I.e., the iterator is expected to return chunks that are 

428 one dimension lower than the array itself. For example, when iterating over the first dimension 

429 of a dataset with shape (1000, 10, 10), then the iterator would return 1000 chunks of 

430 shape (10, 10) one-chunk-at-a-time. If this pattern does not match your use-case then 

431 using :py:class:`~hdmf.data_utils.GenericDataChunkIterator` or 

432 :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more appropriate. 

433 """ 

434 

435 __docval_init = ( 

436 {'name': 'data', 'type': None, 'doc': 'The data object used for iteration', 'default': None}, 

437 {'name': 'maxshape', 'type': tuple, 

438 'doc': 'The maximum shape of the full data array. Use None to indicate unlimited dimensions', 

439 'default': None}, 

440 {'name': 'dtype', 'type': np.dtype, 'doc': 'The Numpy data type for the array', 'default': None}, 

441 {'name': 'buffer_size', 'type': int, 'doc': 'Number of values to be buffered in a chunk', 'default': 1}, 

442 {'name': 'iter_axis', 'type': int, 'doc': 'The dimension to iterate over', 'default': 0} 

443 ) 

444 

445 @docval(*__docval_init) 

446 def __init__(self, **kwargs): 

447 """Initialize the DataChunkIterator. 

448 If 'data' is an iterator and 'dtype' is not specified, then next is called on the iterator in order to determine 

449 the dtype of the data. 

450 """ 

451 # Get the user parameters 

452 self.data, self.__maxshape, self.__dtype, self.buffer_size, self.iter_axis = getargs('data', 

453 'maxshape', 

454 'dtype', 

455 'buffer_size', 

456 'iter_axis', 

457 kwargs) 

458 self.chunk_index = 0 

459 # Create an iterator for the data if possible 

460 if isinstance(self.data, Iterable): 

461 if self.iter_axis != 0 and isinstance(self.data, (list, tuple)): 

462 warn('Iterating over an axis other than the first dimension of list or tuple data ' 

463 'involves converting the data object to a numpy ndarray, which may incur a computational ' 

464 'cost.') 

465 self.data = np.asarray(self.data) 

466 if isinstance(self.data, np.ndarray): 

467 # iterate over the given axis by adding a new view on data (iter only works on the first dim) 

468 self.__data_iter = iter(np.moveaxis(self.data, self.iter_axis, 0)) 

469 else: 

470 self.__data_iter = iter(self.data) 

471 else: 

472 self.__data_iter = None 

473 self.__next_chunk = DataChunk(None, None) 

474 self.__next_chunk_start = 0 

475 self.__first_chunk_shape = None 

476 # Determine the shape of the data if possible 

477 if self.__maxshape is None: 

478 # If the self.data object identifies its shape, then use it 

479 if hasattr(self.data, "shape"): 

480 self.__maxshape = self.data.shape 

481 # Avoid the special case of scalar values by making them into a 1D numpy array 

482 if len(self.__maxshape) == 0: 482 ↛ 483line 482 didn't jump to line 483, because the condition on line 482 was never true

483 self.data = np.asarray([self.data, ]) 

484 self.__maxshape = self.data.shape 

485 self.__data_iter = iter(self.data) 

486 # Try to get an accurate idea of __maxshape for other Python data structures if possible. 

487 # Don't just call get_data_shape for a generator as that would potentially trigger loading of all the data 

488 elif isinstance(self.data, list) or isinstance(self.data, tuple): 

489 self.__maxshape = get_data_shape(self.data, strict_no_data_load=True) 

490 

491 # If we have a data iterator and do not know the dtype, then read the first chunk 

492 if self.__data_iter is not None and self.__dtype is None: 

493 self._read_next_chunk() 

494 

495 # Determine the type of the data if possible 

496 if self.__next_chunk.data is not None: 

497 self.__dtype = self.__next_chunk.data.dtype 

498 self.__first_chunk_shape = get_data_shape(self.__next_chunk.data) 

499 

500 # This should be done as a last resort only 

501 if self.__first_chunk_shape is None and self.__maxshape is not None: 

502 self.__first_chunk_shape = tuple(1 if i is None else i for i in self.__maxshape) 

503 

504 if self.__dtype is None: 

505 raise Exception('Data type could not be determined. Please specify dtype in DataChunkIterator init.') 

506 

507 @classmethod 

508 @docval(*__docval_init) 

509 def from_iterable(cls, **kwargs): 

510 return cls(**kwargs) 

511 

512 def __iter__(self): 

513 """Return the iterator object""" 

514 return self 

515 

516 def _read_next_chunk(self): 

517 """Read a single chunk from self.__data_iter and store the results in self.__next_chunk 

518 

519 :returns: self.__next_chunk, i.e., the DataChunk object describing the next chunk 

520 """ 

521 from h5py import Dataset as H5Dataset 

522 if isinstance(self.data, H5Dataset): 

523 start_index = self.chunk_index * self.buffer_size 

524 stop_index = start_index + self.buffer_size 

525 iter_data_bounds = self.data.shape[self.iter_axis] 

526 if start_index >= iter_data_bounds: 

527 self.__next_chunk = DataChunk(None, None) 

528 else: 

529 if stop_index > iter_data_bounds: 

530 stop_index = iter_data_bounds 

531 

532 selection = [slice(None)] * len(self.maxshape) 

533 selection[self.iter_axis] = slice(start_index, stop_index) 

534 selection = tuple(selection) 

535 self.__next_chunk.data = self.data[selection] 

536 self.__next_chunk.selection = selection 

537 elif self.__data_iter is not None: 

538 # the pieces in the buffer - first dimension consists of individual calls to next 

539 iter_pieces = [] 

540 # offset of where data begins - shift the selection of where to place this chunk by this much 

541 curr_chunk_offset = 0 

542 read_next_empty = False 

543 while len(iter_pieces) < self.buffer_size: 

544 try: 

545 dat = next(self.__data_iter) 

546 if dat is None and len(iter_pieces) == 0: 

547 # Skip forward in our chunk until we find data 

548 curr_chunk_offset += 1 

549 elif dat is None and len(iter_pieces) > 0: 

550 # Stop iteration if we hit empty data while constructing our block 

551 # Buffer may not be full. 

552 read_next_empty = True 

553 break 

554 else: 

555 # Add pieces of data to our buffer 

556 iter_pieces.append(np.asarray(dat)) 

557 except StopIteration: 

558 break 

559 

560 if len(iter_pieces) == 0: 

561 self.__next_chunk = DataChunk(None, None) # signal end of iteration 

562 else: 

563 # concatenate all the pieces into the chunk along the iteration axis 

564 piece_shape = list(get_data_shape(iter_pieces[0])) 

565 piece_shape.insert(self.iter_axis, 1) # insert the missing axis 

566 next_chunk_shape = piece_shape.copy() 

567 next_chunk_shape[self.iter_axis] *= len(iter_pieces) 

568 next_chunk_size = next_chunk_shape[self.iter_axis] 

569 

570 # use the piece dtype because the actual dtype may not have been determined yet 

571 # NOTE: this could be problematic if a generator returns e.g. floats first and ints later 

572 self.__next_chunk.data = np.empty(next_chunk_shape, dtype=iter_pieces[0].dtype) 

573 self.__next_chunk.data = np.stack(iter_pieces, axis=self.iter_axis) 

574 

575 selection = [slice(None)] * len(self.maxshape) 

576 selection[self.iter_axis] = slice(self.__next_chunk_start + curr_chunk_offset, 

577 self.__next_chunk_start + curr_chunk_offset + next_chunk_size) 

578 self.__next_chunk.selection = tuple(selection) 

579 

580 # next chunk should start at self.__next_chunk.selection[self.iter_axis].stop 

581 # but if this chunk stopped because of reading empty data, then this should be adjusted by 1 

582 self.__next_chunk_start = self.__next_chunk.selection[self.iter_axis].stop 

583 if read_next_empty: 

584 self.__next_chunk_start += 1 

585 else: 

586 self.__next_chunk = DataChunk(None, None) 

587 

588 self.chunk_index += 1 

589 return self.__next_chunk 

590 

591 def __next__(self): 

592 """ 

593 Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved. 

594 

595 .. tip:: 

596 

597 :py:attr:`numpy.s_` provides a convenient way to generate index tuples using standard array slicing. This 

598 is often useful to define the DataChunk.selection of the current chunk 

599 

600 :returns: DataChunk object with the data and selection of the current chunk 

601 :rtype: DataChunk 

602 

603 """ 

604 # If we have not already read the next chunk, then read it now 

605 if self.__next_chunk.data is None: 

606 self._read_next_chunk() 

607 # If we do not have any next chunk 

608 if self.__next_chunk.data is None: 

609 raise StopIteration 

610 # If this is the first time we see a chunk then remember the size of the first chunk 

611 if self.__first_chunk_shape is None: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true

612 self.__first_chunk_shape = self.__next_chunk.data.shape 

613 # Keep the next chunk we need to return 

614 curr_chunk = DataChunk(self.__next_chunk.data, 

615 self.__next_chunk.selection) 

616 # Remove the data for the next chunk from our list since we are returning it here. 

617 # This is to allow the GarbageCollector to remove the data when it goes out of scope and avoid 

618 # having 2 full chunks in memory if not necessary 

619 self.__next_chunk.data = None 

620 # Return the current next chunk 

621 return curr_chunk 

622 

623 next = __next__ 

624 

625 @docval(returns='Tuple with the recommended chunk shape or None if no particular shape is recommended.') 

626 def recommended_chunk_shape(self): 

627 """Recommend a chunk shape. 

628 

629 To optimize iterative write the chunk should be aligned with the common shape of chunks returned by __next__ 

630 or if those chunks are too large, then a well-aligned subset of those chunks. This may also be 

631 any other value in case one wants to recommend chunk shapes to optimize read rather 

632 than write. The default implementation returns None, indicating no preferential chunking option.""" 

633 return None 

634 

635 @docval(returns='Recommended initial shape for the full data. This should be the shape of the full dataset' + 

636 'if known beforehand or alternatively the minimum shape of the dataset. Return None if no ' + 

637 'recommendation is available') 

638 def recommended_data_shape(self): 

639 """Recommend an initial shape of the data. This is useful when progressively writing data and 

640 we want to recommend an initial size for the dataset""" 

641 if self.maxshape is not None: 

642 if np.all([i is not None for i in self.maxshape]): 

643 return self.maxshape 

644 return self.__first_chunk_shape 

645 

646 @property 

647 def maxshape(self): 

648 """ 

649 Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator. 

650 

651 .. note:: 

652 

653 If an iterator is provided and no data has been read yet, then the first chunk will be read 

654 (i.e., next will be called on the iterator) in order to determine the maxshape. The iterator 

655 is expected to return single chunks along the iterator dimension, this means that maxshape will 

656 add an additional dimension along the iteration dimension. E.g., if we iterate over 

657 the first dimension and the iterator returns chunks of shape (10, 10), then the maxshape would 

658 be (None, 10, 10) or (len(self.data), 10, 10), depending on whether size of the 

659 iteration dimension is known. 

660 

661 :return: Shape tuple. None is used for dimensions where the maximum shape is not known or unlimited. 

662 """ 

663 if self.__maxshape is None: 

664 # If no data has been read from the iterator yet, read the first chunk and use it to determine the maxshape 

665 if self.__data_iter is not None and self.__next_chunk.data is None: 665 ↛ 666line 665 didn't jump to line 666, because the condition on line 665 was never true

666 self._read_next_chunk() 

667 

668 # Determine maxshape from self.__next_chunk 

669 if self.__next_chunk.data is None: 

670 return None 

671 data_shape = get_data_shape(self.__next_chunk.data) 

672 self.__maxshape = list(data_shape) 

673 try: 

674 # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a 

675 # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if 

676 # possible. Otherwise, use None to represent an unlimited size 

677 if hasattr(self.data, '__len__') and self.iter_axis == 0: 

678 # special case of 1-D array 

679 self.__maxshape[0] = len(self.data) 

680 else: 

681 self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis] 

682 except AttributeError: # from self.data.shape 

683 self.__maxshape[self.iter_axis] = None 

684 self.__maxshape = tuple(self.__maxshape) 

685 

686 return self.__maxshape 

687 

688 @property 

689 def dtype(self): 

690 """ 

691 Get the value data type 

692 

693 :return: np.dtype object describing the datatype 

694 """ 

695 return self.__dtype 

696 

697 

698class DataChunk: 

699 """ 

700 Class used to describe a data chunk. Used in DataChunkIterator. 

701 """ 

702 

703 @docval({'name': 'data', 'type': np.ndarray, 

704 'doc': 'Numpy array with the data value(s) of the chunk', 'default': None}, 

705 {'name': 'selection', 'type': None, 

706 'doc': 'Numpy index tuple describing the location of the chunk', 'default': None}) 

707 def __init__(self, **kwargs): 

708 self.data, self.selection = getargs('data', 'selection', kwargs) 

709 

710 def __len__(self): 

711 """Get the number of values in the data chunk""" 

712 if self.data is not None: 

713 return len(self.data) 

714 else: 

715 return 0 

716 

717 def __getattr__(self, attr): 

718 """Delegate retrieval of attributes to the data in self.data""" 

719 return getattr(self.data, attr) 

720 

721 def __copy__(self): 

722 newobj = DataChunk(data=self.data, 

723 selection=self.selection) 

724 return newobj 

725 

726 def __deepcopy__(self, memo): 

727 result = DataChunk(data=copy.deepcopy(self.data), 

728 selection=copy.deepcopy(self.selection)) 

729 memo[id(self)] = result 

730 return result 

731 

732 def astype(self, dtype): 

733 """Get a new DataChunk with the self.data converted to the given type""" 

734 return DataChunk(data=self.data.astype(dtype), 

735 selection=self.selection) 

736 

737 @property 

738 def dtype(self): 

739 """ 

740 Data type of the values in the chunk 

741 

742 :returns: np.dtype of the values in the DataChunk 

743 """ 

744 return self.data.dtype 

745 

746 def get_min_bounds(self): 

747 """ 

748 Helper function to compute the minimum dataset size required to fit the selection of this chunk. 

749 

750 :raises TypeError: If the the selection is not a single int, slice, or tuple of slices. 

751 

752 :return: Tuple with the minimum shape required to store the selection 

753 """ 

754 if isinstance(self.selection, tuple): 754 ↛ 757line 754 didn't jump to line 757, because the condition on line 754 was never false

755 # Determine the minimum array dimensions to fit the chunk selection 

756 max_bounds = tuple([x.stop or 0 if isinstance(x, slice) else x+1 for x in self.selection]) 

757 elif isinstance(self.selection, int): 

758 max_bounds = (self.selection+1, ) 

759 elif isinstance(self.selection, slice): 

760 max_bounds = (self.selection.stop or 0, ) 

761 else: 

762 # Note: Technically any numpy index tuple would be allowed, but h5py is not as general and this case 

763 # only implements the selections supported by h5py. We could add more cases to support a 

764 # broader range of valid numpy selection types 

765 msg = ("Chunk selection %s must be a single int, single slice, or tuple of slices " 

766 "and/or integers") % str(self.selection) 

767 raise TypeError(msg) 

768 return max_bounds 

769 

770 

771def assertEqualShape(data1, 

772 data2, 

773 axes1=None, 

774 axes2=None, 

775 name1=None, 

776 name2=None, 

777 ignore_undetermined=True): 

778 """ 

779 Ensure that the shape of data1 and data2 match along the given dimensions 

780 

781 :param data1: The first input array 

782 :type data1: List, Tuple, np.ndarray, DataChunkIterator etc. 

783 :param data2: The second input array 

784 :type data2: List, Tuple, np.ndarray, DataChunkIterator etc. 

785 :param name1: Optional string with the name of data1 

786 :param name2: Optional string with the name of data2 

787 :param axes1: The dimensions of data1 that should be matched to the dimensions of data2. Set to None to 

788 compare all axes in order. 

789 :type axes1: int, Tuple of ints, List of ints, or None 

790 :param axes2: The dimensions of data2 that should be matched to the dimensions of data1. Must have 

791 the same length as axes1. Set to None to compare all axes in order. 

792 :type axes1: int, Tuple of ints, List of ints, or None 

793 :param ignore_undetermined: Boolean indicating whether non-matching unlimited dimensions should be ignored, 

794 i.e., if two dimension don't match because we can't determine the shape of either one, then 

795 should we ignore that case or treat it as no match 

796 

797 :return: Bool indicating whether the check passed and a string with a message about the matching process 

798 """ 

799 # Create the base return object 

800 response = ShapeValidatorResult() 

801 # Determine the shape of the datasets 

802 response.shape1 = get_data_shape(data1) 

803 response.shape2 = get_data_shape(data2) 

804 # Determine the number of dimensions of the datasets 

805 num_dims_1 = len(response.shape1) if response.shape1 is not None else None 

806 num_dims_2 = len(response.shape2) if response.shape2 is not None else None 

807 # Determine the string names of the datasets 

808 n1 = name1 if name1 is not None else ("data1 at " + str(hex(id(data1)))) 

809 n2 = name2 if name2 is not None else ("data2 at " + str(hex(id(data2)))) 

810 # Determine the axes we should compare 

811 response.axes1 = list(range(num_dims_1)) if axes1 is None else ([axes1] if isinstance(axes1, int) else axes1) 

812 response.axes2 = list(range(num_dims_2)) if axes2 is None else ([axes2] if isinstance(axes2, int) else axes2) 

813 # Validate the array shape 

814 # 1) Check the number of dimensions of the arrays 

815 if (response.axes1 is None and response.axes2 is None) and num_dims_1 != num_dims_2: 815 ↛ 816line 815 didn't jump to line 816, because the condition on line 815 was never true

816 response.result = False 

817 response.error = 'NUM_DIMS_ERROR' 

818 response.message = response.SHAPE_ERROR[response.error] 

819 response.message += " %s is %sD and %s is %sD" % (n1, num_dims_1, n2, num_dims_2) 

820 # 2) Check that we have the same number of dimensions to compare on both arrays 

821 elif len(response.axes1) != len(response.axes2): 

822 response.result = False 

823 response.error = 'NUM_AXES_ERROR' 

824 response.message = response.SHAPE_ERROR[response.error] 

825 response.message += " Cannot compare axes %s with %s" % (str(response.axes1), str(response.axes2)) 

826 # 3) Check that the datasets have sufficient number of dimensions 

827 elif np.max(response.axes1) >= num_dims_1 or np.max(response.axes2) >= num_dims_2: 

828 response.result = False 

829 response.error = 'AXIS_OUT_OF_BOUNDS' 

830 response.message = response.SHAPE_ERROR[response.error] 

831 if np.max(response.axes1) >= num_dims_1: 

832 response.message += "Insufficient number of dimensions for %s -- Expected %i found %i" % \ 

833 (n1, np.max(response.axes1) + 1, num_dims_1) 

834 elif np.max(response.axes2) >= num_dims_2: 834 ↛ 870line 834 didn't jump to line 870, because the condition on line 834 was never false

835 response.message += "Insufficient number of dimensions for %s -- Expected %i found %i" % \ 

836 (n2, np.max(response.axes2) + 1, num_dims_2) 

837 # 4) Compare the length of the dimensions we should validate 

838 else: 

839 unmatched = [] 

840 ignored = [] 

841 for ax in zip(response.axes1, response.axes2): 

842 if response.shape1[ax[0]] != response.shape2[ax[1]]: 

843 if ignore_undetermined and (response.shape1[ax[0]] is None or response.shape2[ax[1]] is None): 

844 ignored.append(ax) 

845 else: 

846 unmatched.append(ax) 

847 response.unmatched = unmatched 

848 response.ignored = ignored 

849 

850 # Check if everything checked out 

851 if len(response.unmatched) == 0: 

852 response.result = True 

853 response.error = None 

854 response.message = response.SHAPE_ERROR[response.error] 

855 if len(response.ignored) > 0: 

856 response.message += " Ignored undetermined axes %s" % str(response.ignored) 

857 else: 

858 response.result = False 

859 response.error = 'AXIS_LEN_ERROR' 

860 response.message = response.SHAPE_ERROR[response.error] 

861 response.message += "Axes %s with size %s of %s did not match dimensions %s with sizes %s of %s." % \ 

862 (str([un[0] for un in response.unmatched]), 

863 str([response.shape1[un[0]] for un in response.unmatched]), 

864 n1, 

865 str([un[1] for un in response.unmatched]), 

866 str([response.shape2[un[1]] for un in response.unmatched]), 

867 n2) 

868 if len(response.ignored) > 0: 868 ↛ 869line 868 didn't jump to line 869, because the condition on line 868 was never true

869 response.message += " Ignored undetermined axes %s" % str(response.ignored) 

870 return response 

871 

872 

873class ShapeValidatorResult: 

874 """Class for storing results from validating the shape of multi-dimensional arrays. 

875 

876 This class is used to store results generated by ShapeValidator 

877 

878 :ivar result: Boolean indicating whether results matched or not 

879 :type result: bool 

880 :ivar message: Message indicating the result of the matching procedure 

881 :type messaage: str, None 

882 """ 

883 SHAPE_ERROR = {None: 'All required axes matched', 

884 'NUM_DIMS_ERROR': 'Unequal number of dimensions.', 

885 'NUM_AXES_ERROR': "Unequal number of axes for comparison.", 

886 'AXIS_OUT_OF_BOUNDS': "Axis index for comparison out of bounds.", 

887 'AXIS_LEN_ERROR': "Unequal length of axes."} 

888 """ 

889 Dict where the Keys are the type of errors that may have occurred during shape comparison and the 

890 values are strings with default error messages for the type. 

891 """ 

892 

893 @docval({'name': 'result', 'type': bool, 'doc': 'Result of the shape validation', 'default': False}, 

894 {'name': 'message', 'type': str, 

895 'doc': 'Message describing the result of the shape validation', 'default': None}, 

896 {'name': 'ignored', 'type': tuple, 

897 'doc': 'Axes that have been ignored in the validaton process', 'default': tuple(), 'shape': (None,)}, 

898 {'name': 'unmatched', 'type': tuple, 

899 'doc': 'List of axes that did not match during shape validation', 'default': tuple(), 'shape': (None,)}, 

900 {'name': 'error', 'type': str, 'doc': 'Error that may have occurred. One of ERROR_TYPE', 'default': None}, 

901 {'name': 'shape1', 'type': tuple, 

902 'doc': 'Shape of the first array for comparison', 'default': tuple(), 'shape': (None,)}, 

903 {'name': 'shape2', 'type': tuple, 

904 'doc': 'Shape of the second array for comparison', 'default': tuple(), 'shape': (None,)}, 

905 {'name': 'axes1', 'type': tuple, 

906 'doc': 'Axes for the first array that should match', 'default': tuple(), 'shape': (None,)}, 

907 {'name': 'axes2', 'type': tuple, 

908 'doc': 'Axes for the second array that should match', 'default': tuple(), 'shape': (None,)}, 

909 ) 

910 def __init__(self, **kwargs): 

911 self.result, self.message, self.ignored, self.unmatched, \ 

912 self.error, self.shape1, self.shape2, self.axes1, self.axes2 = getargs( 

913 'result', 'message', 'ignored', 'unmatched', 'error', 'shape1', 'shape2', 'axes1', 'axes2', kwargs) 

914 

915 def __setattr__(self, key, value): 

916 """ 

917 Overwrite to ensure that, e.g., error_message is not set to an illegal value. 

918 """ 

919 if key == 'error': 

920 if value not in self.SHAPE_ERROR.keys(): 

921 raise ValueError("Illegal error type. Error must be one of ShapeValidatorResult.SHAPE_ERROR: %s" 

922 % str(self.SHAPE_ERROR)) 

923 else: 

924 super().__setattr__(key, value) 

925 elif key in ['shape1', 'shape2', 'axes1', 'axes2', 'ignored', 'unmatched']: # Make sure we sore tuples 

926 super().__setattr__(key, tuple(value)) 

927 else: 

928 super().__setattr__(key, value) 

929 

930 def __getattr__(self, item): 

931 """ 

932 Overwrite to allow dynamic retrieval of the default message 

933 """ 

934 if item == 'default_message': 934 ↛ 936line 934 didn't jump to line 936, because the condition on line 934 was never false

935 return self.SHAPE_ERROR[self.error] 

936 return self.__getattribute__(item) 

937 

938 

939@docval_macro('data') 

940class DataIO: 

941 """ 

942 Base class for wrapping data arrays for I/O. Derived classes of DataIO are typically 

943 used to pass dataset-specific I/O parameters to the particular HDMFIO backend. 

944 """ 

945 

946 @docval({'name': 'data', 

947 'type': 'array_data', 

948 'doc': 'the data to be written', 

949 'default': None}, 

950 {'name': 'dtype', 

951 'type': (type, np.dtype), 

952 'doc': 'the data type of the dataset. Not used if data is specified.', 

953 'default': None}, 

954 {'name': 'shape', 

955 'type': tuple, 

956 'doc': 'the shape of the dataset. Not used if data is specified.', 

957 'default': None}) 

958 def __init__(self, **kwargs): 

959 data, dtype, shape = popargs('data', 'dtype', 'shape', kwargs) 

960 if data is None: 

961 if (dtype is None) ^ (shape is None): 

962 raise ValueError("Must specify 'dtype' and 'shape' if not specifying 'data'") 

963 else: 

964 if dtype is not None: 

965 warn("Argument 'dtype' is ignored when 'data' is specified") 

966 dtype = None 

967 if shape is not None: 

968 warn("Argument 'shape' is ignored when 'data' is specified") 

969 shape = None 

970 self.__data = data 

971 self.__dtype = dtype 

972 self.__shape = shape 

973 

974 def get_io_params(self): 

975 """ 

976 Returns a dict with the I/O parameters specified in this DataIO. 

977 """ 

978 return dict() 

979 

980 @property 

981 def data(self): 

982 """Get the wrapped data object""" 

983 return self.__data 

984 

985 @data.setter 

986 def data(self, val): 

987 """Set the wrapped data object""" 

988 if self.__data is not None: 

989 raise ValueError("cannot overwrite 'data' on DataIO") 

990 if not (self.__dtype is None and self.__shape is None): 

991 raise ValueError("Setting data when dtype and shape are not None is not supported") 

992 self.__data = val 

993 

994 @property 

995 def dtype(self): 

996 """Get the wrapped data object""" 

997 return self.__dtype or self.__getattr__("dtype") 

998 

999 @property 

1000 def shape(self): 

1001 """Get the wrapped data object""" 

1002 return self.__shape or self.__getattr__("shape") 

1003 

1004 def __copy__(self): 

1005 """ 

1006 Define a custom copy method for shallow copy.. 

1007 

1008 This is needed due to delegation of __getattr__ to the data to 

1009 ensure proper copy. 

1010 

1011 :return: Shallow copy of self, ie., a new instance of DataIO wrapping the same self.data object 

1012 """ 

1013 newobj = DataIO(data=self.data) 

1014 return newobj 

1015 

1016 def append(self, arg): 

1017 self.__data = append_data(self.__data, arg) 

1018 

1019 def extend(self, arg): 

1020 self.__data = extend_data(self.__data, arg) 

1021 

1022 def __deepcopy__(self, memo): 

1023 """ 

1024 Define a custom copy method for deep copy. 

1025 

1026 This is needed due to delegation of __getattr__ to the data to 

1027 ensure proper copy. 

1028 

1029 :param memo: 

1030 :return: Deep copy of self, i.e., a new instance of DataIO wrapping a deepcopy of the 

1031 self.data object. 

1032 """ 

1033 result = DataIO(data=copy.deepcopy(self.__data)) 

1034 memo[id(self)] = result 

1035 return result 

1036 

1037 def __len__(self): 

1038 """Number of values in self.data""" 

1039 if self.__shape is not None: 

1040 return self.__shape[0] 

1041 if not self.valid: 

1042 raise InvalidDataIOError("Cannot get length of data. Data is not valid.") 

1043 return len(self.data) 

1044 

1045 def __bool__(self): 

1046 if self.valid: 1046 ↛ 1050line 1046 didn't jump to line 1050, because the condition on line 1046 was never false

1047 if isinstance(self.data, AbstractDataChunkIterator): 1047 ↛ 1048line 1047 didn't jump to line 1048, because the condition on line 1047 was never true

1048 return True 

1049 return len(self) > 0 

1050 return False 

1051 

1052 def __getattr__(self, attr): 

1053 """Delegate attribute lookup to data object""" 

1054 if attr == '__array_struct__' and not self.valid: 

1055 # np.array() checks __array__ or __array_struct__ attribute dep. on numpy version 

1056 raise InvalidDataIOError("Cannot convert data to array. Data is not valid.") 

1057 if not self.valid: 

1058 raise InvalidDataIOError("Cannot get attribute '%s' of data. Data is not valid." % attr) 

1059 return getattr(self.data, attr) 

1060 

1061 def __getitem__(self, item): 

1062 """Delegate slicing to the data object""" 

1063 if not self.valid: 1063 ↛ 1064line 1063 didn't jump to line 1064, because the condition on line 1063 was never true

1064 raise InvalidDataIOError("Cannot get item from data. Data is not valid.") 

1065 return self.data[item] 

1066 

1067 def __array__(self): 

1068 """ 

1069 Support conversion of DataIO.data to a numpy array. This function is 

1070 provided to improve transparent interoperability of DataIO with numpy. 

1071 

1072 :return: An array instance of self.data 

1073 """ 

1074 if not self.valid: 1074 ↛ 1075line 1074 didn't jump to line 1075, because the condition on line 1074 was never true

1075 raise InvalidDataIOError("Cannot convert data to array. Data is not valid.") 

1076 if hasattr(self.data, '__array__'): 

1077 return self.data.__array__() 

1078 elif isinstance(self.data, DataChunkIterator): 

1079 raise NotImplementedError("Conversion of DataChunkIterator to array not supported") 

1080 else: 

1081 # NOTE this may result in a copy of the array 

1082 return np.asarray(self.data) 

1083 

1084 def __next__(self): 

1085 """Delegate iteration interface to data object""" 

1086 if not self.valid: 

1087 raise InvalidDataIOError("Cannot iterate on data. Data is not valid.") 

1088 return self.data.__next__() 

1089 

1090 def __iter__(self): 

1091 """Delegate iteration interface to the data object""" 

1092 if not self.valid: 

1093 raise InvalidDataIOError("Cannot iterate on data. Data is not valid.") 

1094 return self.data.__iter__() 

1095 

1096 @property 

1097 def valid(self): 

1098 """bool indicating if the data object is valid""" 

1099 return self.data is not None 

1100 

1101 

1102class InvalidDataIOError(Exception): 

1103 pass