Coverage for src/hdmf/backends/hdf5/h5tools.py: 87%

906 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-08-18 20:49 +0000

1import logging 

2import os.path 

3import warnings 

4from collections import deque 

5from functools import partial 

6from pathlib import Path, PurePosixPath as pp 

7 

8import numpy as np 

9import h5py 

10from h5py import File, Group, Dataset, special_dtype, SoftLink, ExternalLink, Reference, RegionReference, check_dtype 

11 

12from .h5_utils import (BuilderH5ReferenceDataset, BuilderH5RegionDataset, BuilderH5TableDataset, H5DataIO, 

13 H5SpecReader, H5SpecWriter, HDF5IODataChunkIteratorQueue) 

14from ..io import HDMFIO 

15from ..errors import UnsupportedOperation 

16from ..warnings import BrokenLinkWarning 

17from ...build import (Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, RegionBuilder, 

18 ReferenceBuilder, TypeMap, ObjectMapper) 

19from ...container import Container 

20from ...data_utils import AbstractDataChunkIterator 

21from ...spec import RefSpec, DtypeSpec, NamespaceCatalog 

22from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset 

23from ..utils import NamespaceToBuilderHelper, WriteStatusTracker 

24 

25ROOT_NAME = 'root' 

26SPEC_LOC_ATTR = '.specloc' 

27H5_TEXT = special_dtype(vlen=str) 

28H5_BINARY = special_dtype(vlen=bytes) 

29H5_REF = special_dtype(ref=Reference) 

30H5_REGREF = special_dtype(ref=RegionReference) 

31 

32RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB 

33 

34H5PY_3 = h5py.__version__.startswith('3') 

35 

36 

37class HDF5IO(HDMFIO): 

38 

39 __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group 

40 

41 @staticmethod 

42 def can_read(path): 

43 """Determines whether a given path is readable by the HDF5IO class""" 

44 if not os.path.isfile(path): 

45 return False 

46 try: 

47 with h5py.File(path, "r"): 

48 return True 

49 except IOError: 

50 return False 

51 

52 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

53 {'name': 'mode', 'type': str, 

54 'doc': ('the mode to open the HDF5 file with, one of ("w", "r", "r+", "a", "w-", "x"). ' 

55 'See `h5py.File <http://docs.h5py.org/en/latest/high/file.html#opening-creating-files>`_ for ' 

56 'more details.'), 

57 'default': 'r'}, 

58 {'name': 'manager', 'type': (TypeMap, BuildManager), 

59 'doc': 'the BuildManager or a TypeMap to construct a BuildManager to use for I/O', 'default': None}, 

60 {'name': 'comm', 'type': 'Intracomm', 

61 'doc': 'the MPI communicator to use for parallel I/O', 'default': None}, 

62 {'name': 'file', 'type': [File, "S3File"], 'doc': 'a pre-existing h5py.File object', 'default': None}, 

63 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

64 {'name': 'herd_path', 'type': str, 

65 'doc': 'The path to the HERD', 'default': None},) 

66 def __init__(self, **kwargs): 

67 """Open an HDF5 file for IO. 

68 """ 

69 self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) 

70 path, manager, mode, comm, file_obj, driver, herd_path = popargs('path', 'manager', 'mode', 

71 'comm', 'file', 'driver', 

72 'herd_path', 

73 kwargs) 

74 

75 self.__open_links = [] # keep track of other files opened from links in this file 

76 self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close 

77 

78 path = self.__check_path_file_obj(path, file_obj) 

79 

80 if file_obj is None and not os.path.exists(path) and (mode == 'r' or mode == 'r+') and driver != 'ros3': 

81 msg = "Unable to open file %s in '%s' mode. File does not exist." % (path, mode) 

82 raise UnsupportedOperation(msg) 

83 

84 if file_obj is None and os.path.exists(path) and (mode == 'w-' or mode == 'x'): 

85 msg = "Unable to open file %s in '%s' mode. File already exists." % (path, mode) 

86 raise UnsupportedOperation(msg) 

87 

88 if manager is None: 

89 manager = BuildManager(TypeMap(NamespaceCatalog())) 

90 elif isinstance(manager, TypeMap): 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true

91 manager = BuildManager(manager) 

92 self.__driver = driver 

93 self.__comm = comm 

94 self.__mode = mode 

95 self.__file = file_obj 

96 super().__init__(manager, source=path, herd_path=herd_path) 

97 # NOTE: source is not set if path is None and file_obj is passed 

98 self.__built = dict() # keep track of each builder for each dataset/group/link for each file 

99 self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder 

100 self.__ref_queue = deque() # a queue of the references that need to be added 

101 self.__dci_queue = HDF5IODataChunkIteratorQueue() # a queue of DataChunkIterators that need to be exhausted 

102 ObjectMapper.no_convert(Dataset) 

103 self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object 

104 

105 @property 

106 def comm(self): 

107 """The MPI communicator to use for parallel I/O.""" 

108 return self.__comm 

109 

110 @property 

111 def _file(self): 

112 return self.__file 

113 

114 @property 

115 def driver(self): 

116 return self.__driver 

117 

118 @classmethod 

119 def __check_path_file_obj(cls, path, file_obj): 

120 if isinstance(path, Path): 

121 path = str(path) 

122 

123 if path is None and file_obj is None: 

124 raise ValueError("Either the 'path' or 'file' argument must be supplied.") 

125 

126 if path is not None and file_obj is not None: # consistency check 

127 if os.path.abspath(file_obj.filename) != os.path.abspath(path): 

128 msg = ("You argued '%s' as this object's path, but supplied a file with filename: %s" 

129 % (path, file_obj.filename)) 

130 raise ValueError(msg) 

131 

132 return path 

133 

134 @classmethod 

135 def __resolve_file_obj(cls, path, file_obj, driver): 

136 path = cls.__check_path_file_obj(path, file_obj) 

137 

138 if file_obj is None: 

139 file_kwargs = dict() 

140 if driver is not None: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true

141 file_kwargs.update(driver=driver) 

142 file_obj = File(path, 'r', **file_kwargs) 

143 return file_obj 

144 

145 @classmethod 

146 @docval({'name': 'namespace_catalog', 'type': (NamespaceCatalog, TypeMap), 

147 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'}, 

148 {'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

149 {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None}, 

150 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, 

151 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

152 returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and " 

153 "the included data types"), 

154 rtype=dict) 

155 def load_namespaces(cls, **kwargs): 

156 """Load cached namespaces from a file. 

157 

158 If `file` is not supplied, then an :py:class:`h5py.File` object will be opened for the given `path`, the 

159 namespaces will be read, and the File object will be closed. If `file` is supplied, then 

160 the given File object will be read from and not closed. 

161 

162 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. 

163 """ 

164 namespace_catalog, path, namespaces, file_obj, driver = popargs( 

165 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs) 

166 

167 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) 

168 if file_obj is None: # need to close the file object that we just opened 

169 with open_file_obj: 

170 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj) 

171 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj) 

172 

173 @classmethod 

174 def __load_namespaces(cls, namespace_catalog, namespaces, file_obj): 

175 d = {} 

176 

177 if not cls.__check_specloc(file_obj): 

178 return d 

179 

180 namespace_versions = cls.__get_namespaces(file_obj) 

181 

182 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]] 

183 if namespaces is None: 183 ↛ 186line 183 didn't jump to line 186, because the condition on line 183 was never false

184 namespaces = list(spec_group.keys()) 

185 

186 readers = dict() 

187 deps = dict() 

188 for ns in namespaces: 

189 latest_version = namespace_versions[ns] 

190 ns_group = spec_group[ns][latest_version] 

191 reader = H5SpecReader(ns_group) 

192 readers[ns] = reader 

193 # for each namespace in the 'namespace' dataset, track all included namespaces (dependencies) 

194 for spec_ns in reader.read_namespace(cls.__ns_spec_path): 

195 deps[ns] = list() 

196 for s in spec_ns['schema']: 

197 dep = s.get('namespace') 

198 if dep is not None: 

199 deps[ns].append(dep) 

200 

201 order = cls._order_deps(deps) 

202 for ns in order: 

203 reader = readers[ns] 

204 d.update(namespace_catalog.load_namespaces(cls.__ns_spec_path, reader=reader)) 

205 

206 return d 

207 

208 @classmethod 

209 def __check_specloc(cls, file_obj): 

210 if SPEC_LOC_ATTR not in file_obj.attrs: 

211 # this occurs in legacy files 

212 msg = "No cached namespaces found in %s" % file_obj.filename 

213 warnings.warn(msg) 

214 return False 

215 return True 

216 

217 @classmethod 

218 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

219 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, 

220 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

221 returns="dict mapping names to versions of the namespaces in the file", rtype=dict) 

222 def get_namespaces(cls, **kwargs): 

223 """Get the names and versions of the cached namespaces from a file. 

224 

225 If ``file`` is not supplied, then an :py:class:`h5py.File` object will be opened for the given ``path``, the 

226 namespaces will be read, and the File object will be closed. If `file` is supplied, then 

227 the given File object will be read from and not closed. 

228 

229 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric 

230 ordering) is returned. This is the version of the namespace that is loaded by HDF5IO.load_namespaces(...). 

231 

232 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. 

233 """ 

234 path, file_obj, driver = popargs('path', 'file', 'driver', kwargs) 

235 

236 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) 

237 if file_obj is None: # need to close the file object that we just opened 

238 with open_file_obj: 

239 return cls.__get_namespaces(open_file_obj) 

240 return cls.__get_namespaces(open_file_obj) 

241 

242 @classmethod 

243 def __get_namespaces(cls, file_obj): 

244 """Return a dict mapping namespace name to version string for the latest version of that namespace in the file. 

245 

246 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric 

247 ordering) is returned. This is the version of the namespace that is loaded by ``HDF5IO.load_namespaces``. 

248 """ 

249 used_version_names = dict() 

250 if not cls.__check_specloc(file_obj): 

251 return used_version_names 

252 

253 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]] 

254 namespaces = list(spec_group.keys()) 

255 for ns in namespaces: 

256 ns_group = spec_group[ns] 

257 # NOTE: by default, objects within groups are iterated in alphanumeric order 

258 version_names = list(ns_group.keys()) 

259 if len(version_names) > 1: 

260 # prior to HDMF 1.6.1, extensions without a version were written under the group name "unversioned" 

261 # make sure that if there is another group representing a newer version, that is read instead 

262 if 'unversioned' in version_names: 

263 version_names.remove('unversioned') 

264 if len(version_names) > 1: 

265 # as of HDMF 1.6.1, extensions without a version are written under the group name "None" 

266 # make sure that if there is another group representing a newer version, that is read instead 

267 if 'None' in version_names: 

268 version_names.remove('None') 

269 used_version_names[ns] = version_names[-1] # save the largest in alphanumeric order 

270 

271 return used_version_names 

272 

273 @classmethod 

274 def _order_deps(cls, deps): 

275 """ 

276 Order namespaces according to dependency for loading into a NamespaceCatalog 

277 

278 Args: 

279 deps (dict): a dictionary that maps a namespace name to a list of name of 

280 the namespaces on which the namespace is directly dependent 

281 Example: {'a': ['b', 'c'], 'b': ['d'], 'c': ['d'], 'd': []} 

282 Expected output: ['d', 'b', 'c', 'a'] 

283 """ 

284 order = list() 

285 keys = list(deps.keys()) 

286 deps = dict(deps) 

287 for k in keys: 

288 if k in deps: 

289 cls.__order_deps_aux(order, deps, k) 

290 return order 

291 

292 @classmethod 

293 def __order_deps_aux(cls, order, deps, key): 

294 """ 

295 A recursive helper function for _order_deps 

296 """ 

297 if key not in deps: 

298 return 

299 subdeps = deps.pop(key) 

300 for subk in subdeps: 

301 cls.__order_deps_aux(order, deps, subk) 

302 order.append(key) 

303 

304 @classmethod 

305 @docval({'name': 'source_filename', 'type': str, 'doc': 'the path to the HDF5 file to copy'}, 

306 {'name': 'dest_filename', 'type': str, 'doc': 'the name of the destination file'}, 

307 {'name': 'expand_external', 'type': bool, 'doc': 'expand external links into new objects', 'default': True}, 

308 {'name': 'expand_refs', 'type': bool, 'doc': 'copy objects which are pointed to by reference', 

309 'default': False}, 

310 {'name': 'expand_soft', 'type': bool, 'doc': 'expand soft links into new objects', 'default': False} 

311 ) 

312 def copy_file(self, **kwargs): 

313 """ 

314 Convenience function to copy an HDF5 file while allowing external links to be resolved. 

315 

316 .. warning:: 

317 

318 As of HDMF 2.0, this method is no longer supported and may be removed in a future version. 

319 Please use the export method or h5py.File.copy method instead. 

320 

321 .. note:: 

322 

323 The source file will be opened in 'r' mode and the destination file will be opened in 'w' mode 

324 using h5py. To avoid possible collisions, care should be taken that, e.g., the source file is 

325 not opened already when calling this function. 

326 

327 """ 

328 

329 warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of " 

330 "HDMF. Please use the export method or h5py.File.copy method instead.", DeprecationWarning) 

331 

332 source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename', 

333 'dest_filename', 

334 'expand_external', 

335 'expand_refs', 

336 'expand_soft', 

337 kwargs) 

338 source_file = File(source_filename, 'r') 

339 dest_file = File(dest_filename, 'w') 

340 for objname in source_file["/"].keys(): 

341 source_file.copy(source=objname, 

342 dest=dest_file, 

343 name=objname, 

344 expand_external=expand_external, 

345 expand_refs=expand_refs, 

346 expand_soft=expand_soft, 

347 shallow=False, 

348 without_attrs=False, 

349 ) 

350 for objname in source_file['/'].attrs: 

351 dest_file['/'].attrs[objname] = source_file['/'].attrs[objname] 

352 source_file.close() 

353 dest_file.close() 

354 

355 @docval({'name': 'container', 'type': Container, 'doc': 'the Container object to write'}, 

356 {'name': 'cache_spec', 'type': bool, 

357 'doc': ('If True (default), cache specification to file (highly recommended). If False, do not cache ' 

358 'specification to file. The appropriate specification will then need to be loaded prior to ' 

359 'reading the file.'), 

360 'default': True}, 

361 {'name': 'link_data', 'type': bool, 

362 'doc': 'If True (default), create external links to HDF5 Datasets. If False, copy HDF5 Datasets.', 

363 'default': True}, 

364 {'name': 'exhaust_dci', 'type': bool, 

365 'doc': 'If True (default), exhaust DataChunkIterators one at a time. If False, exhaust them concurrently.', 

366 'default': True}) 

367 def write(self, **kwargs): 

368 """Write the container to an HDF5 file.""" 

369 if self.__mode == 'r': 

370 raise UnsupportedOperation(("Cannot write to file %s in mode '%s'. " 

371 "Please use mode 'r+', 'w', 'w-', 'x', or 'a'") 

372 % (self.source, self.__mode)) 

373 

374 cache_spec = popargs('cache_spec', kwargs) 

375 super().write(**kwargs) 

376 if cache_spec: 

377 self.__cache_spec() 

378 

379 def __cache_spec(self): 

380 ref = self.__file.attrs.get(SPEC_LOC_ATTR) 

381 spec_group = None 

382 if ref is not None: 

383 spec_group = self.__file[ref] 

384 else: 

385 path = 'specifications' # do something to figure out where the specifications should go 

386 spec_group = self.__file.require_group(path) 

387 self.__file.attrs[SPEC_LOC_ATTR] = spec_group.ref 

388 ns_catalog = self.manager.namespace_catalog 

389 for ns_name in ns_catalog.namespaces: 

390 ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name) 

391 namespace = ns_catalog.get_namespace(ns_name) 

392 group_name = '%s/%s' % (ns_name, namespace.version) 

393 if group_name in spec_group: 

394 continue 

395 ns_group = spec_group.create_group(group_name) 

396 writer = H5SpecWriter(ns_group) 

397 ns_builder.export(self.__ns_spec_path, writer=writer) 

398 

399 _export_args = ( 

400 {'name': 'src_io', 'type': 'HDMFIO', 'doc': 'the HDMFIO object for reading the data to export'}, 

401 {'name': 'container', 'type': Container, 

402 'doc': ('the Container object to export. If None, then the entire contents of the HDMFIO object will be ' 

403 'exported'), 

404 'default': None}, 

405 {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`', 

406 'default': None}, 

407 {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file', 

408 'default': True} 

409 # clear_cache is an arg on HDMFIO.export but it is intended for internal usage 

410 # so it is not available on HDF5IO 

411 ) 

412 

413 @docval(*_export_args) 

414 def export(self, **kwargs): 

415 """Export data read from a file from any backend to HDF5. 

416 

417 See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. 

418 """ 

419 if self.__mode != 'w': 

420 raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." 

421 % (self.source, self.__mode)) 

422 

423 src_io = getargs('src_io', kwargs) 

424 write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) 

425 if write_args is None: 

426 write_args = dict() 

427 

428 if not isinstance(src_io, HDF5IO) and write_args.get('link_data', True): 

429 raise UnsupportedOperation("Cannot export from non-HDF5 backend %s to HDF5 with write argument " 

430 "link_data=True." % src_io.__class__.__name__) 

431 

432 write_args['export_source'] = os.path.abspath(src_io.source) if src_io.source is not None else None 

433 ckwargs = kwargs.copy() 

434 ckwargs['write_args'] = write_args 

435 if not write_args.get('link_data', True): 

436 ckwargs['clear_cache'] = True 

437 super().export(**ckwargs) 

438 if cache_spec: 

439 # add any namespaces from the src_io that have not yet been loaded 

440 for namespace in src_io.manager.namespace_catalog.namespaces: 

441 if namespace not in self.manager.namespace_catalog.namespaces: 441 ↛ 440line 441 didn't jump to line 440, because the condition on line 441 was never false

442 self.manager.namespace_catalog.add_namespace( 

443 name=namespace, 

444 namespace=src_io.manager.namespace_catalog.get_namespace(namespace) 

445 ) 

446 self.__cache_spec() 

447 

448 @classmethod 

449 @docval({'name': 'path', 'type': str, 'doc': 'the path to the destination HDF5 file'}, 

450 {'name': 'comm', 'type': 'Intracomm', 'doc': 'the MPI communicator to use for parallel I/O', 

451 'default': None}, 

452 *_export_args) # NOTE: src_io is required and is the second positional argument 

453 def export_io(self, **kwargs): 

454 """Export from one backend to HDF5 (class method). 

455 

456 Convenience function for :py:meth:`export` where you do not need to 

457 instantiate a new ``HDF5IO`` object for writing. An ``HDF5IO`` object is created with mode 'w' and the given 

458 arguments. 

459 

460 Example usage: 

461 

462 .. code-block:: python 

463 

464 old_io = HDF5IO('old.h5', 'r') 

465 HDF5IO.export_io(path='new_copy.h5', src_io=old_io) 

466 

467 See :py:meth:`export` for more details. 

468 """ 

469 path, comm = popargs('path', 'comm', kwargs) 

470 

471 with HDF5IO(path=path, comm=comm, mode='w') as write_io: 

472 write_io.export(**kwargs) 

473 

474 def read(self, **kwargs): 

475 if self.__mode == 'w' or self.__mode == 'w-' or self.__mode == 'x': 

476 raise UnsupportedOperation("Cannot read from file %s in mode '%s'. Please use mode 'r', 'r+', or 'a'." 

477 % (self.source, self.__mode)) 

478 try: 

479 return super().read(**kwargs) 

480 except UnsupportedOperation as e: 

481 if str(e) == 'Cannot build data. There are no values.': # pragma: no cover 

482 raise UnsupportedOperation("Cannot read data from file %s in mode '%s'. There are no values." 

483 % (self.source, self.__mode)) 

484 

485 @docval(returns='a GroupBuilder representing the data object', rtype='GroupBuilder') 

486 def read_builder(self): 

487 """ 

488 Read data and return the GroupBuilder representing it. 

489 

490 NOTE: On read, the Builder.source may will usually not be set of the Builders. 

491 NOTE: The Builder.location is used internally to ensure correct handling of links (in particular on export) 

492 and should be set on read for all GroupBuilder, DatasetBuilder, and LinkBuilder objects. 

493 """ 

494 if not self.__file: 

495 raise UnsupportedOperation("Cannot read data from closed HDF5 file '%s'" % self.source) 

496 f_builder = self.__read.get(self.__file) 

497 # ignore cached specs when reading builder 

498 ignore = set() 

499 specloc = self.__file.attrs.get(SPEC_LOC_ATTR) 

500 if specloc is not None: 

501 ignore.add(self.__file[specloc].name) 

502 if f_builder is None: 

503 f_builder = self.__read_group(self.__file, ROOT_NAME, ignore=ignore) 

504 self.__read[self.__file] = f_builder 

505 return f_builder 

506 

507 def __set_written(self, builder): 

508 """ 

509 Helper function used to set the written status for builders 

510 

511 :param builder: Builder object to be marked as written 

512 :type builder: Builder 

513 """ 

514 self._written_builders.set_written(builder) 

515 

516 def get_written(self, builder): 

517 """Return True if this builder has been written to (or read from) disk by this IO object, False otherwise. 

518 

519 :param builder: Builder object to get the written flag for 

520 :type builder: Builder 

521 

522 :return: True if the builder is found in self._written_builders using the builder ID, False otherwise 

523 """ 

524 return self._written_builders.get_written(builder) 

525 

526 def __set_built(self, fpath, id, builder): 

527 """ 

528 Update self.__built to cache the given builder for the given file and id. 

529 

530 :param fpath: Path to the HDF5 file containing the object 

531 :type fpath: str 

532 :param id: ID of the HDF5 object in the path 

533 :type id: h5py GroupID object 

534 :param builder: The builder to be cached 

535 """ 

536 self.__built.setdefault(fpath, dict()).setdefault(id, builder) 

537 

538 def __get_built(self, fpath, id): 

539 """ 

540 Look up a builder for the given file and id in self.__built cache 

541 

542 :param fpath: Path to the HDF5 file containing the object 

543 :type fpath: str 

544 :param id: ID of the HDF5 object in the path 

545 :type id: h5py GroupID object 

546 

547 :return: Builder in the self.__built cache or None 

548 """ 

549 

550 fdict = self.__built.get(fpath) 

551 if fdict: 

552 return fdict.get(id) 

553 else: 

554 return None 

555 

556 @docval({'name': 'h5obj', 'type': (Dataset, Group), 

557 'doc': 'the HDF5 object to the corresponding Builder object for'}) 

558 def get_builder(self, **kwargs): 

559 """ 

560 Get the builder for the corresponding h5py Group or Dataset 

561 

562 :raises ValueError: When no builder has been constructed yet for the given h5py object 

563 """ 

564 h5obj = getargs('h5obj', kwargs) 

565 fpath = h5obj.file.filename 

566 builder = self.__get_built(fpath, h5obj.id) 

567 if builder is None: 567 ↛ 568line 567 didn't jump to line 568, because the condition on line 567 was never true

568 msg = '%s:%s has not been built' % (fpath, h5obj.name) 

569 raise ValueError(msg) 

570 return builder 

571 

572 @docval({'name': 'h5obj', 'type': (Dataset, Group), 

573 'doc': 'the HDF5 object to the corresponding Container/Data object for'}) 

574 def get_container(self, **kwargs): 

575 """ 

576 Get the container for the corresponding h5py Group or Dataset 

577 

578 :raises ValueError: When no builder has been constructed yet for the given h5py object 

579 """ 

580 h5obj = getargs('h5obj', kwargs) 

581 builder = self.get_builder(h5obj) 

582 container = self.manager.construct(builder) 

583 return container 

584 

585 def __read_group(self, h5obj, name=None, ignore=set()): 

586 kwargs = { 

587 "attributes": self.__read_attrs(h5obj), 

588 "groups": dict(), 

589 "datasets": dict(), 

590 "links": dict() 

591 } 

592 

593 for key, val in kwargs['attributes'].items(): 

594 if isinstance(val, bytes): 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true

595 kwargs['attributes'][key] = val.decode('UTF-8') 

596 

597 if name is None: 

598 name = str(os.path.basename(h5obj.name)) 

599 for k in h5obj: 

600 sub_h5obj = h5obj.get(k) 

601 if sub_h5obj is not None: 

602 if sub_h5obj.name in ignore: 

603 continue 

604 link_type = h5obj.get(k, getlink=True) 

605 if isinstance(link_type, (SoftLink, ExternalLink)): 

606 # Reading links might be better suited in its own function 

607 # get path of link (the key used for tracking what's been built) 

608 target_path = link_type.path 

609 target_obj = sub_h5obj.file[target_path] 

610 builder_name = os.path.basename(target_path) 

611 # get builder if already read, else build it 

612 builder = self.__get_built(sub_h5obj.file.filename, target_obj.id) 

613 if builder is None: 

614 # NOTE: all links must have absolute paths 

615 if isinstance(target_obj, Dataset): 

616 builder = self.__read_dataset(target_obj, builder_name) 

617 else: 

618 builder = self.__read_group(target_obj, builder_name, ignore=ignore) 

619 self.__set_built(sub_h5obj.file.filename, target_obj.id, builder) 

620 link_builder = LinkBuilder(builder=builder, name=k, source=os.path.abspath(h5obj.file.filename)) 

621 link_builder.location = h5obj.name 

622 self.__set_written(link_builder) 

623 kwargs['links'][builder_name] = link_builder 

624 if isinstance(link_type, ExternalLink): 

625 self.__open_links.append(sub_h5obj) 

626 else: 

627 builder = self.__get_built(sub_h5obj.file.filename, sub_h5obj.id) 

628 obj_type = None 

629 read_method = None 

630 if isinstance(sub_h5obj, Dataset): 

631 read_method = self.__read_dataset 

632 obj_type = kwargs['datasets'] 

633 else: 

634 read_method = partial(self.__read_group, ignore=ignore) 

635 obj_type = kwargs['groups'] 

636 if builder is None: 

637 builder = read_method(sub_h5obj) 

638 self.__set_built(sub_h5obj.file.filename, sub_h5obj.id, builder) 

639 obj_type[builder.name] = builder 

640 else: 

641 warnings.warn('Path to Group altered/broken at ' + os.path.join(h5obj.name, k), BrokenLinkWarning) 

642 kwargs['datasets'][k] = None 

643 continue 

644 kwargs['source'] = os.path.abspath(h5obj.file.filename) 

645 ret = GroupBuilder(name, **kwargs) 

646 ret.location = os.path.dirname(h5obj.name) 

647 self.__set_written(ret) 

648 return ret 

649 

650 def __read_dataset(self, h5obj, name=None): 

651 kwargs = { 

652 "attributes": self.__read_attrs(h5obj), 

653 "dtype": h5obj.dtype, 

654 "maxshape": h5obj.maxshape 

655 } 

656 for key, val in kwargs['attributes'].items(): 

657 if isinstance(val, bytes): 657 ↛ 658line 657 didn't jump to line 658, because the condition on line 657 was never true

658 kwargs['attributes'][key] = val.decode('UTF-8') 

659 

660 if name is None: 

661 name = str(os.path.basename(h5obj.name)) 

662 kwargs['source'] = os.path.abspath(h5obj.file.filename) 

663 ndims = len(h5obj.shape) 

664 if ndims == 0: # read scalar 

665 scalar = h5obj[()] 

666 if isinstance(scalar, bytes): 666 ↛ 667line 666 didn't jump to line 667, because the condition on line 666 was never true

667 scalar = scalar.decode('UTF-8') 

668 

669 if isinstance(scalar, Reference): 669 ↛ 671line 669 didn't jump to line 671, because the condition on line 669 was never true

670 # TODO (AJTRITT): This should call __read_ref to support Group references 

671 target = h5obj.file[scalar] 

672 target_builder = self.__read_dataset(target) 

673 self.__set_built(target.file.filename, target.id, target_builder) 

674 if isinstance(scalar, RegionReference): 

675 d = RegionBuilder(scalar, target_builder) 

676 else: 

677 d = ReferenceBuilder(target_builder) 

678 kwargs['data'] = d 

679 kwargs['dtype'] = d.dtype 

680 else: 

681 kwargs["data"] = scalar 

682 else: 

683 d = None 

684 if h5obj.dtype.kind == 'O' and len(h5obj) > 0: 

685 elem1 = h5obj[tuple([0] * (h5obj.ndim - 1) + [0])] 

686 if isinstance(elem1, (str, bytes)): 

687 d = self._check_str_dtype(h5obj) 

688 elif isinstance(elem1, RegionReference): # read list of references 688 ↛ 689line 688 didn't jump to line 689, because the condition on line 688 was never true

689 d = BuilderH5RegionDataset(h5obj, self) 

690 kwargs['dtype'] = d.dtype 

691 elif isinstance(elem1, Reference): 691 ↛ 701line 691 didn't jump to line 701, because the condition on line 691 was never false

692 d = BuilderH5ReferenceDataset(h5obj, self) 

693 kwargs['dtype'] = d.dtype 

694 elif h5obj.dtype.kind == 'V': # table / compound data type 

695 cpd_dt = h5obj.dtype 

696 ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))] 

697 d = BuilderH5TableDataset(h5obj, self, ref_cols) 

698 kwargs['dtype'] = HDF5IO.__compound_dtype_to_list(h5obj.dtype, d.dtype) 

699 else: 

700 d = h5obj 

701 kwargs["data"] = d 

702 ret = DatasetBuilder(name, **kwargs) 

703 ret.location = os.path.dirname(h5obj.name) 

704 self.__set_written(ret) 

705 return ret 

706 

707 def _check_str_dtype(self, h5obj): 

708 dtype = h5obj.dtype 

709 if dtype.kind == 'O': 709 ↛ 712line 709 didn't jump to line 712, because the condition on line 709 was never false

710 if dtype.metadata.get('vlen') == str and H5PY_3: 710 ↛ 712line 710 didn't jump to line 712, because the condition on line 710 was never false

711 return StrDataset(h5obj, None) 

712 return h5obj 

713 

714 @classmethod 

715 def __compound_dtype_to_list(cls, h5obj_dtype, dset_dtype): 

716 ret = [] 

717 for name, dtype in zip(h5obj_dtype.fields, dset_dtype): 

718 ret.append({'name': name, 'dtype': dtype}) 

719 return ret 

720 

721 def __read_attrs(self, h5obj): 

722 ret = dict() 

723 for k, v in h5obj.attrs.items(): 

724 if k == SPEC_LOC_ATTR: # ignore cached spec 

725 continue 

726 if isinstance(v, RegionReference): 726 ↛ 727line 726 didn't jump to line 727, because the condition on line 726 was never true

727 raise ValueError("cannot read region reference attributes yet") 

728 elif isinstance(v, Reference): 

729 ret[k] = self.__read_ref(h5obj.file[v]) 

730 else: 

731 ret[k] = v 

732 return ret 

733 

734 def __read_ref(self, h5obj): 

735 ret = None 

736 ret = self.__get_built(h5obj.file.filename, h5obj.id) 

737 if ret is None: 

738 if isinstance(h5obj, Dataset): 

739 ret = self.__read_dataset(h5obj) 

740 elif isinstance(h5obj, Group): 740 ↛ 743line 740 didn't jump to line 743, because the condition on line 740 was never false

741 ret = self.__read_group(h5obj) 

742 else: 

743 raise ValueError("h5obj must be a Dataset or a Group - got %s" % str(h5obj)) 

744 self.__set_built(h5obj.file.filename, h5obj.id, ret) 

745 return ret 

746 

747 def open(self): 

748 if self.__file is None: 

749 open_flag = self.__mode 

750 kwargs = dict(rdcc_nbytes=RDCC_NBYTES) 

751 if self.comm: 751 ↛ 752line 751 didn't jump to line 752, because the condition on line 751 was never true

752 kwargs.update(driver='mpio', comm=self.comm) 

753 

754 if self.driver is not None: 754 ↛ 755line 754 didn't jump to line 755, because the condition on line 754 was never true

755 kwargs.update(driver=self.driver) 

756 

757 self.__file = File(self.source, open_flag, **kwargs) 

758 

759 def close(self, close_links=True): 

760 """Close this file and any files linked to from this file. 

761 

762 :param close_links: Whether to close all files linked to from this file. (default: True) 

763 :type close_links: bool 

764 """ 

765 if close_links: 

766 self.close_linked_files() 

767 try: 

768 if self.__file is not None: 

769 self.__file.close() 

770 except AttributeError: 

771 # Do not do anything in case that self._file does not exist. This 

772 # may happen in case that an error occurs before HDF5IO has been fully 

773 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises 

774 # an error before self.__file has been created 

775 self.__file = None 

776 

777 def close_linked_files(self): 

778 """Close all opened, linked-to files. 

779 

780 MacOS and Linux automatically release the linked-to file after the linking file is closed, but Windows does 

781 not, which prevents the linked-to file from being deleted or truncated. Use this method to close all opened, 

782 linked-to files. 

783 """ 

784 # Make sure 

785 try: 

786 for obj in self.__open_links: 

787 if obj: 

788 obj.file.close() 

789 except AttributeError: 

790 # Do not do anything in case that self.__open_links does not exist. This 

791 # may happen in case that an error occurs before HDF5IO has been fully 

792 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises 

793 # an error before self.__open_links has been created. 

794 pass 

795 finally: 

796 self.__open_links = [] 

797 

798 @docval({'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the HDF5 file'}, 

799 {'name': 'link_data', 'type': bool, 

800 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

801 {'name': 'exhaust_dci', 'type': bool, 

802 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

803 'default': True}, 

804 {'name': 'export_source', 'type': str, 

805 'doc': 'The source of the builders when exporting', 'default': None}) 

806 def write_builder(self, **kwargs): 

807 f_builder = popargs('builder', kwargs) 

808 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) 

809 self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" 

810 % (f_builder.name, self.source, kwargs)) 

811 for name, gbldr in f_builder.groups.items(): 

812 self.write_group(self.__file, gbldr, **kwargs) 

813 for name, dbldr in f_builder.datasets.items(): 

814 self.write_dataset(self.__file, dbldr, **kwargs) 

815 for name, lbldr in f_builder.links.items(): 

816 self.write_link(self.__file, lbldr, export_source=kwargs.get("export_source")) 

817 self.set_attributes(self.__file, f_builder.attributes) 

818 self.__add_refs() 

819 self.__dci_queue.exhaust_queue() 

820 self.__set_written(f_builder) 

821 self.logger.debug("Done writing %s '%s' to path '%s'" % 

822 (f_builder.__class__.__qualname__, f_builder.name, self.source)) 

823 

824 def __add_refs(self): 

825 ''' 

826 Add all references in the file. 

827 

828 References get queued to be added at the end of write. This is because 

829 the current traversal algorithm (i.e. iterating over GroupBuilder items) 

830 does not happen in a guaranteed order. We need to figure out what objects 

831 will be references, and then write them after we write everything else. 

832 ''' 

833 failed = set() 

834 while len(self.__ref_queue) > 0: 

835 call = self.__ref_queue.popleft() 

836 self.logger.debug("Adding reference with call id %d from queue (length %d)" 

837 % (id(call), len(self.__ref_queue))) 

838 try: 

839 call() 

840 except KeyError: 

841 if id(call) in failed: 

842 raise RuntimeError('Unable to resolve reference') 

843 self.logger.debug("Adding reference with call id %d failed. Appending call to queue" % id(call)) 

844 failed.add(id(call)) 

845 self.__ref_queue.append(call) 

846 

847 @classmethod 

848 def get_type(cls, data): 

849 if isinstance(data, str): 

850 return H5_TEXT 

851 elif isinstance(data, bytes): 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true

852 return H5_BINARY 

853 elif isinstance(data, Container): 853 ↛ 854line 853 didn't jump to line 854, because the condition on line 853 was never true

854 return H5_REF 

855 elif not hasattr(data, '__len__'): 

856 return type(data) 

857 else: 

858 if len(data) == 0: 

859 if hasattr(data, 'dtype'): 859 ↛ 860line 859 didn't jump to line 860, because the condition on line 859 was never true

860 return data.dtype 

861 else: 

862 raise ValueError('cannot determine type for empty data') 

863 return cls.get_type(data[0]) 

864 

865 __dtypes = { 

866 "float": np.float32, 

867 "float32": np.float32, 

868 "double": np.float64, 

869 "float64": np.float64, 

870 "long": np.int64, 

871 "int64": np.int64, 

872 "int": np.int32, 

873 "int32": np.int32, 

874 "short": np.int16, 

875 "int16": np.int16, 

876 "int8": np.int8, 

877 "uint64": np.uint64, 

878 "uint": np.uint32, 

879 "uint32": np.uint32, 

880 "uint16": np.uint16, 

881 "uint8": np.uint8, 

882 "bool": np.bool_, 

883 "text": H5_TEXT, 

884 "utf": H5_TEXT, 

885 "utf8": H5_TEXT, 

886 "utf-8": H5_TEXT, 

887 "ascii": H5_BINARY, 

888 "bytes": H5_BINARY, 

889 "ref": H5_REF, 

890 "reference": H5_REF, 

891 "object": H5_REF, 

892 "region": H5_REGREF, 

893 "isodatetime": H5_TEXT, 

894 "datetime": H5_TEXT, 

895 } 

896 

897 @classmethod 

898 def __resolve_dtype__(cls, dtype, data): 

899 # TODO: These values exist, but I haven't solved them yet 

900 # binary 

901 # number 

902 dtype = cls.__resolve_dtype_helper__(dtype) 

903 if dtype is None: 

904 dtype = cls.get_type(data) 

905 return dtype 

906 

907 @classmethod 

908 def __resolve_dtype_helper__(cls, dtype): 

909 if dtype is None: 

910 return None 

911 elif isinstance(dtype, str): 

912 return cls.__dtypes.get(dtype) 

913 elif isinstance(dtype, dict): 

914 return cls.__dtypes.get(dtype['reftype']) 

915 elif isinstance(dtype, np.dtype): 

916 # NOTE: some dtypes may not be supported, but we need to support writing of read-in compound types 

917 return dtype 

918 else: 

919 return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype]) 

920 

921 @docval({'name': 'obj', 'type': (Group, Dataset), 'doc': 'the HDF5 object to add attributes to'}, 

922 {'name': 'attributes', 

923 'type': dict, 

924 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}) 

925 def set_attributes(self, **kwargs): 

926 obj, attributes = getargs('obj', 'attributes', kwargs) 

927 for key, value in attributes.items(): 

928 try: 

929 if isinstance(value, (set, list, tuple)): 

930 tmp = tuple(value) 

931 if len(tmp) > 0: 

932 if isinstance(tmp[0], (str, bytes)): 932 ↛ 934line 932 didn't jump to line 934, because the condition on line 932 was never false

933 value = np.array(value, dtype=special_dtype(vlen=type(tmp[0]))) 

934 elif isinstance(tmp[0], Container): # a list of references 

935 self.__queue_ref(self._make_attr_ref_filler(obj, key, tmp)) 

936 else: 

937 value = np.array(value) 

938 self.logger.debug("Setting %s '%s' attribute '%s' to %s" 

939 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

940 obj.attrs[key] = value 

941 elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference 

942 self.__queue_ref(self._make_attr_ref_filler(obj, key, value)) 

943 else: 

944 self.logger.debug("Setting %s '%s' attribute '%s' to %s" 

945 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

946 if isinstance(value, np.ndarray) and value.dtype.kind == 'U': 946 ↛ 947line 946 didn't jump to line 947, because the condition on line 946 was never true

947 value = np.array(value, dtype=H5_TEXT) 

948 obj.attrs[key] = value # a regular scalar 

949 except Exception as e: 

950 msg = "unable to write attribute '%s' on object '%s'" % (key, obj.name) 

951 raise RuntimeError(msg) from e 

952 

953 def _make_attr_ref_filler(self, obj, key, value): 

954 ''' 

955 Make the callable for setting references to attributes 

956 ''' 

957 self.logger.debug("Queueing set %s '%s' attribute '%s' to %s" 

958 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

959 if isinstance(value, (tuple, list)): 959 ↛ 960line 959 didn't jump to line 960, because the condition on line 959 was never true

960 def _filler(): 

961 ret = list() 

962 for item in value: 

963 ret.append(self.__get_ref(item)) 

964 obj.attrs[key] = ret 

965 else: 

966 def _filler(): 

967 obj.attrs[key] = self.__get_ref(value) 

968 return _filler 

969 

970 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, 

971 {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, 

972 {'name': 'link_data', 'type': bool, 

973 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

974 {'name': 'exhaust_dci', 'type': bool, 

975 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

976 'default': True}, 

977 {'name': 'export_source', 'type': str, 

978 'doc': 'The source of the builders when exporting', 'default': None}, 

979 returns='the Group that was created', rtype='Group') 

980 def write_group(self, **kwargs): 

981 parent, builder = popargs('parent', 'builder', kwargs) 

982 self.logger.debug("Writing GroupBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

983 if self.get_written(builder): 

984 self.logger.debug(" GroupBuilder '%s' is already written" % builder.name) 

985 group = parent[builder.name] 

986 else: 

987 self.logger.debug(" Creating group '%s'" % builder.name) 

988 group = parent.create_group(builder.name) 

989 # write all groups 

990 subgroups = builder.groups 

991 if subgroups: 

992 for subgroup_name, sub_builder in subgroups.items(): 

993 # do not create an empty group without attributes or links 

994 self.write_group(group, sub_builder, **kwargs) 

995 # write all datasets 

996 datasets = builder.datasets 

997 if datasets: 

998 for dset_name, sub_builder in datasets.items(): 

999 self.write_dataset(group, sub_builder, **kwargs) 

1000 # write all links 

1001 links = builder.links 

1002 if links: 

1003 for link_name, sub_builder in links.items(): 

1004 self.write_link(group, sub_builder, export_source=kwargs.get("export_source")) 

1005 attributes = builder.attributes 

1006 self.set_attributes(group, attributes) 

1007 self.__set_written(builder) 

1008 return group 

1009 

1010 def __get_path(self, builder): 

1011 """Get the path to the builder. 

1012 

1013 Note that the root of the file has no name - it is just "/". Thus, the name of the root container is ignored. 

1014 If builder.location is set then it is used as the path, otherwise the function 

1015 determines the path by constructing it iteratively from the parents of the 

1016 builder. 

1017 """ 

1018 if builder.location is not None: 

1019 path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/") 

1020 else: 

1021 curr = builder 

1022 names = list() 

1023 while curr.parent is not None: 

1024 names.append(curr.name) 

1025 curr = curr.parent 

1026 delim = "/" 

1027 path = "%s%s" % (delim, delim.join(reversed(names))) 

1028 return path 

1029 

1030 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, 

1031 {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'}, 

1032 {'name': 'export_source', 'type': str, 

1033 'doc': 'The source of the builders when exporting', 'default': None}, 

1034 returns='the Link that was created', rtype='Link') 

1035 def write_link(self, **kwargs): 

1036 parent, builder, export_source = getargs('parent', 'builder', 'export_source', kwargs) 

1037 self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

1038 if self.get_written(builder): 1038 ↛ 1039line 1038 didn't jump to line 1039, because the condition on line 1038 was never true

1039 self.logger.debug(" LinkBuilder '%s' is already written" % builder.name) 

1040 return None 

1041 name = builder.name 

1042 target_builder = builder.builder 

1043 path = self.__get_path(target_builder) 

1044 # source will indicate target_builder's location 

1045 if export_source is None: 

1046 write_source = builder.source 

1047 else: 

1048 write_source = export_source 

1049 

1050 parent_filename = os.path.abspath(parent.file.filename) 

1051 if target_builder.source in (write_source, parent_filename): 

1052 link_obj = SoftLink(path) 

1053 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1054 % (parent.name, name, link_obj.path)) 

1055 elif target_builder.source is not None: 1055 ↛ 1064line 1055 didn't jump to line 1064, because the condition on line 1055 was never false

1056 target_filename = os.path.abspath(target_builder.source) 

1057 relative_path = os.path.relpath(target_filename, os.path.dirname(parent_filename)) 

1058 if target_builder.location is not None: 

1059 path = target_builder.location + "/" + target_builder.name 

1060 link_obj = ExternalLink(relative_path, path) 

1061 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1062 % (parent.name, name, link_obj.filename, link_obj.path)) 

1063 else: 

1064 msg = 'cannot create external link to %s' % path 

1065 raise ValueError(msg) 

1066 parent[name] = link_obj 

1067 self.__set_written(builder) 

1068 return link_obj 

1069 

1070 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, # noqa: C901 

1071 {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'}, 

1072 {'name': 'link_data', 'type': bool, 

1073 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

1074 {'name': 'exhaust_dci', 'type': bool, 

1075 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

1076 'default': True}, 

1077 {'name': 'export_source', 'type': str, 

1078 'doc': 'The source of the builders when exporting', 'default': None}, 

1079 returns='the Dataset that was created', rtype=Dataset) 

1080 def write_dataset(self, **kwargs): # noqa: C901 

1081 """ Write a dataset to HDF5 

1082 

1083 The function uses other dataset-dependent write functions, e.g, 

1084 ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data. 

1085 """ 

1086 parent, builder = popargs('parent', 'builder', kwargs) 

1087 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) 

1088 self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

1089 if self.get_written(builder): 

1090 self.logger.debug(" DatasetBuilder '%s' is already written" % builder.name) 

1091 return None 

1092 name = builder.name 

1093 data = builder.data 

1094 dataio = None 

1095 options = dict() # dict with additional 

1096 if isinstance(data, H5DataIO): 

1097 options['io_settings'] = data.io_settings 

1098 dataio = data 

1099 link_data = data.link_data 

1100 data = data.data 

1101 else: 

1102 options['io_settings'] = {} 

1103 attributes = builder.attributes 

1104 options['dtype'] = builder.dtype 

1105 dset = None 

1106 link = None 

1107 

1108 # The user provided an existing h5py dataset as input and asked to create a link to the dataset 

1109 if isinstance(data, Dataset): 

1110 data_filename = os.path.abspath(data.file.filename) 

1111 if link_data: 

1112 if export_source is None: # not exporting 

1113 parent_filename = os.path.abspath(parent.file.filename) 

1114 if data_filename != parent_filename: # create external link to data 

1115 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename)) 

1116 link = ExternalLink(relative_path, data.name) 

1117 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1118 % (parent.name, name, link.filename, link.path)) 

1119 else: # create soft link to dataset already in this file -- possible if mode == 'r+' 

1120 link = SoftLink(data.name) 

1121 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1122 % (parent.name, name, link.path)) 

1123 parent[name] = link 

1124 else: # exporting 

1125 export_source = os.path.abspath(export_source) 

1126 parent_filename = os.path.abspath(parent.file.filename) 

1127 if data_filename != export_source: # dataset is in different file than export source 

1128 # possible if user adds a link to a dataset in a different file after reading export source 

1129 # to memory 

1130 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename)) 

1131 link = ExternalLink(relative_path, data.name) 

1132 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1133 % (parent.name, name, link.filename, link.path)) 

1134 parent[name] = link 

1135 elif parent.name != data.parent.name: # dataset is in export source and has different path 

1136 # so create a soft link to the dataset in this file 

1137 # possible if user adds a link to a dataset in export source after reading to memory 

1138 # TODO check that there is/will be still a dataset at data.name -- if the dataset has 

1139 # been removed, then this link will be broken 

1140 link = SoftLink(data.name) 

1141 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1142 % (parent.name, name, link.path)) 

1143 parent[name] = link 

1144 else: # dataset is in export source and has same path as the builder, so copy the dataset 

1145 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'" 

1146 % (data.file.filename, data.name, parent.name, name)) 

1147 parent.copy(source=data, 

1148 dest=parent, 

1149 name=name, 

1150 expand_soft=False, 

1151 expand_external=False, 

1152 expand_refs=False, 

1153 without_attrs=True) 

1154 dset = parent[name] 

1155 else: 

1156 # TODO add option for case where there are multiple links to the same dataset within a file: 

1157 # instead of copying the dset N times, copy it once and create soft links to it within the file 

1158 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'" 

1159 % (data.file.filename, data.name, parent.name, name)) 

1160 parent.copy(source=data, 

1161 dest=parent, 

1162 name=name, 

1163 expand_soft=False, 

1164 expand_external=False, 

1165 expand_refs=False, 

1166 without_attrs=True) 

1167 dset = parent[name] 

1168 

1169 # Write a compound dataset, i.e, a dataset with compound data type 

1170 elif isinstance(options['dtype'], list): 

1171 # do some stuff to figure out what data is a reference 

1172 refs = list() 

1173 for i, dts in enumerate(options['dtype']): 

1174 if self.__is_ref(dts): 

1175 refs.append(i) 

1176 # If one or more of the parts of the compound data type are references then we need to deal with those 

1177 if len(refs) > 0: 

1178 try: 

1179 _dtype = self.__resolve_dtype__(options['dtype'], data) 

1180 except Exception as exc: 

1181 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1182 raise Exception(msg) from exc 

1183 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings']) 

1184 self.__set_written(builder) 

1185 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1186 "object references. attributes: %s" 

1187 % (name, list(attributes.keys()))) 

1188 

1189 @self.__queue_ref 

1190 def _filler(): 

1191 self.logger.debug("Resolving object references and setting attribute on dataset '%s' " 

1192 "containing attributes: %s" 

1193 % (name, list(attributes.keys()))) 

1194 ret = list() 

1195 for item in data: 

1196 new_item = list(item) 

1197 for i in refs: 

1198 new_item[i] = self.__get_ref(item[i]) 

1199 ret.append(tuple(new_item)) 

1200 dset = parent[name] 

1201 dset[:] = ret 

1202 self.set_attributes(dset, attributes) 

1203 

1204 return 

1205 # If the compound data type contains only regular data (i.e., no references) then we can write it as usual 

1206 else: 

1207 dset = self.__list_fill__(parent, name, data, options) 

1208 # Write a dataset containing references, i.e., a region or object reference. 

1209 # NOTE: we can ignore options['io_settings'] for scalar data 

1210 elif self.__is_ref(options['dtype']): 

1211 _dtype = self.__dtypes.get(options['dtype']) 

1212 # Write a scalar data region reference dataset 

1213 if isinstance(data, RegionBuilder): 1213 ↛ 1214line 1213 didn't jump to line 1214, because the condition on line 1213 was never true

1214 dset = parent.require_dataset(name, shape=(), dtype=_dtype) 

1215 self.__set_written(builder) 

1216 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing a " 

1217 "region reference. attributes: %s" 

1218 % (name, list(attributes.keys()))) 

1219 

1220 @self.__queue_ref 

1221 def _filler(): 

1222 self.logger.debug("Resolving region reference and setting attribute on dataset '%s' " 

1223 "containing attributes: %s" 

1224 % (name, list(attributes.keys()))) 

1225 ref = self.__get_ref(data.builder, data.region) 

1226 dset = parent[name] 

1227 dset[()] = ref 

1228 self.set_attributes(dset, attributes) 

1229 # Write a scalar object reference dataset 

1230 elif isinstance(data, ReferenceBuilder): 1230 ↛ 1231line 1230 didn't jump to line 1231, because the condition on line 1230 was never true

1231 dset = parent.require_dataset(name, dtype=_dtype, shape=()) 

1232 self.__set_written(builder) 

1233 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing an " 

1234 "object reference. attributes: %s" 

1235 % (name, list(attributes.keys()))) 

1236 

1237 @self.__queue_ref 

1238 def _filler(): 

1239 self.logger.debug("Resolving object reference and setting attribute on dataset '%s' " 

1240 "containing attributes: %s" 

1241 % (name, list(attributes.keys()))) 

1242 ref = self.__get_ref(data.builder) 

1243 dset = parent[name] 

1244 dset[()] = ref 

1245 self.set_attributes(dset, attributes) 

1246 # Write an array dataset of references 

1247 else: 

1248 # Write a array of region references 

1249 if options['dtype'] == 'region': 1249 ↛ 1250line 1249 didn't jump to line 1250, because the condition on line 1249 was never true

1250 dset = parent.require_dataset(name, dtype=_dtype, shape=(len(data),), **options['io_settings']) 

1251 self.__set_written(builder) 

1252 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1253 "region references. attributes: %s" 

1254 % (name, list(attributes.keys()))) 

1255 

1256 @self.__queue_ref 

1257 def _filler(): 

1258 self.logger.debug("Resolving region references and setting attribute on dataset '%s' " 

1259 "containing attributes: %s" 

1260 % (name, list(attributes.keys()))) 

1261 refs = list() 

1262 for item in data: 

1263 refs.append(self.__get_ref(item.builder, item.region)) 

1264 dset = parent[name] 

1265 dset[()] = refs 

1266 self.set_attributes(dset, attributes) 

1267 # Write array of object references 

1268 else: 

1269 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings']) 

1270 self.__set_written(builder) 

1271 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1272 "object references. attributes: %s" 

1273 % (name, list(attributes.keys()))) 

1274 

1275 @self.__queue_ref 

1276 def _filler(): 

1277 self.logger.debug("Resolving object references and setting attribute on dataset '%s' " 

1278 "containing attributes: %s" 

1279 % (name, list(attributes.keys()))) 

1280 refs = list() 

1281 for item in data: 

1282 refs.append(self.__get_ref(item)) 

1283 dset = parent[name] 

1284 dset[()] = refs 

1285 self.set_attributes(dset, attributes) 

1286 return 

1287 # write a "regular" dataset 

1288 else: 

1289 # Create an empty dataset 

1290 if data is None: 

1291 dset = self.__setup_empty_dset__(parent, name, options['io_settings']) 

1292 dataio.dataset = dset 

1293 # Write a scalar dataset containing a single string 

1294 elif isinstance(data, (str, bytes)): 

1295 dset = self.__scalar_fill__(parent, name, data, options) 

1296 # Iterative write of a data chunk iterator 

1297 elif isinstance(data, AbstractDataChunkIterator): 

1298 dset = self.__setup_chunked_dset__(parent, name, data, options) 

1299 self.__dci_queue.append(dataset=dset, data=data) 

1300 # Write a regular in memory array (e.g., numpy array, list etc.) 

1301 elif hasattr(data, '__len__'): 

1302 dset = self.__list_fill__(parent, name, data, options) 

1303 # Write a regular scalar dataset 

1304 else: 

1305 dset = self.__scalar_fill__(parent, name, data, options) 

1306 # Create the attributes on the dataset only if we are the primary and not just a Soft/External link 

1307 if link is None: 

1308 self.set_attributes(dset, attributes) 

1309 # Validate the attributes on the linked dataset 

1310 elif len(attributes) > 0: 

1311 pass 

1312 self.__set_written(builder) 

1313 if exhaust_dci: 1313 ↛ exitline 1313 didn't return from function 'write_dataset', because the condition on line 1313 was never false

1314 self.__dci_queue.exhaust_queue() 

1315 

1316 @classmethod 

1317 def __scalar_fill__(cls, parent, name, data, options=None): 

1318 dtype = None 

1319 io_settings = {} 

1320 if options is not None: 1320 ↛ 1323line 1320 didn't jump to line 1323, because the condition on line 1320 was never false

1321 dtype = options.get('dtype') 

1322 io_settings = options.get('io_settings') 

1323 if not isinstance(dtype, type): 1323 ↛ 1329line 1323 didn't jump to line 1329, because the condition on line 1323 was never false

1324 try: 

1325 dtype = cls.__resolve_dtype__(dtype, data) 

1326 except Exception as exc: 

1327 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1328 raise Exception(msg) from exc 

1329 try: 

1330 dset = parent.create_dataset(name, data=data, shape=None, dtype=dtype, **io_settings) 

1331 except Exception as exc: 

1332 msg = "Could not create scalar dataset %s in %s" % (name, parent.name) 

1333 raise Exception(msg) from exc 

1334 return dset 

1335 

1336 @classmethod 

1337 def __setup_chunked_dset__(cls, parent, name, data, options=None): 

1338 """ 

1339 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator 

1340 

1341 :param parent: The parent object to which the dataset should be added 

1342 :type parent: h5py.Group, h5py.File 

1343 :param name: The name of the dataset 

1344 :type name: str 

1345 :param data: The data to be written. 

1346 :type data: DataChunkIterator 

1347 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1348 :type options: dict 

1349 

1350 """ 

1351 io_settings = {} 

1352 if options is not None: 

1353 if 'io_settings' in options: 1353 ↛ 1356line 1353 didn't jump to line 1356, because the condition on line 1353 was never false

1354 io_settings = options.get('io_settings') 

1355 # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write. 

1356 if 'chunks' not in io_settings: 

1357 recommended_chunks = data.recommended_chunk_shape() 

1358 io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks 

1359 # Define the shape of the data if not provided by the user 

1360 if 'shape' not in io_settings: 1360 ↛ 1363line 1360 didn't jump to line 1363, because the condition on line 1360 was never false

1361 io_settings['shape'] = data.recommended_data_shape() 

1362 # Define the maxshape of the data if not provided by the user 

1363 if 'maxshape' not in io_settings: 

1364 io_settings['maxshape'] = data.maxshape 

1365 if 'dtype' not in io_settings: 1365 ↛ 1373line 1365 didn't jump to line 1373, because the condition on line 1365 was never false

1366 if (options is not None) and ('dtype' in options): 

1367 io_settings['dtype'] = options['dtype'] 

1368 else: 

1369 io_settings['dtype'] = data.dtype 

1370 if isinstance(io_settings['dtype'], str): 1370 ↛ 1372line 1370 didn't jump to line 1372, because the condition on line 1370 was never true

1371 # map to real dtype if we were given a string 

1372 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) 

1373 try: 

1374 dset = parent.create_dataset(name, **io_settings) 

1375 except Exception as exc: 

1376 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc 

1377 return dset 

1378 

1379 @classmethod 

1380 def __setup_empty_dset__(cls, parent, name, io_settings): 

1381 """ 

1382 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator 

1383 

1384 :param parent: The parent object to which the dataset should be added 

1385 :type parent: h5py.Group, h5py.File 

1386 :param name: The name of the dataset 

1387 :type name: str 

1388 :param data: The data to be written. 

1389 :type data: DataChunkIterator 

1390 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1391 :type options: dict 

1392 

1393 """ 

1394 # Define the shape of the data if not provided by the user 

1395 if 'shape' not in io_settings: 

1396 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without shape") 

1397 if 'dtype' not in io_settings: 

1398 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without dtype") 

1399 if isinstance(io_settings['dtype'], str): 

1400 # map to real dtype if we were given a string 

1401 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) 

1402 try: 

1403 dset = parent.create_dataset(name, **io_settings) 

1404 except Exception as exc: 

1405 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc 

1406 return dset 

1407 

1408 @classmethod 

1409 def __chunked_iter_fill__(cls, parent, name, data, options=None): 

1410 """ 

1411 Write data to a dataset one-chunk-at-a-time based on the given DataChunkIterator 

1412 

1413 :param parent: The parent object to which the dataset should be added 

1414 :type parent: h5py.Group, h5py.File 

1415 :param name: The name of the dataset 

1416 :type name: str 

1417 :param data: The data to be written. 

1418 :type data: DataChunkIterator 

1419 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1420 :type options: dict 

1421 

1422 """ 

1423 dset = cls.__setup_chunked_dset__(parent, name, data, options=options) 

1424 read = True 

1425 while read: 

1426 read = HDF5IODataChunkIteratorQueue._write_chunk(dset, data) 

1427 return dset 

1428 

1429 @classmethod 

1430 def __list_fill__(cls, parent, name, data, options=None): 

1431 # define the io settings and data type if necessary 

1432 io_settings = {} 

1433 dtype = None 

1434 if options is not None: 

1435 dtype = options.get('dtype') 

1436 io_settings = options.get('io_settings') 

1437 if not isinstance(dtype, type): 

1438 try: 

1439 dtype = cls.__resolve_dtype__(dtype, data) 

1440 except Exception as exc: 

1441 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1442 raise Exception(msg) from exc 

1443 # define the data shape 

1444 if 'shape' in io_settings: 1444 ↛ 1445line 1444 didn't jump to line 1445, because the condition on line 1444 was never true

1445 data_shape = io_settings.pop('shape') 

1446 elif hasattr(data, 'shape'): 

1447 data_shape = data.shape 

1448 elif isinstance(dtype, np.dtype): 

1449 data_shape = (len(data),) 

1450 else: 

1451 data_shape = get_data_shape(data) 

1452 

1453 # Create the dataset 

1454 try: 

1455 dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings) 

1456 except Exception as exc: 

1457 msg = "Could not create dataset %s in %s with shape %s, dtype %s, and iosettings %s. %s" % \ 

1458 (name, parent.name, str(data_shape), str(dtype), str(io_settings), str(exc)) 

1459 raise Exception(msg) from exc 

1460 # Write the data 

1461 if len(data) > dset.shape[0]: 1461 ↛ 1462line 1461 didn't jump to line 1462, because the condition on line 1461 was never true

1462 new_shape = list(dset.shape) 

1463 new_shape[0] = len(data) 

1464 dset.resize(new_shape) 

1465 try: 

1466 dset[:] = data 

1467 except Exception as e: 

1468 raise e 

1469 return dset 

1470 

1471 @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference', 

1472 'default': None}, 

1473 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object', 

1474 'default': None}, 

1475 returns='the reference', rtype=Reference) 

1476 def __get_ref(self, **kwargs): 

1477 container, region = getargs('container', 'region', kwargs) 

1478 if container is None: 1478 ↛ 1479line 1478 didn't jump to line 1479, because the condition on line 1478 was never true

1479 return None 

1480 if isinstance(container, Builder): 

1481 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name)) 

1482 if isinstance(container, LinkBuilder): 1482 ↛ 1483line 1482 didn't jump to line 1483, because the condition on line 1482 was never true

1483 builder = container.target_builder 

1484 else: 

1485 builder = container 

1486 elif isinstance(container, ReferenceBuilder): 

1487 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.builder.name)) 

1488 builder = container.builder 

1489 else: 

1490 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name)) 

1491 builder = self.manager.build(container) 

1492 path = self.__get_path(builder) 

1493 self.logger.debug("Getting reference at path '%s'" % path) 

1494 if isinstance(container, RegionBuilder): 1494 ↛ 1495line 1494 didn't jump to line 1495, because the condition on line 1494 was never true

1495 region = container.region 

1496 if region is not None: 1496 ↛ 1497line 1496 didn't jump to line 1497, because the condition on line 1496 was never true

1497 dset = self.__file[path] 

1498 if not isinstance(dset, Dataset): 

1499 raise ValueError('cannot create region reference without Dataset') 

1500 return self.__file[path].regionref[region] 

1501 else: 

1502 return self.__file[path].ref 

1503 

1504 def __is_ref(self, dtype): 

1505 if isinstance(dtype, DtypeSpec): 

1506 return self.__is_ref(dtype.dtype) 

1507 if isinstance(dtype, RefSpec): 

1508 return True 

1509 if isinstance(dtype, dict): # may be dict from reading a compound dataset 

1510 return self.__is_ref(dtype['dtype']) 

1511 if isinstance(dtype, str): 

1512 return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE 

1513 return False 

1514 

1515 def __queue_ref(self, func): 

1516 '''Set aside filling dset with references 

1517 

1518 dest[sl] = func() 

1519 

1520 Args: 

1521 dset: the h5py.Dataset that the references need to be added to 

1522 sl: the np.s_ (slice) object for indexing into dset 

1523 func: a function to call to return the chunk of data, with 

1524 references filled in 

1525 ''' 

1526 # TODO: come up with more intelligent way of 

1527 # queueing reference resolution, based on reference 

1528 # dependency 

1529 self.__ref_queue.append(func) 

1530 

1531 def __rec_get_ref(self, ref_list): 

1532 ret = list() 

1533 for elem in ref_list: 

1534 if isinstance(elem, (list, tuple)): 

1535 ret.append(self.__rec_get_ref(elem)) 

1536 elif isinstance(elem, (Builder, Container)): 

1537 ret.append(self.__get_ref(elem)) 

1538 else: 

1539 ret.append(elem) 

1540 return ret 

1541 

1542 @property 

1543 def mode(self): 

1544 """ 

1545 Return the HDF5 file mode. One of ("w", "r", "r+", "a", "w-", "x"). 

1546 """ 

1547 return self.__mode 

1548 

1549 @classmethod 

1550 @docval(*get_docval(H5DataIO.__init__)) 

1551 def set_dataio(cls, **kwargs): 

1552 """ 

1553 Wrap the given Data object with an H5DataIO. 

1554 

1555 This method is provided merely for convenience. It is the equivalent 

1556 of the following: 

1557 

1558 .. code-block:: python 

1559 

1560 from hdmf.backends.hdf5 import H5DataIO 

1561 data = ... 

1562 data = H5DataIO(data) 

1563 """ 

1564 return H5DataIO.__init__(**kwargs)