Coverage for src/hdmf/backends/hdf5/h5tools.py: 87%

905 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-07-21 22:12 +0000

1import logging 

2import os.path 

3import warnings 

4from collections import deque 

5from functools import partial 

6from pathlib import Path, PurePosixPath as pp 

7 

8import numpy as np 

9import h5py 

10from h5py import File, Group, Dataset, special_dtype, SoftLink, ExternalLink, Reference, RegionReference, check_dtype 

11 

12from .h5_utils import (BuilderH5ReferenceDataset, BuilderH5RegionDataset, BuilderH5TableDataset, H5DataIO, 

13 H5SpecReader, H5SpecWriter, HDF5IODataChunkIteratorQueue) 

14from ..io import HDMFIO 

15from ..errors import UnsupportedOperation 

16from ..warnings import BrokenLinkWarning 

17from ...build import (Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, RegionBuilder, 

18 ReferenceBuilder, TypeMap, ObjectMapper) 

19from ...container import Container 

20from ...data_utils import AbstractDataChunkIterator 

21from ...spec import RefSpec, DtypeSpec, NamespaceCatalog 

22from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset 

23from ..utils import NamespaceToBuilderHelper, WriteStatusTracker 

24 

25ROOT_NAME = 'root' 

26SPEC_LOC_ATTR = '.specloc' 

27H5_TEXT = special_dtype(vlen=str) 

28H5_BINARY = special_dtype(vlen=bytes) 

29H5_REF = special_dtype(ref=Reference) 

30H5_REGREF = special_dtype(ref=RegionReference) 

31 

32H5PY_3 = h5py.__version__.startswith('3') 

33 

34 

35class HDF5IO(HDMFIO): 

36 

37 __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group 

38 

39 @staticmethod 

40 def can_read(path): 

41 """Determines whether a given path is readable by the HDF5IO class""" 

42 if not os.path.isfile(path): 

43 return False 

44 try: 

45 with h5py.File(path, "r"): 

46 return True 

47 except IOError: 

48 return False 

49 

50 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

51 {'name': 'mode', 'type': str, 

52 'doc': ('the mode to open the HDF5 file with, one of ("w", "r", "r+", "a", "w-", "x"). ' 

53 'See `h5py.File <http://docs.h5py.org/en/latest/high/file.html#opening-creating-files>`_ for ' 

54 'more details.'), 

55 'default': 'r'}, 

56 {'name': 'manager', 'type': (TypeMap, BuildManager), 

57 'doc': 'the BuildManager or a TypeMap to construct a BuildManager to use for I/O', 'default': None}, 

58 {'name': 'comm', 'type': 'Intracomm', 

59 'doc': 'the MPI communicator to use for parallel I/O', 'default': None}, 

60 {'name': 'file', 'type': [File, "S3File"], 'doc': 'a pre-existing h5py.File object', 'default': None}, 

61 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

62 {'name': 'external_resources_path', 'type': str, 

63 'doc': 'The path to the ExternalResources', 'default': None},) 

64 def __init__(self, **kwargs): 

65 """Open an HDF5 file for IO. 

66 """ 

67 self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) 

68 path, manager, mode, comm, file_obj, driver, external_resources_path = popargs('path', 'manager', 'mode', 

69 'comm', 'file', 'driver', 

70 'external_resources_path', 

71 kwargs) 

72 

73 self.__open_links = [] # keep track of other files opened from links in this file 

74 self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close 

75 

76 path = self.__check_path_file_obj(path, file_obj) 

77 

78 if file_obj is None and not os.path.exists(path) and (mode == 'r' or mode == 'r+') and driver != 'ros3': 

79 msg = "Unable to open file %s in '%s' mode. File does not exist." % (path, mode) 

80 raise UnsupportedOperation(msg) 

81 

82 if file_obj is None and os.path.exists(path) and (mode == 'w-' or mode == 'x'): 

83 msg = "Unable to open file %s in '%s' mode. File already exists." % (path, mode) 

84 raise UnsupportedOperation(msg) 

85 

86 if manager is None: 

87 manager = BuildManager(TypeMap(NamespaceCatalog())) 

88 elif isinstance(manager, TypeMap): 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 manager = BuildManager(manager) 

90 self.__driver = driver 

91 self.__comm = comm 

92 self.__mode = mode 

93 self.__file = file_obj 

94 super().__init__(manager, source=path, external_resources_path=external_resources_path) 

95 # NOTE: source is not set if path is None and file_obj is passed 

96 self.__built = dict() # keep track of each builder for each dataset/group/link for each file 

97 self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder 

98 self.__ref_queue = deque() # a queue of the references that need to be added 

99 self.__dci_queue = HDF5IODataChunkIteratorQueue() # a queue of DataChunkIterators that need to be exhausted 

100 ObjectMapper.no_convert(Dataset) 

101 self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object 

102 

103 @property 

104 def comm(self): 

105 """The MPI communicator to use for parallel I/O.""" 

106 return self.__comm 

107 

108 @property 

109 def _file(self): 

110 return self.__file 

111 

112 @property 

113 def driver(self): 

114 return self.__driver 

115 

116 @classmethod 

117 def __check_path_file_obj(cls, path, file_obj): 

118 if isinstance(path, Path): 

119 path = str(path) 

120 

121 if path is None and file_obj is None: 

122 raise ValueError("Either the 'path' or 'file' argument must be supplied.") 

123 

124 if path is not None and file_obj is not None: # consistency check 

125 if os.path.abspath(file_obj.filename) != os.path.abspath(path): 

126 msg = ("You argued '%s' as this object's path, but supplied a file with filename: %s" 

127 % (path, file_obj.filename)) 

128 raise ValueError(msg) 

129 

130 return path 

131 

132 @classmethod 

133 def __resolve_file_obj(cls, path, file_obj, driver): 

134 path = cls.__check_path_file_obj(path, file_obj) 

135 

136 if file_obj is None: 

137 file_kwargs = dict() 

138 if driver is not None: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true

139 file_kwargs.update(driver=driver) 

140 file_obj = File(path, 'r', **file_kwargs) 

141 return file_obj 

142 

143 @classmethod 

144 @docval({'name': 'namespace_catalog', 'type': (NamespaceCatalog, TypeMap), 

145 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'}, 

146 {'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

147 {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None}, 

148 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, 

149 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

150 returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and " 

151 "the included data types"), 

152 rtype=dict) 

153 def load_namespaces(cls, **kwargs): 

154 """Load cached namespaces from a file. 

155 

156 If `file` is not supplied, then an :py:class:`h5py.File` object will be opened for the given `path`, the 

157 namespaces will be read, and the File object will be closed. If `file` is supplied, then 

158 the given File object will be read from and not closed. 

159 

160 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. 

161 """ 

162 namespace_catalog, path, namespaces, file_obj, driver = popargs( 

163 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs) 

164 

165 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) 

166 if file_obj is None: # need to close the file object that we just opened 

167 with open_file_obj: 

168 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj) 

169 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj) 

170 

171 @classmethod 

172 def __load_namespaces(cls, namespace_catalog, namespaces, file_obj): 

173 d = {} 

174 

175 if not cls.__check_specloc(file_obj): 

176 return d 

177 

178 namespace_versions = cls.__get_namespaces(file_obj) 

179 

180 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]] 

181 if namespaces is None: 181 ↛ 184line 181 didn't jump to line 184, because the condition on line 181 was never false

182 namespaces = list(spec_group.keys()) 

183 

184 readers = dict() 

185 deps = dict() 

186 for ns in namespaces: 

187 latest_version = namespace_versions[ns] 

188 ns_group = spec_group[ns][latest_version] 

189 reader = H5SpecReader(ns_group) 

190 readers[ns] = reader 

191 # for each namespace in the 'namespace' dataset, track all included namespaces (dependencies) 

192 for spec_ns in reader.read_namespace(cls.__ns_spec_path): 

193 deps[ns] = list() 

194 for s in spec_ns['schema']: 

195 dep = s.get('namespace') 

196 if dep is not None: 

197 deps[ns].append(dep) 

198 

199 order = cls._order_deps(deps) 

200 for ns in order: 

201 reader = readers[ns] 

202 d.update(namespace_catalog.load_namespaces(cls.__ns_spec_path, reader=reader)) 

203 

204 return d 

205 

206 @classmethod 

207 def __check_specloc(cls, file_obj): 

208 if SPEC_LOC_ATTR not in file_obj.attrs: 

209 # this occurs in legacy files 

210 msg = "No cached namespaces found in %s" % file_obj.filename 

211 warnings.warn(msg) 

212 return False 

213 return True 

214 

215 @classmethod 

216 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

217 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, 

218 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

219 returns="dict mapping names to versions of the namespaces in the file", rtype=dict) 

220 def get_namespaces(cls, **kwargs): 

221 """Get the names and versions of the cached namespaces from a file. 

222 

223 If ``file`` is not supplied, then an :py:class:`h5py.File` object will be opened for the given ``path``, the 

224 namespaces will be read, and the File object will be closed. If `file` is supplied, then 

225 the given File object will be read from and not closed. 

226 

227 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric 

228 ordering) is returned. This is the version of the namespace that is loaded by HDF5IO.load_namespaces(...). 

229 

230 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. 

231 """ 

232 path, file_obj, driver = popargs('path', 'file', 'driver', kwargs) 

233 

234 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) 

235 if file_obj is None: # need to close the file object that we just opened 

236 with open_file_obj: 

237 return cls.__get_namespaces(open_file_obj) 

238 return cls.__get_namespaces(open_file_obj) 

239 

240 @classmethod 

241 def __get_namespaces(cls, file_obj): 

242 """Return a dict mapping namespace name to version string for the latest version of that namespace in the file. 

243 

244 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric 

245 ordering) is returned. This is the version of the namespace that is loaded by ``HDF5IO.load_namespaces``. 

246 """ 

247 used_version_names = dict() 

248 if not cls.__check_specloc(file_obj): 

249 return used_version_names 

250 

251 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]] 

252 namespaces = list(spec_group.keys()) 

253 for ns in namespaces: 

254 ns_group = spec_group[ns] 

255 # NOTE: by default, objects within groups are iterated in alphanumeric order 

256 version_names = list(ns_group.keys()) 

257 if len(version_names) > 1: 

258 # prior to HDMF 1.6.1, extensions without a version were written under the group name "unversioned" 

259 # make sure that if there is another group representing a newer version, that is read instead 

260 if 'unversioned' in version_names: 

261 version_names.remove('unversioned') 

262 if len(version_names) > 1: 

263 # as of HDMF 1.6.1, extensions without a version are written under the group name "None" 

264 # make sure that if there is another group representing a newer version, that is read instead 

265 if 'None' in version_names: 

266 version_names.remove('None') 

267 used_version_names[ns] = version_names[-1] # save the largest in alphanumeric order 

268 

269 return used_version_names 

270 

271 @classmethod 

272 def _order_deps(cls, deps): 

273 """ 

274 Order namespaces according to dependency for loading into a NamespaceCatalog 

275 

276 Args: 

277 deps (dict): a dictionary that maps a namespace name to a list of name of 

278 the namespaces on which the namespace is directly dependent 

279 Example: {'a': ['b', 'c'], 'b': ['d'], 'c': ['d'], 'd': []} 

280 Expected output: ['d', 'b', 'c', 'a'] 

281 """ 

282 order = list() 

283 keys = list(deps.keys()) 

284 deps = dict(deps) 

285 for k in keys: 

286 if k in deps: 

287 cls.__order_deps_aux(order, deps, k) 

288 return order 

289 

290 @classmethod 

291 def __order_deps_aux(cls, order, deps, key): 

292 """ 

293 A recursive helper function for _order_deps 

294 """ 

295 if key not in deps: 

296 return 

297 subdeps = deps.pop(key) 

298 for subk in subdeps: 

299 cls.__order_deps_aux(order, deps, subk) 

300 order.append(key) 

301 

302 @classmethod 

303 @docval({'name': 'source_filename', 'type': str, 'doc': 'the path to the HDF5 file to copy'}, 

304 {'name': 'dest_filename', 'type': str, 'doc': 'the name of the destination file'}, 

305 {'name': 'expand_external', 'type': bool, 'doc': 'expand external links into new objects', 'default': True}, 

306 {'name': 'expand_refs', 'type': bool, 'doc': 'copy objects which are pointed to by reference', 

307 'default': False}, 

308 {'name': 'expand_soft', 'type': bool, 'doc': 'expand soft links into new objects', 'default': False} 

309 ) 

310 def copy_file(self, **kwargs): 

311 """ 

312 Convenience function to copy an HDF5 file while allowing external links to be resolved. 

313 

314 .. warning:: 

315 

316 As of HDMF 2.0, this method is no longer supported and may be removed in a future version. 

317 Please use the export method or h5py.File.copy method instead. 

318 

319 .. note:: 

320 

321 The source file will be opened in 'r' mode and the destination file will be opened in 'w' mode 

322 using h5py. To avoid possible collisions, care should be taken that, e.g., the source file is 

323 not opened already when calling this function. 

324 

325 """ 

326 

327 warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of " 

328 "HDMF. Please use the export method or h5py.File.copy method instead.", DeprecationWarning) 

329 

330 source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename', 

331 'dest_filename', 

332 'expand_external', 

333 'expand_refs', 

334 'expand_soft', 

335 kwargs) 

336 source_file = File(source_filename, 'r') 

337 dest_file = File(dest_filename, 'w') 

338 for objname in source_file["/"].keys(): 

339 source_file.copy(source=objname, 

340 dest=dest_file, 

341 name=objname, 

342 expand_external=expand_external, 

343 expand_refs=expand_refs, 

344 expand_soft=expand_soft, 

345 shallow=False, 

346 without_attrs=False, 

347 ) 

348 for objname in source_file['/'].attrs: 

349 dest_file['/'].attrs[objname] = source_file['/'].attrs[objname] 

350 source_file.close() 

351 dest_file.close() 

352 

353 @docval({'name': 'container', 'type': Container, 'doc': 'the Container object to write'}, 

354 {'name': 'cache_spec', 'type': bool, 

355 'doc': ('If True (default), cache specification to file (highly recommended). If False, do not cache ' 

356 'specification to file. The appropriate specification will then need to be loaded prior to ' 

357 'reading the file.'), 

358 'default': True}, 

359 {'name': 'link_data', 'type': bool, 

360 'doc': 'If True (default), create external links to HDF5 Datasets. If False, copy HDF5 Datasets.', 

361 'default': True}, 

362 {'name': 'exhaust_dci', 'type': bool, 

363 'doc': 'If True (default), exhaust DataChunkIterators one at a time. If False, exhaust them concurrently.', 

364 'default': True}) 

365 def write(self, **kwargs): 

366 """Write the container to an HDF5 file.""" 

367 if self.__mode == 'r': 

368 raise UnsupportedOperation(("Cannot write to file %s in mode '%s'. " 

369 "Please use mode 'r+', 'w', 'w-', 'x', or 'a'") 

370 % (self.source, self.__mode)) 

371 

372 cache_spec = popargs('cache_spec', kwargs) 

373 super().write(**kwargs) 

374 if cache_spec: 

375 self.__cache_spec() 

376 

377 def __cache_spec(self): 

378 ref = self.__file.attrs.get(SPEC_LOC_ATTR) 

379 spec_group = None 

380 if ref is not None: 

381 spec_group = self.__file[ref] 

382 else: 

383 path = 'specifications' # do something to figure out where the specifications should go 

384 spec_group = self.__file.require_group(path) 

385 self.__file.attrs[SPEC_LOC_ATTR] = spec_group.ref 

386 ns_catalog = self.manager.namespace_catalog 

387 for ns_name in ns_catalog.namespaces: 

388 ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name) 

389 namespace = ns_catalog.get_namespace(ns_name) 

390 group_name = '%s/%s' % (ns_name, namespace.version) 

391 if group_name in spec_group: 

392 continue 

393 ns_group = spec_group.create_group(group_name) 

394 writer = H5SpecWriter(ns_group) 

395 ns_builder.export(self.__ns_spec_path, writer=writer) 

396 

397 _export_args = ( 

398 {'name': 'src_io', 'type': 'HDMFIO', 'doc': 'the HDMFIO object for reading the data to export'}, 

399 {'name': 'container', 'type': Container, 

400 'doc': ('the Container object to export. If None, then the entire contents of the HDMFIO object will be ' 

401 'exported'), 

402 'default': None}, 

403 {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`', 

404 'default': None}, 

405 {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file', 

406 'default': True} 

407 # clear_cache is an arg on HDMFIO.export but it is intended for internal usage 

408 # so it is not available on HDF5IO 

409 ) 

410 

411 @docval(*_export_args) 

412 def export(self, **kwargs): 

413 """Export data read from a file from any backend to HDF5. 

414 

415 See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. 

416 """ 

417 if self.__mode != 'w': 

418 raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." 

419 % (self.source, self.__mode)) 

420 

421 src_io = getargs('src_io', kwargs) 

422 write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) 

423 if write_args is None: 

424 write_args = dict() 

425 

426 if not isinstance(src_io, HDF5IO) and write_args.get('link_data', True): 

427 raise UnsupportedOperation("Cannot export from non-HDF5 backend %s to HDF5 with write argument " 

428 "link_data=True." % src_io.__class__.__name__) 

429 

430 write_args['export_source'] = os.path.abspath(src_io.source) if src_io.source is not None else None 

431 ckwargs = kwargs.copy() 

432 ckwargs['write_args'] = write_args 

433 if not write_args.get('link_data', True): 

434 ckwargs['clear_cache'] = True 

435 super().export(**ckwargs) 

436 if cache_spec: 

437 # add any namespaces from the src_io that have not yet been loaded 

438 for namespace in src_io.manager.namespace_catalog.namespaces: 

439 if namespace not in self.manager.namespace_catalog.namespaces: 439 ↛ 438line 439 didn't jump to line 438, because the condition on line 439 was never false

440 self.manager.namespace_catalog.add_namespace( 

441 name=namespace, 

442 namespace=src_io.manager.namespace_catalog.get_namespace(namespace) 

443 ) 

444 self.__cache_spec() 

445 

446 @classmethod 

447 @docval({'name': 'path', 'type': str, 'doc': 'the path to the destination HDF5 file'}, 

448 {'name': 'comm', 'type': 'Intracomm', 'doc': 'the MPI communicator to use for parallel I/O', 

449 'default': None}, 

450 *_export_args) # NOTE: src_io is required and is the second positional argument 

451 def export_io(self, **kwargs): 

452 """Export from one backend to HDF5 (class method). 

453 

454 Convenience function for :py:meth:`export` where you do not need to 

455 instantiate a new ``HDF5IO`` object for writing. An ``HDF5IO`` object is created with mode 'w' and the given 

456 arguments. 

457 

458 Example usage: 

459 

460 .. code-block:: python 

461 

462 old_io = HDF5IO('old.h5', 'r') 

463 HDF5IO.export_io(path='new_copy.h5', src_io=old_io) 

464 

465 See :py:meth:`export` for more details. 

466 """ 

467 path, comm = popargs('path', 'comm', kwargs) 

468 

469 with HDF5IO(path=path, comm=comm, mode='w') as write_io: 

470 write_io.export(**kwargs) 

471 

472 def read(self, **kwargs): 

473 if self.__mode == 'w' or self.__mode == 'w-' or self.__mode == 'x': 

474 raise UnsupportedOperation("Cannot read from file %s in mode '%s'. Please use mode 'r', 'r+', or 'a'." 

475 % (self.source, self.__mode)) 

476 try: 

477 return super().read(**kwargs) 

478 except UnsupportedOperation as e: 

479 if str(e) == 'Cannot build data. There are no values.': # pragma: no cover 

480 raise UnsupportedOperation("Cannot read data from file %s in mode '%s'. There are no values." 

481 % (self.source, self.__mode)) 

482 

483 @docval(returns='a GroupBuilder representing the data object', rtype='GroupBuilder') 

484 def read_builder(self): 

485 """ 

486 Read data and return the GroupBuilder representing it. 

487 

488 NOTE: On read, the Builder.source may will usually not be set of the Builders. 

489 NOTE: The Builder.location is used internally to ensure correct handling of links (in particular on export) 

490 and should be set on read for all GroupBuilder, DatasetBuilder, and LinkBuilder objects. 

491 """ 

492 if not self.__file: 

493 raise UnsupportedOperation("Cannot read data from closed HDF5 file '%s'" % self.source) 

494 f_builder = self.__read.get(self.__file) 

495 # ignore cached specs when reading builder 

496 ignore = set() 

497 specloc = self.__file.attrs.get(SPEC_LOC_ATTR) 

498 if specloc is not None: 

499 ignore.add(self.__file[specloc].name) 

500 if f_builder is None: 

501 f_builder = self.__read_group(self.__file, ROOT_NAME, ignore=ignore) 

502 self.__read[self.__file] = f_builder 

503 return f_builder 

504 

505 def __set_written(self, builder): 

506 """ 

507 Helper function used to set the written status for builders 

508 

509 :param builder: Builder object to be marked as written 

510 :type builder: Builder 

511 """ 

512 self._written_builders.set_written(builder) 

513 

514 def get_written(self, builder): 

515 """Return True if this builder has been written to (or read from) disk by this IO object, False otherwise. 

516 

517 :param builder: Builder object to get the written flag for 

518 :type builder: Builder 

519 

520 :return: True if the builder is found in self._written_builders using the builder ID, False otherwise 

521 """ 

522 return self._written_builders.get_written(builder) 

523 

524 def __set_built(self, fpath, id, builder): 

525 """ 

526 Update self.__built to cache the given builder for the given file and id. 

527 

528 :param fpath: Path to the HDF5 file containing the object 

529 :type fpath: str 

530 :param id: ID of the HDF5 object in the path 

531 :type id: h5py GroupID object 

532 :param builder: The builder to be cached 

533 """ 

534 self.__built.setdefault(fpath, dict()).setdefault(id, builder) 

535 

536 def __get_built(self, fpath, id): 

537 """ 

538 Look up a builder for the given file and id in self.__built cache 

539 

540 :param fpath: Path to the HDF5 file containing the object 

541 :type fpath: str 

542 :param id: ID of the HDF5 object in the path 

543 :type id: h5py GroupID object 

544 

545 :return: Builder in the self.__built cache or None 

546 """ 

547 

548 fdict = self.__built.get(fpath) 

549 if fdict: 

550 return fdict.get(id) 

551 else: 

552 return None 

553 

554 @docval({'name': 'h5obj', 'type': (Dataset, Group), 

555 'doc': 'the HDF5 object to the corresponding Builder object for'}) 

556 def get_builder(self, **kwargs): 

557 """ 

558 Get the builder for the corresponding h5py Group or Dataset 

559 

560 :raises ValueError: When no builder has been constructed yet for the given h5py object 

561 """ 

562 h5obj = getargs('h5obj', kwargs) 

563 fpath = h5obj.file.filename 

564 builder = self.__get_built(fpath, h5obj.id) 

565 if builder is None: 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true

566 msg = '%s:%s has not been built' % (fpath, h5obj.name) 

567 raise ValueError(msg) 

568 return builder 

569 

570 @docval({'name': 'h5obj', 'type': (Dataset, Group), 

571 'doc': 'the HDF5 object to the corresponding Container/Data object for'}) 

572 def get_container(self, **kwargs): 

573 """ 

574 Get the container for the corresponding h5py Group or Dataset 

575 

576 :raises ValueError: When no builder has been constructed yet for the given h5py object 

577 """ 

578 h5obj = getargs('h5obj', kwargs) 

579 builder = self.get_builder(h5obj) 

580 container = self.manager.construct(builder) 

581 return container 

582 

583 def __read_group(self, h5obj, name=None, ignore=set()): 

584 kwargs = { 

585 "attributes": self.__read_attrs(h5obj), 

586 "groups": dict(), 

587 "datasets": dict(), 

588 "links": dict() 

589 } 

590 

591 for key, val in kwargs['attributes'].items(): 

592 if isinstance(val, bytes): 592 ↛ 593line 592 didn't jump to line 593, because the condition on line 592 was never true

593 kwargs['attributes'][key] = val.decode('UTF-8') 

594 

595 if name is None: 

596 name = str(os.path.basename(h5obj.name)) 

597 for k in h5obj: 

598 sub_h5obj = h5obj.get(k) 

599 if sub_h5obj is not None: 

600 if sub_h5obj.name in ignore: 

601 continue 

602 link_type = h5obj.get(k, getlink=True) 

603 if isinstance(link_type, (SoftLink, ExternalLink)): 

604 # Reading links might be better suited in its own function 

605 # get path of link (the key used for tracking what's been built) 

606 target_path = link_type.path 

607 target_obj = sub_h5obj.file[target_path] 

608 builder_name = os.path.basename(target_path) 

609 # get builder if already read, else build it 

610 builder = self.__get_built(sub_h5obj.file.filename, target_obj.id) 

611 if builder is None: 

612 # NOTE: all links must have absolute paths 

613 if isinstance(target_obj, Dataset): 

614 builder = self.__read_dataset(target_obj, builder_name) 

615 else: 

616 builder = self.__read_group(target_obj, builder_name, ignore=ignore) 

617 self.__set_built(sub_h5obj.file.filename, target_obj.id, builder) 

618 link_builder = LinkBuilder(builder=builder, name=k, source=os.path.abspath(h5obj.file.filename)) 

619 link_builder.location = h5obj.name 

620 self.__set_written(link_builder) 

621 kwargs['links'][builder_name] = link_builder 

622 if isinstance(link_type, ExternalLink): 

623 self.__open_links.append(sub_h5obj) 

624 else: 

625 builder = self.__get_built(sub_h5obj.file.filename, sub_h5obj.id) 

626 obj_type = None 

627 read_method = None 

628 if isinstance(sub_h5obj, Dataset): 

629 read_method = self.__read_dataset 

630 obj_type = kwargs['datasets'] 

631 else: 

632 read_method = partial(self.__read_group, ignore=ignore) 

633 obj_type = kwargs['groups'] 

634 if builder is None: 

635 builder = read_method(sub_h5obj) 

636 self.__set_built(sub_h5obj.file.filename, sub_h5obj.id, builder) 

637 obj_type[builder.name] = builder 

638 else: 

639 warnings.warn('Path to Group altered/broken at ' + os.path.join(h5obj.name, k), BrokenLinkWarning) 

640 kwargs['datasets'][k] = None 

641 continue 

642 kwargs['source'] = os.path.abspath(h5obj.file.filename) 

643 ret = GroupBuilder(name, **kwargs) 

644 ret.location = os.path.dirname(h5obj.name) 

645 self.__set_written(ret) 

646 return ret 

647 

648 def __read_dataset(self, h5obj, name=None): 

649 kwargs = { 

650 "attributes": self.__read_attrs(h5obj), 

651 "dtype": h5obj.dtype, 

652 "maxshape": h5obj.maxshape 

653 } 

654 for key, val in kwargs['attributes'].items(): 

655 if isinstance(val, bytes): 655 ↛ 656line 655 didn't jump to line 656, because the condition on line 655 was never true

656 kwargs['attributes'][key] = val.decode('UTF-8') 

657 

658 if name is None: 

659 name = str(os.path.basename(h5obj.name)) 

660 kwargs['source'] = os.path.abspath(h5obj.file.filename) 

661 ndims = len(h5obj.shape) 

662 if ndims == 0: # read scalar 

663 scalar = h5obj[()] 

664 if isinstance(scalar, bytes): 664 ↛ 665line 664 didn't jump to line 665, because the condition on line 664 was never true

665 scalar = scalar.decode('UTF-8') 

666 

667 if isinstance(scalar, Reference): 667 ↛ 669line 667 didn't jump to line 669, because the condition on line 667 was never true

668 # TODO (AJTRITT): This should call __read_ref to support Group references 

669 target = h5obj.file[scalar] 

670 target_builder = self.__read_dataset(target) 

671 self.__set_built(target.file.filename, target.id, target_builder) 

672 if isinstance(scalar, RegionReference): 

673 d = RegionBuilder(scalar, target_builder) 

674 else: 

675 d = ReferenceBuilder(target_builder) 

676 kwargs['data'] = d 

677 kwargs['dtype'] = d.dtype 

678 else: 

679 kwargs["data"] = scalar 

680 else: 

681 d = None 

682 if h5obj.dtype.kind == 'O' and len(h5obj) > 0: 

683 elem1 = h5obj[tuple([0] * (h5obj.ndim - 1) + [0])] 

684 if isinstance(elem1, (str, bytes)): 

685 d = self._check_str_dtype(h5obj) 

686 elif isinstance(elem1, RegionReference): # read list of references 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true

687 d = BuilderH5RegionDataset(h5obj, self) 

688 kwargs['dtype'] = d.dtype 

689 elif isinstance(elem1, Reference): 689 ↛ 699line 689 didn't jump to line 699, because the condition on line 689 was never false

690 d = BuilderH5ReferenceDataset(h5obj, self) 

691 kwargs['dtype'] = d.dtype 

692 elif h5obj.dtype.kind == 'V': # table / compound data type 

693 cpd_dt = h5obj.dtype 

694 ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))] 

695 d = BuilderH5TableDataset(h5obj, self, ref_cols) 

696 kwargs['dtype'] = HDF5IO.__compound_dtype_to_list(h5obj.dtype, d.dtype) 

697 else: 

698 d = h5obj 

699 kwargs["data"] = d 

700 ret = DatasetBuilder(name, **kwargs) 

701 ret.location = os.path.dirname(h5obj.name) 

702 self.__set_written(ret) 

703 return ret 

704 

705 def _check_str_dtype(self, h5obj): 

706 dtype = h5obj.dtype 

707 if dtype.kind == 'O': 707 ↛ 710line 707 didn't jump to line 710, because the condition on line 707 was never false

708 if dtype.metadata.get('vlen') == str and H5PY_3: 708 ↛ 710line 708 didn't jump to line 710, because the condition on line 708 was never false

709 return StrDataset(h5obj, None) 

710 return h5obj 

711 

712 @classmethod 

713 def __compound_dtype_to_list(cls, h5obj_dtype, dset_dtype): 

714 ret = [] 

715 for name, dtype in zip(h5obj_dtype.fields, dset_dtype): 

716 ret.append({'name': name, 'dtype': dtype}) 

717 return ret 

718 

719 def __read_attrs(self, h5obj): 

720 ret = dict() 

721 for k, v in h5obj.attrs.items(): 

722 if k == SPEC_LOC_ATTR: # ignore cached spec 

723 continue 

724 if isinstance(v, RegionReference): 724 ↛ 725line 724 didn't jump to line 725, because the condition on line 724 was never true

725 raise ValueError("cannot read region reference attributes yet") 

726 elif isinstance(v, Reference): 

727 ret[k] = self.__read_ref(h5obj.file[v]) 

728 else: 

729 ret[k] = v 

730 return ret 

731 

732 def __read_ref(self, h5obj): 

733 ret = None 

734 ret = self.__get_built(h5obj.file.filename, h5obj.id) 

735 if ret is None: 

736 if isinstance(h5obj, Dataset): 

737 ret = self.__read_dataset(h5obj) 

738 elif isinstance(h5obj, Group): 738 ↛ 741line 738 didn't jump to line 741, because the condition on line 738 was never false

739 ret = self.__read_group(h5obj) 

740 else: 

741 raise ValueError("h5obj must be a Dataset or a Group - got %s" % str(h5obj)) 

742 self.__set_built(h5obj.file.filename, h5obj.id, ret) 

743 return ret 

744 

745 def open(self): 

746 if self.__file is None: 

747 open_flag = self.__mode 

748 kwargs = dict() 

749 if self.comm: 749 ↛ 750line 749 didn't jump to line 750, because the condition on line 749 was never true

750 kwargs.update(driver='mpio', comm=self.comm) 

751 

752 if self.driver is not None: 752 ↛ 753line 752 didn't jump to line 753, because the condition on line 752 was never true

753 kwargs.update(driver=self.driver) 

754 

755 self.__file = File(self.source, open_flag, **kwargs) 

756 

757 def close(self, close_links=True): 

758 """Close this file and any files linked to from this file. 

759 

760 :param close_links: Whether to close all files linked to from this file. (default: True) 

761 :type close_links: bool 

762 """ 

763 if close_links: 

764 self.close_linked_files() 

765 try: 

766 if self.__file is not None: 

767 self.__file.close() 

768 except AttributeError: 

769 # Do not do anything in case that self._file does not exist. This 

770 # may happen in case that an error occurs before HDF5IO has been fully 

771 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises 

772 # an error before self.__file has been created 

773 self.__file = None 

774 

775 def close_linked_files(self): 

776 """Close all opened, linked-to files. 

777 

778 MacOS and Linux automatically release the linked-to file after the linking file is closed, but Windows does 

779 not, which prevents the linked-to file from being deleted or truncated. Use this method to close all opened, 

780 linked-to files. 

781 """ 

782 # Make sure 

783 try: 

784 for obj in self.__open_links: 

785 if obj: 

786 obj.file.close() 

787 except AttributeError: 

788 # Do not do anything in case that self.__open_links does not exist. This 

789 # may happen in case that an error occurs before HDF5IO has been fully 

790 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises 

791 # an error before self.__open_links has been created. 

792 pass 

793 finally: 

794 self.__open_links = [] 

795 

796 @docval({'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the HDF5 file'}, 

797 {'name': 'link_data', 'type': bool, 

798 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

799 {'name': 'exhaust_dci', 'type': bool, 

800 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

801 'default': True}, 

802 {'name': 'export_source', 'type': str, 

803 'doc': 'The source of the builders when exporting', 'default': None}) 

804 def write_builder(self, **kwargs): 

805 f_builder = popargs('builder', kwargs) 

806 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) 

807 self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" 

808 % (f_builder.name, self.source, kwargs)) 

809 for name, gbldr in f_builder.groups.items(): 

810 self.write_group(self.__file, gbldr, **kwargs) 

811 for name, dbldr in f_builder.datasets.items(): 

812 self.write_dataset(self.__file, dbldr, **kwargs) 

813 for name, lbldr in f_builder.links.items(): 

814 self.write_link(self.__file, lbldr, export_source=kwargs.get("export_source")) 

815 self.set_attributes(self.__file, f_builder.attributes) 

816 self.__add_refs() 

817 self.__dci_queue.exhaust_queue() 

818 self.__set_written(f_builder) 

819 self.logger.debug("Done writing %s '%s' to path '%s'" % 

820 (f_builder.__class__.__qualname__, f_builder.name, self.source)) 

821 

822 def __add_refs(self): 

823 ''' 

824 Add all references in the file. 

825 

826 References get queued to be added at the end of write. This is because 

827 the current traversal algorithm (i.e. iterating over GroupBuilder items) 

828 does not happen in a guaranteed order. We need to figure out what objects 

829 will be references, and then write them after we write everything else. 

830 ''' 

831 failed = set() 

832 while len(self.__ref_queue) > 0: 

833 call = self.__ref_queue.popleft() 

834 self.logger.debug("Adding reference with call id %d from queue (length %d)" 

835 % (id(call), len(self.__ref_queue))) 

836 try: 

837 call() 

838 except KeyError: 

839 if id(call) in failed: 

840 raise RuntimeError('Unable to resolve reference') 

841 self.logger.debug("Adding reference with call id %d failed. Appending call to queue" % id(call)) 

842 failed.add(id(call)) 

843 self.__ref_queue.append(call) 

844 

845 @classmethod 

846 def get_type(cls, data): 

847 if isinstance(data, str): 

848 return H5_TEXT 

849 elif isinstance(data, bytes): 849 ↛ 850line 849 didn't jump to line 850, because the condition on line 849 was never true

850 return H5_BINARY 

851 elif isinstance(data, Container): 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true

852 return H5_REF 

853 elif not hasattr(data, '__len__'): 

854 return type(data) 

855 else: 

856 if len(data) == 0: 

857 if hasattr(data, 'dtype'): 857 ↛ 858line 857 didn't jump to line 858, because the condition on line 857 was never true

858 return data.dtype 

859 else: 

860 raise ValueError('cannot determine type for empty data') 

861 return cls.get_type(data[0]) 

862 

863 __dtypes = { 

864 "float": np.float32, 

865 "float32": np.float32, 

866 "double": np.float64, 

867 "float64": np.float64, 

868 "long": np.int64, 

869 "int64": np.int64, 

870 "int": np.int32, 

871 "int32": np.int32, 

872 "short": np.int16, 

873 "int16": np.int16, 

874 "int8": np.int8, 

875 "uint64": np.uint64, 

876 "uint": np.uint32, 

877 "uint32": np.uint32, 

878 "uint16": np.uint16, 

879 "uint8": np.uint8, 

880 "bool": np.bool_, 

881 "text": H5_TEXT, 

882 "utf": H5_TEXT, 

883 "utf8": H5_TEXT, 

884 "utf-8": H5_TEXT, 

885 "ascii": H5_BINARY, 

886 "bytes": H5_BINARY, 

887 "ref": H5_REF, 

888 "reference": H5_REF, 

889 "object": H5_REF, 

890 "region": H5_REGREF, 

891 "isodatetime": H5_TEXT, 

892 "datetime": H5_TEXT, 

893 } 

894 

895 @classmethod 

896 def __resolve_dtype__(cls, dtype, data): 

897 # TODO: These values exist, but I haven't solved them yet 

898 # binary 

899 # number 

900 dtype = cls.__resolve_dtype_helper__(dtype) 

901 if dtype is None: 

902 dtype = cls.get_type(data) 

903 return dtype 

904 

905 @classmethod 

906 def __resolve_dtype_helper__(cls, dtype): 

907 if dtype is None: 

908 return None 

909 elif isinstance(dtype, str): 

910 return cls.__dtypes.get(dtype) 

911 elif isinstance(dtype, dict): 

912 return cls.__dtypes.get(dtype['reftype']) 

913 elif isinstance(dtype, np.dtype): 

914 # NOTE: some dtypes may not be supported, but we need to support writing of read-in compound types 

915 return dtype 

916 else: 

917 return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype]) 

918 

919 @docval({'name': 'obj', 'type': (Group, Dataset), 'doc': 'the HDF5 object to add attributes to'}, 

920 {'name': 'attributes', 

921 'type': dict, 

922 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}) 

923 def set_attributes(self, **kwargs): 

924 obj, attributes = getargs('obj', 'attributes', kwargs) 

925 for key, value in attributes.items(): 

926 try: 

927 if isinstance(value, (set, list, tuple)): 

928 tmp = tuple(value) 

929 if len(tmp) > 0: 

930 if isinstance(tmp[0], (str, bytes)): 930 ↛ 932line 930 didn't jump to line 932, because the condition on line 930 was never false

931 value = np.array(value, dtype=special_dtype(vlen=type(tmp[0]))) 

932 elif isinstance(tmp[0], Container): # a list of references 

933 self.__queue_ref(self._make_attr_ref_filler(obj, key, tmp)) 

934 else: 

935 value = np.array(value) 

936 self.logger.debug("Setting %s '%s' attribute '%s' to %s" 

937 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

938 obj.attrs[key] = value 

939 elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference 

940 self.__queue_ref(self._make_attr_ref_filler(obj, key, value)) 

941 else: 

942 self.logger.debug("Setting %s '%s' attribute '%s' to %s" 

943 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

944 if isinstance(value, np.ndarray) and value.dtype.kind == 'U': 944 ↛ 945line 944 didn't jump to line 945, because the condition on line 944 was never true

945 value = np.array(value, dtype=H5_TEXT) 

946 obj.attrs[key] = value # a regular scalar 

947 except Exception as e: 

948 msg = "unable to write attribute '%s' on object '%s'" % (key, obj.name) 

949 raise RuntimeError(msg) from e 

950 

951 def _make_attr_ref_filler(self, obj, key, value): 

952 ''' 

953 Make the callable for setting references to attributes 

954 ''' 

955 self.logger.debug("Queueing set %s '%s' attribute '%s' to %s" 

956 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

957 if isinstance(value, (tuple, list)): 957 ↛ 958line 957 didn't jump to line 958, because the condition on line 957 was never true

958 def _filler(): 

959 ret = list() 

960 for item in value: 

961 ret.append(self.__get_ref(item)) 

962 obj.attrs[key] = ret 

963 else: 

964 def _filler(): 

965 obj.attrs[key] = self.__get_ref(value) 

966 return _filler 

967 

968 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, 

969 {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, 

970 {'name': 'link_data', 'type': bool, 

971 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

972 {'name': 'exhaust_dci', 'type': bool, 

973 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

974 'default': True}, 

975 {'name': 'export_source', 'type': str, 

976 'doc': 'The source of the builders when exporting', 'default': None}, 

977 returns='the Group that was created', rtype='Group') 

978 def write_group(self, **kwargs): 

979 parent, builder = popargs('parent', 'builder', kwargs) 

980 self.logger.debug("Writing GroupBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

981 if self.get_written(builder): 

982 self.logger.debug(" GroupBuilder '%s' is already written" % builder.name) 

983 group = parent[builder.name] 

984 else: 

985 self.logger.debug(" Creating group '%s'" % builder.name) 

986 group = parent.create_group(builder.name) 

987 # write all groups 

988 subgroups = builder.groups 

989 if subgroups: 

990 for subgroup_name, sub_builder in subgroups.items(): 

991 # do not create an empty group without attributes or links 

992 self.write_group(group, sub_builder, **kwargs) 

993 # write all datasets 

994 datasets = builder.datasets 

995 if datasets: 

996 for dset_name, sub_builder in datasets.items(): 

997 self.write_dataset(group, sub_builder, **kwargs) 

998 # write all links 

999 links = builder.links 

1000 if links: 

1001 for link_name, sub_builder in links.items(): 

1002 self.write_link(group, sub_builder, export_source=kwargs.get("export_source")) 

1003 attributes = builder.attributes 

1004 self.set_attributes(group, attributes) 

1005 self.__set_written(builder) 

1006 return group 

1007 

1008 def __get_path(self, builder): 

1009 """Get the path to the builder. 

1010 

1011 Note that the root of the file has no name - it is just "/". Thus, the name of the root container is ignored. 

1012 If builder.location is set then it is used as the path, otherwise the function 

1013 determines the path by constructing it iteratively from the parents of the 

1014 builder. 

1015 """ 

1016 if builder.location is not None: 

1017 path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/") 

1018 else: 

1019 curr = builder 

1020 names = list() 

1021 while curr.parent is not None: 

1022 names.append(curr.name) 

1023 curr = curr.parent 

1024 delim = "/" 

1025 path = "%s%s" % (delim, delim.join(reversed(names))) 

1026 return path 

1027 

1028 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, 

1029 {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'}, 

1030 {'name': 'export_source', 'type': str, 

1031 'doc': 'The source of the builders when exporting', 'default': None}, 

1032 returns='the Link that was created', rtype='Link') 

1033 def write_link(self, **kwargs): 

1034 parent, builder, export_source = getargs('parent', 'builder', 'export_source', kwargs) 

1035 self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

1036 if self.get_written(builder): 1036 ↛ 1037line 1036 didn't jump to line 1037, because the condition on line 1036 was never true

1037 self.logger.debug(" LinkBuilder '%s' is already written" % builder.name) 

1038 return None 

1039 name = builder.name 

1040 target_builder = builder.builder 

1041 path = self.__get_path(target_builder) 

1042 # source will indicate target_builder's location 

1043 if export_source is None: 

1044 write_source = builder.source 

1045 else: 

1046 write_source = export_source 

1047 

1048 parent_filename = os.path.abspath(parent.file.filename) 

1049 if target_builder.source in (write_source, parent_filename): 

1050 link_obj = SoftLink(path) 

1051 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1052 % (parent.name, name, link_obj.path)) 

1053 elif target_builder.source is not None: 1053 ↛ 1062line 1053 didn't jump to line 1062, because the condition on line 1053 was never false

1054 target_filename = os.path.abspath(target_builder.source) 

1055 relative_path = os.path.relpath(target_filename, os.path.dirname(parent_filename)) 

1056 if target_builder.location is not None: 

1057 path = target_builder.location + "/" + target_builder.name 

1058 link_obj = ExternalLink(relative_path, path) 

1059 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1060 % (parent.name, name, link_obj.filename, link_obj.path)) 

1061 else: 

1062 msg = 'cannot create external link to %s' % path 

1063 raise ValueError(msg) 

1064 parent[name] = link_obj 

1065 self.__set_written(builder) 

1066 return link_obj 

1067 

1068 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, # noqa: C901 

1069 {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'}, 

1070 {'name': 'link_data', 'type': bool, 

1071 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

1072 {'name': 'exhaust_dci', 'type': bool, 

1073 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

1074 'default': True}, 

1075 {'name': 'export_source', 'type': str, 

1076 'doc': 'The source of the builders when exporting', 'default': None}, 

1077 returns='the Dataset that was created', rtype=Dataset) 

1078 def write_dataset(self, **kwargs): # noqa: C901 

1079 """ Write a dataset to HDF5 

1080 

1081 The function uses other dataset-dependent write functions, e.g, 

1082 ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data. 

1083 """ 

1084 parent, builder = popargs('parent', 'builder', kwargs) 

1085 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) 

1086 self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

1087 if self.get_written(builder): 

1088 self.logger.debug(" DatasetBuilder '%s' is already written" % builder.name) 

1089 return None 

1090 name = builder.name 

1091 data = builder.data 

1092 dataio = None 

1093 options = dict() # dict with additional 

1094 if isinstance(data, H5DataIO): 

1095 options['io_settings'] = data.io_settings 

1096 dataio = data 

1097 link_data = data.link_data 

1098 data = data.data 

1099 else: 

1100 options['io_settings'] = {} 

1101 attributes = builder.attributes 

1102 options['dtype'] = builder.dtype 

1103 dset = None 

1104 link = None 

1105 

1106 # The user provided an existing h5py dataset as input and asked to create a link to the dataset 

1107 if isinstance(data, Dataset): 

1108 data_filename = os.path.abspath(data.file.filename) 

1109 if link_data: 

1110 if export_source is None: # not exporting 

1111 parent_filename = os.path.abspath(parent.file.filename) 

1112 if data_filename != parent_filename: # create external link to data 

1113 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename)) 

1114 link = ExternalLink(relative_path, data.name) 

1115 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1116 % (parent.name, name, link.filename, link.path)) 

1117 else: # create soft link to dataset already in this file -- possible if mode == 'r+' 

1118 link = SoftLink(data.name) 

1119 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1120 % (parent.name, name, link.path)) 

1121 parent[name] = link 

1122 else: # exporting 

1123 export_source = os.path.abspath(export_source) 

1124 parent_filename = os.path.abspath(parent.file.filename) 

1125 if data_filename != export_source: # dataset is in different file than export source 

1126 # possible if user adds a link to a dataset in a different file after reading export source 

1127 # to memory 

1128 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename)) 

1129 link = ExternalLink(relative_path, data.name) 

1130 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1131 % (parent.name, name, link.filename, link.path)) 

1132 parent[name] = link 

1133 elif parent.name != data.parent.name: # dataset is in export source and has different path 

1134 # so create a soft link to the dataset in this file 

1135 # possible if user adds a link to a dataset in export source after reading to memory 

1136 # TODO check that there is/will be still a dataset at data.name -- if the dataset has 

1137 # been removed, then this link will be broken 

1138 link = SoftLink(data.name) 

1139 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1140 % (parent.name, name, link.path)) 

1141 parent[name] = link 

1142 else: # dataset is in export source and has same path as the builder, so copy the dataset 

1143 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'" 

1144 % (data.file.filename, data.name, parent.name, name)) 

1145 parent.copy(source=data, 

1146 dest=parent, 

1147 name=name, 

1148 expand_soft=False, 

1149 expand_external=False, 

1150 expand_refs=False, 

1151 without_attrs=True) 

1152 dset = parent[name] 

1153 else: 

1154 # TODO add option for case where there are multiple links to the same dataset within a file: 

1155 # instead of copying the dset N times, copy it once and create soft links to it within the file 

1156 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'" 

1157 % (data.file.filename, data.name, parent.name, name)) 

1158 parent.copy(source=data, 

1159 dest=parent, 

1160 name=name, 

1161 expand_soft=False, 

1162 expand_external=False, 

1163 expand_refs=False, 

1164 without_attrs=True) 

1165 dset = parent[name] 

1166 

1167 # Write a compound dataset, i.e, a dataset with compound data type 

1168 elif isinstance(options['dtype'], list): 

1169 # do some stuff to figure out what data is a reference 

1170 refs = list() 

1171 for i, dts in enumerate(options['dtype']): 

1172 if self.__is_ref(dts): 

1173 refs.append(i) 

1174 # If one or more of the parts of the compound data type are references then we need to deal with those 

1175 if len(refs) > 0: 

1176 try: 

1177 _dtype = self.__resolve_dtype__(options['dtype'], data) 

1178 except Exception as exc: 

1179 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1180 raise Exception(msg) from exc 

1181 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings']) 

1182 self.__set_written(builder) 

1183 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1184 "object references. attributes: %s" 

1185 % (name, list(attributes.keys()))) 

1186 

1187 @self.__queue_ref 

1188 def _filler(): 

1189 self.logger.debug("Resolving object references and setting attribute on dataset '%s' " 

1190 "containing attributes: %s" 

1191 % (name, list(attributes.keys()))) 

1192 ret = list() 

1193 for item in data: 

1194 new_item = list(item) 

1195 for i in refs: 

1196 new_item[i] = self.__get_ref(item[i]) 

1197 ret.append(tuple(new_item)) 

1198 dset = parent[name] 

1199 dset[:] = ret 

1200 self.set_attributes(dset, attributes) 

1201 

1202 return 

1203 # If the compound data type contains only regular data (i.e., no references) then we can write it as usual 

1204 else: 

1205 dset = self.__list_fill__(parent, name, data, options) 

1206 # Write a dataset containing references, i.e., a region or object reference. 

1207 # NOTE: we can ignore options['io_settings'] for scalar data 

1208 elif self.__is_ref(options['dtype']): 

1209 _dtype = self.__dtypes.get(options['dtype']) 

1210 # Write a scalar data region reference dataset 

1211 if isinstance(data, RegionBuilder): 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true

1212 dset = parent.require_dataset(name, shape=(), dtype=_dtype) 

1213 self.__set_written(builder) 

1214 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing a " 

1215 "region reference. attributes: %s" 

1216 % (name, list(attributes.keys()))) 

1217 

1218 @self.__queue_ref 

1219 def _filler(): 

1220 self.logger.debug("Resolving region reference and setting attribute on dataset '%s' " 

1221 "containing attributes: %s" 

1222 % (name, list(attributes.keys()))) 

1223 ref = self.__get_ref(data.builder, data.region) 

1224 dset = parent[name] 

1225 dset[()] = ref 

1226 self.set_attributes(dset, attributes) 

1227 # Write a scalar object reference dataset 

1228 elif isinstance(data, ReferenceBuilder): 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true

1229 dset = parent.require_dataset(name, dtype=_dtype, shape=()) 

1230 self.__set_written(builder) 

1231 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing an " 

1232 "object reference. attributes: %s" 

1233 % (name, list(attributes.keys()))) 

1234 

1235 @self.__queue_ref 

1236 def _filler(): 

1237 self.logger.debug("Resolving object reference and setting attribute on dataset '%s' " 

1238 "containing attributes: %s" 

1239 % (name, list(attributes.keys()))) 

1240 ref = self.__get_ref(data.builder) 

1241 dset = parent[name] 

1242 dset[()] = ref 

1243 self.set_attributes(dset, attributes) 

1244 # Write an array dataset of references 

1245 else: 

1246 # Write a array of region references 

1247 if options['dtype'] == 'region': 1247 ↛ 1248line 1247 didn't jump to line 1248, because the condition on line 1247 was never true

1248 dset = parent.require_dataset(name, dtype=_dtype, shape=(len(data),), **options['io_settings']) 

1249 self.__set_written(builder) 

1250 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1251 "region references. attributes: %s" 

1252 % (name, list(attributes.keys()))) 

1253 

1254 @self.__queue_ref 

1255 def _filler(): 

1256 self.logger.debug("Resolving region references and setting attribute on dataset '%s' " 

1257 "containing attributes: %s" 

1258 % (name, list(attributes.keys()))) 

1259 refs = list() 

1260 for item in data: 

1261 refs.append(self.__get_ref(item.builder, item.region)) 

1262 dset = parent[name] 

1263 dset[()] = refs 

1264 self.set_attributes(dset, attributes) 

1265 # Write array of object references 

1266 else: 

1267 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings']) 

1268 self.__set_written(builder) 

1269 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1270 "object references. attributes: %s" 

1271 % (name, list(attributes.keys()))) 

1272 

1273 @self.__queue_ref 

1274 def _filler(): 

1275 self.logger.debug("Resolving object references and setting attribute on dataset '%s' " 

1276 "containing attributes: %s" 

1277 % (name, list(attributes.keys()))) 

1278 refs = list() 

1279 for item in data: 

1280 refs.append(self.__get_ref(item)) 

1281 dset = parent[name] 

1282 dset[()] = refs 

1283 self.set_attributes(dset, attributes) 

1284 return 

1285 # write a "regular" dataset 

1286 else: 

1287 # Create an empty dataset 

1288 if data is None: 

1289 dset = self.__setup_empty_dset__(parent, name, options['io_settings']) 

1290 dataio.dataset = dset 

1291 # Write a scalar dataset containing a single string 

1292 elif isinstance(data, (str, bytes)): 

1293 dset = self.__scalar_fill__(parent, name, data, options) 

1294 # Iterative write of a data chunk iterator 

1295 elif isinstance(data, AbstractDataChunkIterator): 

1296 dset = self.__setup_chunked_dset__(parent, name, data, options) 

1297 self.__dci_queue.append(dataset=dset, data=data) 

1298 # Write a regular in memory array (e.g., numpy array, list etc.) 

1299 elif hasattr(data, '__len__'): 

1300 dset = self.__list_fill__(parent, name, data, options) 

1301 # Write a regular scalar dataset 

1302 else: 

1303 dset = self.__scalar_fill__(parent, name, data, options) 

1304 # Create the attributes on the dataset only if we are the primary and not just a Soft/External link 

1305 if link is None: 

1306 self.set_attributes(dset, attributes) 

1307 # Validate the attributes on the linked dataset 

1308 elif len(attributes) > 0: 

1309 pass 

1310 self.__set_written(builder) 

1311 if exhaust_dci: 1311 ↛ exitline 1311 didn't return from function 'write_dataset', because the condition on line 1311 was never false

1312 self.__dci_queue.exhaust_queue() 

1313 

1314 @classmethod 

1315 def __scalar_fill__(cls, parent, name, data, options=None): 

1316 dtype = None 

1317 io_settings = {} 

1318 if options is not None: 1318 ↛ 1321line 1318 didn't jump to line 1321, because the condition on line 1318 was never false

1319 dtype = options.get('dtype') 

1320 io_settings = options.get('io_settings') 

1321 if not isinstance(dtype, type): 1321 ↛ 1327line 1321 didn't jump to line 1327, because the condition on line 1321 was never false

1322 try: 

1323 dtype = cls.__resolve_dtype__(dtype, data) 

1324 except Exception as exc: 

1325 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1326 raise Exception(msg) from exc 

1327 try: 

1328 dset = parent.create_dataset(name, data=data, shape=None, dtype=dtype, **io_settings) 

1329 except Exception as exc: 

1330 msg = "Could not create scalar dataset %s in %s" % (name, parent.name) 

1331 raise Exception(msg) from exc 

1332 return dset 

1333 

1334 @classmethod 

1335 def __setup_chunked_dset__(cls, parent, name, data, options=None): 

1336 """ 

1337 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator 

1338 

1339 :param parent: The parent object to which the dataset should be added 

1340 :type parent: h5py.Group, h5py.File 

1341 :param name: The name of the dataset 

1342 :type name: str 

1343 :param data: The data to be written. 

1344 :type data: DataChunkIterator 

1345 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1346 :type options: dict 

1347 

1348 """ 

1349 io_settings = {} 

1350 if options is not None: 

1351 if 'io_settings' in options: 1351 ↛ 1354line 1351 didn't jump to line 1354, because the condition on line 1351 was never false

1352 io_settings = options.get('io_settings') 

1353 # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write. 

1354 if 'chunks' not in io_settings: 

1355 recommended_chunks = data.recommended_chunk_shape() 

1356 io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks 

1357 # Define the shape of the data if not provided by the user 

1358 if 'shape' not in io_settings: 1358 ↛ 1361line 1358 didn't jump to line 1361, because the condition on line 1358 was never false

1359 io_settings['shape'] = data.recommended_data_shape() 

1360 # Define the maxshape of the data if not provided by the user 

1361 if 'maxshape' not in io_settings: 

1362 io_settings['maxshape'] = data.maxshape 

1363 if 'dtype' not in io_settings: 1363 ↛ 1371line 1363 didn't jump to line 1371, because the condition on line 1363 was never false

1364 if (options is not None) and ('dtype' in options): 

1365 io_settings['dtype'] = options['dtype'] 

1366 else: 

1367 io_settings['dtype'] = data.dtype 

1368 if isinstance(io_settings['dtype'], str): 1368 ↛ 1370line 1368 didn't jump to line 1370, because the condition on line 1368 was never true

1369 # map to real dtype if we were given a string 

1370 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) 

1371 try: 

1372 dset = parent.create_dataset(name, **io_settings) 

1373 except Exception as exc: 

1374 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc 

1375 return dset 

1376 

1377 @classmethod 

1378 def __setup_empty_dset__(cls, parent, name, io_settings): 

1379 """ 

1380 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator 

1381 

1382 :param parent: The parent object to which the dataset should be added 

1383 :type parent: h5py.Group, h5py.File 

1384 :param name: The name of the dataset 

1385 :type name: str 

1386 :param data: The data to be written. 

1387 :type data: DataChunkIterator 

1388 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1389 :type options: dict 

1390 

1391 """ 

1392 # Define the shape of the data if not provided by the user 

1393 if 'shape' not in io_settings: 

1394 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without shape") 

1395 if 'dtype' not in io_settings: 

1396 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without dtype") 

1397 if isinstance(io_settings['dtype'], str): 

1398 # map to real dtype if we were given a string 

1399 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) 

1400 try: 

1401 dset = parent.create_dataset(name, **io_settings) 

1402 except Exception as exc: 

1403 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc 

1404 return dset 

1405 

1406 @classmethod 

1407 def __chunked_iter_fill__(cls, parent, name, data, options=None): 

1408 """ 

1409 Write data to a dataset one-chunk-at-a-time based on the given DataChunkIterator 

1410 

1411 :param parent: The parent object to which the dataset should be added 

1412 :type parent: h5py.Group, h5py.File 

1413 :param name: The name of the dataset 

1414 :type name: str 

1415 :param data: The data to be written. 

1416 :type data: DataChunkIterator 

1417 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1418 :type options: dict 

1419 

1420 """ 

1421 dset = cls.__setup_chunked_dset__(parent, name, data, options=options) 

1422 read = True 

1423 while read: 

1424 read = HDF5IODataChunkIteratorQueue._write_chunk(dset, data) 

1425 return dset 

1426 

1427 @classmethod 

1428 def __list_fill__(cls, parent, name, data, options=None): 

1429 # define the io settings and data type if necessary 

1430 io_settings = {} 

1431 dtype = None 

1432 if options is not None: 

1433 dtype = options.get('dtype') 

1434 io_settings = options.get('io_settings') 

1435 if not isinstance(dtype, type): 

1436 try: 

1437 dtype = cls.__resolve_dtype__(dtype, data) 

1438 except Exception as exc: 

1439 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1440 raise Exception(msg) from exc 

1441 # define the data shape 

1442 if 'shape' in io_settings: 1442 ↛ 1443line 1442 didn't jump to line 1443, because the condition on line 1442 was never true

1443 data_shape = io_settings.pop('shape') 

1444 elif hasattr(data, 'shape'): 

1445 data_shape = data.shape 

1446 elif isinstance(dtype, np.dtype): 

1447 data_shape = (len(data),) 

1448 else: 

1449 data_shape = get_data_shape(data) 

1450 

1451 # Create the dataset 

1452 try: 

1453 dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings) 

1454 except Exception as exc: 

1455 msg = "Could not create dataset %s in %s with shape %s, dtype %s, and iosettings %s. %s" % \ 

1456 (name, parent.name, str(data_shape), str(dtype), str(io_settings), str(exc)) 

1457 raise Exception(msg) from exc 

1458 # Write the data 

1459 if len(data) > dset.shape[0]: 1459 ↛ 1460line 1459 didn't jump to line 1460, because the condition on line 1459 was never true

1460 new_shape = list(dset.shape) 

1461 new_shape[0] = len(data) 

1462 dset.resize(new_shape) 

1463 try: 

1464 dset[:] = data 

1465 except Exception as e: 

1466 raise e 

1467 return dset 

1468 

1469 @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference', 

1470 'default': None}, 

1471 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object', 

1472 'default': None}, 

1473 returns='the reference', rtype=Reference) 

1474 def __get_ref(self, **kwargs): 

1475 container, region = getargs('container', 'region', kwargs) 

1476 if container is None: 1476 ↛ 1477line 1476 didn't jump to line 1477, because the condition on line 1476 was never true

1477 return None 

1478 if isinstance(container, Builder): 

1479 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name)) 

1480 if isinstance(container, LinkBuilder): 1480 ↛ 1481line 1480 didn't jump to line 1481, because the condition on line 1480 was never true

1481 builder = container.target_builder 

1482 else: 

1483 builder = container 

1484 elif isinstance(container, ReferenceBuilder): 

1485 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.builder.name)) 

1486 builder = container.builder 

1487 else: 

1488 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name)) 

1489 builder = self.manager.build(container) 

1490 path = self.__get_path(builder) 

1491 self.logger.debug("Getting reference at path '%s'" % path) 

1492 if isinstance(container, RegionBuilder): 1492 ↛ 1493line 1492 didn't jump to line 1493, because the condition on line 1492 was never true

1493 region = container.region 

1494 if region is not None: 1494 ↛ 1495line 1494 didn't jump to line 1495, because the condition on line 1494 was never true

1495 dset = self.__file[path] 

1496 if not isinstance(dset, Dataset): 

1497 raise ValueError('cannot create region reference without Dataset') 

1498 return self.__file[path].regionref[region] 

1499 else: 

1500 return self.__file[path].ref 

1501 

1502 def __is_ref(self, dtype): 

1503 if isinstance(dtype, DtypeSpec): 

1504 return self.__is_ref(dtype.dtype) 

1505 if isinstance(dtype, RefSpec): 

1506 return True 

1507 if isinstance(dtype, dict): # may be dict from reading a compound dataset 

1508 return self.__is_ref(dtype['dtype']) 

1509 if isinstance(dtype, str): 

1510 return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE 

1511 return False 

1512 

1513 def __queue_ref(self, func): 

1514 '''Set aside filling dset with references 

1515 

1516 dest[sl] = func() 

1517 

1518 Args: 

1519 dset: the h5py.Dataset that the references need to be added to 

1520 sl: the np.s_ (slice) object for indexing into dset 

1521 func: a function to call to return the chunk of data, with 

1522 references filled in 

1523 ''' 

1524 # TODO: come up with more intelligent way of 

1525 # queueing reference resolution, based on reference 

1526 # dependency 

1527 self.__ref_queue.append(func) 

1528 

1529 def __rec_get_ref(self, ref_list): 

1530 ret = list() 

1531 for elem in ref_list: 

1532 if isinstance(elem, (list, tuple)): 

1533 ret.append(self.__rec_get_ref(elem)) 

1534 elif isinstance(elem, (Builder, Container)): 

1535 ret.append(self.__get_ref(elem)) 

1536 else: 

1537 ret.append(elem) 

1538 return ret 

1539 

1540 @property 

1541 def mode(self): 

1542 """ 

1543 Return the HDF5 file mode. One of ("w", "r", "r+", "a", "w-", "x"). 

1544 """ 

1545 return self.__mode 

1546 

1547 @classmethod 

1548 @docval(*get_docval(H5DataIO.__init__)) 

1549 def set_dataio(cls, **kwargs): 

1550 """ 

1551 Wrap the given Data object with an H5DataIO. 

1552 

1553 This method is provided merely for convenience. It is the equivalent 

1554 of the following: 

1555 

1556 .. code-block:: python 

1557 

1558 from hdmf.backends.hdf5 import H5DataIO 

1559 data = ... 

1560 data = H5DataIO(data) 

1561 """ 

1562 return H5DataIO.__init__(**kwargs)