Coverage for src/hdmf/backends/hdf5/h5tools.py: 87%

905 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-04 02:57 +0000

1import logging 

2import os.path 

3import warnings 

4from collections import deque 

5from functools import partial 

6from pathlib import Path, PurePosixPath as pp 

7 

8import numpy as np 

9import h5py 

10from h5py import File, Group, Dataset, special_dtype, SoftLink, ExternalLink, Reference, RegionReference, check_dtype 

11 

12from .h5_utils import (BuilderH5ReferenceDataset, BuilderH5RegionDataset, BuilderH5TableDataset, H5DataIO, 

13 H5SpecReader, H5SpecWriter, HDF5IODataChunkIteratorQueue) 

14from ..io import HDMFIO 

15from ..errors import UnsupportedOperation 

16from ..warnings import BrokenLinkWarning 

17from ...build import (Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, RegionBuilder, 

18 ReferenceBuilder, TypeMap, ObjectMapper) 

19from ...container import Container 

20from ...term_set import TermSetWrapper 

21from ...data_utils import AbstractDataChunkIterator 

22from ...spec import RefSpec, DtypeSpec, NamespaceCatalog 

23from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset 

24from ..utils import NamespaceToBuilderHelper, WriteStatusTracker 

25 

26ROOT_NAME = 'root' 

27SPEC_LOC_ATTR = '.specloc' 

28H5_TEXT = special_dtype(vlen=str) 

29H5_BINARY = special_dtype(vlen=bytes) 

30H5_REF = special_dtype(ref=Reference) 

31H5_REGREF = special_dtype(ref=RegionReference) 

32 

33RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB 

34 

35H5PY_3 = h5py.__version__.startswith('3') 

36 

37 

38class HDF5IO(HDMFIO): 

39 

40 __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group 

41 

42 @staticmethod 

43 def can_read(path): 

44 """Determines whether a given path is readable by the HDF5IO class""" 

45 if not os.path.isfile(path): 

46 return False 

47 try: 

48 with h5py.File(path, "r"): 

49 return True 

50 except IOError: 

51 return False 

52 

53 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

54 {'name': 'mode', 'type': str, 

55 'doc': ('the mode to open the HDF5 file with, one of ("w", "r", "r+", "a", "w-", "x"). ' 

56 'See `h5py.File <http://docs.h5py.org/en/latest/high/file.html#opening-creating-files>`_ for ' 

57 'more details.'), 

58 'default': 'r'}, 

59 {'name': 'manager', 'type': (TypeMap, BuildManager), 

60 'doc': 'the BuildManager or a TypeMap to construct a BuildManager to use for I/O', 'default': None}, 

61 {'name': 'comm', 'type': 'Intracomm', 

62 'doc': 'the MPI communicator to use for parallel I/O', 'default': None}, 

63 {'name': 'file', 'type': [File, "S3File", "RemFile"], 

64 'doc': 'a pre-existing h5py.File, S3File, or RemFile object', 'default': None}, 

65 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

66 {'name': 'herd_path', 'type': str, 

67 'doc': 'The path to read/write the HERD file', 'default': None},) 

68 def __init__(self, **kwargs): 

69 """Open an HDF5 file for IO. 

70 """ 

71 self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) 

72 path, manager, mode, comm, file_obj, driver, herd_path = popargs('path', 'manager', 'mode', 

73 'comm', 'file', 'driver', 

74 'herd_path', 

75 kwargs) 

76 

77 self.__open_links = [] # keep track of other files opened from links in this file 

78 self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close 

79 

80 path = self.__check_path_file_obj(path, file_obj) 

81 

82 if file_obj is None and not os.path.exists(path) and (mode == 'r' or mode == 'r+') and driver != 'ros3': 

83 msg = "Unable to open file %s in '%s' mode. File does not exist." % (path, mode) 

84 raise UnsupportedOperation(msg) 

85 

86 if file_obj is None and os.path.exists(path) and (mode == 'w-' or mode == 'x'): 

87 msg = "Unable to open file %s in '%s' mode. File already exists." % (path, mode) 

88 raise UnsupportedOperation(msg) 

89 

90 if manager is None: 

91 manager = BuildManager(TypeMap(NamespaceCatalog())) 

92 elif isinstance(manager, TypeMap): 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true

93 manager = BuildManager(manager) 

94 self.__driver = driver 

95 self.__comm = comm 

96 self.__mode = mode 

97 self.__file = file_obj 

98 super().__init__(manager, source=path, herd_path=herd_path) 

99 # NOTE: source is not set if path is None and file_obj is passed 

100 self.__built = dict() # keep track of each builder for each dataset/group/link for each file 

101 self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder 

102 self.__ref_queue = deque() # a queue of the references that need to be added 

103 self.__dci_queue = HDF5IODataChunkIteratorQueue() # a queue of DataChunkIterators that need to be exhausted 

104 ObjectMapper.no_convert(Dataset) 

105 self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object 

106 

107 @property 

108 def comm(self): 

109 """The MPI communicator to use for parallel I/O.""" 

110 return self.__comm 

111 

112 @property 

113 def _file(self): 

114 return self.__file 

115 

116 @property 

117 def driver(self): 

118 return self.__driver 

119 

120 @classmethod 

121 def __check_path_file_obj(cls, path, file_obj): 

122 if isinstance(path, Path): 

123 path = str(path) 

124 

125 if path is None and file_obj is None: 

126 raise ValueError("Either the 'path' or 'file' argument must be supplied.") 

127 

128 if path is not None and file_obj is not None: # consistency check 

129 if os.path.abspath(file_obj.filename) != os.path.abspath(path): 

130 msg = ("You argued '%s' as this object's path, but supplied a file with filename: %s" 

131 % (path, file_obj.filename)) 

132 raise ValueError(msg) 

133 

134 return path 

135 

136 @classmethod 

137 def __resolve_file_obj(cls, path, file_obj, driver): 

138 path = cls.__check_path_file_obj(path, file_obj) 

139 

140 if file_obj is None: 

141 file_kwargs = dict() 

142 if driver is not None: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 file_kwargs.update(driver=driver) 

144 file_obj = File(path, 'r', **file_kwargs) 

145 return file_obj 

146 

147 @classmethod 

148 @docval({'name': 'namespace_catalog', 'type': (NamespaceCatalog, TypeMap), 

149 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'}, 

150 {'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

151 {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None}, 

152 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, 

153 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

154 returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and " 

155 "the included data types"), 

156 rtype=dict) 

157 def load_namespaces(cls, **kwargs): 

158 """Load cached namespaces from a file. 

159 

160 If `file` is not supplied, then an :py:class:`h5py.File` object will be opened for the given `path`, the 

161 namespaces will be read, and the File object will be closed. If `file` is supplied, then 

162 the given File object will be read from and not closed. 

163 

164 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. 

165 """ 

166 namespace_catalog, path, namespaces, file_obj, driver = popargs( 

167 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs) 

168 

169 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) 

170 if file_obj is None: # need to close the file object that we just opened 

171 with open_file_obj: 

172 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj) 

173 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj) 

174 

175 @classmethod 

176 def __load_namespaces(cls, namespace_catalog, namespaces, file_obj): 

177 d = {} 

178 

179 if not cls.__check_specloc(file_obj): 

180 return d 

181 

182 namespace_versions = cls.__get_namespaces(file_obj) 

183 

184 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]] 

185 if namespaces is None: 185 ↛ 188line 185 didn't jump to line 188, because the condition on line 185 was never false

186 namespaces = list(spec_group.keys()) 

187 

188 readers = dict() 

189 deps = dict() 

190 for ns in namespaces: 

191 latest_version = namespace_versions[ns] 

192 ns_group = spec_group[ns][latest_version] 

193 reader = H5SpecReader(ns_group) 

194 readers[ns] = reader 

195 # for each namespace in the 'namespace' dataset, track all included namespaces (dependencies) 

196 for spec_ns in reader.read_namespace(cls.__ns_spec_path): 

197 deps[ns] = list() 

198 for s in spec_ns['schema']: 

199 dep = s.get('namespace') 

200 if dep is not None: 

201 deps[ns].append(dep) 

202 

203 order = cls._order_deps(deps) 

204 for ns in order: 

205 reader = readers[ns] 

206 d.update(namespace_catalog.load_namespaces(cls.__ns_spec_path, reader=reader)) 

207 

208 return d 

209 

210 @classmethod 

211 def __check_specloc(cls, file_obj): 

212 return SPEC_LOC_ATTR in file_obj.attrs 

213 

214 @classmethod 

215 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, 

216 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, 

217 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, 

218 returns="dict mapping names to versions of the namespaces in the file", rtype=dict) 

219 def get_namespaces(cls, **kwargs): 

220 """Get the names and versions of the cached namespaces from a file. 

221 

222 If ``file`` is not supplied, then an :py:class:`h5py.File` object will be opened for the given ``path``, the 

223 namespaces will be read, and the File object will be closed. If `file` is supplied, then 

224 the given File object will be read from and not closed. 

225 

226 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric 

227 ordering) is returned. This is the version of the namespace that is loaded by HDF5IO.load_namespaces(...). 

228 

229 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. 

230 """ 

231 path, file_obj, driver = popargs('path', 'file', 'driver', kwargs) 

232 

233 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) 

234 if file_obj is None: # need to close the file object that we just opened 

235 with open_file_obj: 

236 return cls.__get_namespaces(open_file_obj) 

237 return cls.__get_namespaces(open_file_obj) 

238 

239 @classmethod 

240 def __get_namespaces(cls, file_obj): 

241 """Return a dict mapping namespace name to version string for the latest version of that namespace in the file. 

242 

243 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric 

244 ordering) is returned. This is the version of the namespace that is loaded by ``HDF5IO.load_namespaces``. 

245 """ 

246 used_version_names = dict() 

247 if not cls.__check_specloc(file_obj): 

248 return used_version_names 

249 

250 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]] 

251 namespaces = list(spec_group.keys()) 

252 for ns in namespaces: 

253 ns_group = spec_group[ns] 

254 # NOTE: by default, objects within groups are iterated in alphanumeric order 

255 version_names = list(ns_group.keys()) 

256 if len(version_names) > 1: 

257 # prior to HDMF 1.6.1, extensions without a version were written under the group name "unversioned" 

258 # make sure that if there is another group representing a newer version, that is read instead 

259 if 'unversioned' in version_names: 

260 version_names.remove('unversioned') 

261 if len(version_names) > 1: 

262 # as of HDMF 1.6.1, extensions without a version are written under the group name "None" 

263 # make sure that if there is another group representing a newer version, that is read instead 

264 if 'None' in version_names: 

265 version_names.remove('None') 

266 used_version_names[ns] = version_names[-1] # save the largest in alphanumeric order 

267 

268 return used_version_names 

269 

270 @classmethod 

271 def _order_deps(cls, deps): 

272 """ 

273 Order namespaces according to dependency for loading into a NamespaceCatalog 

274 

275 Args: 

276 deps (dict): a dictionary that maps a namespace name to a list of name of 

277 the namespaces on which the namespace is directly dependent 

278 Example: {'a': ['b', 'c'], 'b': ['d'], 'c': ['d'], 'd': []} 

279 Expected output: ['d', 'b', 'c', 'a'] 

280 """ 

281 order = list() 

282 keys = list(deps.keys()) 

283 deps = dict(deps) 

284 for k in keys: 

285 if k in deps: 

286 cls.__order_deps_aux(order, deps, k) 

287 return order 

288 

289 @classmethod 

290 def __order_deps_aux(cls, order, deps, key): 

291 """ 

292 A recursive helper function for _order_deps 

293 """ 

294 if key not in deps: 

295 return 

296 subdeps = deps.pop(key) 

297 for subk in subdeps: 

298 cls.__order_deps_aux(order, deps, subk) 

299 order.append(key) 

300 

301 @classmethod 

302 @docval({'name': 'source_filename', 'type': str, 'doc': 'the path to the HDF5 file to copy'}, 

303 {'name': 'dest_filename', 'type': str, 'doc': 'the name of the destination file'}, 

304 {'name': 'expand_external', 'type': bool, 'doc': 'expand external links into new objects', 'default': True}, 

305 {'name': 'expand_refs', 'type': bool, 'doc': 'copy objects which are pointed to by reference', 

306 'default': False}, 

307 {'name': 'expand_soft', 'type': bool, 'doc': 'expand soft links into new objects', 'default': False} 

308 ) 

309 def copy_file(self, **kwargs): 

310 """ 

311 Convenience function to copy an HDF5 file while allowing external links to be resolved. 

312 

313 .. warning:: 

314 

315 As of HDMF 2.0, this method is no longer supported and may be removed in a future version. 

316 Please use the export method or h5py.File.copy method instead. 

317 

318 .. note:: 

319 

320 The source file will be opened in 'r' mode and the destination file will be opened in 'w' mode 

321 using h5py. To avoid possible collisions, care should be taken that, e.g., the source file is 

322 not opened already when calling this function. 

323 

324 """ 

325 

326 warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of " 

327 "HDMF. Please use the export method or h5py.File.copy method instead.", DeprecationWarning) 

328 

329 source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename', 

330 'dest_filename', 

331 'expand_external', 

332 'expand_refs', 

333 'expand_soft', 

334 kwargs) 

335 source_file = File(source_filename, 'r') 

336 dest_file = File(dest_filename, 'w') 

337 for objname in source_file["/"].keys(): 

338 source_file.copy(source=objname, 

339 dest=dest_file, 

340 name=objname, 

341 expand_external=expand_external, 

342 expand_refs=expand_refs, 

343 expand_soft=expand_soft, 

344 shallow=False, 

345 without_attrs=False, 

346 ) 

347 for objname in source_file['/'].attrs: 

348 dest_file['/'].attrs[objname] = source_file['/'].attrs[objname] 

349 source_file.close() 

350 dest_file.close() 

351 

352 @docval({'name': 'container', 'type': Container, 'doc': 'the Container object to write'}, 

353 {'name': 'cache_spec', 'type': bool, 

354 'doc': ('If True (default), cache specification to file (highly recommended). If False, do not cache ' 

355 'specification to file. The appropriate specification will then need to be loaded prior to ' 

356 'reading the file.'), 

357 'default': True}, 

358 {'name': 'link_data', 'type': bool, 

359 'doc': 'If True (default), create external links to HDF5 Datasets. If False, copy HDF5 Datasets.', 

360 'default': True}, 

361 {'name': 'exhaust_dci', 'type': bool, 

362 'doc': 'If True (default), exhaust DataChunkIterators one at a time. If False, exhaust them concurrently.', 

363 'default': True}, 

364 {'name': 'herd', 'type': 'HERD', 

365 'doc': 'A HERD object to populate with references.', 

366 'default': None}) 

367 def write(self, **kwargs): 

368 """Write the container to an HDF5 file.""" 

369 if self.__mode == 'r': 

370 raise UnsupportedOperation(("Cannot write to file %s in mode '%s'. " 

371 "Please use mode 'r+', 'w', 'w-', 'x', or 'a'") 

372 % (self.source, self.__mode)) 

373 

374 cache_spec = popargs('cache_spec', kwargs) 

375 super().write(**kwargs) 

376 if cache_spec: 

377 self.__cache_spec() 

378 

379 def __cache_spec(self): 

380 ref = self.__file.attrs.get(SPEC_LOC_ATTR) 

381 spec_group = None 

382 if ref is not None: 

383 spec_group = self.__file[ref] 

384 else: 

385 path = 'specifications' # do something to figure out where the specifications should go 

386 spec_group = self.__file.require_group(path) 

387 self.__file.attrs[SPEC_LOC_ATTR] = spec_group.ref 

388 ns_catalog = self.manager.namespace_catalog 

389 for ns_name in ns_catalog.namespaces: 

390 ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name) 

391 namespace = ns_catalog.get_namespace(ns_name) 

392 group_name = '%s/%s' % (ns_name, namespace.version) 

393 if group_name in spec_group: 

394 continue 

395 ns_group = spec_group.create_group(group_name) 

396 writer = H5SpecWriter(ns_group) 

397 ns_builder.export(self.__ns_spec_path, writer=writer) 

398 

399 _export_args = ( 

400 {'name': 'src_io', 'type': 'HDMFIO', 'doc': 'the HDMFIO object for reading the data to export'}, 

401 {'name': 'container', 'type': Container, 

402 'doc': ('the Container object to export. If None, then the entire contents of the HDMFIO object will be ' 

403 'exported'), 

404 'default': None}, 

405 {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`', 

406 'default': None}, 

407 {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file', 

408 'default': True} 

409 # clear_cache is an arg on HDMFIO.export but it is intended for internal usage 

410 # so it is not available on HDF5IO 

411 ) 

412 

413 @docval(*_export_args) 

414 def export(self, **kwargs): 

415 """Export data read from a file from any backend to HDF5. 

416 

417 See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. 

418 """ 

419 if self.__mode != 'w': 

420 raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." 

421 % (self.source, self.__mode)) 

422 

423 src_io = getargs('src_io', kwargs) 

424 write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) 

425 if write_args is None: 

426 write_args = dict() 

427 

428 if not isinstance(src_io, HDF5IO) and write_args.get('link_data', True): 

429 raise UnsupportedOperation("Cannot export from non-HDF5 backend %s to HDF5 with write argument " 

430 "link_data=True." % src_io.__class__.__name__) 

431 

432 write_args['export_source'] = os.path.abspath(src_io.source) if src_io.source is not None else None 

433 ckwargs = kwargs.copy() 

434 ckwargs['write_args'] = write_args 

435 if not write_args.get('link_data', True): 

436 ckwargs['clear_cache'] = True 

437 super().export(**ckwargs) 

438 if cache_spec: 

439 # add any namespaces from the src_io that have not yet been loaded 

440 for namespace in src_io.manager.namespace_catalog.namespaces: 

441 if namespace not in self.manager.namespace_catalog.namespaces: 441 ↛ 440line 441 didn't jump to line 440, because the condition on line 441 was never false

442 self.manager.namespace_catalog.add_namespace( 

443 name=namespace, 

444 namespace=src_io.manager.namespace_catalog.get_namespace(namespace) 

445 ) 

446 self.__cache_spec() 

447 

448 @classmethod 

449 @docval({'name': 'path', 'type': str, 'doc': 'the path to the destination HDF5 file'}, 

450 {'name': 'comm', 'type': 'Intracomm', 'doc': 'the MPI communicator to use for parallel I/O', 

451 'default': None}, 

452 *_export_args) # NOTE: src_io is required and is the second positional argument 

453 def export_io(self, **kwargs): 

454 """Export from one backend to HDF5 (class method). 

455 

456 Convenience function for :py:meth:`export` where you do not need to 

457 instantiate a new ``HDF5IO`` object for writing. An ``HDF5IO`` object is created with mode 'w' and the given 

458 arguments. 

459 

460 Example usage: 

461 

462 .. code-block:: python 

463 

464 old_io = HDF5IO('old.h5', 'r') 

465 HDF5IO.export_io(path='new_copy.h5', src_io=old_io) 

466 

467 See :py:meth:`export` for more details. 

468 """ 

469 path, comm = popargs('path', 'comm', kwargs) 

470 

471 with HDF5IO(path=path, comm=comm, mode='w') as write_io: 

472 write_io.export(**kwargs) 

473 

474 def read(self, **kwargs): 

475 if self.__mode == 'w' or self.__mode == 'w-' or self.__mode == 'x': 

476 raise UnsupportedOperation("Cannot read from file %s in mode '%s'. Please use mode 'r', 'r+', or 'a'." 

477 % (self.source, self.__mode)) 

478 try: 

479 return super().read(**kwargs) 

480 except UnsupportedOperation as e: 

481 if str(e) == 'Cannot build data. There are no values.': # pragma: no cover 

482 raise UnsupportedOperation("Cannot read data from file %s in mode '%s'. There are no values." 

483 % (self.source, self.__mode)) 

484 

485 @docval(returns='a GroupBuilder representing the data object', rtype='GroupBuilder') 

486 def read_builder(self): 

487 """ 

488 Read data and return the GroupBuilder representing it. 

489 

490 NOTE: On read, the Builder.source may will usually not be set of the Builders. 

491 NOTE: The Builder.location is used internally to ensure correct handling of links (in particular on export) 

492 and should be set on read for all GroupBuilder, DatasetBuilder, and LinkBuilder objects. 

493 """ 

494 if not self.__file: 

495 raise UnsupportedOperation("Cannot read data from closed HDF5 file '%s'" % self.source) 

496 f_builder = self.__read.get(self.__file) 

497 # ignore cached specs when reading builder 

498 ignore = set() 

499 specloc = self.__file.attrs.get(SPEC_LOC_ATTR) 

500 if specloc is not None: 

501 ignore.add(self.__file[specloc].name) 

502 if f_builder is None: 

503 f_builder = self.__read_group(self.__file, ROOT_NAME, ignore=ignore) 

504 self.__read[self.__file] = f_builder 

505 return f_builder 

506 

507 def __set_written(self, builder): 

508 """ 

509 Helper function used to set the written status for builders 

510 

511 :param builder: Builder object to be marked as written 

512 :type builder: Builder 

513 """ 

514 self._written_builders.set_written(builder) 

515 

516 def get_written(self, builder): 

517 """Return True if this builder has been written to (or read from) disk by this IO object, False otherwise. 

518 

519 :param builder: Builder object to get the written flag for 

520 :type builder: Builder 

521 

522 :return: True if the builder is found in self._written_builders using the builder ID, False otherwise 

523 """ 

524 return self._written_builders.get_written(builder) 

525 

526 def __set_built(self, fpath, id, builder): 

527 """ 

528 Update self.__built to cache the given builder for the given file and id. 

529 

530 :param fpath: Path to the HDF5 file containing the object 

531 :type fpath: str 

532 :param id: ID of the HDF5 object in the path 

533 :type id: h5py GroupID object 

534 :param builder: The builder to be cached 

535 """ 

536 self.__built.setdefault(fpath, dict()).setdefault(id, builder) 

537 

538 def __get_built(self, fpath, id): 

539 """ 

540 Look up a builder for the given file and id in self.__built cache 

541 

542 :param fpath: Path to the HDF5 file containing the object 

543 :type fpath: str 

544 :param id: ID of the HDF5 object in the path 

545 :type id: h5py GroupID object 

546 

547 :return: Builder in the self.__built cache or None 

548 """ 

549 

550 fdict = self.__built.get(fpath) 

551 if fdict: 

552 return fdict.get(id) 

553 else: 

554 return None 

555 

556 @docval({'name': 'h5obj', 'type': (Dataset, Group), 

557 'doc': 'the HDF5 object to the corresponding Builder object for'}) 

558 def get_builder(self, **kwargs): 

559 """ 

560 Get the builder for the corresponding h5py Group or Dataset 

561 

562 :raises ValueError: When no builder has been constructed yet for the given h5py object 

563 """ 

564 h5obj = getargs('h5obj', kwargs) 

565 fpath = h5obj.file.filename 

566 builder = self.__get_built(fpath, h5obj.id) 

567 if builder is None: 567 ↛ 568line 567 didn't jump to line 568, because the condition on line 567 was never true

568 msg = '%s:%s has not been built' % (fpath, h5obj.name) 

569 raise ValueError(msg) 

570 return builder 

571 

572 @docval({'name': 'h5obj', 'type': (Dataset, Group), 

573 'doc': 'the HDF5 object to the corresponding Container/Data object for'}) 

574 def get_container(self, **kwargs): 

575 """ 

576 Get the container for the corresponding h5py Group or Dataset 

577 

578 :raises ValueError: When no builder has been constructed yet for the given h5py object 

579 """ 

580 h5obj = getargs('h5obj', kwargs) 

581 builder = self.get_builder(h5obj) 

582 container = self.manager.construct(builder) 

583 return container 

584 

585 def __read_group(self, h5obj, name=None, ignore=set()): 

586 kwargs = { 

587 "attributes": self.__read_attrs(h5obj), 

588 "groups": dict(), 

589 "datasets": dict(), 

590 "links": dict() 

591 } 

592 

593 for key, val in kwargs['attributes'].items(): 

594 if isinstance(val, bytes): 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true

595 kwargs['attributes'][key] = val.decode('UTF-8') 

596 

597 if name is None: 

598 name = str(os.path.basename(h5obj.name)) 

599 for k in h5obj: 

600 sub_h5obj = h5obj.get(k) 

601 if sub_h5obj is not None: 

602 if sub_h5obj.name in ignore: 

603 continue 

604 link_type = h5obj.get(k, getlink=True) 

605 if isinstance(link_type, (SoftLink, ExternalLink)): 

606 # Reading links might be better suited in its own function 

607 # get path of link (the key used for tracking what's been built) 

608 target_path = link_type.path 

609 target_obj = sub_h5obj.file[target_path] 

610 builder_name = os.path.basename(target_path) 

611 # get builder if already read, else build it 

612 builder = self.__get_built(sub_h5obj.file.filename, target_obj.id) 

613 if builder is None: 

614 # NOTE: all links must have absolute paths 

615 if isinstance(target_obj, Dataset): 

616 builder = self.__read_dataset(target_obj, builder_name) 

617 else: 

618 builder = self.__read_group(target_obj, builder_name, ignore=ignore) 

619 self.__set_built(sub_h5obj.file.filename, target_obj.id, builder) 

620 link_builder = LinkBuilder(builder=builder, name=k, source=os.path.abspath(h5obj.file.filename)) 

621 link_builder.location = h5obj.name 

622 self.__set_written(link_builder) 

623 kwargs['links'][builder_name] = link_builder 

624 if isinstance(link_type, ExternalLink): 

625 self.__open_links.append(sub_h5obj) 

626 else: 

627 builder = self.__get_built(sub_h5obj.file.filename, sub_h5obj.id) 

628 obj_type = None 

629 read_method = None 

630 if isinstance(sub_h5obj, Dataset): 

631 read_method = self.__read_dataset 

632 obj_type = kwargs['datasets'] 

633 else: 

634 read_method = partial(self.__read_group, ignore=ignore) 

635 obj_type = kwargs['groups'] 

636 if builder is None: 

637 builder = read_method(sub_h5obj) 

638 self.__set_built(sub_h5obj.file.filename, sub_h5obj.id, builder) 

639 obj_type[builder.name] = builder 

640 else: 

641 warnings.warn('Path to Group altered/broken at ' + os.path.join(h5obj.name, k), BrokenLinkWarning) 

642 kwargs['datasets'][k] = None 

643 continue 

644 kwargs['source'] = os.path.abspath(h5obj.file.filename) 

645 ret = GroupBuilder(name, **kwargs) 

646 ret.location = os.path.dirname(h5obj.name) 

647 self.__set_written(ret) 

648 return ret 

649 

650 def __read_dataset(self, h5obj, name=None): 

651 kwargs = { 

652 "attributes": self.__read_attrs(h5obj), 

653 "dtype": h5obj.dtype, 

654 "maxshape": h5obj.maxshape 

655 } 

656 for key, val in kwargs['attributes'].items(): 

657 if isinstance(val, bytes): 657 ↛ 658line 657 didn't jump to line 658, because the condition on line 657 was never true

658 kwargs['attributes'][key] = val.decode('UTF-8') 

659 

660 if name is None: 

661 name = str(os.path.basename(h5obj.name)) 

662 kwargs['source'] = os.path.abspath(h5obj.file.filename) 

663 ndims = len(h5obj.shape) 

664 if ndims == 0: # read scalar 

665 scalar = h5obj[()] 

666 if isinstance(scalar, bytes): 666 ↛ 667line 666 didn't jump to line 667, because the condition on line 666 was never true

667 scalar = scalar.decode('UTF-8') 

668 

669 if isinstance(scalar, Reference): 669 ↛ 671line 669 didn't jump to line 671, because the condition on line 669 was never true

670 # TODO (AJTRITT): This should call __read_ref to support Group references 

671 target = h5obj.file[scalar] 

672 target_builder = self.__read_dataset(target) 

673 self.__set_built(target.file.filename, target.id, target_builder) 

674 if isinstance(scalar, RegionReference): 

675 d = RegionBuilder(scalar, target_builder) 

676 else: 

677 d = ReferenceBuilder(target_builder) 

678 kwargs['data'] = d 

679 kwargs['dtype'] = d.dtype 

680 else: 

681 kwargs["data"] = scalar 

682 else: 

683 d = None 

684 if h5obj.dtype.kind == 'O' and len(h5obj) > 0: 

685 elem1 = h5obj[tuple([0] * (h5obj.ndim - 1) + [0])] 

686 if isinstance(elem1, (str, bytes)): 

687 d = self._check_str_dtype(h5obj) 

688 elif isinstance(elem1, RegionReference): # read list of references 688 ↛ 689line 688 didn't jump to line 689, because the condition on line 688 was never true

689 d = BuilderH5RegionDataset(h5obj, self) 

690 kwargs['dtype'] = d.dtype 

691 elif isinstance(elem1, Reference): 691 ↛ 701line 691 didn't jump to line 701, because the condition on line 691 was never false

692 d = BuilderH5ReferenceDataset(h5obj, self) 

693 kwargs['dtype'] = d.dtype 

694 elif h5obj.dtype.kind == 'V': # table / compound data type 

695 cpd_dt = h5obj.dtype 

696 ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))] 

697 d = BuilderH5TableDataset(h5obj, self, ref_cols) 

698 kwargs['dtype'] = HDF5IO.__compound_dtype_to_list(h5obj.dtype, d.dtype) 

699 else: 

700 d = h5obj 

701 kwargs["data"] = d 

702 ret = DatasetBuilder(name, **kwargs) 

703 ret.location = os.path.dirname(h5obj.name) 

704 self.__set_written(ret) 

705 return ret 

706 

707 def _check_str_dtype(self, h5obj): 

708 dtype = h5obj.dtype 

709 if dtype.kind == 'O': 709 ↛ 712line 709 didn't jump to line 712, because the condition on line 709 was never false

710 if dtype.metadata.get('vlen') == str and H5PY_3: 710 ↛ 712line 710 didn't jump to line 712, because the condition on line 710 was never false

711 return StrDataset(h5obj, None) 

712 return h5obj 

713 

714 @classmethod 

715 def __compound_dtype_to_list(cls, h5obj_dtype, dset_dtype): 

716 ret = [] 

717 for name, dtype in zip(h5obj_dtype.fields, dset_dtype): 

718 ret.append({'name': name, 'dtype': dtype}) 

719 return ret 

720 

721 def __read_attrs(self, h5obj): 

722 ret = dict() 

723 for k, v in h5obj.attrs.items(): 

724 if k == SPEC_LOC_ATTR: # ignore cached spec 

725 continue 

726 if isinstance(v, RegionReference): 726 ↛ 727line 726 didn't jump to line 727, because the condition on line 726 was never true

727 raise ValueError("cannot read region reference attributes yet") 

728 elif isinstance(v, Reference): 

729 ret[k] = self.__read_ref(h5obj.file[v]) 

730 else: 

731 ret[k] = v 

732 return ret 

733 

734 def __read_ref(self, h5obj): 

735 ret = None 

736 ret = self.__get_built(h5obj.file.filename, h5obj.id) 

737 if ret is None: 

738 if isinstance(h5obj, Dataset): 

739 ret = self.__read_dataset(h5obj) 

740 elif isinstance(h5obj, Group): 740 ↛ 743line 740 didn't jump to line 743, because the condition on line 740 was never false

741 ret = self.__read_group(h5obj) 

742 else: 

743 raise ValueError("h5obj must be a Dataset or a Group - got %s" % str(h5obj)) 

744 self.__set_built(h5obj.file.filename, h5obj.id, ret) 

745 return ret 

746 

747 def open(self): 

748 if self.__file is None: 

749 open_flag = self.__mode 

750 kwargs = dict(rdcc_nbytes=RDCC_NBYTES) 

751 if self.comm: 751 ↛ 752line 751 didn't jump to line 752, because the condition on line 751 was never true

752 kwargs.update(driver='mpio', comm=self.comm) 

753 

754 if self.driver is not None: 754 ↛ 755line 754 didn't jump to line 755, because the condition on line 754 was never true

755 kwargs.update(driver=self.driver) 

756 

757 self.__file = File(self.source, open_flag, **kwargs) 

758 

759 def close(self, close_links=True): 

760 """Close this file and any files linked to from this file. 

761 

762 :param close_links: Whether to close all files linked to from this file. (default: True) 

763 :type close_links: bool 

764 """ 

765 if close_links: 

766 self.close_linked_files() 

767 try: 

768 if self.__file is not None: 

769 self.__file.close() 

770 except AttributeError: 

771 # Do not do anything in case that self._file does not exist. This 

772 # may happen in case that an error occurs before HDF5IO has been fully 

773 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises 

774 # an error before self.__file has been created 

775 self.__file = None 

776 

777 def close_linked_files(self): 

778 """Close all opened, linked-to files. 

779 

780 MacOS and Linux automatically release the linked-to file after the linking file is closed, but Windows does 

781 not, which prevents the linked-to file from being deleted or truncated. Use this method to close all opened, 

782 linked-to files. 

783 """ 

784 # Make sure 

785 try: 

786 for obj in self.__open_links: 

787 if obj: 

788 obj.file.close() 

789 except AttributeError: 

790 # Do not do anything in case that self.__open_links does not exist. This 

791 # may happen in case that an error occurs before HDF5IO has been fully 

792 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises 

793 # an error before self.__open_links has been created. 

794 pass 

795 finally: 

796 self.__open_links = [] 

797 

798 @docval({'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the HDF5 file'}, 

799 {'name': 'link_data', 'type': bool, 

800 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

801 {'name': 'exhaust_dci', 'type': bool, 

802 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

803 'default': True}, 

804 {'name': 'export_source', 'type': str, 

805 'doc': 'The source of the builders when exporting', 'default': None}) 

806 def write_builder(self, **kwargs): 

807 f_builder = popargs('builder', kwargs) 

808 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) 

809 self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" 

810 % (f_builder.name, self.source, kwargs)) 

811 for name, gbldr in f_builder.groups.items(): 

812 self.write_group(self.__file, gbldr, **kwargs) 

813 for name, dbldr in f_builder.datasets.items(): 

814 self.write_dataset(self.__file, dbldr, **kwargs) 

815 for name, lbldr in f_builder.links.items(): 

816 self.write_link(self.__file, lbldr, export_source=kwargs.get("export_source")) 

817 self.set_attributes(self.__file, f_builder.attributes) 

818 self.__add_refs() 

819 self.__dci_queue.exhaust_queue() 

820 self.__set_written(f_builder) 

821 self.logger.debug("Done writing %s '%s' to path '%s'" % 

822 (f_builder.__class__.__qualname__, f_builder.name, self.source)) 

823 

824 def __add_refs(self): 

825 ''' 

826 Add all references in the file. 

827 

828 References get queued to be added at the end of write. This is because 

829 the current traversal algorithm (i.e. iterating over GroupBuilder items) 

830 does not happen in a guaranteed order. We need to figure out what objects 

831 will be references, and then write them after we write everything else. 

832 ''' 

833 failed = set() 

834 while len(self.__ref_queue) > 0: 

835 call = self.__ref_queue.popleft() 

836 self.logger.debug("Adding reference with call id %d from queue (length %d)" 

837 % (id(call), len(self.__ref_queue))) 

838 try: 

839 call() 

840 except KeyError: 

841 if id(call) in failed: 

842 raise RuntimeError('Unable to resolve reference') 

843 self.logger.debug("Adding reference with call id %d failed. Appending call to queue" % id(call)) 

844 failed.add(id(call)) 

845 self.__ref_queue.append(call) 

846 

847 @classmethod 

848 def get_type(cls, data): 

849 if isinstance(data, str): 

850 return H5_TEXT 

851 elif isinstance(data, bytes): 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true

852 return H5_BINARY 

853 elif isinstance(data, Container): 853 ↛ 854line 853 didn't jump to line 854, because the condition on line 853 was never true

854 return H5_REF 

855 elif not hasattr(data, '__len__'): 

856 return type(data) 

857 else: 

858 if len(data) == 0: 

859 if hasattr(data, 'dtype'): 859 ↛ 860line 859 didn't jump to line 860, because the condition on line 859 was never true

860 return data.dtype 

861 else: 

862 raise ValueError('cannot determine type for empty data') 

863 return cls.get_type(data[0]) 

864 

865 __dtypes = { 

866 "float": np.float32, 

867 "float32": np.float32, 

868 "double": np.float64, 

869 "float64": np.float64, 

870 "long": np.int64, 

871 "int64": np.int64, 

872 "int": np.int32, 

873 "int32": np.int32, 

874 "short": np.int16, 

875 "int16": np.int16, 

876 "int8": np.int8, 

877 "uint64": np.uint64, 

878 "uint": np.uint32, 

879 "uint32": np.uint32, 

880 "uint16": np.uint16, 

881 "uint8": np.uint8, 

882 "bool": np.bool_, 

883 "text": H5_TEXT, 

884 "utf": H5_TEXT, 

885 "utf8": H5_TEXT, 

886 "utf-8": H5_TEXT, 

887 "ascii": H5_BINARY, 

888 "bytes": H5_BINARY, 

889 "ref": H5_REF, 

890 "reference": H5_REF, 

891 "object": H5_REF, 

892 "region": H5_REGREF, 

893 "isodatetime": H5_TEXT, 

894 "datetime": H5_TEXT, 

895 } 

896 

897 @classmethod 

898 def __resolve_dtype__(cls, dtype, data): 

899 # TODO: These values exist, but I haven't solved them yet 

900 # binary 

901 # number 

902 dtype = cls.__resolve_dtype_helper__(dtype) 

903 if dtype is None: 

904 dtype = cls.get_type(data) 

905 return dtype 

906 

907 @classmethod 

908 def __resolve_dtype_helper__(cls, dtype): 

909 if dtype is None: 

910 return None 

911 elif isinstance(dtype, str): 

912 return cls.__dtypes.get(dtype) 

913 elif isinstance(dtype, dict): 

914 return cls.__dtypes.get(dtype['reftype']) 

915 elif isinstance(dtype, np.dtype): 

916 # NOTE: some dtypes may not be supported, but we need to support writing of read-in compound types 

917 return dtype 

918 else: 

919 return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype]) 

920 

921 @docval({'name': 'obj', 'type': (Group, Dataset), 'doc': 'the HDF5 object to add attributes to'}, 

922 {'name': 'attributes', 

923 'type': dict, 

924 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}) 

925 def set_attributes(self, **kwargs): 

926 obj, attributes = getargs('obj', 'attributes', kwargs) 

927 for key, value in attributes.items(): 

928 try: 

929 if isinstance(value, (set, list, tuple)): 

930 tmp = tuple(value) 

931 if len(tmp) > 0: 

932 if isinstance(tmp[0], (str, bytes)): 932 ↛ 934line 932 didn't jump to line 934, because the condition on line 932 was never false

933 value = np.array(value, dtype=special_dtype(vlen=type(tmp[0]))) 

934 elif isinstance(tmp[0], Container): # a list of references 

935 self.__queue_ref(self._make_attr_ref_filler(obj, key, tmp)) 

936 else: 

937 value = np.array(value) 

938 self.logger.debug("Setting %s '%s' attribute '%s' to %s" 

939 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

940 obj.attrs[key] = value 

941 elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference 

942 self.__queue_ref(self._make_attr_ref_filler(obj, key, value)) 

943 else: 

944 self.logger.debug("Setting %s '%s' attribute '%s' to %s" 

945 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

946 if isinstance(value, np.ndarray) and value.dtype.kind == 'U': 946 ↛ 947line 946 didn't jump to line 947, because the condition on line 946 was never true

947 value = np.array(value, dtype=H5_TEXT) 

948 obj.attrs[key] = value # a regular scalar 

949 except Exception as e: 

950 msg = "unable to write attribute '%s' on object '%s'" % (key, obj.name) 

951 raise RuntimeError(msg) from e 

952 

953 def _make_attr_ref_filler(self, obj, key, value): 

954 ''' 

955 Make the callable for setting references to attributes 

956 ''' 

957 self.logger.debug("Queueing set %s '%s' attribute '%s' to %s" 

958 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) 

959 if isinstance(value, (tuple, list)): 959 ↛ 960line 959 didn't jump to line 960, because the condition on line 959 was never true

960 def _filler(): 

961 ret = list() 

962 for item in value: 

963 ret.append(self.__get_ref(item)) 

964 obj.attrs[key] = ret 

965 else: 

966 def _filler(): 

967 obj.attrs[key] = self.__get_ref(value) 

968 return _filler 

969 

970 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, 

971 {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, 

972 {'name': 'link_data', 'type': bool, 

973 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

974 {'name': 'exhaust_dci', 'type': bool, 

975 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

976 'default': True}, 

977 {'name': 'export_source', 'type': str, 

978 'doc': 'The source of the builders when exporting', 'default': None}, 

979 returns='the Group that was created', rtype='Group') 

980 def write_group(self, **kwargs): 

981 parent, builder = popargs('parent', 'builder', kwargs) 

982 self.logger.debug("Writing GroupBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

983 if self.get_written(builder): 

984 self.logger.debug(" GroupBuilder '%s' is already written" % builder.name) 

985 group = parent[builder.name] 

986 else: 

987 self.logger.debug(" Creating group '%s'" % builder.name) 

988 group = parent.create_group(builder.name) 

989 # write all groups 

990 subgroups = builder.groups 

991 if subgroups: 

992 for subgroup_name, sub_builder in subgroups.items(): 

993 # do not create an empty group without attributes or links 

994 self.write_group(group, sub_builder, **kwargs) 

995 # write all datasets 

996 datasets = builder.datasets 

997 if datasets: 

998 for dset_name, sub_builder in datasets.items(): 

999 self.write_dataset(group, sub_builder, **kwargs) 

1000 # write all links 

1001 links = builder.links 

1002 if links: 

1003 for link_name, sub_builder in links.items(): 

1004 self.write_link(group, sub_builder, export_source=kwargs.get("export_source")) 

1005 attributes = builder.attributes 

1006 self.set_attributes(group, attributes) 

1007 self.__set_written(builder) 

1008 return group 

1009 

1010 def __get_path(self, builder): 

1011 """Get the path to the builder. 

1012 

1013 Note that the root of the file has no name - it is just "/". Thus, the name of the root container is ignored. 

1014 If builder.location is set then it is used as the path, otherwise the function 

1015 determines the path by constructing it iteratively from the parents of the 

1016 builder. 

1017 """ 

1018 if builder.location is not None: 

1019 path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/") 

1020 else: 

1021 curr = builder 

1022 names = list() 

1023 while curr.parent is not None: 

1024 names.append(curr.name) 

1025 curr = curr.parent 

1026 delim = "/" 

1027 path = "%s%s" % (delim, delim.join(reversed(names))) 

1028 return path 

1029 

1030 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, 

1031 {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'}, 

1032 {'name': 'export_source', 'type': str, 

1033 'doc': 'The source of the builders when exporting', 'default': None}, 

1034 returns='the Link that was created', rtype='Link') 

1035 def write_link(self, **kwargs): 

1036 parent, builder, export_source = getargs('parent', 'builder', 'export_source', kwargs) 

1037 self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

1038 if self.get_written(builder): 1038 ↛ 1039line 1038 didn't jump to line 1039, because the condition on line 1038 was never true

1039 self.logger.debug(" LinkBuilder '%s' is already written" % builder.name) 

1040 return None 

1041 name = builder.name 

1042 target_builder = builder.builder 

1043 path = self.__get_path(target_builder) 

1044 # source will indicate target_builder's location 

1045 if export_source is None: 

1046 write_source = builder.source 

1047 else: 

1048 write_source = export_source 

1049 

1050 parent_filename = os.path.abspath(parent.file.filename) 

1051 if target_builder.source in (write_source, parent_filename): 

1052 link_obj = SoftLink(path) 

1053 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1054 % (parent.name, name, link_obj.path)) 

1055 elif target_builder.source is not None: 1055 ↛ 1064line 1055 didn't jump to line 1064, because the condition on line 1055 was never false

1056 target_filename = os.path.abspath(target_builder.source) 

1057 relative_path = os.path.relpath(target_filename, os.path.dirname(parent_filename)) 

1058 if target_builder.location is not None: 

1059 path = target_builder.location + "/" + target_builder.name 

1060 link_obj = ExternalLink(relative_path, path) 

1061 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1062 % (parent.name, name, link_obj.filename, link_obj.path)) 

1063 else: 

1064 msg = 'cannot create external link to %s' % path 

1065 raise ValueError(msg) 

1066 parent[name] = link_obj 

1067 self.__set_written(builder) 

1068 return link_obj 

1069 

1070 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, # noqa: C901 

1071 {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'}, 

1072 {'name': 'link_data', 'type': bool, 

1073 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True}, 

1074 {'name': 'exhaust_dci', 'type': bool, 

1075 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 

1076 'default': True}, 

1077 {'name': 'export_source', 'type': str, 

1078 'doc': 'The source of the builders when exporting', 'default': None}, 

1079 returns='the Dataset that was created', rtype=Dataset) 

1080 def write_dataset(self, **kwargs): # noqa: C901 

1081 """ Write a dataset to HDF5 

1082 

1083 The function uses other dataset-dependent write functions, e.g, 

1084 ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data. 

1085 """ 

1086 parent, builder = popargs('parent', 'builder', kwargs) 

1087 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) 

1088 self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) 

1089 if self.get_written(builder): 

1090 self.logger.debug(" DatasetBuilder '%s' is already written" % builder.name) 

1091 return None 

1092 name = builder.name 

1093 data = builder.data 

1094 dataio = None 

1095 options = dict() # dict with additional 

1096 if isinstance(data, H5DataIO): 

1097 options['io_settings'] = data.io_settings 

1098 dataio = data 

1099 link_data = data.link_data 

1100 data = data.data 

1101 else: 

1102 options['io_settings'] = {} 

1103 if isinstance(data, TermSetWrapper): 

1104 # This is for when the wrapped item is a dataset 

1105 # (refer to objectmapper.py for wrapped attributes) 

1106 data = data.value 

1107 attributes = builder.attributes 

1108 options['dtype'] = builder.dtype 

1109 dset = None 

1110 link = None 

1111 

1112 # The user provided an existing h5py dataset as input and asked to create a link to the dataset 

1113 if isinstance(data, Dataset): 

1114 data_filename = os.path.abspath(data.file.filename) 

1115 if link_data: 

1116 if export_source is None: # not exporting 

1117 parent_filename = os.path.abspath(parent.file.filename) 

1118 if data_filename != parent_filename: # create external link to data 

1119 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename)) 

1120 link = ExternalLink(relative_path, data.name) 

1121 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1122 % (parent.name, name, link.filename, link.path)) 

1123 else: # create soft link to dataset already in this file -- possible if mode == 'r+' 

1124 link = SoftLink(data.name) 

1125 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1126 % (parent.name, name, link.path)) 

1127 parent[name] = link 

1128 else: # exporting 

1129 export_source = os.path.abspath(export_source) 

1130 parent_filename = os.path.abspath(parent.file.filename) 

1131 if data_filename != export_source: # dataset is in different file than export source 

1132 # possible if user adds a link to a dataset in a different file after reading export source 

1133 # to memory 

1134 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename)) 

1135 link = ExternalLink(relative_path, data.name) 

1136 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'" 

1137 % (parent.name, name, link.filename, link.path)) 

1138 parent[name] = link 

1139 elif parent.name != data.parent.name: # dataset is in export source and has different path 

1140 # so create a soft link to the dataset in this file 

1141 # possible if user adds a link to a dataset in export source after reading to memory 

1142 # TODO check that there is/will be still a dataset at data.name -- if the dataset has 

1143 # been removed, then this link will be broken 

1144 link = SoftLink(data.name) 

1145 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'" 

1146 % (parent.name, name, link.path)) 

1147 parent[name] = link 

1148 else: # dataset is in export source and has same path as the builder, so copy the dataset 

1149 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'" 

1150 % (data.file.filename, data.name, parent.name, name)) 

1151 parent.copy(source=data, 

1152 dest=parent, 

1153 name=name, 

1154 expand_soft=False, 

1155 expand_external=False, 

1156 expand_refs=False, 

1157 without_attrs=True) 

1158 dset = parent[name] 

1159 else: 

1160 # TODO add option for case where there are multiple links to the same dataset within a file: 

1161 # instead of copying the dset N times, copy it once and create soft links to it within the file 

1162 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'" 

1163 % (data.file.filename, data.name, parent.name, name)) 

1164 parent.copy(source=data, 

1165 dest=parent, 

1166 name=name, 

1167 expand_soft=False, 

1168 expand_external=False, 

1169 expand_refs=False, 

1170 without_attrs=True) 

1171 dset = parent[name] 

1172 

1173 # Write a compound dataset, i.e, a dataset with compound data type 

1174 elif isinstance(options['dtype'], list): 

1175 # do some stuff to figure out what data is a reference 

1176 refs = list() 

1177 for i, dts in enumerate(options['dtype']): 

1178 if self.__is_ref(dts): 

1179 refs.append(i) 

1180 # If one or more of the parts of the compound data type are references then we need to deal with those 

1181 if len(refs) > 0: 

1182 try: 

1183 _dtype = self.__resolve_dtype__(options['dtype'], data) 

1184 except Exception as exc: 

1185 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1186 raise Exception(msg) from exc 

1187 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings']) 

1188 self.__set_written(builder) 

1189 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1190 "object references. attributes: %s" 

1191 % (name, list(attributes.keys()))) 

1192 

1193 @self.__queue_ref 

1194 def _filler(): 

1195 self.logger.debug("Resolving object references and setting attribute on dataset '%s' " 

1196 "containing attributes: %s" 

1197 % (name, list(attributes.keys()))) 

1198 ret = list() 

1199 for item in data: 

1200 new_item = list(item) 

1201 for i in refs: 

1202 new_item[i] = self.__get_ref(item[i]) 

1203 ret.append(tuple(new_item)) 

1204 dset = parent[name] 

1205 dset[:] = ret 

1206 self.set_attributes(dset, attributes) 

1207 

1208 return 

1209 # If the compound data type contains only regular data (i.e., no references) then we can write it as usual 

1210 else: 

1211 dset = self.__list_fill__(parent, name, data, options) 

1212 # Write a dataset containing references, i.e., a region or object reference. 

1213 # NOTE: we can ignore options['io_settings'] for scalar data 

1214 elif self.__is_ref(options['dtype']): 

1215 _dtype = self.__dtypes.get(options['dtype']) 

1216 # Write a scalar data region reference dataset 

1217 if isinstance(data, RegionBuilder): 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true

1218 dset = parent.require_dataset(name, shape=(), dtype=_dtype) 

1219 self.__set_written(builder) 

1220 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing a " 

1221 "region reference. attributes: %s" 

1222 % (name, list(attributes.keys()))) 

1223 

1224 @self.__queue_ref 

1225 def _filler(): 

1226 self.logger.debug("Resolving region reference and setting attribute on dataset '%s' " 

1227 "containing attributes: %s" 

1228 % (name, list(attributes.keys()))) 

1229 ref = self.__get_ref(data.builder, data.region) 

1230 dset = parent[name] 

1231 dset[()] = ref 

1232 self.set_attributes(dset, attributes) 

1233 # Write a scalar object reference dataset 

1234 elif isinstance(data, ReferenceBuilder): 1234 ↛ 1235line 1234 didn't jump to line 1235, because the condition on line 1234 was never true

1235 dset = parent.require_dataset(name, dtype=_dtype, shape=()) 

1236 self.__set_written(builder) 

1237 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing an " 

1238 "object reference. attributes: %s" 

1239 % (name, list(attributes.keys()))) 

1240 

1241 @self.__queue_ref 

1242 def _filler(): 

1243 self.logger.debug("Resolving object reference and setting attribute on dataset '%s' " 

1244 "containing attributes: %s" 

1245 % (name, list(attributes.keys()))) 

1246 ref = self.__get_ref(data.builder) 

1247 dset = parent[name] 

1248 dset[()] = ref 

1249 self.set_attributes(dset, attributes) 

1250 # Write an array dataset of references 

1251 else: 

1252 # Write a array of region references 

1253 if options['dtype'] == 'region': 1253 ↛ 1254line 1253 didn't jump to line 1254, because the condition on line 1253 was never true

1254 dset = parent.require_dataset(name, dtype=_dtype, shape=(len(data),), **options['io_settings']) 

1255 self.__set_written(builder) 

1256 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1257 "region references. attributes: %s" 

1258 % (name, list(attributes.keys()))) 

1259 

1260 @self.__queue_ref 

1261 def _filler(): 

1262 self.logger.debug("Resolving region references and setting attribute on dataset '%s' " 

1263 "containing attributes: %s" 

1264 % (name, list(attributes.keys()))) 

1265 refs = list() 

1266 for item in data: 

1267 refs.append(self.__get_ref(item.builder, item.region)) 

1268 dset = parent[name] 

1269 dset[()] = refs 

1270 self.set_attributes(dset, attributes) 

1271 # Write array of object references 

1272 else: 

1273 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings']) 

1274 self.__set_written(builder) 

1275 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing " 

1276 "object references. attributes: %s" 

1277 % (name, list(attributes.keys()))) 

1278 

1279 @self.__queue_ref 

1280 def _filler(): 

1281 self.logger.debug("Resolving object references and setting attribute on dataset '%s' " 

1282 "containing attributes: %s" 

1283 % (name, list(attributes.keys()))) 

1284 refs = list() 

1285 for item in data: 

1286 refs.append(self.__get_ref(item)) 

1287 dset = parent[name] 

1288 dset[()] = refs 

1289 self.set_attributes(dset, attributes) 

1290 return 

1291 # write a "regular" dataset 

1292 else: 

1293 # Create an empty dataset 

1294 if data is None: 

1295 dset = self.__setup_empty_dset__(parent, name, options['io_settings']) 

1296 dataio.dataset = dset 

1297 # Write a scalar dataset containing a single string 

1298 elif isinstance(data, (str, bytes)): 

1299 dset = self.__scalar_fill__(parent, name, data, options) 

1300 # Iterative write of a data chunk iterator 

1301 elif isinstance(data, AbstractDataChunkIterator): 

1302 dset = self.__setup_chunked_dset__(parent, name, data, options) 

1303 self.__dci_queue.append(dataset=dset, data=data) 

1304 # Write a regular in memory array (e.g., numpy array, list etc.) 

1305 elif hasattr(data, '__len__'): 

1306 dset = self.__list_fill__(parent, name, data, options) 

1307 # Write a regular scalar dataset 

1308 else: 

1309 dset = self.__scalar_fill__(parent, name, data, options) 

1310 # Create the attributes on the dataset only if we are the primary and not just a Soft/External link 

1311 if link is None: 

1312 self.set_attributes(dset, attributes) 

1313 # Validate the attributes on the linked dataset 

1314 elif len(attributes) > 0: 

1315 pass 

1316 self.__set_written(builder) 

1317 if exhaust_dci: 1317 ↛ exitline 1317 didn't return from function 'write_dataset', because the condition on line 1317 was never false

1318 self.__dci_queue.exhaust_queue() 

1319 

1320 @classmethod 

1321 def __scalar_fill__(cls, parent, name, data, options=None): 

1322 dtype = None 

1323 io_settings = {} 

1324 if options is not None: 1324 ↛ 1327line 1324 didn't jump to line 1327, because the condition on line 1324 was never false

1325 dtype = options.get('dtype') 

1326 io_settings = options.get('io_settings') 

1327 if not isinstance(dtype, type): 1327 ↛ 1333line 1327 didn't jump to line 1333, because the condition on line 1327 was never false

1328 try: 

1329 dtype = cls.__resolve_dtype__(dtype, data) 

1330 except Exception as exc: 

1331 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1332 raise Exception(msg) from exc 

1333 try: 

1334 dset = parent.create_dataset(name, data=data, shape=None, dtype=dtype, **io_settings) 

1335 except Exception as exc: 

1336 msg = "Could not create scalar dataset %s in %s" % (name, parent.name) 

1337 raise Exception(msg) from exc 

1338 return dset 

1339 

1340 @classmethod 

1341 def __setup_chunked_dset__(cls, parent, name, data, options=None): 

1342 """ 

1343 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator 

1344 

1345 :param parent: The parent object to which the dataset should be added 

1346 :type parent: h5py.Group, h5py.File 

1347 :param name: The name of the dataset 

1348 :type name: str 

1349 :param data: The data to be written. 

1350 :type data: DataChunkIterator 

1351 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1352 :type options: dict 

1353 

1354 """ 

1355 io_settings = {} 

1356 if options is not None: 

1357 if 'io_settings' in options: 1357 ↛ 1360line 1357 didn't jump to line 1360, because the condition on line 1357 was never false

1358 io_settings = options.get('io_settings') 

1359 # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write. 

1360 if 'chunks' not in io_settings: 

1361 recommended_chunks = data.recommended_chunk_shape() 

1362 io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks 

1363 # Define the shape of the data if not provided by the user 

1364 if 'shape' not in io_settings: 1364 ↛ 1367line 1364 didn't jump to line 1367, because the condition on line 1364 was never false

1365 io_settings['shape'] = data.recommended_data_shape() 

1366 # Define the maxshape of the data if not provided by the user 

1367 if 'maxshape' not in io_settings: 

1368 io_settings['maxshape'] = data.maxshape 

1369 if 'dtype' not in io_settings: 1369 ↛ 1377line 1369 didn't jump to line 1377, because the condition on line 1369 was never false

1370 if (options is not None) and ('dtype' in options): 

1371 io_settings['dtype'] = options['dtype'] 

1372 else: 

1373 io_settings['dtype'] = data.dtype 

1374 if isinstance(io_settings['dtype'], str): 1374 ↛ 1376line 1374 didn't jump to line 1376, because the condition on line 1374 was never true

1375 # map to real dtype if we were given a string 

1376 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) 

1377 try: 

1378 dset = parent.create_dataset(name, **io_settings) 

1379 except Exception as exc: 

1380 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc 

1381 return dset 

1382 

1383 @classmethod 

1384 def __setup_empty_dset__(cls, parent, name, io_settings): 

1385 """ 

1386 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator 

1387 

1388 :param parent: The parent object to which the dataset should be added 

1389 :type parent: h5py.Group, h5py.File 

1390 :param name: The name of the dataset 

1391 :type name: str 

1392 :param data: The data to be written. 

1393 :type data: DataChunkIterator 

1394 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1395 :type options: dict 

1396 

1397 """ 

1398 # Define the shape of the data if not provided by the user 

1399 if 'shape' not in io_settings: 

1400 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without shape") 

1401 if 'dtype' not in io_settings: 

1402 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without dtype") 

1403 if isinstance(io_settings['dtype'], str): 

1404 # map to real dtype if we were given a string 

1405 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) 

1406 try: 

1407 dset = parent.create_dataset(name, **io_settings) 

1408 except Exception as exc: 

1409 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc 

1410 return dset 

1411 

1412 @classmethod 

1413 def __chunked_iter_fill__(cls, parent, name, data, options=None): 

1414 """ 

1415 Write data to a dataset one-chunk-at-a-time based on the given DataChunkIterator 

1416 

1417 :param parent: The parent object to which the dataset should be added 

1418 :type parent: h5py.Group, h5py.File 

1419 :param name: The name of the dataset 

1420 :type name: str 

1421 :param data: The data to be written. 

1422 :type data: DataChunkIterator 

1423 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' 

1424 :type options: dict 

1425 

1426 """ 

1427 dset = cls.__setup_chunked_dset__(parent, name, data, options=options) 

1428 read = True 

1429 while read: 

1430 read = HDF5IODataChunkIteratorQueue._write_chunk(dset, data) 

1431 return dset 

1432 

1433 @classmethod 

1434 def __list_fill__(cls, parent, name, data, options=None): 

1435 # define the io settings and data type if necessary 

1436 io_settings = {} 

1437 dtype = None 

1438 if options is not None: 

1439 dtype = options.get('dtype') 

1440 io_settings = options.get('io_settings') 

1441 if not isinstance(dtype, type): 

1442 try: 

1443 dtype = cls.__resolve_dtype__(dtype, data) 

1444 except Exception as exc: 

1445 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) 

1446 raise Exception(msg) from exc 

1447 # define the data shape 

1448 if 'shape' in io_settings: 1448 ↛ 1449line 1448 didn't jump to line 1449, because the condition on line 1448 was never true

1449 data_shape = io_settings.pop('shape') 

1450 elif hasattr(data, 'shape'): 

1451 data_shape = data.shape 

1452 elif isinstance(dtype, np.dtype): 

1453 data_shape = (len(data),) 

1454 else: 

1455 data_shape = get_data_shape(data) 

1456 

1457 # Create the dataset 

1458 try: 

1459 dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings) 

1460 except Exception as exc: 

1461 msg = "Could not create dataset %s in %s with shape %s, dtype %s, and iosettings %s. %s" % \ 

1462 (name, parent.name, str(data_shape), str(dtype), str(io_settings), str(exc)) 

1463 raise Exception(msg) from exc 

1464 # Write the data 

1465 if len(data) > dset.shape[0]: 1465 ↛ 1466line 1465 didn't jump to line 1466, because the condition on line 1465 was never true

1466 new_shape = list(dset.shape) 

1467 new_shape[0] = len(data) 

1468 dset.resize(new_shape) 

1469 try: 

1470 dset[:] = data 

1471 except Exception as e: 

1472 raise e 

1473 return dset 

1474 

1475 @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference', 

1476 'default': None}, 

1477 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object', 

1478 'default': None}, 

1479 returns='the reference', rtype=Reference) 

1480 def __get_ref(self, **kwargs): 

1481 container, region = getargs('container', 'region', kwargs) 

1482 if container is None: 1482 ↛ 1483line 1482 didn't jump to line 1483, because the condition on line 1482 was never true

1483 return None 

1484 if isinstance(container, Builder): 

1485 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name)) 

1486 if isinstance(container, LinkBuilder): 1486 ↛ 1487line 1486 didn't jump to line 1487, because the condition on line 1486 was never true

1487 builder = container.target_builder 

1488 else: 

1489 builder = container 

1490 elif isinstance(container, ReferenceBuilder): 

1491 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.builder.name)) 

1492 builder = container.builder 

1493 else: 

1494 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name)) 

1495 builder = self.manager.build(container) 

1496 path = self.__get_path(builder) 

1497 self.logger.debug("Getting reference at path '%s'" % path) 

1498 if isinstance(container, RegionBuilder): 1498 ↛ 1499line 1498 didn't jump to line 1499, because the condition on line 1498 was never true

1499 region = container.region 

1500 if region is not None: 1500 ↛ 1501line 1500 didn't jump to line 1501, because the condition on line 1500 was never true

1501 dset = self.__file[path] 

1502 if not isinstance(dset, Dataset): 

1503 raise ValueError('cannot create region reference without Dataset') 

1504 return self.__file[path].regionref[region] 

1505 else: 

1506 return self.__file[path].ref 

1507 

1508 def __is_ref(self, dtype): 

1509 if isinstance(dtype, DtypeSpec): 

1510 return self.__is_ref(dtype.dtype) 

1511 if isinstance(dtype, RefSpec): 

1512 return True 

1513 if isinstance(dtype, dict): # may be dict from reading a compound dataset 

1514 return self.__is_ref(dtype['dtype']) 

1515 if isinstance(dtype, str): 

1516 return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE 

1517 return False 

1518 

1519 def __queue_ref(self, func): 

1520 '''Set aside filling dset with references 

1521 

1522 dest[sl] = func() 

1523 

1524 Args: 

1525 dset: the h5py.Dataset that the references need to be added to 

1526 sl: the np.s_ (slice) object for indexing into dset 

1527 func: a function to call to return the chunk of data, with 

1528 references filled in 

1529 ''' 

1530 # TODO: come up with more intelligent way of 

1531 # queueing reference resolution, based on reference 

1532 # dependency 

1533 self.__ref_queue.append(func) 

1534 

1535 def __rec_get_ref(self, ref_list): 

1536 ret = list() 

1537 for elem in ref_list: 

1538 if isinstance(elem, (list, tuple)): 

1539 ret.append(self.__rec_get_ref(elem)) 

1540 elif isinstance(elem, (Builder, Container)): 

1541 ret.append(self.__get_ref(elem)) 

1542 else: 

1543 ret.append(elem) 

1544 return ret 

1545 

1546 @property 

1547 def mode(self): 

1548 """ 

1549 Return the HDF5 file mode. One of ("w", "r", "r+", "a", "w-", "x"). 

1550 """ 

1551 return self.__mode 

1552 

1553 @classmethod 

1554 @docval(*get_docval(H5DataIO.__init__)) 

1555 def set_dataio(cls, **kwargs): 

1556 """ 

1557 Wrap the given Data object with an H5DataIO. 

1558 

1559 This method is provided merely for convenience. It is the equivalent 

1560 of the following: 

1561 

1562 .. code-block:: python 

1563 

1564 from hdmf.backends.hdf5 import H5DataIO 

1565 data = ... 

1566 data = H5DataIO(data) 

1567 """ 

1568 return H5DataIO.__init__(**kwargs)