Coverage for src/hdmf/backends/hdf5/h5tools.py: 87%
905 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-07-21 22:12 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-07-21 22:12 +0000
1import logging
2import os.path
3import warnings
4from collections import deque
5from functools import partial
6from pathlib import Path, PurePosixPath as pp
8import numpy as np
9import h5py
10from h5py import File, Group, Dataset, special_dtype, SoftLink, ExternalLink, Reference, RegionReference, check_dtype
12from .h5_utils import (BuilderH5ReferenceDataset, BuilderH5RegionDataset, BuilderH5TableDataset, H5DataIO,
13 H5SpecReader, H5SpecWriter, HDF5IODataChunkIteratorQueue)
14from ..io import HDMFIO
15from ..errors import UnsupportedOperation
16from ..warnings import BrokenLinkWarning
17from ...build import (Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, RegionBuilder,
18 ReferenceBuilder, TypeMap, ObjectMapper)
19from ...container import Container
20from ...data_utils import AbstractDataChunkIterator
21from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
22from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset
23from ..utils import NamespaceToBuilderHelper, WriteStatusTracker
25ROOT_NAME = 'root'
26SPEC_LOC_ATTR = '.specloc'
27H5_TEXT = special_dtype(vlen=str)
28H5_BINARY = special_dtype(vlen=bytes)
29H5_REF = special_dtype(ref=Reference)
30H5_REGREF = special_dtype(ref=RegionReference)
32H5PY_3 = h5py.__version__.startswith('3')
35class HDF5IO(HDMFIO):
37 __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group
39 @staticmethod
40 def can_read(path):
41 """Determines whether a given path is readable by the HDF5IO class"""
42 if not os.path.isfile(path):
43 return False
44 try:
45 with h5py.File(path, "r"):
46 return True
47 except IOError:
48 return False
50 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
51 {'name': 'mode', 'type': str,
52 'doc': ('the mode to open the HDF5 file with, one of ("w", "r", "r+", "a", "w-", "x"). '
53 'See `h5py.File <http://docs.h5py.org/en/latest/high/file.html#opening-creating-files>`_ for '
54 'more details.'),
55 'default': 'r'},
56 {'name': 'manager', 'type': (TypeMap, BuildManager),
57 'doc': 'the BuildManager or a TypeMap to construct a BuildManager to use for I/O', 'default': None},
58 {'name': 'comm', 'type': 'Intracomm',
59 'doc': 'the MPI communicator to use for parallel I/O', 'default': None},
60 {'name': 'file', 'type': [File, "S3File"], 'doc': 'a pre-existing h5py.File object', 'default': None},
61 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
62 {'name': 'external_resources_path', 'type': str,
63 'doc': 'The path to the ExternalResources', 'default': None},)
64 def __init__(self, **kwargs):
65 """Open an HDF5 file for IO.
66 """
67 self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
68 path, manager, mode, comm, file_obj, driver, external_resources_path = popargs('path', 'manager', 'mode',
69 'comm', 'file', 'driver',
70 'external_resources_path',
71 kwargs)
73 self.__open_links = [] # keep track of other files opened from links in this file
74 self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close
76 path = self.__check_path_file_obj(path, file_obj)
78 if file_obj is None and not os.path.exists(path) and (mode == 'r' or mode == 'r+') and driver != 'ros3':
79 msg = "Unable to open file %s in '%s' mode. File does not exist." % (path, mode)
80 raise UnsupportedOperation(msg)
82 if file_obj is None and os.path.exists(path) and (mode == 'w-' or mode == 'x'):
83 msg = "Unable to open file %s in '%s' mode. File already exists." % (path, mode)
84 raise UnsupportedOperation(msg)
86 if manager is None:
87 manager = BuildManager(TypeMap(NamespaceCatalog()))
88 elif isinstance(manager, TypeMap): 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true
89 manager = BuildManager(manager)
90 self.__driver = driver
91 self.__comm = comm
92 self.__mode = mode
93 self.__file = file_obj
94 super().__init__(manager, source=path, external_resources_path=external_resources_path)
95 # NOTE: source is not set if path is None and file_obj is passed
96 self.__built = dict() # keep track of each builder for each dataset/group/link for each file
97 self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder
98 self.__ref_queue = deque() # a queue of the references that need to be added
99 self.__dci_queue = HDF5IODataChunkIteratorQueue() # a queue of DataChunkIterators that need to be exhausted
100 ObjectMapper.no_convert(Dataset)
101 self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object
103 @property
104 def comm(self):
105 """The MPI communicator to use for parallel I/O."""
106 return self.__comm
108 @property
109 def _file(self):
110 return self.__file
112 @property
113 def driver(self):
114 return self.__driver
116 @classmethod
117 def __check_path_file_obj(cls, path, file_obj):
118 if isinstance(path, Path):
119 path = str(path)
121 if path is None and file_obj is None:
122 raise ValueError("Either the 'path' or 'file' argument must be supplied.")
124 if path is not None and file_obj is not None: # consistency check
125 if os.path.abspath(file_obj.filename) != os.path.abspath(path):
126 msg = ("You argued '%s' as this object's path, but supplied a file with filename: %s"
127 % (path, file_obj.filename))
128 raise ValueError(msg)
130 return path
132 @classmethod
133 def __resolve_file_obj(cls, path, file_obj, driver):
134 path = cls.__check_path_file_obj(path, file_obj)
136 if file_obj is None:
137 file_kwargs = dict()
138 if driver is not None: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true
139 file_kwargs.update(driver=driver)
140 file_obj = File(path, 'r', **file_kwargs)
141 return file_obj
143 @classmethod
144 @docval({'name': 'namespace_catalog', 'type': (NamespaceCatalog, TypeMap),
145 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'},
146 {'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
147 {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None},
148 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
149 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
150 returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and "
151 "the included data types"),
152 rtype=dict)
153 def load_namespaces(cls, **kwargs):
154 """Load cached namespaces from a file.
156 If `file` is not supplied, then an :py:class:`h5py.File` object will be opened for the given `path`, the
157 namespaces will be read, and the File object will be closed. If `file` is supplied, then
158 the given File object will be read from and not closed.
160 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
161 """
162 namespace_catalog, path, namespaces, file_obj, driver = popargs(
163 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs)
165 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
166 if file_obj is None: # need to close the file object that we just opened
167 with open_file_obj:
168 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj)
169 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj)
171 @classmethod
172 def __load_namespaces(cls, namespace_catalog, namespaces, file_obj):
173 d = {}
175 if not cls.__check_specloc(file_obj):
176 return d
178 namespace_versions = cls.__get_namespaces(file_obj)
180 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]]
181 if namespaces is None: 181 ↛ 184line 181 didn't jump to line 184, because the condition on line 181 was never false
182 namespaces = list(spec_group.keys())
184 readers = dict()
185 deps = dict()
186 for ns in namespaces:
187 latest_version = namespace_versions[ns]
188 ns_group = spec_group[ns][latest_version]
189 reader = H5SpecReader(ns_group)
190 readers[ns] = reader
191 # for each namespace in the 'namespace' dataset, track all included namespaces (dependencies)
192 for spec_ns in reader.read_namespace(cls.__ns_spec_path):
193 deps[ns] = list()
194 for s in spec_ns['schema']:
195 dep = s.get('namespace')
196 if dep is not None:
197 deps[ns].append(dep)
199 order = cls._order_deps(deps)
200 for ns in order:
201 reader = readers[ns]
202 d.update(namespace_catalog.load_namespaces(cls.__ns_spec_path, reader=reader))
204 return d
206 @classmethod
207 def __check_specloc(cls, file_obj):
208 if SPEC_LOC_ATTR not in file_obj.attrs:
209 # this occurs in legacy files
210 msg = "No cached namespaces found in %s" % file_obj.filename
211 warnings.warn(msg)
212 return False
213 return True
215 @classmethod
216 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
217 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
218 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
219 returns="dict mapping names to versions of the namespaces in the file", rtype=dict)
220 def get_namespaces(cls, **kwargs):
221 """Get the names and versions of the cached namespaces from a file.
223 If ``file`` is not supplied, then an :py:class:`h5py.File` object will be opened for the given ``path``, the
224 namespaces will be read, and the File object will be closed. If `file` is supplied, then
225 the given File object will be read from and not closed.
227 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric
228 ordering) is returned. This is the version of the namespace that is loaded by HDF5IO.load_namespaces(...).
230 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
231 """
232 path, file_obj, driver = popargs('path', 'file', 'driver', kwargs)
234 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
235 if file_obj is None: # need to close the file object that we just opened
236 with open_file_obj:
237 return cls.__get_namespaces(open_file_obj)
238 return cls.__get_namespaces(open_file_obj)
240 @classmethod
241 def __get_namespaces(cls, file_obj):
242 """Return a dict mapping namespace name to version string for the latest version of that namespace in the file.
244 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric
245 ordering) is returned. This is the version of the namespace that is loaded by ``HDF5IO.load_namespaces``.
246 """
247 used_version_names = dict()
248 if not cls.__check_specloc(file_obj):
249 return used_version_names
251 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]]
252 namespaces = list(spec_group.keys())
253 for ns in namespaces:
254 ns_group = spec_group[ns]
255 # NOTE: by default, objects within groups are iterated in alphanumeric order
256 version_names = list(ns_group.keys())
257 if len(version_names) > 1:
258 # prior to HDMF 1.6.1, extensions without a version were written under the group name "unversioned"
259 # make sure that if there is another group representing a newer version, that is read instead
260 if 'unversioned' in version_names:
261 version_names.remove('unversioned')
262 if len(version_names) > 1:
263 # as of HDMF 1.6.1, extensions without a version are written under the group name "None"
264 # make sure that if there is another group representing a newer version, that is read instead
265 if 'None' in version_names:
266 version_names.remove('None')
267 used_version_names[ns] = version_names[-1] # save the largest in alphanumeric order
269 return used_version_names
271 @classmethod
272 def _order_deps(cls, deps):
273 """
274 Order namespaces according to dependency for loading into a NamespaceCatalog
276 Args:
277 deps (dict): a dictionary that maps a namespace name to a list of name of
278 the namespaces on which the namespace is directly dependent
279 Example: {'a': ['b', 'c'], 'b': ['d'], 'c': ['d'], 'd': []}
280 Expected output: ['d', 'b', 'c', 'a']
281 """
282 order = list()
283 keys = list(deps.keys())
284 deps = dict(deps)
285 for k in keys:
286 if k in deps:
287 cls.__order_deps_aux(order, deps, k)
288 return order
290 @classmethod
291 def __order_deps_aux(cls, order, deps, key):
292 """
293 A recursive helper function for _order_deps
294 """
295 if key not in deps:
296 return
297 subdeps = deps.pop(key)
298 for subk in subdeps:
299 cls.__order_deps_aux(order, deps, subk)
300 order.append(key)
302 @classmethod
303 @docval({'name': 'source_filename', 'type': str, 'doc': 'the path to the HDF5 file to copy'},
304 {'name': 'dest_filename', 'type': str, 'doc': 'the name of the destination file'},
305 {'name': 'expand_external', 'type': bool, 'doc': 'expand external links into new objects', 'default': True},
306 {'name': 'expand_refs', 'type': bool, 'doc': 'copy objects which are pointed to by reference',
307 'default': False},
308 {'name': 'expand_soft', 'type': bool, 'doc': 'expand soft links into new objects', 'default': False}
309 )
310 def copy_file(self, **kwargs):
311 """
312 Convenience function to copy an HDF5 file while allowing external links to be resolved.
314 .. warning::
316 As of HDMF 2.0, this method is no longer supported and may be removed in a future version.
317 Please use the export method or h5py.File.copy method instead.
319 .. note::
321 The source file will be opened in 'r' mode and the destination file will be opened in 'w' mode
322 using h5py. To avoid possible collisions, care should be taken that, e.g., the source file is
323 not opened already when calling this function.
325 """
327 warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of "
328 "HDMF. Please use the export method or h5py.File.copy method instead.", DeprecationWarning)
330 source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename',
331 'dest_filename',
332 'expand_external',
333 'expand_refs',
334 'expand_soft',
335 kwargs)
336 source_file = File(source_filename, 'r')
337 dest_file = File(dest_filename, 'w')
338 for objname in source_file["/"].keys():
339 source_file.copy(source=objname,
340 dest=dest_file,
341 name=objname,
342 expand_external=expand_external,
343 expand_refs=expand_refs,
344 expand_soft=expand_soft,
345 shallow=False,
346 without_attrs=False,
347 )
348 for objname in source_file['/'].attrs:
349 dest_file['/'].attrs[objname] = source_file['/'].attrs[objname]
350 source_file.close()
351 dest_file.close()
353 @docval({'name': 'container', 'type': Container, 'doc': 'the Container object to write'},
354 {'name': 'cache_spec', 'type': bool,
355 'doc': ('If True (default), cache specification to file (highly recommended). If False, do not cache '
356 'specification to file. The appropriate specification will then need to be loaded prior to '
357 'reading the file.'),
358 'default': True},
359 {'name': 'link_data', 'type': bool,
360 'doc': 'If True (default), create external links to HDF5 Datasets. If False, copy HDF5 Datasets.',
361 'default': True},
362 {'name': 'exhaust_dci', 'type': bool,
363 'doc': 'If True (default), exhaust DataChunkIterators one at a time. If False, exhaust them concurrently.',
364 'default': True})
365 def write(self, **kwargs):
366 """Write the container to an HDF5 file."""
367 if self.__mode == 'r':
368 raise UnsupportedOperation(("Cannot write to file %s in mode '%s'. "
369 "Please use mode 'r+', 'w', 'w-', 'x', or 'a'")
370 % (self.source, self.__mode))
372 cache_spec = popargs('cache_spec', kwargs)
373 super().write(**kwargs)
374 if cache_spec:
375 self.__cache_spec()
377 def __cache_spec(self):
378 ref = self.__file.attrs.get(SPEC_LOC_ATTR)
379 spec_group = None
380 if ref is not None:
381 spec_group = self.__file[ref]
382 else:
383 path = 'specifications' # do something to figure out where the specifications should go
384 spec_group = self.__file.require_group(path)
385 self.__file.attrs[SPEC_LOC_ATTR] = spec_group.ref
386 ns_catalog = self.manager.namespace_catalog
387 for ns_name in ns_catalog.namespaces:
388 ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name)
389 namespace = ns_catalog.get_namespace(ns_name)
390 group_name = '%s/%s' % (ns_name, namespace.version)
391 if group_name in spec_group:
392 continue
393 ns_group = spec_group.create_group(group_name)
394 writer = H5SpecWriter(ns_group)
395 ns_builder.export(self.__ns_spec_path, writer=writer)
397 _export_args = (
398 {'name': 'src_io', 'type': 'HDMFIO', 'doc': 'the HDMFIO object for reading the data to export'},
399 {'name': 'container', 'type': Container,
400 'doc': ('the Container object to export. If None, then the entire contents of the HDMFIO object will be '
401 'exported'),
402 'default': None},
403 {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`',
404 'default': None},
405 {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file',
406 'default': True}
407 # clear_cache is an arg on HDMFIO.export but it is intended for internal usage
408 # so it is not available on HDF5IO
409 )
411 @docval(*_export_args)
412 def export(self, **kwargs):
413 """Export data read from a file from any backend to HDF5.
415 See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details.
416 """
417 if self.__mode != 'w':
418 raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'."
419 % (self.source, self.__mode))
421 src_io = getargs('src_io', kwargs)
422 write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs)
423 if write_args is None:
424 write_args = dict()
426 if not isinstance(src_io, HDF5IO) and write_args.get('link_data', True):
427 raise UnsupportedOperation("Cannot export from non-HDF5 backend %s to HDF5 with write argument "
428 "link_data=True." % src_io.__class__.__name__)
430 write_args['export_source'] = os.path.abspath(src_io.source) if src_io.source is not None else None
431 ckwargs = kwargs.copy()
432 ckwargs['write_args'] = write_args
433 if not write_args.get('link_data', True):
434 ckwargs['clear_cache'] = True
435 super().export(**ckwargs)
436 if cache_spec:
437 # add any namespaces from the src_io that have not yet been loaded
438 for namespace in src_io.manager.namespace_catalog.namespaces:
439 if namespace not in self.manager.namespace_catalog.namespaces: 439 ↛ 438line 439 didn't jump to line 438, because the condition on line 439 was never false
440 self.manager.namespace_catalog.add_namespace(
441 name=namespace,
442 namespace=src_io.manager.namespace_catalog.get_namespace(namespace)
443 )
444 self.__cache_spec()
446 @classmethod
447 @docval({'name': 'path', 'type': str, 'doc': 'the path to the destination HDF5 file'},
448 {'name': 'comm', 'type': 'Intracomm', 'doc': 'the MPI communicator to use for parallel I/O',
449 'default': None},
450 *_export_args) # NOTE: src_io is required and is the second positional argument
451 def export_io(self, **kwargs):
452 """Export from one backend to HDF5 (class method).
454 Convenience function for :py:meth:`export` where you do not need to
455 instantiate a new ``HDF5IO`` object for writing. An ``HDF5IO`` object is created with mode 'w' and the given
456 arguments.
458 Example usage:
460 .. code-block:: python
462 old_io = HDF5IO('old.h5', 'r')
463 HDF5IO.export_io(path='new_copy.h5', src_io=old_io)
465 See :py:meth:`export` for more details.
466 """
467 path, comm = popargs('path', 'comm', kwargs)
469 with HDF5IO(path=path, comm=comm, mode='w') as write_io:
470 write_io.export(**kwargs)
472 def read(self, **kwargs):
473 if self.__mode == 'w' or self.__mode == 'w-' or self.__mode == 'x':
474 raise UnsupportedOperation("Cannot read from file %s in mode '%s'. Please use mode 'r', 'r+', or 'a'."
475 % (self.source, self.__mode))
476 try:
477 return super().read(**kwargs)
478 except UnsupportedOperation as e:
479 if str(e) == 'Cannot build data. There are no values.': # pragma: no cover
480 raise UnsupportedOperation("Cannot read data from file %s in mode '%s'. There are no values."
481 % (self.source, self.__mode))
483 @docval(returns='a GroupBuilder representing the data object', rtype='GroupBuilder')
484 def read_builder(self):
485 """
486 Read data and return the GroupBuilder representing it.
488 NOTE: On read, the Builder.source may will usually not be set of the Builders.
489 NOTE: The Builder.location is used internally to ensure correct handling of links (in particular on export)
490 and should be set on read for all GroupBuilder, DatasetBuilder, and LinkBuilder objects.
491 """
492 if not self.__file:
493 raise UnsupportedOperation("Cannot read data from closed HDF5 file '%s'" % self.source)
494 f_builder = self.__read.get(self.__file)
495 # ignore cached specs when reading builder
496 ignore = set()
497 specloc = self.__file.attrs.get(SPEC_LOC_ATTR)
498 if specloc is not None:
499 ignore.add(self.__file[specloc].name)
500 if f_builder is None:
501 f_builder = self.__read_group(self.__file, ROOT_NAME, ignore=ignore)
502 self.__read[self.__file] = f_builder
503 return f_builder
505 def __set_written(self, builder):
506 """
507 Helper function used to set the written status for builders
509 :param builder: Builder object to be marked as written
510 :type builder: Builder
511 """
512 self._written_builders.set_written(builder)
514 def get_written(self, builder):
515 """Return True if this builder has been written to (or read from) disk by this IO object, False otherwise.
517 :param builder: Builder object to get the written flag for
518 :type builder: Builder
520 :return: True if the builder is found in self._written_builders using the builder ID, False otherwise
521 """
522 return self._written_builders.get_written(builder)
524 def __set_built(self, fpath, id, builder):
525 """
526 Update self.__built to cache the given builder for the given file and id.
528 :param fpath: Path to the HDF5 file containing the object
529 :type fpath: str
530 :param id: ID of the HDF5 object in the path
531 :type id: h5py GroupID object
532 :param builder: The builder to be cached
533 """
534 self.__built.setdefault(fpath, dict()).setdefault(id, builder)
536 def __get_built(self, fpath, id):
537 """
538 Look up a builder for the given file and id in self.__built cache
540 :param fpath: Path to the HDF5 file containing the object
541 :type fpath: str
542 :param id: ID of the HDF5 object in the path
543 :type id: h5py GroupID object
545 :return: Builder in the self.__built cache or None
546 """
548 fdict = self.__built.get(fpath)
549 if fdict:
550 return fdict.get(id)
551 else:
552 return None
554 @docval({'name': 'h5obj', 'type': (Dataset, Group),
555 'doc': 'the HDF5 object to the corresponding Builder object for'})
556 def get_builder(self, **kwargs):
557 """
558 Get the builder for the corresponding h5py Group or Dataset
560 :raises ValueError: When no builder has been constructed yet for the given h5py object
561 """
562 h5obj = getargs('h5obj', kwargs)
563 fpath = h5obj.file.filename
564 builder = self.__get_built(fpath, h5obj.id)
565 if builder is None: 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true
566 msg = '%s:%s has not been built' % (fpath, h5obj.name)
567 raise ValueError(msg)
568 return builder
570 @docval({'name': 'h5obj', 'type': (Dataset, Group),
571 'doc': 'the HDF5 object to the corresponding Container/Data object for'})
572 def get_container(self, **kwargs):
573 """
574 Get the container for the corresponding h5py Group or Dataset
576 :raises ValueError: When no builder has been constructed yet for the given h5py object
577 """
578 h5obj = getargs('h5obj', kwargs)
579 builder = self.get_builder(h5obj)
580 container = self.manager.construct(builder)
581 return container
583 def __read_group(self, h5obj, name=None, ignore=set()):
584 kwargs = {
585 "attributes": self.__read_attrs(h5obj),
586 "groups": dict(),
587 "datasets": dict(),
588 "links": dict()
589 }
591 for key, val in kwargs['attributes'].items():
592 if isinstance(val, bytes): 592 ↛ 593line 592 didn't jump to line 593, because the condition on line 592 was never true
593 kwargs['attributes'][key] = val.decode('UTF-8')
595 if name is None:
596 name = str(os.path.basename(h5obj.name))
597 for k in h5obj:
598 sub_h5obj = h5obj.get(k)
599 if sub_h5obj is not None:
600 if sub_h5obj.name in ignore:
601 continue
602 link_type = h5obj.get(k, getlink=True)
603 if isinstance(link_type, (SoftLink, ExternalLink)):
604 # Reading links might be better suited in its own function
605 # get path of link (the key used for tracking what's been built)
606 target_path = link_type.path
607 target_obj = sub_h5obj.file[target_path]
608 builder_name = os.path.basename(target_path)
609 # get builder if already read, else build it
610 builder = self.__get_built(sub_h5obj.file.filename, target_obj.id)
611 if builder is None:
612 # NOTE: all links must have absolute paths
613 if isinstance(target_obj, Dataset):
614 builder = self.__read_dataset(target_obj, builder_name)
615 else:
616 builder = self.__read_group(target_obj, builder_name, ignore=ignore)
617 self.__set_built(sub_h5obj.file.filename, target_obj.id, builder)
618 link_builder = LinkBuilder(builder=builder, name=k, source=os.path.abspath(h5obj.file.filename))
619 link_builder.location = h5obj.name
620 self.__set_written(link_builder)
621 kwargs['links'][builder_name] = link_builder
622 if isinstance(link_type, ExternalLink):
623 self.__open_links.append(sub_h5obj)
624 else:
625 builder = self.__get_built(sub_h5obj.file.filename, sub_h5obj.id)
626 obj_type = None
627 read_method = None
628 if isinstance(sub_h5obj, Dataset):
629 read_method = self.__read_dataset
630 obj_type = kwargs['datasets']
631 else:
632 read_method = partial(self.__read_group, ignore=ignore)
633 obj_type = kwargs['groups']
634 if builder is None:
635 builder = read_method(sub_h5obj)
636 self.__set_built(sub_h5obj.file.filename, sub_h5obj.id, builder)
637 obj_type[builder.name] = builder
638 else:
639 warnings.warn('Path to Group altered/broken at ' + os.path.join(h5obj.name, k), BrokenLinkWarning)
640 kwargs['datasets'][k] = None
641 continue
642 kwargs['source'] = os.path.abspath(h5obj.file.filename)
643 ret = GroupBuilder(name, **kwargs)
644 ret.location = os.path.dirname(h5obj.name)
645 self.__set_written(ret)
646 return ret
648 def __read_dataset(self, h5obj, name=None):
649 kwargs = {
650 "attributes": self.__read_attrs(h5obj),
651 "dtype": h5obj.dtype,
652 "maxshape": h5obj.maxshape
653 }
654 for key, val in kwargs['attributes'].items():
655 if isinstance(val, bytes): 655 ↛ 656line 655 didn't jump to line 656, because the condition on line 655 was never true
656 kwargs['attributes'][key] = val.decode('UTF-8')
658 if name is None:
659 name = str(os.path.basename(h5obj.name))
660 kwargs['source'] = os.path.abspath(h5obj.file.filename)
661 ndims = len(h5obj.shape)
662 if ndims == 0: # read scalar
663 scalar = h5obj[()]
664 if isinstance(scalar, bytes): 664 ↛ 665line 664 didn't jump to line 665, because the condition on line 664 was never true
665 scalar = scalar.decode('UTF-8')
667 if isinstance(scalar, Reference): 667 ↛ 669line 667 didn't jump to line 669, because the condition on line 667 was never true
668 # TODO (AJTRITT): This should call __read_ref to support Group references
669 target = h5obj.file[scalar]
670 target_builder = self.__read_dataset(target)
671 self.__set_built(target.file.filename, target.id, target_builder)
672 if isinstance(scalar, RegionReference):
673 d = RegionBuilder(scalar, target_builder)
674 else:
675 d = ReferenceBuilder(target_builder)
676 kwargs['data'] = d
677 kwargs['dtype'] = d.dtype
678 else:
679 kwargs["data"] = scalar
680 else:
681 d = None
682 if h5obj.dtype.kind == 'O' and len(h5obj) > 0:
683 elem1 = h5obj[tuple([0] * (h5obj.ndim - 1) + [0])]
684 if isinstance(elem1, (str, bytes)):
685 d = self._check_str_dtype(h5obj)
686 elif isinstance(elem1, RegionReference): # read list of references 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true
687 d = BuilderH5RegionDataset(h5obj, self)
688 kwargs['dtype'] = d.dtype
689 elif isinstance(elem1, Reference): 689 ↛ 699line 689 didn't jump to line 699, because the condition on line 689 was never false
690 d = BuilderH5ReferenceDataset(h5obj, self)
691 kwargs['dtype'] = d.dtype
692 elif h5obj.dtype.kind == 'V': # table / compound data type
693 cpd_dt = h5obj.dtype
694 ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))]
695 d = BuilderH5TableDataset(h5obj, self, ref_cols)
696 kwargs['dtype'] = HDF5IO.__compound_dtype_to_list(h5obj.dtype, d.dtype)
697 else:
698 d = h5obj
699 kwargs["data"] = d
700 ret = DatasetBuilder(name, **kwargs)
701 ret.location = os.path.dirname(h5obj.name)
702 self.__set_written(ret)
703 return ret
705 def _check_str_dtype(self, h5obj):
706 dtype = h5obj.dtype
707 if dtype.kind == 'O': 707 ↛ 710line 707 didn't jump to line 710, because the condition on line 707 was never false
708 if dtype.metadata.get('vlen') == str and H5PY_3: 708 ↛ 710line 708 didn't jump to line 710, because the condition on line 708 was never false
709 return StrDataset(h5obj, None)
710 return h5obj
712 @classmethod
713 def __compound_dtype_to_list(cls, h5obj_dtype, dset_dtype):
714 ret = []
715 for name, dtype in zip(h5obj_dtype.fields, dset_dtype):
716 ret.append({'name': name, 'dtype': dtype})
717 return ret
719 def __read_attrs(self, h5obj):
720 ret = dict()
721 for k, v in h5obj.attrs.items():
722 if k == SPEC_LOC_ATTR: # ignore cached spec
723 continue
724 if isinstance(v, RegionReference): 724 ↛ 725line 724 didn't jump to line 725, because the condition on line 724 was never true
725 raise ValueError("cannot read region reference attributes yet")
726 elif isinstance(v, Reference):
727 ret[k] = self.__read_ref(h5obj.file[v])
728 else:
729 ret[k] = v
730 return ret
732 def __read_ref(self, h5obj):
733 ret = None
734 ret = self.__get_built(h5obj.file.filename, h5obj.id)
735 if ret is None:
736 if isinstance(h5obj, Dataset):
737 ret = self.__read_dataset(h5obj)
738 elif isinstance(h5obj, Group): 738 ↛ 741line 738 didn't jump to line 741, because the condition on line 738 was never false
739 ret = self.__read_group(h5obj)
740 else:
741 raise ValueError("h5obj must be a Dataset or a Group - got %s" % str(h5obj))
742 self.__set_built(h5obj.file.filename, h5obj.id, ret)
743 return ret
745 def open(self):
746 if self.__file is None:
747 open_flag = self.__mode
748 kwargs = dict()
749 if self.comm: 749 ↛ 750line 749 didn't jump to line 750, because the condition on line 749 was never true
750 kwargs.update(driver='mpio', comm=self.comm)
752 if self.driver is not None: 752 ↛ 753line 752 didn't jump to line 753, because the condition on line 752 was never true
753 kwargs.update(driver=self.driver)
755 self.__file = File(self.source, open_flag, **kwargs)
757 def close(self, close_links=True):
758 """Close this file and any files linked to from this file.
760 :param close_links: Whether to close all files linked to from this file. (default: True)
761 :type close_links: bool
762 """
763 if close_links:
764 self.close_linked_files()
765 try:
766 if self.__file is not None:
767 self.__file.close()
768 except AttributeError:
769 # Do not do anything in case that self._file does not exist. This
770 # may happen in case that an error occurs before HDF5IO has been fully
771 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises
772 # an error before self.__file has been created
773 self.__file = None
775 def close_linked_files(self):
776 """Close all opened, linked-to files.
778 MacOS and Linux automatically release the linked-to file after the linking file is closed, but Windows does
779 not, which prevents the linked-to file from being deleted or truncated. Use this method to close all opened,
780 linked-to files.
781 """
782 # Make sure
783 try:
784 for obj in self.__open_links:
785 if obj:
786 obj.file.close()
787 except AttributeError:
788 # Do not do anything in case that self.__open_links does not exist. This
789 # may happen in case that an error occurs before HDF5IO has been fully
790 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises
791 # an error before self.__open_links has been created.
792 pass
793 finally:
794 self.__open_links = []
796 @docval({'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the HDF5 file'},
797 {'name': 'link_data', 'type': bool,
798 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
799 {'name': 'exhaust_dci', 'type': bool,
800 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
801 'default': True},
802 {'name': 'export_source', 'type': str,
803 'doc': 'The source of the builders when exporting', 'default': None})
804 def write_builder(self, **kwargs):
805 f_builder = popargs('builder', kwargs)
806 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
807 self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s"
808 % (f_builder.name, self.source, kwargs))
809 for name, gbldr in f_builder.groups.items():
810 self.write_group(self.__file, gbldr, **kwargs)
811 for name, dbldr in f_builder.datasets.items():
812 self.write_dataset(self.__file, dbldr, **kwargs)
813 for name, lbldr in f_builder.links.items():
814 self.write_link(self.__file, lbldr, export_source=kwargs.get("export_source"))
815 self.set_attributes(self.__file, f_builder.attributes)
816 self.__add_refs()
817 self.__dci_queue.exhaust_queue()
818 self.__set_written(f_builder)
819 self.logger.debug("Done writing %s '%s' to path '%s'" %
820 (f_builder.__class__.__qualname__, f_builder.name, self.source))
822 def __add_refs(self):
823 '''
824 Add all references in the file.
826 References get queued to be added at the end of write. This is because
827 the current traversal algorithm (i.e. iterating over GroupBuilder items)
828 does not happen in a guaranteed order. We need to figure out what objects
829 will be references, and then write them after we write everything else.
830 '''
831 failed = set()
832 while len(self.__ref_queue) > 0:
833 call = self.__ref_queue.popleft()
834 self.logger.debug("Adding reference with call id %d from queue (length %d)"
835 % (id(call), len(self.__ref_queue)))
836 try:
837 call()
838 except KeyError:
839 if id(call) in failed:
840 raise RuntimeError('Unable to resolve reference')
841 self.logger.debug("Adding reference with call id %d failed. Appending call to queue" % id(call))
842 failed.add(id(call))
843 self.__ref_queue.append(call)
845 @classmethod
846 def get_type(cls, data):
847 if isinstance(data, str):
848 return H5_TEXT
849 elif isinstance(data, bytes): 849 ↛ 850line 849 didn't jump to line 850, because the condition on line 849 was never true
850 return H5_BINARY
851 elif isinstance(data, Container): 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true
852 return H5_REF
853 elif not hasattr(data, '__len__'):
854 return type(data)
855 else:
856 if len(data) == 0:
857 if hasattr(data, 'dtype'): 857 ↛ 858line 857 didn't jump to line 858, because the condition on line 857 was never true
858 return data.dtype
859 else:
860 raise ValueError('cannot determine type for empty data')
861 return cls.get_type(data[0])
863 __dtypes = {
864 "float": np.float32,
865 "float32": np.float32,
866 "double": np.float64,
867 "float64": np.float64,
868 "long": np.int64,
869 "int64": np.int64,
870 "int": np.int32,
871 "int32": np.int32,
872 "short": np.int16,
873 "int16": np.int16,
874 "int8": np.int8,
875 "uint64": np.uint64,
876 "uint": np.uint32,
877 "uint32": np.uint32,
878 "uint16": np.uint16,
879 "uint8": np.uint8,
880 "bool": np.bool_,
881 "text": H5_TEXT,
882 "utf": H5_TEXT,
883 "utf8": H5_TEXT,
884 "utf-8": H5_TEXT,
885 "ascii": H5_BINARY,
886 "bytes": H5_BINARY,
887 "ref": H5_REF,
888 "reference": H5_REF,
889 "object": H5_REF,
890 "region": H5_REGREF,
891 "isodatetime": H5_TEXT,
892 "datetime": H5_TEXT,
893 }
895 @classmethod
896 def __resolve_dtype__(cls, dtype, data):
897 # TODO: These values exist, but I haven't solved them yet
898 # binary
899 # number
900 dtype = cls.__resolve_dtype_helper__(dtype)
901 if dtype is None:
902 dtype = cls.get_type(data)
903 return dtype
905 @classmethod
906 def __resolve_dtype_helper__(cls, dtype):
907 if dtype is None:
908 return None
909 elif isinstance(dtype, str):
910 return cls.__dtypes.get(dtype)
911 elif isinstance(dtype, dict):
912 return cls.__dtypes.get(dtype['reftype'])
913 elif isinstance(dtype, np.dtype):
914 # NOTE: some dtypes may not be supported, but we need to support writing of read-in compound types
915 return dtype
916 else:
917 return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype])
919 @docval({'name': 'obj', 'type': (Group, Dataset), 'doc': 'the HDF5 object to add attributes to'},
920 {'name': 'attributes',
921 'type': dict,
922 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'})
923 def set_attributes(self, **kwargs):
924 obj, attributes = getargs('obj', 'attributes', kwargs)
925 for key, value in attributes.items():
926 try:
927 if isinstance(value, (set, list, tuple)):
928 tmp = tuple(value)
929 if len(tmp) > 0:
930 if isinstance(tmp[0], (str, bytes)): 930 ↛ 932line 930 didn't jump to line 932, because the condition on line 930 was never false
931 value = np.array(value, dtype=special_dtype(vlen=type(tmp[0])))
932 elif isinstance(tmp[0], Container): # a list of references
933 self.__queue_ref(self._make_attr_ref_filler(obj, key, tmp))
934 else:
935 value = np.array(value)
936 self.logger.debug("Setting %s '%s' attribute '%s' to %s"
937 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
938 obj.attrs[key] = value
939 elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference
940 self.__queue_ref(self._make_attr_ref_filler(obj, key, value))
941 else:
942 self.logger.debug("Setting %s '%s' attribute '%s' to %s"
943 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
944 if isinstance(value, np.ndarray) and value.dtype.kind == 'U': 944 ↛ 945line 944 didn't jump to line 945, because the condition on line 944 was never true
945 value = np.array(value, dtype=H5_TEXT)
946 obj.attrs[key] = value # a regular scalar
947 except Exception as e:
948 msg = "unable to write attribute '%s' on object '%s'" % (key, obj.name)
949 raise RuntimeError(msg) from e
951 def _make_attr_ref_filler(self, obj, key, value):
952 '''
953 Make the callable for setting references to attributes
954 '''
955 self.logger.debug("Queueing set %s '%s' attribute '%s' to %s"
956 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
957 if isinstance(value, (tuple, list)): 957 ↛ 958line 957 didn't jump to line 958, because the condition on line 957 was never true
958 def _filler():
959 ret = list()
960 for item in value:
961 ret.append(self.__get_ref(item))
962 obj.attrs[key] = ret
963 else:
964 def _filler():
965 obj.attrs[key] = self.__get_ref(value)
966 return _filler
968 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'},
969 {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'},
970 {'name': 'link_data', 'type': bool,
971 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
972 {'name': 'exhaust_dci', 'type': bool,
973 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
974 'default': True},
975 {'name': 'export_source', 'type': str,
976 'doc': 'The source of the builders when exporting', 'default': None},
977 returns='the Group that was created', rtype='Group')
978 def write_group(self, **kwargs):
979 parent, builder = popargs('parent', 'builder', kwargs)
980 self.logger.debug("Writing GroupBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
981 if self.get_written(builder):
982 self.logger.debug(" GroupBuilder '%s' is already written" % builder.name)
983 group = parent[builder.name]
984 else:
985 self.logger.debug(" Creating group '%s'" % builder.name)
986 group = parent.create_group(builder.name)
987 # write all groups
988 subgroups = builder.groups
989 if subgroups:
990 for subgroup_name, sub_builder in subgroups.items():
991 # do not create an empty group without attributes or links
992 self.write_group(group, sub_builder, **kwargs)
993 # write all datasets
994 datasets = builder.datasets
995 if datasets:
996 for dset_name, sub_builder in datasets.items():
997 self.write_dataset(group, sub_builder, **kwargs)
998 # write all links
999 links = builder.links
1000 if links:
1001 for link_name, sub_builder in links.items():
1002 self.write_link(group, sub_builder, export_source=kwargs.get("export_source"))
1003 attributes = builder.attributes
1004 self.set_attributes(group, attributes)
1005 self.__set_written(builder)
1006 return group
1008 def __get_path(self, builder):
1009 """Get the path to the builder.
1011 Note that the root of the file has no name - it is just "/". Thus, the name of the root container is ignored.
1012 If builder.location is set then it is used as the path, otherwise the function
1013 determines the path by constructing it iteratively from the parents of the
1014 builder.
1015 """
1016 if builder.location is not None:
1017 path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/")
1018 else:
1019 curr = builder
1020 names = list()
1021 while curr.parent is not None:
1022 names.append(curr.name)
1023 curr = curr.parent
1024 delim = "/"
1025 path = "%s%s" % (delim, delim.join(reversed(names)))
1026 return path
1028 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'},
1029 {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'},
1030 {'name': 'export_source', 'type': str,
1031 'doc': 'The source of the builders when exporting', 'default': None},
1032 returns='the Link that was created', rtype='Link')
1033 def write_link(self, **kwargs):
1034 parent, builder, export_source = getargs('parent', 'builder', 'export_source', kwargs)
1035 self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
1036 if self.get_written(builder): 1036 ↛ 1037line 1036 didn't jump to line 1037, because the condition on line 1036 was never true
1037 self.logger.debug(" LinkBuilder '%s' is already written" % builder.name)
1038 return None
1039 name = builder.name
1040 target_builder = builder.builder
1041 path = self.__get_path(target_builder)
1042 # source will indicate target_builder's location
1043 if export_source is None:
1044 write_source = builder.source
1045 else:
1046 write_source = export_source
1048 parent_filename = os.path.abspath(parent.file.filename)
1049 if target_builder.source in (write_source, parent_filename):
1050 link_obj = SoftLink(path)
1051 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1052 % (parent.name, name, link_obj.path))
1053 elif target_builder.source is not None: 1053 ↛ 1062line 1053 didn't jump to line 1062, because the condition on line 1053 was never false
1054 target_filename = os.path.abspath(target_builder.source)
1055 relative_path = os.path.relpath(target_filename, os.path.dirname(parent_filename))
1056 if target_builder.location is not None:
1057 path = target_builder.location + "/" + target_builder.name
1058 link_obj = ExternalLink(relative_path, path)
1059 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1060 % (parent.name, name, link_obj.filename, link_obj.path))
1061 else:
1062 msg = 'cannot create external link to %s' % path
1063 raise ValueError(msg)
1064 parent[name] = link_obj
1065 self.__set_written(builder)
1066 return link_obj
1068 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, # noqa: C901
1069 {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'},
1070 {'name': 'link_data', 'type': bool,
1071 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
1072 {'name': 'exhaust_dci', 'type': bool,
1073 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
1074 'default': True},
1075 {'name': 'export_source', 'type': str,
1076 'doc': 'The source of the builders when exporting', 'default': None},
1077 returns='the Dataset that was created', rtype=Dataset)
1078 def write_dataset(self, **kwargs): # noqa: C901
1079 """ Write a dataset to HDF5
1081 The function uses other dataset-dependent write functions, e.g,
1082 ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data.
1083 """
1084 parent, builder = popargs('parent', 'builder', kwargs)
1085 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
1086 self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
1087 if self.get_written(builder):
1088 self.logger.debug(" DatasetBuilder '%s' is already written" % builder.name)
1089 return None
1090 name = builder.name
1091 data = builder.data
1092 dataio = None
1093 options = dict() # dict with additional
1094 if isinstance(data, H5DataIO):
1095 options['io_settings'] = data.io_settings
1096 dataio = data
1097 link_data = data.link_data
1098 data = data.data
1099 else:
1100 options['io_settings'] = {}
1101 attributes = builder.attributes
1102 options['dtype'] = builder.dtype
1103 dset = None
1104 link = None
1106 # The user provided an existing h5py dataset as input and asked to create a link to the dataset
1107 if isinstance(data, Dataset):
1108 data_filename = os.path.abspath(data.file.filename)
1109 if link_data:
1110 if export_source is None: # not exporting
1111 parent_filename = os.path.abspath(parent.file.filename)
1112 if data_filename != parent_filename: # create external link to data
1113 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename))
1114 link = ExternalLink(relative_path, data.name)
1115 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1116 % (parent.name, name, link.filename, link.path))
1117 else: # create soft link to dataset already in this file -- possible if mode == 'r+'
1118 link = SoftLink(data.name)
1119 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1120 % (parent.name, name, link.path))
1121 parent[name] = link
1122 else: # exporting
1123 export_source = os.path.abspath(export_source)
1124 parent_filename = os.path.abspath(parent.file.filename)
1125 if data_filename != export_source: # dataset is in different file than export source
1126 # possible if user adds a link to a dataset in a different file after reading export source
1127 # to memory
1128 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename))
1129 link = ExternalLink(relative_path, data.name)
1130 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1131 % (parent.name, name, link.filename, link.path))
1132 parent[name] = link
1133 elif parent.name != data.parent.name: # dataset is in export source and has different path
1134 # so create a soft link to the dataset in this file
1135 # possible if user adds a link to a dataset in export source after reading to memory
1136 # TODO check that there is/will be still a dataset at data.name -- if the dataset has
1137 # been removed, then this link will be broken
1138 link = SoftLink(data.name)
1139 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1140 % (parent.name, name, link.path))
1141 parent[name] = link
1142 else: # dataset is in export source and has same path as the builder, so copy the dataset
1143 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'"
1144 % (data.file.filename, data.name, parent.name, name))
1145 parent.copy(source=data,
1146 dest=parent,
1147 name=name,
1148 expand_soft=False,
1149 expand_external=False,
1150 expand_refs=False,
1151 without_attrs=True)
1152 dset = parent[name]
1153 else:
1154 # TODO add option for case where there are multiple links to the same dataset within a file:
1155 # instead of copying the dset N times, copy it once and create soft links to it within the file
1156 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'"
1157 % (data.file.filename, data.name, parent.name, name))
1158 parent.copy(source=data,
1159 dest=parent,
1160 name=name,
1161 expand_soft=False,
1162 expand_external=False,
1163 expand_refs=False,
1164 without_attrs=True)
1165 dset = parent[name]
1167 # Write a compound dataset, i.e, a dataset with compound data type
1168 elif isinstance(options['dtype'], list):
1169 # do some stuff to figure out what data is a reference
1170 refs = list()
1171 for i, dts in enumerate(options['dtype']):
1172 if self.__is_ref(dts):
1173 refs.append(i)
1174 # If one or more of the parts of the compound data type are references then we need to deal with those
1175 if len(refs) > 0:
1176 try:
1177 _dtype = self.__resolve_dtype__(options['dtype'], data)
1178 except Exception as exc:
1179 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1180 raise Exception(msg) from exc
1181 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings'])
1182 self.__set_written(builder)
1183 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1184 "object references. attributes: %s"
1185 % (name, list(attributes.keys())))
1187 @self.__queue_ref
1188 def _filler():
1189 self.logger.debug("Resolving object references and setting attribute on dataset '%s' "
1190 "containing attributes: %s"
1191 % (name, list(attributes.keys())))
1192 ret = list()
1193 for item in data:
1194 new_item = list(item)
1195 for i in refs:
1196 new_item[i] = self.__get_ref(item[i])
1197 ret.append(tuple(new_item))
1198 dset = parent[name]
1199 dset[:] = ret
1200 self.set_attributes(dset, attributes)
1202 return
1203 # If the compound data type contains only regular data (i.e., no references) then we can write it as usual
1204 else:
1205 dset = self.__list_fill__(parent, name, data, options)
1206 # Write a dataset containing references, i.e., a region or object reference.
1207 # NOTE: we can ignore options['io_settings'] for scalar data
1208 elif self.__is_ref(options['dtype']):
1209 _dtype = self.__dtypes.get(options['dtype'])
1210 # Write a scalar data region reference dataset
1211 if isinstance(data, RegionBuilder): 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true
1212 dset = parent.require_dataset(name, shape=(), dtype=_dtype)
1213 self.__set_written(builder)
1214 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing a "
1215 "region reference. attributes: %s"
1216 % (name, list(attributes.keys())))
1218 @self.__queue_ref
1219 def _filler():
1220 self.logger.debug("Resolving region reference and setting attribute on dataset '%s' "
1221 "containing attributes: %s"
1222 % (name, list(attributes.keys())))
1223 ref = self.__get_ref(data.builder, data.region)
1224 dset = parent[name]
1225 dset[()] = ref
1226 self.set_attributes(dset, attributes)
1227 # Write a scalar object reference dataset
1228 elif isinstance(data, ReferenceBuilder): 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true
1229 dset = parent.require_dataset(name, dtype=_dtype, shape=())
1230 self.__set_written(builder)
1231 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing an "
1232 "object reference. attributes: %s"
1233 % (name, list(attributes.keys())))
1235 @self.__queue_ref
1236 def _filler():
1237 self.logger.debug("Resolving object reference and setting attribute on dataset '%s' "
1238 "containing attributes: %s"
1239 % (name, list(attributes.keys())))
1240 ref = self.__get_ref(data.builder)
1241 dset = parent[name]
1242 dset[()] = ref
1243 self.set_attributes(dset, attributes)
1244 # Write an array dataset of references
1245 else:
1246 # Write a array of region references
1247 if options['dtype'] == 'region': 1247 ↛ 1248line 1247 didn't jump to line 1248, because the condition on line 1247 was never true
1248 dset = parent.require_dataset(name, dtype=_dtype, shape=(len(data),), **options['io_settings'])
1249 self.__set_written(builder)
1250 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1251 "region references. attributes: %s"
1252 % (name, list(attributes.keys())))
1254 @self.__queue_ref
1255 def _filler():
1256 self.logger.debug("Resolving region references and setting attribute on dataset '%s' "
1257 "containing attributes: %s"
1258 % (name, list(attributes.keys())))
1259 refs = list()
1260 for item in data:
1261 refs.append(self.__get_ref(item.builder, item.region))
1262 dset = parent[name]
1263 dset[()] = refs
1264 self.set_attributes(dset, attributes)
1265 # Write array of object references
1266 else:
1267 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings'])
1268 self.__set_written(builder)
1269 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1270 "object references. attributes: %s"
1271 % (name, list(attributes.keys())))
1273 @self.__queue_ref
1274 def _filler():
1275 self.logger.debug("Resolving object references and setting attribute on dataset '%s' "
1276 "containing attributes: %s"
1277 % (name, list(attributes.keys())))
1278 refs = list()
1279 for item in data:
1280 refs.append(self.__get_ref(item))
1281 dset = parent[name]
1282 dset[()] = refs
1283 self.set_attributes(dset, attributes)
1284 return
1285 # write a "regular" dataset
1286 else:
1287 # Create an empty dataset
1288 if data is None:
1289 dset = self.__setup_empty_dset__(parent, name, options['io_settings'])
1290 dataio.dataset = dset
1291 # Write a scalar dataset containing a single string
1292 elif isinstance(data, (str, bytes)):
1293 dset = self.__scalar_fill__(parent, name, data, options)
1294 # Iterative write of a data chunk iterator
1295 elif isinstance(data, AbstractDataChunkIterator):
1296 dset = self.__setup_chunked_dset__(parent, name, data, options)
1297 self.__dci_queue.append(dataset=dset, data=data)
1298 # Write a regular in memory array (e.g., numpy array, list etc.)
1299 elif hasattr(data, '__len__'):
1300 dset = self.__list_fill__(parent, name, data, options)
1301 # Write a regular scalar dataset
1302 else:
1303 dset = self.__scalar_fill__(parent, name, data, options)
1304 # Create the attributes on the dataset only if we are the primary and not just a Soft/External link
1305 if link is None:
1306 self.set_attributes(dset, attributes)
1307 # Validate the attributes on the linked dataset
1308 elif len(attributes) > 0:
1309 pass
1310 self.__set_written(builder)
1311 if exhaust_dci: 1311 ↛ exitline 1311 didn't return from function 'write_dataset', because the condition on line 1311 was never false
1312 self.__dci_queue.exhaust_queue()
1314 @classmethod
1315 def __scalar_fill__(cls, parent, name, data, options=None):
1316 dtype = None
1317 io_settings = {}
1318 if options is not None: 1318 ↛ 1321line 1318 didn't jump to line 1321, because the condition on line 1318 was never false
1319 dtype = options.get('dtype')
1320 io_settings = options.get('io_settings')
1321 if not isinstance(dtype, type): 1321 ↛ 1327line 1321 didn't jump to line 1327, because the condition on line 1321 was never false
1322 try:
1323 dtype = cls.__resolve_dtype__(dtype, data)
1324 except Exception as exc:
1325 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1326 raise Exception(msg) from exc
1327 try:
1328 dset = parent.create_dataset(name, data=data, shape=None, dtype=dtype, **io_settings)
1329 except Exception as exc:
1330 msg = "Could not create scalar dataset %s in %s" % (name, parent.name)
1331 raise Exception(msg) from exc
1332 return dset
1334 @classmethod
1335 def __setup_chunked_dset__(cls, parent, name, data, options=None):
1336 """
1337 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator
1339 :param parent: The parent object to which the dataset should be added
1340 :type parent: h5py.Group, h5py.File
1341 :param name: The name of the dataset
1342 :type name: str
1343 :param data: The data to be written.
1344 :type data: DataChunkIterator
1345 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1346 :type options: dict
1348 """
1349 io_settings = {}
1350 if options is not None:
1351 if 'io_settings' in options: 1351 ↛ 1354line 1351 didn't jump to line 1354, because the condition on line 1351 was never false
1352 io_settings = options.get('io_settings')
1353 # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write.
1354 if 'chunks' not in io_settings:
1355 recommended_chunks = data.recommended_chunk_shape()
1356 io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks
1357 # Define the shape of the data if not provided by the user
1358 if 'shape' not in io_settings: 1358 ↛ 1361line 1358 didn't jump to line 1361, because the condition on line 1358 was never false
1359 io_settings['shape'] = data.recommended_data_shape()
1360 # Define the maxshape of the data if not provided by the user
1361 if 'maxshape' not in io_settings:
1362 io_settings['maxshape'] = data.maxshape
1363 if 'dtype' not in io_settings: 1363 ↛ 1371line 1363 didn't jump to line 1371, because the condition on line 1363 was never false
1364 if (options is not None) and ('dtype' in options):
1365 io_settings['dtype'] = options['dtype']
1366 else:
1367 io_settings['dtype'] = data.dtype
1368 if isinstance(io_settings['dtype'], str): 1368 ↛ 1370line 1368 didn't jump to line 1370, because the condition on line 1368 was never true
1369 # map to real dtype if we were given a string
1370 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype'])
1371 try:
1372 dset = parent.create_dataset(name, **io_settings)
1373 except Exception as exc:
1374 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc
1375 return dset
1377 @classmethod
1378 def __setup_empty_dset__(cls, parent, name, io_settings):
1379 """
1380 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator
1382 :param parent: The parent object to which the dataset should be added
1383 :type parent: h5py.Group, h5py.File
1384 :param name: The name of the dataset
1385 :type name: str
1386 :param data: The data to be written.
1387 :type data: DataChunkIterator
1388 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1389 :type options: dict
1391 """
1392 # Define the shape of the data if not provided by the user
1393 if 'shape' not in io_settings:
1394 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without shape")
1395 if 'dtype' not in io_settings:
1396 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without dtype")
1397 if isinstance(io_settings['dtype'], str):
1398 # map to real dtype if we were given a string
1399 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype'])
1400 try:
1401 dset = parent.create_dataset(name, **io_settings)
1402 except Exception as exc:
1403 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc
1404 return dset
1406 @classmethod
1407 def __chunked_iter_fill__(cls, parent, name, data, options=None):
1408 """
1409 Write data to a dataset one-chunk-at-a-time based on the given DataChunkIterator
1411 :param parent: The parent object to which the dataset should be added
1412 :type parent: h5py.Group, h5py.File
1413 :param name: The name of the dataset
1414 :type name: str
1415 :param data: The data to be written.
1416 :type data: DataChunkIterator
1417 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1418 :type options: dict
1420 """
1421 dset = cls.__setup_chunked_dset__(parent, name, data, options=options)
1422 read = True
1423 while read:
1424 read = HDF5IODataChunkIteratorQueue._write_chunk(dset, data)
1425 return dset
1427 @classmethod
1428 def __list_fill__(cls, parent, name, data, options=None):
1429 # define the io settings and data type if necessary
1430 io_settings = {}
1431 dtype = None
1432 if options is not None:
1433 dtype = options.get('dtype')
1434 io_settings = options.get('io_settings')
1435 if not isinstance(dtype, type):
1436 try:
1437 dtype = cls.__resolve_dtype__(dtype, data)
1438 except Exception as exc:
1439 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1440 raise Exception(msg) from exc
1441 # define the data shape
1442 if 'shape' in io_settings: 1442 ↛ 1443line 1442 didn't jump to line 1443, because the condition on line 1442 was never true
1443 data_shape = io_settings.pop('shape')
1444 elif hasattr(data, 'shape'):
1445 data_shape = data.shape
1446 elif isinstance(dtype, np.dtype):
1447 data_shape = (len(data),)
1448 else:
1449 data_shape = get_data_shape(data)
1451 # Create the dataset
1452 try:
1453 dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings)
1454 except Exception as exc:
1455 msg = "Could not create dataset %s in %s with shape %s, dtype %s, and iosettings %s. %s" % \
1456 (name, parent.name, str(data_shape), str(dtype), str(io_settings), str(exc))
1457 raise Exception(msg) from exc
1458 # Write the data
1459 if len(data) > dset.shape[0]: 1459 ↛ 1460line 1459 didn't jump to line 1460, because the condition on line 1459 was never true
1460 new_shape = list(dset.shape)
1461 new_shape[0] = len(data)
1462 dset.resize(new_shape)
1463 try:
1464 dset[:] = data
1465 except Exception as e:
1466 raise e
1467 return dset
1469 @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference',
1470 'default': None},
1471 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object',
1472 'default': None},
1473 returns='the reference', rtype=Reference)
1474 def __get_ref(self, **kwargs):
1475 container, region = getargs('container', 'region', kwargs)
1476 if container is None: 1476 ↛ 1477line 1476 didn't jump to line 1477, because the condition on line 1476 was never true
1477 return None
1478 if isinstance(container, Builder):
1479 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
1480 if isinstance(container, LinkBuilder): 1480 ↛ 1481line 1480 didn't jump to line 1481, because the condition on line 1480 was never true
1481 builder = container.target_builder
1482 else:
1483 builder = container
1484 elif isinstance(container, ReferenceBuilder):
1485 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.builder.name))
1486 builder = container.builder
1487 else:
1488 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
1489 builder = self.manager.build(container)
1490 path = self.__get_path(builder)
1491 self.logger.debug("Getting reference at path '%s'" % path)
1492 if isinstance(container, RegionBuilder): 1492 ↛ 1493line 1492 didn't jump to line 1493, because the condition on line 1492 was never true
1493 region = container.region
1494 if region is not None: 1494 ↛ 1495line 1494 didn't jump to line 1495, because the condition on line 1494 was never true
1495 dset = self.__file[path]
1496 if not isinstance(dset, Dataset):
1497 raise ValueError('cannot create region reference without Dataset')
1498 return self.__file[path].regionref[region]
1499 else:
1500 return self.__file[path].ref
1502 def __is_ref(self, dtype):
1503 if isinstance(dtype, DtypeSpec):
1504 return self.__is_ref(dtype.dtype)
1505 if isinstance(dtype, RefSpec):
1506 return True
1507 if isinstance(dtype, dict): # may be dict from reading a compound dataset
1508 return self.__is_ref(dtype['dtype'])
1509 if isinstance(dtype, str):
1510 return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE
1511 return False
1513 def __queue_ref(self, func):
1514 '''Set aside filling dset with references
1516 dest[sl] = func()
1518 Args:
1519 dset: the h5py.Dataset that the references need to be added to
1520 sl: the np.s_ (slice) object for indexing into dset
1521 func: a function to call to return the chunk of data, with
1522 references filled in
1523 '''
1524 # TODO: come up with more intelligent way of
1525 # queueing reference resolution, based on reference
1526 # dependency
1527 self.__ref_queue.append(func)
1529 def __rec_get_ref(self, ref_list):
1530 ret = list()
1531 for elem in ref_list:
1532 if isinstance(elem, (list, tuple)):
1533 ret.append(self.__rec_get_ref(elem))
1534 elif isinstance(elem, (Builder, Container)):
1535 ret.append(self.__get_ref(elem))
1536 else:
1537 ret.append(elem)
1538 return ret
1540 @property
1541 def mode(self):
1542 """
1543 Return the HDF5 file mode. One of ("w", "r", "r+", "a", "w-", "x").
1544 """
1545 return self.__mode
1547 @classmethod
1548 @docval(*get_docval(H5DataIO.__init__))
1549 def set_dataio(cls, **kwargs):
1550 """
1551 Wrap the given Data object with an H5DataIO.
1553 This method is provided merely for convenience. It is the equivalent
1554 of the following:
1556 .. code-block:: python
1558 from hdmf.backends.hdf5 import H5DataIO
1559 data = ...
1560 data = H5DataIO(data)
1561 """
1562 return H5DataIO.__init__(**kwargs)