Coverage for src/hdmf/backends/hdf5/h5tools.py: 87%
906 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-08-18 20:49 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-08-18 20:49 +0000
1import logging
2import os.path
3import warnings
4from collections import deque
5from functools import partial
6from pathlib import Path, PurePosixPath as pp
8import numpy as np
9import h5py
10from h5py import File, Group, Dataset, special_dtype, SoftLink, ExternalLink, Reference, RegionReference, check_dtype
12from .h5_utils import (BuilderH5ReferenceDataset, BuilderH5RegionDataset, BuilderH5TableDataset, H5DataIO,
13 H5SpecReader, H5SpecWriter, HDF5IODataChunkIteratorQueue)
14from ..io import HDMFIO
15from ..errors import UnsupportedOperation
16from ..warnings import BrokenLinkWarning
17from ...build import (Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, RegionBuilder,
18 ReferenceBuilder, TypeMap, ObjectMapper)
19from ...container import Container
20from ...data_utils import AbstractDataChunkIterator
21from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
22from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset
23from ..utils import NamespaceToBuilderHelper, WriteStatusTracker
25ROOT_NAME = 'root'
26SPEC_LOC_ATTR = '.specloc'
27H5_TEXT = special_dtype(vlen=str)
28H5_BINARY = special_dtype(vlen=bytes)
29H5_REF = special_dtype(ref=Reference)
30H5_REGREF = special_dtype(ref=RegionReference)
32RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB
34H5PY_3 = h5py.__version__.startswith('3')
37class HDF5IO(HDMFIO):
39 __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group
41 @staticmethod
42 def can_read(path):
43 """Determines whether a given path is readable by the HDF5IO class"""
44 if not os.path.isfile(path):
45 return False
46 try:
47 with h5py.File(path, "r"):
48 return True
49 except IOError:
50 return False
52 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
53 {'name': 'mode', 'type': str,
54 'doc': ('the mode to open the HDF5 file with, one of ("w", "r", "r+", "a", "w-", "x"). '
55 'See `h5py.File <http://docs.h5py.org/en/latest/high/file.html#opening-creating-files>`_ for '
56 'more details.'),
57 'default': 'r'},
58 {'name': 'manager', 'type': (TypeMap, BuildManager),
59 'doc': 'the BuildManager or a TypeMap to construct a BuildManager to use for I/O', 'default': None},
60 {'name': 'comm', 'type': 'Intracomm',
61 'doc': 'the MPI communicator to use for parallel I/O', 'default': None},
62 {'name': 'file', 'type': [File, "S3File"], 'doc': 'a pre-existing h5py.File object', 'default': None},
63 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
64 {'name': 'herd_path', 'type': str,
65 'doc': 'The path to the HERD', 'default': None},)
66 def __init__(self, **kwargs):
67 """Open an HDF5 file for IO.
68 """
69 self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
70 path, manager, mode, comm, file_obj, driver, herd_path = popargs('path', 'manager', 'mode',
71 'comm', 'file', 'driver',
72 'herd_path',
73 kwargs)
75 self.__open_links = [] # keep track of other files opened from links in this file
76 self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close
78 path = self.__check_path_file_obj(path, file_obj)
80 if file_obj is None and not os.path.exists(path) and (mode == 'r' or mode == 'r+') and driver != 'ros3':
81 msg = "Unable to open file %s in '%s' mode. File does not exist." % (path, mode)
82 raise UnsupportedOperation(msg)
84 if file_obj is None and os.path.exists(path) and (mode == 'w-' or mode == 'x'):
85 msg = "Unable to open file %s in '%s' mode. File already exists." % (path, mode)
86 raise UnsupportedOperation(msg)
88 if manager is None:
89 manager = BuildManager(TypeMap(NamespaceCatalog()))
90 elif isinstance(manager, TypeMap): 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true
91 manager = BuildManager(manager)
92 self.__driver = driver
93 self.__comm = comm
94 self.__mode = mode
95 self.__file = file_obj
96 super().__init__(manager, source=path, herd_path=herd_path)
97 # NOTE: source is not set if path is None and file_obj is passed
98 self.__built = dict() # keep track of each builder for each dataset/group/link for each file
99 self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder
100 self.__ref_queue = deque() # a queue of the references that need to be added
101 self.__dci_queue = HDF5IODataChunkIteratorQueue() # a queue of DataChunkIterators that need to be exhausted
102 ObjectMapper.no_convert(Dataset)
103 self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object
105 @property
106 def comm(self):
107 """The MPI communicator to use for parallel I/O."""
108 return self.__comm
110 @property
111 def _file(self):
112 return self.__file
114 @property
115 def driver(self):
116 return self.__driver
118 @classmethod
119 def __check_path_file_obj(cls, path, file_obj):
120 if isinstance(path, Path):
121 path = str(path)
123 if path is None and file_obj is None:
124 raise ValueError("Either the 'path' or 'file' argument must be supplied.")
126 if path is not None and file_obj is not None: # consistency check
127 if os.path.abspath(file_obj.filename) != os.path.abspath(path):
128 msg = ("You argued '%s' as this object's path, but supplied a file with filename: %s"
129 % (path, file_obj.filename))
130 raise ValueError(msg)
132 return path
134 @classmethod
135 def __resolve_file_obj(cls, path, file_obj, driver):
136 path = cls.__check_path_file_obj(path, file_obj)
138 if file_obj is None:
139 file_kwargs = dict()
140 if driver is not None: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 file_kwargs.update(driver=driver)
142 file_obj = File(path, 'r', **file_kwargs)
143 return file_obj
145 @classmethod
146 @docval({'name': 'namespace_catalog', 'type': (NamespaceCatalog, TypeMap),
147 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'},
148 {'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
149 {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None},
150 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
151 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
152 returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and "
153 "the included data types"),
154 rtype=dict)
155 def load_namespaces(cls, **kwargs):
156 """Load cached namespaces from a file.
158 If `file` is not supplied, then an :py:class:`h5py.File` object will be opened for the given `path`, the
159 namespaces will be read, and the File object will be closed. If `file` is supplied, then
160 the given File object will be read from and not closed.
162 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
163 """
164 namespace_catalog, path, namespaces, file_obj, driver = popargs(
165 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs)
167 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
168 if file_obj is None: # need to close the file object that we just opened
169 with open_file_obj:
170 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj)
171 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj)
173 @classmethod
174 def __load_namespaces(cls, namespace_catalog, namespaces, file_obj):
175 d = {}
177 if not cls.__check_specloc(file_obj):
178 return d
180 namespace_versions = cls.__get_namespaces(file_obj)
182 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]]
183 if namespaces is None: 183 ↛ 186line 183 didn't jump to line 186, because the condition on line 183 was never false
184 namespaces = list(spec_group.keys())
186 readers = dict()
187 deps = dict()
188 for ns in namespaces:
189 latest_version = namespace_versions[ns]
190 ns_group = spec_group[ns][latest_version]
191 reader = H5SpecReader(ns_group)
192 readers[ns] = reader
193 # for each namespace in the 'namespace' dataset, track all included namespaces (dependencies)
194 for spec_ns in reader.read_namespace(cls.__ns_spec_path):
195 deps[ns] = list()
196 for s in spec_ns['schema']:
197 dep = s.get('namespace')
198 if dep is not None:
199 deps[ns].append(dep)
201 order = cls._order_deps(deps)
202 for ns in order:
203 reader = readers[ns]
204 d.update(namespace_catalog.load_namespaces(cls.__ns_spec_path, reader=reader))
206 return d
208 @classmethod
209 def __check_specloc(cls, file_obj):
210 if SPEC_LOC_ATTR not in file_obj.attrs:
211 # this occurs in legacy files
212 msg = "No cached namespaces found in %s" % file_obj.filename
213 warnings.warn(msg)
214 return False
215 return True
217 @classmethod
218 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
219 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
220 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
221 returns="dict mapping names to versions of the namespaces in the file", rtype=dict)
222 def get_namespaces(cls, **kwargs):
223 """Get the names and versions of the cached namespaces from a file.
225 If ``file`` is not supplied, then an :py:class:`h5py.File` object will be opened for the given ``path``, the
226 namespaces will be read, and the File object will be closed. If `file` is supplied, then
227 the given File object will be read from and not closed.
229 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric
230 ordering) is returned. This is the version of the namespace that is loaded by HDF5IO.load_namespaces(...).
232 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
233 """
234 path, file_obj, driver = popargs('path', 'file', 'driver', kwargs)
236 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
237 if file_obj is None: # need to close the file object that we just opened
238 with open_file_obj:
239 return cls.__get_namespaces(open_file_obj)
240 return cls.__get_namespaces(open_file_obj)
242 @classmethod
243 def __get_namespaces(cls, file_obj):
244 """Return a dict mapping namespace name to version string for the latest version of that namespace in the file.
246 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric
247 ordering) is returned. This is the version of the namespace that is loaded by ``HDF5IO.load_namespaces``.
248 """
249 used_version_names = dict()
250 if not cls.__check_specloc(file_obj):
251 return used_version_names
253 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]]
254 namespaces = list(spec_group.keys())
255 for ns in namespaces:
256 ns_group = spec_group[ns]
257 # NOTE: by default, objects within groups are iterated in alphanumeric order
258 version_names = list(ns_group.keys())
259 if len(version_names) > 1:
260 # prior to HDMF 1.6.1, extensions without a version were written under the group name "unversioned"
261 # make sure that if there is another group representing a newer version, that is read instead
262 if 'unversioned' in version_names:
263 version_names.remove('unversioned')
264 if len(version_names) > 1:
265 # as of HDMF 1.6.1, extensions without a version are written under the group name "None"
266 # make sure that if there is another group representing a newer version, that is read instead
267 if 'None' in version_names:
268 version_names.remove('None')
269 used_version_names[ns] = version_names[-1] # save the largest in alphanumeric order
271 return used_version_names
273 @classmethod
274 def _order_deps(cls, deps):
275 """
276 Order namespaces according to dependency for loading into a NamespaceCatalog
278 Args:
279 deps (dict): a dictionary that maps a namespace name to a list of name of
280 the namespaces on which the namespace is directly dependent
281 Example: {'a': ['b', 'c'], 'b': ['d'], 'c': ['d'], 'd': []}
282 Expected output: ['d', 'b', 'c', 'a']
283 """
284 order = list()
285 keys = list(deps.keys())
286 deps = dict(deps)
287 for k in keys:
288 if k in deps:
289 cls.__order_deps_aux(order, deps, k)
290 return order
292 @classmethod
293 def __order_deps_aux(cls, order, deps, key):
294 """
295 A recursive helper function for _order_deps
296 """
297 if key not in deps:
298 return
299 subdeps = deps.pop(key)
300 for subk in subdeps:
301 cls.__order_deps_aux(order, deps, subk)
302 order.append(key)
304 @classmethod
305 @docval({'name': 'source_filename', 'type': str, 'doc': 'the path to the HDF5 file to copy'},
306 {'name': 'dest_filename', 'type': str, 'doc': 'the name of the destination file'},
307 {'name': 'expand_external', 'type': bool, 'doc': 'expand external links into new objects', 'default': True},
308 {'name': 'expand_refs', 'type': bool, 'doc': 'copy objects which are pointed to by reference',
309 'default': False},
310 {'name': 'expand_soft', 'type': bool, 'doc': 'expand soft links into new objects', 'default': False}
311 )
312 def copy_file(self, **kwargs):
313 """
314 Convenience function to copy an HDF5 file while allowing external links to be resolved.
316 .. warning::
318 As of HDMF 2.0, this method is no longer supported and may be removed in a future version.
319 Please use the export method or h5py.File.copy method instead.
321 .. note::
323 The source file will be opened in 'r' mode and the destination file will be opened in 'w' mode
324 using h5py. To avoid possible collisions, care should be taken that, e.g., the source file is
325 not opened already when calling this function.
327 """
329 warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of "
330 "HDMF. Please use the export method or h5py.File.copy method instead.", DeprecationWarning)
332 source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename',
333 'dest_filename',
334 'expand_external',
335 'expand_refs',
336 'expand_soft',
337 kwargs)
338 source_file = File(source_filename, 'r')
339 dest_file = File(dest_filename, 'w')
340 for objname in source_file["/"].keys():
341 source_file.copy(source=objname,
342 dest=dest_file,
343 name=objname,
344 expand_external=expand_external,
345 expand_refs=expand_refs,
346 expand_soft=expand_soft,
347 shallow=False,
348 without_attrs=False,
349 )
350 for objname in source_file['/'].attrs:
351 dest_file['/'].attrs[objname] = source_file['/'].attrs[objname]
352 source_file.close()
353 dest_file.close()
355 @docval({'name': 'container', 'type': Container, 'doc': 'the Container object to write'},
356 {'name': 'cache_spec', 'type': bool,
357 'doc': ('If True (default), cache specification to file (highly recommended). If False, do not cache '
358 'specification to file. The appropriate specification will then need to be loaded prior to '
359 'reading the file.'),
360 'default': True},
361 {'name': 'link_data', 'type': bool,
362 'doc': 'If True (default), create external links to HDF5 Datasets. If False, copy HDF5 Datasets.',
363 'default': True},
364 {'name': 'exhaust_dci', 'type': bool,
365 'doc': 'If True (default), exhaust DataChunkIterators one at a time. If False, exhaust them concurrently.',
366 'default': True})
367 def write(self, **kwargs):
368 """Write the container to an HDF5 file."""
369 if self.__mode == 'r':
370 raise UnsupportedOperation(("Cannot write to file %s in mode '%s'. "
371 "Please use mode 'r+', 'w', 'w-', 'x', or 'a'")
372 % (self.source, self.__mode))
374 cache_spec = popargs('cache_spec', kwargs)
375 super().write(**kwargs)
376 if cache_spec:
377 self.__cache_spec()
379 def __cache_spec(self):
380 ref = self.__file.attrs.get(SPEC_LOC_ATTR)
381 spec_group = None
382 if ref is not None:
383 spec_group = self.__file[ref]
384 else:
385 path = 'specifications' # do something to figure out where the specifications should go
386 spec_group = self.__file.require_group(path)
387 self.__file.attrs[SPEC_LOC_ATTR] = spec_group.ref
388 ns_catalog = self.manager.namespace_catalog
389 for ns_name in ns_catalog.namespaces:
390 ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name)
391 namespace = ns_catalog.get_namespace(ns_name)
392 group_name = '%s/%s' % (ns_name, namespace.version)
393 if group_name in spec_group:
394 continue
395 ns_group = spec_group.create_group(group_name)
396 writer = H5SpecWriter(ns_group)
397 ns_builder.export(self.__ns_spec_path, writer=writer)
399 _export_args = (
400 {'name': 'src_io', 'type': 'HDMFIO', 'doc': 'the HDMFIO object for reading the data to export'},
401 {'name': 'container', 'type': Container,
402 'doc': ('the Container object to export. If None, then the entire contents of the HDMFIO object will be '
403 'exported'),
404 'default': None},
405 {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`',
406 'default': None},
407 {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file',
408 'default': True}
409 # clear_cache is an arg on HDMFIO.export but it is intended for internal usage
410 # so it is not available on HDF5IO
411 )
413 @docval(*_export_args)
414 def export(self, **kwargs):
415 """Export data read from a file from any backend to HDF5.
417 See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details.
418 """
419 if self.__mode != 'w':
420 raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'."
421 % (self.source, self.__mode))
423 src_io = getargs('src_io', kwargs)
424 write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs)
425 if write_args is None:
426 write_args = dict()
428 if not isinstance(src_io, HDF5IO) and write_args.get('link_data', True):
429 raise UnsupportedOperation("Cannot export from non-HDF5 backend %s to HDF5 with write argument "
430 "link_data=True." % src_io.__class__.__name__)
432 write_args['export_source'] = os.path.abspath(src_io.source) if src_io.source is not None else None
433 ckwargs = kwargs.copy()
434 ckwargs['write_args'] = write_args
435 if not write_args.get('link_data', True):
436 ckwargs['clear_cache'] = True
437 super().export(**ckwargs)
438 if cache_spec:
439 # add any namespaces from the src_io that have not yet been loaded
440 for namespace in src_io.manager.namespace_catalog.namespaces:
441 if namespace not in self.manager.namespace_catalog.namespaces: 441 ↛ 440line 441 didn't jump to line 440, because the condition on line 441 was never false
442 self.manager.namespace_catalog.add_namespace(
443 name=namespace,
444 namespace=src_io.manager.namespace_catalog.get_namespace(namespace)
445 )
446 self.__cache_spec()
448 @classmethod
449 @docval({'name': 'path', 'type': str, 'doc': 'the path to the destination HDF5 file'},
450 {'name': 'comm', 'type': 'Intracomm', 'doc': 'the MPI communicator to use for parallel I/O',
451 'default': None},
452 *_export_args) # NOTE: src_io is required and is the second positional argument
453 def export_io(self, **kwargs):
454 """Export from one backend to HDF5 (class method).
456 Convenience function for :py:meth:`export` where you do not need to
457 instantiate a new ``HDF5IO`` object for writing. An ``HDF5IO`` object is created with mode 'w' and the given
458 arguments.
460 Example usage:
462 .. code-block:: python
464 old_io = HDF5IO('old.h5', 'r')
465 HDF5IO.export_io(path='new_copy.h5', src_io=old_io)
467 See :py:meth:`export` for more details.
468 """
469 path, comm = popargs('path', 'comm', kwargs)
471 with HDF5IO(path=path, comm=comm, mode='w') as write_io:
472 write_io.export(**kwargs)
474 def read(self, **kwargs):
475 if self.__mode == 'w' or self.__mode == 'w-' or self.__mode == 'x':
476 raise UnsupportedOperation("Cannot read from file %s in mode '%s'. Please use mode 'r', 'r+', or 'a'."
477 % (self.source, self.__mode))
478 try:
479 return super().read(**kwargs)
480 except UnsupportedOperation as e:
481 if str(e) == 'Cannot build data. There are no values.': # pragma: no cover
482 raise UnsupportedOperation("Cannot read data from file %s in mode '%s'. There are no values."
483 % (self.source, self.__mode))
485 @docval(returns='a GroupBuilder representing the data object', rtype='GroupBuilder')
486 def read_builder(self):
487 """
488 Read data and return the GroupBuilder representing it.
490 NOTE: On read, the Builder.source may will usually not be set of the Builders.
491 NOTE: The Builder.location is used internally to ensure correct handling of links (in particular on export)
492 and should be set on read for all GroupBuilder, DatasetBuilder, and LinkBuilder objects.
493 """
494 if not self.__file:
495 raise UnsupportedOperation("Cannot read data from closed HDF5 file '%s'" % self.source)
496 f_builder = self.__read.get(self.__file)
497 # ignore cached specs when reading builder
498 ignore = set()
499 specloc = self.__file.attrs.get(SPEC_LOC_ATTR)
500 if specloc is not None:
501 ignore.add(self.__file[specloc].name)
502 if f_builder is None:
503 f_builder = self.__read_group(self.__file, ROOT_NAME, ignore=ignore)
504 self.__read[self.__file] = f_builder
505 return f_builder
507 def __set_written(self, builder):
508 """
509 Helper function used to set the written status for builders
511 :param builder: Builder object to be marked as written
512 :type builder: Builder
513 """
514 self._written_builders.set_written(builder)
516 def get_written(self, builder):
517 """Return True if this builder has been written to (or read from) disk by this IO object, False otherwise.
519 :param builder: Builder object to get the written flag for
520 :type builder: Builder
522 :return: True if the builder is found in self._written_builders using the builder ID, False otherwise
523 """
524 return self._written_builders.get_written(builder)
526 def __set_built(self, fpath, id, builder):
527 """
528 Update self.__built to cache the given builder for the given file and id.
530 :param fpath: Path to the HDF5 file containing the object
531 :type fpath: str
532 :param id: ID of the HDF5 object in the path
533 :type id: h5py GroupID object
534 :param builder: The builder to be cached
535 """
536 self.__built.setdefault(fpath, dict()).setdefault(id, builder)
538 def __get_built(self, fpath, id):
539 """
540 Look up a builder for the given file and id in self.__built cache
542 :param fpath: Path to the HDF5 file containing the object
543 :type fpath: str
544 :param id: ID of the HDF5 object in the path
545 :type id: h5py GroupID object
547 :return: Builder in the self.__built cache or None
548 """
550 fdict = self.__built.get(fpath)
551 if fdict:
552 return fdict.get(id)
553 else:
554 return None
556 @docval({'name': 'h5obj', 'type': (Dataset, Group),
557 'doc': 'the HDF5 object to the corresponding Builder object for'})
558 def get_builder(self, **kwargs):
559 """
560 Get the builder for the corresponding h5py Group or Dataset
562 :raises ValueError: When no builder has been constructed yet for the given h5py object
563 """
564 h5obj = getargs('h5obj', kwargs)
565 fpath = h5obj.file.filename
566 builder = self.__get_built(fpath, h5obj.id)
567 if builder is None: 567 ↛ 568line 567 didn't jump to line 568, because the condition on line 567 was never true
568 msg = '%s:%s has not been built' % (fpath, h5obj.name)
569 raise ValueError(msg)
570 return builder
572 @docval({'name': 'h5obj', 'type': (Dataset, Group),
573 'doc': 'the HDF5 object to the corresponding Container/Data object for'})
574 def get_container(self, **kwargs):
575 """
576 Get the container for the corresponding h5py Group or Dataset
578 :raises ValueError: When no builder has been constructed yet for the given h5py object
579 """
580 h5obj = getargs('h5obj', kwargs)
581 builder = self.get_builder(h5obj)
582 container = self.manager.construct(builder)
583 return container
585 def __read_group(self, h5obj, name=None, ignore=set()):
586 kwargs = {
587 "attributes": self.__read_attrs(h5obj),
588 "groups": dict(),
589 "datasets": dict(),
590 "links": dict()
591 }
593 for key, val in kwargs['attributes'].items():
594 if isinstance(val, bytes): 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true
595 kwargs['attributes'][key] = val.decode('UTF-8')
597 if name is None:
598 name = str(os.path.basename(h5obj.name))
599 for k in h5obj:
600 sub_h5obj = h5obj.get(k)
601 if sub_h5obj is not None:
602 if sub_h5obj.name in ignore:
603 continue
604 link_type = h5obj.get(k, getlink=True)
605 if isinstance(link_type, (SoftLink, ExternalLink)):
606 # Reading links might be better suited in its own function
607 # get path of link (the key used for tracking what's been built)
608 target_path = link_type.path
609 target_obj = sub_h5obj.file[target_path]
610 builder_name = os.path.basename(target_path)
611 # get builder if already read, else build it
612 builder = self.__get_built(sub_h5obj.file.filename, target_obj.id)
613 if builder is None:
614 # NOTE: all links must have absolute paths
615 if isinstance(target_obj, Dataset):
616 builder = self.__read_dataset(target_obj, builder_name)
617 else:
618 builder = self.__read_group(target_obj, builder_name, ignore=ignore)
619 self.__set_built(sub_h5obj.file.filename, target_obj.id, builder)
620 link_builder = LinkBuilder(builder=builder, name=k, source=os.path.abspath(h5obj.file.filename))
621 link_builder.location = h5obj.name
622 self.__set_written(link_builder)
623 kwargs['links'][builder_name] = link_builder
624 if isinstance(link_type, ExternalLink):
625 self.__open_links.append(sub_h5obj)
626 else:
627 builder = self.__get_built(sub_h5obj.file.filename, sub_h5obj.id)
628 obj_type = None
629 read_method = None
630 if isinstance(sub_h5obj, Dataset):
631 read_method = self.__read_dataset
632 obj_type = kwargs['datasets']
633 else:
634 read_method = partial(self.__read_group, ignore=ignore)
635 obj_type = kwargs['groups']
636 if builder is None:
637 builder = read_method(sub_h5obj)
638 self.__set_built(sub_h5obj.file.filename, sub_h5obj.id, builder)
639 obj_type[builder.name] = builder
640 else:
641 warnings.warn('Path to Group altered/broken at ' + os.path.join(h5obj.name, k), BrokenLinkWarning)
642 kwargs['datasets'][k] = None
643 continue
644 kwargs['source'] = os.path.abspath(h5obj.file.filename)
645 ret = GroupBuilder(name, **kwargs)
646 ret.location = os.path.dirname(h5obj.name)
647 self.__set_written(ret)
648 return ret
650 def __read_dataset(self, h5obj, name=None):
651 kwargs = {
652 "attributes": self.__read_attrs(h5obj),
653 "dtype": h5obj.dtype,
654 "maxshape": h5obj.maxshape
655 }
656 for key, val in kwargs['attributes'].items():
657 if isinstance(val, bytes): 657 ↛ 658line 657 didn't jump to line 658, because the condition on line 657 was never true
658 kwargs['attributes'][key] = val.decode('UTF-8')
660 if name is None:
661 name = str(os.path.basename(h5obj.name))
662 kwargs['source'] = os.path.abspath(h5obj.file.filename)
663 ndims = len(h5obj.shape)
664 if ndims == 0: # read scalar
665 scalar = h5obj[()]
666 if isinstance(scalar, bytes): 666 ↛ 667line 666 didn't jump to line 667, because the condition on line 666 was never true
667 scalar = scalar.decode('UTF-8')
669 if isinstance(scalar, Reference): 669 ↛ 671line 669 didn't jump to line 671, because the condition on line 669 was never true
670 # TODO (AJTRITT): This should call __read_ref to support Group references
671 target = h5obj.file[scalar]
672 target_builder = self.__read_dataset(target)
673 self.__set_built(target.file.filename, target.id, target_builder)
674 if isinstance(scalar, RegionReference):
675 d = RegionBuilder(scalar, target_builder)
676 else:
677 d = ReferenceBuilder(target_builder)
678 kwargs['data'] = d
679 kwargs['dtype'] = d.dtype
680 else:
681 kwargs["data"] = scalar
682 else:
683 d = None
684 if h5obj.dtype.kind == 'O' and len(h5obj) > 0:
685 elem1 = h5obj[tuple([0] * (h5obj.ndim - 1) + [0])]
686 if isinstance(elem1, (str, bytes)):
687 d = self._check_str_dtype(h5obj)
688 elif isinstance(elem1, RegionReference): # read list of references 688 ↛ 689line 688 didn't jump to line 689, because the condition on line 688 was never true
689 d = BuilderH5RegionDataset(h5obj, self)
690 kwargs['dtype'] = d.dtype
691 elif isinstance(elem1, Reference): 691 ↛ 701line 691 didn't jump to line 701, because the condition on line 691 was never false
692 d = BuilderH5ReferenceDataset(h5obj, self)
693 kwargs['dtype'] = d.dtype
694 elif h5obj.dtype.kind == 'V': # table / compound data type
695 cpd_dt = h5obj.dtype
696 ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))]
697 d = BuilderH5TableDataset(h5obj, self, ref_cols)
698 kwargs['dtype'] = HDF5IO.__compound_dtype_to_list(h5obj.dtype, d.dtype)
699 else:
700 d = h5obj
701 kwargs["data"] = d
702 ret = DatasetBuilder(name, **kwargs)
703 ret.location = os.path.dirname(h5obj.name)
704 self.__set_written(ret)
705 return ret
707 def _check_str_dtype(self, h5obj):
708 dtype = h5obj.dtype
709 if dtype.kind == 'O': 709 ↛ 712line 709 didn't jump to line 712, because the condition on line 709 was never false
710 if dtype.metadata.get('vlen') == str and H5PY_3: 710 ↛ 712line 710 didn't jump to line 712, because the condition on line 710 was never false
711 return StrDataset(h5obj, None)
712 return h5obj
714 @classmethod
715 def __compound_dtype_to_list(cls, h5obj_dtype, dset_dtype):
716 ret = []
717 for name, dtype in zip(h5obj_dtype.fields, dset_dtype):
718 ret.append({'name': name, 'dtype': dtype})
719 return ret
721 def __read_attrs(self, h5obj):
722 ret = dict()
723 for k, v in h5obj.attrs.items():
724 if k == SPEC_LOC_ATTR: # ignore cached spec
725 continue
726 if isinstance(v, RegionReference): 726 ↛ 727line 726 didn't jump to line 727, because the condition on line 726 was never true
727 raise ValueError("cannot read region reference attributes yet")
728 elif isinstance(v, Reference):
729 ret[k] = self.__read_ref(h5obj.file[v])
730 else:
731 ret[k] = v
732 return ret
734 def __read_ref(self, h5obj):
735 ret = None
736 ret = self.__get_built(h5obj.file.filename, h5obj.id)
737 if ret is None:
738 if isinstance(h5obj, Dataset):
739 ret = self.__read_dataset(h5obj)
740 elif isinstance(h5obj, Group): 740 ↛ 743line 740 didn't jump to line 743, because the condition on line 740 was never false
741 ret = self.__read_group(h5obj)
742 else:
743 raise ValueError("h5obj must be a Dataset or a Group - got %s" % str(h5obj))
744 self.__set_built(h5obj.file.filename, h5obj.id, ret)
745 return ret
747 def open(self):
748 if self.__file is None:
749 open_flag = self.__mode
750 kwargs = dict(rdcc_nbytes=RDCC_NBYTES)
751 if self.comm: 751 ↛ 752line 751 didn't jump to line 752, because the condition on line 751 was never true
752 kwargs.update(driver='mpio', comm=self.comm)
754 if self.driver is not None: 754 ↛ 755line 754 didn't jump to line 755, because the condition on line 754 was never true
755 kwargs.update(driver=self.driver)
757 self.__file = File(self.source, open_flag, **kwargs)
759 def close(self, close_links=True):
760 """Close this file and any files linked to from this file.
762 :param close_links: Whether to close all files linked to from this file. (default: True)
763 :type close_links: bool
764 """
765 if close_links:
766 self.close_linked_files()
767 try:
768 if self.__file is not None:
769 self.__file.close()
770 except AttributeError:
771 # Do not do anything in case that self._file does not exist. This
772 # may happen in case that an error occurs before HDF5IO has been fully
773 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises
774 # an error before self.__file has been created
775 self.__file = None
777 def close_linked_files(self):
778 """Close all opened, linked-to files.
780 MacOS and Linux automatically release the linked-to file after the linking file is closed, but Windows does
781 not, which prevents the linked-to file from being deleted or truncated. Use this method to close all opened,
782 linked-to files.
783 """
784 # Make sure
785 try:
786 for obj in self.__open_links:
787 if obj:
788 obj.file.close()
789 except AttributeError:
790 # Do not do anything in case that self.__open_links does not exist. This
791 # may happen in case that an error occurs before HDF5IO has been fully
792 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises
793 # an error before self.__open_links has been created.
794 pass
795 finally:
796 self.__open_links = []
798 @docval({'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the HDF5 file'},
799 {'name': 'link_data', 'type': bool,
800 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
801 {'name': 'exhaust_dci', 'type': bool,
802 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
803 'default': True},
804 {'name': 'export_source', 'type': str,
805 'doc': 'The source of the builders when exporting', 'default': None})
806 def write_builder(self, **kwargs):
807 f_builder = popargs('builder', kwargs)
808 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
809 self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s"
810 % (f_builder.name, self.source, kwargs))
811 for name, gbldr in f_builder.groups.items():
812 self.write_group(self.__file, gbldr, **kwargs)
813 for name, dbldr in f_builder.datasets.items():
814 self.write_dataset(self.__file, dbldr, **kwargs)
815 for name, lbldr in f_builder.links.items():
816 self.write_link(self.__file, lbldr, export_source=kwargs.get("export_source"))
817 self.set_attributes(self.__file, f_builder.attributes)
818 self.__add_refs()
819 self.__dci_queue.exhaust_queue()
820 self.__set_written(f_builder)
821 self.logger.debug("Done writing %s '%s' to path '%s'" %
822 (f_builder.__class__.__qualname__, f_builder.name, self.source))
824 def __add_refs(self):
825 '''
826 Add all references in the file.
828 References get queued to be added at the end of write. This is because
829 the current traversal algorithm (i.e. iterating over GroupBuilder items)
830 does not happen in a guaranteed order. We need to figure out what objects
831 will be references, and then write them after we write everything else.
832 '''
833 failed = set()
834 while len(self.__ref_queue) > 0:
835 call = self.__ref_queue.popleft()
836 self.logger.debug("Adding reference with call id %d from queue (length %d)"
837 % (id(call), len(self.__ref_queue)))
838 try:
839 call()
840 except KeyError:
841 if id(call) in failed:
842 raise RuntimeError('Unable to resolve reference')
843 self.logger.debug("Adding reference with call id %d failed. Appending call to queue" % id(call))
844 failed.add(id(call))
845 self.__ref_queue.append(call)
847 @classmethod
848 def get_type(cls, data):
849 if isinstance(data, str):
850 return H5_TEXT
851 elif isinstance(data, bytes): 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true
852 return H5_BINARY
853 elif isinstance(data, Container): 853 ↛ 854line 853 didn't jump to line 854, because the condition on line 853 was never true
854 return H5_REF
855 elif not hasattr(data, '__len__'):
856 return type(data)
857 else:
858 if len(data) == 0:
859 if hasattr(data, 'dtype'): 859 ↛ 860line 859 didn't jump to line 860, because the condition on line 859 was never true
860 return data.dtype
861 else:
862 raise ValueError('cannot determine type for empty data')
863 return cls.get_type(data[0])
865 __dtypes = {
866 "float": np.float32,
867 "float32": np.float32,
868 "double": np.float64,
869 "float64": np.float64,
870 "long": np.int64,
871 "int64": np.int64,
872 "int": np.int32,
873 "int32": np.int32,
874 "short": np.int16,
875 "int16": np.int16,
876 "int8": np.int8,
877 "uint64": np.uint64,
878 "uint": np.uint32,
879 "uint32": np.uint32,
880 "uint16": np.uint16,
881 "uint8": np.uint8,
882 "bool": np.bool_,
883 "text": H5_TEXT,
884 "utf": H5_TEXT,
885 "utf8": H5_TEXT,
886 "utf-8": H5_TEXT,
887 "ascii": H5_BINARY,
888 "bytes": H5_BINARY,
889 "ref": H5_REF,
890 "reference": H5_REF,
891 "object": H5_REF,
892 "region": H5_REGREF,
893 "isodatetime": H5_TEXT,
894 "datetime": H5_TEXT,
895 }
897 @classmethod
898 def __resolve_dtype__(cls, dtype, data):
899 # TODO: These values exist, but I haven't solved them yet
900 # binary
901 # number
902 dtype = cls.__resolve_dtype_helper__(dtype)
903 if dtype is None:
904 dtype = cls.get_type(data)
905 return dtype
907 @classmethod
908 def __resolve_dtype_helper__(cls, dtype):
909 if dtype is None:
910 return None
911 elif isinstance(dtype, str):
912 return cls.__dtypes.get(dtype)
913 elif isinstance(dtype, dict):
914 return cls.__dtypes.get(dtype['reftype'])
915 elif isinstance(dtype, np.dtype):
916 # NOTE: some dtypes may not be supported, but we need to support writing of read-in compound types
917 return dtype
918 else:
919 return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype])
921 @docval({'name': 'obj', 'type': (Group, Dataset), 'doc': 'the HDF5 object to add attributes to'},
922 {'name': 'attributes',
923 'type': dict,
924 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'})
925 def set_attributes(self, **kwargs):
926 obj, attributes = getargs('obj', 'attributes', kwargs)
927 for key, value in attributes.items():
928 try:
929 if isinstance(value, (set, list, tuple)):
930 tmp = tuple(value)
931 if len(tmp) > 0:
932 if isinstance(tmp[0], (str, bytes)): 932 ↛ 934line 932 didn't jump to line 934, because the condition on line 932 was never false
933 value = np.array(value, dtype=special_dtype(vlen=type(tmp[0])))
934 elif isinstance(tmp[0], Container): # a list of references
935 self.__queue_ref(self._make_attr_ref_filler(obj, key, tmp))
936 else:
937 value = np.array(value)
938 self.logger.debug("Setting %s '%s' attribute '%s' to %s"
939 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
940 obj.attrs[key] = value
941 elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference
942 self.__queue_ref(self._make_attr_ref_filler(obj, key, value))
943 else:
944 self.logger.debug("Setting %s '%s' attribute '%s' to %s"
945 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
946 if isinstance(value, np.ndarray) and value.dtype.kind == 'U': 946 ↛ 947line 946 didn't jump to line 947, because the condition on line 946 was never true
947 value = np.array(value, dtype=H5_TEXT)
948 obj.attrs[key] = value # a regular scalar
949 except Exception as e:
950 msg = "unable to write attribute '%s' on object '%s'" % (key, obj.name)
951 raise RuntimeError(msg) from e
953 def _make_attr_ref_filler(self, obj, key, value):
954 '''
955 Make the callable for setting references to attributes
956 '''
957 self.logger.debug("Queueing set %s '%s' attribute '%s' to %s"
958 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
959 if isinstance(value, (tuple, list)): 959 ↛ 960line 959 didn't jump to line 960, because the condition on line 959 was never true
960 def _filler():
961 ret = list()
962 for item in value:
963 ret.append(self.__get_ref(item))
964 obj.attrs[key] = ret
965 else:
966 def _filler():
967 obj.attrs[key] = self.__get_ref(value)
968 return _filler
970 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'},
971 {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'},
972 {'name': 'link_data', 'type': bool,
973 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
974 {'name': 'exhaust_dci', 'type': bool,
975 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
976 'default': True},
977 {'name': 'export_source', 'type': str,
978 'doc': 'The source of the builders when exporting', 'default': None},
979 returns='the Group that was created', rtype='Group')
980 def write_group(self, **kwargs):
981 parent, builder = popargs('parent', 'builder', kwargs)
982 self.logger.debug("Writing GroupBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
983 if self.get_written(builder):
984 self.logger.debug(" GroupBuilder '%s' is already written" % builder.name)
985 group = parent[builder.name]
986 else:
987 self.logger.debug(" Creating group '%s'" % builder.name)
988 group = parent.create_group(builder.name)
989 # write all groups
990 subgroups = builder.groups
991 if subgroups:
992 for subgroup_name, sub_builder in subgroups.items():
993 # do not create an empty group without attributes or links
994 self.write_group(group, sub_builder, **kwargs)
995 # write all datasets
996 datasets = builder.datasets
997 if datasets:
998 for dset_name, sub_builder in datasets.items():
999 self.write_dataset(group, sub_builder, **kwargs)
1000 # write all links
1001 links = builder.links
1002 if links:
1003 for link_name, sub_builder in links.items():
1004 self.write_link(group, sub_builder, export_source=kwargs.get("export_source"))
1005 attributes = builder.attributes
1006 self.set_attributes(group, attributes)
1007 self.__set_written(builder)
1008 return group
1010 def __get_path(self, builder):
1011 """Get the path to the builder.
1013 Note that the root of the file has no name - it is just "/". Thus, the name of the root container is ignored.
1014 If builder.location is set then it is used as the path, otherwise the function
1015 determines the path by constructing it iteratively from the parents of the
1016 builder.
1017 """
1018 if builder.location is not None:
1019 path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/")
1020 else:
1021 curr = builder
1022 names = list()
1023 while curr.parent is not None:
1024 names.append(curr.name)
1025 curr = curr.parent
1026 delim = "/"
1027 path = "%s%s" % (delim, delim.join(reversed(names)))
1028 return path
1030 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'},
1031 {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'},
1032 {'name': 'export_source', 'type': str,
1033 'doc': 'The source of the builders when exporting', 'default': None},
1034 returns='the Link that was created', rtype='Link')
1035 def write_link(self, **kwargs):
1036 parent, builder, export_source = getargs('parent', 'builder', 'export_source', kwargs)
1037 self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
1038 if self.get_written(builder): 1038 ↛ 1039line 1038 didn't jump to line 1039, because the condition on line 1038 was never true
1039 self.logger.debug(" LinkBuilder '%s' is already written" % builder.name)
1040 return None
1041 name = builder.name
1042 target_builder = builder.builder
1043 path = self.__get_path(target_builder)
1044 # source will indicate target_builder's location
1045 if export_source is None:
1046 write_source = builder.source
1047 else:
1048 write_source = export_source
1050 parent_filename = os.path.abspath(parent.file.filename)
1051 if target_builder.source in (write_source, parent_filename):
1052 link_obj = SoftLink(path)
1053 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1054 % (parent.name, name, link_obj.path))
1055 elif target_builder.source is not None: 1055 ↛ 1064line 1055 didn't jump to line 1064, because the condition on line 1055 was never false
1056 target_filename = os.path.abspath(target_builder.source)
1057 relative_path = os.path.relpath(target_filename, os.path.dirname(parent_filename))
1058 if target_builder.location is not None:
1059 path = target_builder.location + "/" + target_builder.name
1060 link_obj = ExternalLink(relative_path, path)
1061 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1062 % (parent.name, name, link_obj.filename, link_obj.path))
1063 else:
1064 msg = 'cannot create external link to %s' % path
1065 raise ValueError(msg)
1066 parent[name] = link_obj
1067 self.__set_written(builder)
1068 return link_obj
1070 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, # noqa: C901
1071 {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'},
1072 {'name': 'link_data', 'type': bool,
1073 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
1074 {'name': 'exhaust_dci', 'type': bool,
1075 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
1076 'default': True},
1077 {'name': 'export_source', 'type': str,
1078 'doc': 'The source of the builders when exporting', 'default': None},
1079 returns='the Dataset that was created', rtype=Dataset)
1080 def write_dataset(self, **kwargs): # noqa: C901
1081 """ Write a dataset to HDF5
1083 The function uses other dataset-dependent write functions, e.g,
1084 ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data.
1085 """
1086 parent, builder = popargs('parent', 'builder', kwargs)
1087 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
1088 self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
1089 if self.get_written(builder):
1090 self.logger.debug(" DatasetBuilder '%s' is already written" % builder.name)
1091 return None
1092 name = builder.name
1093 data = builder.data
1094 dataio = None
1095 options = dict() # dict with additional
1096 if isinstance(data, H5DataIO):
1097 options['io_settings'] = data.io_settings
1098 dataio = data
1099 link_data = data.link_data
1100 data = data.data
1101 else:
1102 options['io_settings'] = {}
1103 attributes = builder.attributes
1104 options['dtype'] = builder.dtype
1105 dset = None
1106 link = None
1108 # The user provided an existing h5py dataset as input and asked to create a link to the dataset
1109 if isinstance(data, Dataset):
1110 data_filename = os.path.abspath(data.file.filename)
1111 if link_data:
1112 if export_source is None: # not exporting
1113 parent_filename = os.path.abspath(parent.file.filename)
1114 if data_filename != parent_filename: # create external link to data
1115 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename))
1116 link = ExternalLink(relative_path, data.name)
1117 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1118 % (parent.name, name, link.filename, link.path))
1119 else: # create soft link to dataset already in this file -- possible if mode == 'r+'
1120 link = SoftLink(data.name)
1121 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1122 % (parent.name, name, link.path))
1123 parent[name] = link
1124 else: # exporting
1125 export_source = os.path.abspath(export_source)
1126 parent_filename = os.path.abspath(parent.file.filename)
1127 if data_filename != export_source: # dataset is in different file than export source
1128 # possible if user adds a link to a dataset in a different file after reading export source
1129 # to memory
1130 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename))
1131 link = ExternalLink(relative_path, data.name)
1132 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1133 % (parent.name, name, link.filename, link.path))
1134 parent[name] = link
1135 elif parent.name != data.parent.name: # dataset is in export source and has different path
1136 # so create a soft link to the dataset in this file
1137 # possible if user adds a link to a dataset in export source after reading to memory
1138 # TODO check that there is/will be still a dataset at data.name -- if the dataset has
1139 # been removed, then this link will be broken
1140 link = SoftLink(data.name)
1141 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1142 % (parent.name, name, link.path))
1143 parent[name] = link
1144 else: # dataset is in export source and has same path as the builder, so copy the dataset
1145 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'"
1146 % (data.file.filename, data.name, parent.name, name))
1147 parent.copy(source=data,
1148 dest=parent,
1149 name=name,
1150 expand_soft=False,
1151 expand_external=False,
1152 expand_refs=False,
1153 without_attrs=True)
1154 dset = parent[name]
1155 else:
1156 # TODO add option for case where there are multiple links to the same dataset within a file:
1157 # instead of copying the dset N times, copy it once and create soft links to it within the file
1158 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'"
1159 % (data.file.filename, data.name, parent.name, name))
1160 parent.copy(source=data,
1161 dest=parent,
1162 name=name,
1163 expand_soft=False,
1164 expand_external=False,
1165 expand_refs=False,
1166 without_attrs=True)
1167 dset = parent[name]
1169 # Write a compound dataset, i.e, a dataset with compound data type
1170 elif isinstance(options['dtype'], list):
1171 # do some stuff to figure out what data is a reference
1172 refs = list()
1173 for i, dts in enumerate(options['dtype']):
1174 if self.__is_ref(dts):
1175 refs.append(i)
1176 # If one or more of the parts of the compound data type are references then we need to deal with those
1177 if len(refs) > 0:
1178 try:
1179 _dtype = self.__resolve_dtype__(options['dtype'], data)
1180 except Exception as exc:
1181 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1182 raise Exception(msg) from exc
1183 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings'])
1184 self.__set_written(builder)
1185 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1186 "object references. attributes: %s"
1187 % (name, list(attributes.keys())))
1189 @self.__queue_ref
1190 def _filler():
1191 self.logger.debug("Resolving object references and setting attribute on dataset '%s' "
1192 "containing attributes: %s"
1193 % (name, list(attributes.keys())))
1194 ret = list()
1195 for item in data:
1196 new_item = list(item)
1197 for i in refs:
1198 new_item[i] = self.__get_ref(item[i])
1199 ret.append(tuple(new_item))
1200 dset = parent[name]
1201 dset[:] = ret
1202 self.set_attributes(dset, attributes)
1204 return
1205 # If the compound data type contains only regular data (i.e., no references) then we can write it as usual
1206 else:
1207 dset = self.__list_fill__(parent, name, data, options)
1208 # Write a dataset containing references, i.e., a region or object reference.
1209 # NOTE: we can ignore options['io_settings'] for scalar data
1210 elif self.__is_ref(options['dtype']):
1211 _dtype = self.__dtypes.get(options['dtype'])
1212 # Write a scalar data region reference dataset
1213 if isinstance(data, RegionBuilder): 1213 ↛ 1214line 1213 didn't jump to line 1214, because the condition on line 1213 was never true
1214 dset = parent.require_dataset(name, shape=(), dtype=_dtype)
1215 self.__set_written(builder)
1216 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing a "
1217 "region reference. attributes: %s"
1218 % (name, list(attributes.keys())))
1220 @self.__queue_ref
1221 def _filler():
1222 self.logger.debug("Resolving region reference and setting attribute on dataset '%s' "
1223 "containing attributes: %s"
1224 % (name, list(attributes.keys())))
1225 ref = self.__get_ref(data.builder, data.region)
1226 dset = parent[name]
1227 dset[()] = ref
1228 self.set_attributes(dset, attributes)
1229 # Write a scalar object reference dataset
1230 elif isinstance(data, ReferenceBuilder): 1230 ↛ 1231line 1230 didn't jump to line 1231, because the condition on line 1230 was never true
1231 dset = parent.require_dataset(name, dtype=_dtype, shape=())
1232 self.__set_written(builder)
1233 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing an "
1234 "object reference. attributes: %s"
1235 % (name, list(attributes.keys())))
1237 @self.__queue_ref
1238 def _filler():
1239 self.logger.debug("Resolving object reference and setting attribute on dataset '%s' "
1240 "containing attributes: %s"
1241 % (name, list(attributes.keys())))
1242 ref = self.__get_ref(data.builder)
1243 dset = parent[name]
1244 dset[()] = ref
1245 self.set_attributes(dset, attributes)
1246 # Write an array dataset of references
1247 else:
1248 # Write a array of region references
1249 if options['dtype'] == 'region': 1249 ↛ 1250line 1249 didn't jump to line 1250, because the condition on line 1249 was never true
1250 dset = parent.require_dataset(name, dtype=_dtype, shape=(len(data),), **options['io_settings'])
1251 self.__set_written(builder)
1252 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1253 "region references. attributes: %s"
1254 % (name, list(attributes.keys())))
1256 @self.__queue_ref
1257 def _filler():
1258 self.logger.debug("Resolving region references and setting attribute on dataset '%s' "
1259 "containing attributes: %s"
1260 % (name, list(attributes.keys())))
1261 refs = list()
1262 for item in data:
1263 refs.append(self.__get_ref(item.builder, item.region))
1264 dset = parent[name]
1265 dset[()] = refs
1266 self.set_attributes(dset, attributes)
1267 # Write array of object references
1268 else:
1269 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings'])
1270 self.__set_written(builder)
1271 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1272 "object references. attributes: %s"
1273 % (name, list(attributes.keys())))
1275 @self.__queue_ref
1276 def _filler():
1277 self.logger.debug("Resolving object references and setting attribute on dataset '%s' "
1278 "containing attributes: %s"
1279 % (name, list(attributes.keys())))
1280 refs = list()
1281 for item in data:
1282 refs.append(self.__get_ref(item))
1283 dset = parent[name]
1284 dset[()] = refs
1285 self.set_attributes(dset, attributes)
1286 return
1287 # write a "regular" dataset
1288 else:
1289 # Create an empty dataset
1290 if data is None:
1291 dset = self.__setup_empty_dset__(parent, name, options['io_settings'])
1292 dataio.dataset = dset
1293 # Write a scalar dataset containing a single string
1294 elif isinstance(data, (str, bytes)):
1295 dset = self.__scalar_fill__(parent, name, data, options)
1296 # Iterative write of a data chunk iterator
1297 elif isinstance(data, AbstractDataChunkIterator):
1298 dset = self.__setup_chunked_dset__(parent, name, data, options)
1299 self.__dci_queue.append(dataset=dset, data=data)
1300 # Write a regular in memory array (e.g., numpy array, list etc.)
1301 elif hasattr(data, '__len__'):
1302 dset = self.__list_fill__(parent, name, data, options)
1303 # Write a regular scalar dataset
1304 else:
1305 dset = self.__scalar_fill__(parent, name, data, options)
1306 # Create the attributes on the dataset only if we are the primary and not just a Soft/External link
1307 if link is None:
1308 self.set_attributes(dset, attributes)
1309 # Validate the attributes on the linked dataset
1310 elif len(attributes) > 0:
1311 pass
1312 self.__set_written(builder)
1313 if exhaust_dci: 1313 ↛ exitline 1313 didn't return from function 'write_dataset', because the condition on line 1313 was never false
1314 self.__dci_queue.exhaust_queue()
1316 @classmethod
1317 def __scalar_fill__(cls, parent, name, data, options=None):
1318 dtype = None
1319 io_settings = {}
1320 if options is not None: 1320 ↛ 1323line 1320 didn't jump to line 1323, because the condition on line 1320 was never false
1321 dtype = options.get('dtype')
1322 io_settings = options.get('io_settings')
1323 if not isinstance(dtype, type): 1323 ↛ 1329line 1323 didn't jump to line 1329, because the condition on line 1323 was never false
1324 try:
1325 dtype = cls.__resolve_dtype__(dtype, data)
1326 except Exception as exc:
1327 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1328 raise Exception(msg) from exc
1329 try:
1330 dset = parent.create_dataset(name, data=data, shape=None, dtype=dtype, **io_settings)
1331 except Exception as exc:
1332 msg = "Could not create scalar dataset %s in %s" % (name, parent.name)
1333 raise Exception(msg) from exc
1334 return dset
1336 @classmethod
1337 def __setup_chunked_dset__(cls, parent, name, data, options=None):
1338 """
1339 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator
1341 :param parent: The parent object to which the dataset should be added
1342 :type parent: h5py.Group, h5py.File
1343 :param name: The name of the dataset
1344 :type name: str
1345 :param data: The data to be written.
1346 :type data: DataChunkIterator
1347 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1348 :type options: dict
1350 """
1351 io_settings = {}
1352 if options is not None:
1353 if 'io_settings' in options: 1353 ↛ 1356line 1353 didn't jump to line 1356, because the condition on line 1353 was never false
1354 io_settings = options.get('io_settings')
1355 # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write.
1356 if 'chunks' not in io_settings:
1357 recommended_chunks = data.recommended_chunk_shape()
1358 io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks
1359 # Define the shape of the data if not provided by the user
1360 if 'shape' not in io_settings: 1360 ↛ 1363line 1360 didn't jump to line 1363, because the condition on line 1360 was never false
1361 io_settings['shape'] = data.recommended_data_shape()
1362 # Define the maxshape of the data if not provided by the user
1363 if 'maxshape' not in io_settings:
1364 io_settings['maxshape'] = data.maxshape
1365 if 'dtype' not in io_settings: 1365 ↛ 1373line 1365 didn't jump to line 1373, because the condition on line 1365 was never false
1366 if (options is not None) and ('dtype' in options):
1367 io_settings['dtype'] = options['dtype']
1368 else:
1369 io_settings['dtype'] = data.dtype
1370 if isinstance(io_settings['dtype'], str): 1370 ↛ 1372line 1370 didn't jump to line 1372, because the condition on line 1370 was never true
1371 # map to real dtype if we were given a string
1372 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype'])
1373 try:
1374 dset = parent.create_dataset(name, **io_settings)
1375 except Exception as exc:
1376 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc
1377 return dset
1379 @classmethod
1380 def __setup_empty_dset__(cls, parent, name, io_settings):
1381 """
1382 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator
1384 :param parent: The parent object to which the dataset should be added
1385 :type parent: h5py.Group, h5py.File
1386 :param name: The name of the dataset
1387 :type name: str
1388 :param data: The data to be written.
1389 :type data: DataChunkIterator
1390 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1391 :type options: dict
1393 """
1394 # Define the shape of the data if not provided by the user
1395 if 'shape' not in io_settings:
1396 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without shape")
1397 if 'dtype' not in io_settings:
1398 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without dtype")
1399 if isinstance(io_settings['dtype'], str):
1400 # map to real dtype if we were given a string
1401 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype'])
1402 try:
1403 dset = parent.create_dataset(name, **io_settings)
1404 except Exception as exc:
1405 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc
1406 return dset
1408 @classmethod
1409 def __chunked_iter_fill__(cls, parent, name, data, options=None):
1410 """
1411 Write data to a dataset one-chunk-at-a-time based on the given DataChunkIterator
1413 :param parent: The parent object to which the dataset should be added
1414 :type parent: h5py.Group, h5py.File
1415 :param name: The name of the dataset
1416 :type name: str
1417 :param data: The data to be written.
1418 :type data: DataChunkIterator
1419 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1420 :type options: dict
1422 """
1423 dset = cls.__setup_chunked_dset__(parent, name, data, options=options)
1424 read = True
1425 while read:
1426 read = HDF5IODataChunkIteratorQueue._write_chunk(dset, data)
1427 return dset
1429 @classmethod
1430 def __list_fill__(cls, parent, name, data, options=None):
1431 # define the io settings and data type if necessary
1432 io_settings = {}
1433 dtype = None
1434 if options is not None:
1435 dtype = options.get('dtype')
1436 io_settings = options.get('io_settings')
1437 if not isinstance(dtype, type):
1438 try:
1439 dtype = cls.__resolve_dtype__(dtype, data)
1440 except Exception as exc:
1441 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1442 raise Exception(msg) from exc
1443 # define the data shape
1444 if 'shape' in io_settings: 1444 ↛ 1445line 1444 didn't jump to line 1445, because the condition on line 1444 was never true
1445 data_shape = io_settings.pop('shape')
1446 elif hasattr(data, 'shape'):
1447 data_shape = data.shape
1448 elif isinstance(dtype, np.dtype):
1449 data_shape = (len(data),)
1450 else:
1451 data_shape = get_data_shape(data)
1453 # Create the dataset
1454 try:
1455 dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings)
1456 except Exception as exc:
1457 msg = "Could not create dataset %s in %s with shape %s, dtype %s, and iosettings %s. %s" % \
1458 (name, parent.name, str(data_shape), str(dtype), str(io_settings), str(exc))
1459 raise Exception(msg) from exc
1460 # Write the data
1461 if len(data) > dset.shape[0]: 1461 ↛ 1462line 1461 didn't jump to line 1462, because the condition on line 1461 was never true
1462 new_shape = list(dset.shape)
1463 new_shape[0] = len(data)
1464 dset.resize(new_shape)
1465 try:
1466 dset[:] = data
1467 except Exception as e:
1468 raise e
1469 return dset
1471 @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference',
1472 'default': None},
1473 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object',
1474 'default': None},
1475 returns='the reference', rtype=Reference)
1476 def __get_ref(self, **kwargs):
1477 container, region = getargs('container', 'region', kwargs)
1478 if container is None: 1478 ↛ 1479line 1478 didn't jump to line 1479, because the condition on line 1478 was never true
1479 return None
1480 if isinstance(container, Builder):
1481 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
1482 if isinstance(container, LinkBuilder): 1482 ↛ 1483line 1482 didn't jump to line 1483, because the condition on line 1482 was never true
1483 builder = container.target_builder
1484 else:
1485 builder = container
1486 elif isinstance(container, ReferenceBuilder):
1487 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.builder.name))
1488 builder = container.builder
1489 else:
1490 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
1491 builder = self.manager.build(container)
1492 path = self.__get_path(builder)
1493 self.logger.debug("Getting reference at path '%s'" % path)
1494 if isinstance(container, RegionBuilder): 1494 ↛ 1495line 1494 didn't jump to line 1495, because the condition on line 1494 was never true
1495 region = container.region
1496 if region is not None: 1496 ↛ 1497line 1496 didn't jump to line 1497, because the condition on line 1496 was never true
1497 dset = self.__file[path]
1498 if not isinstance(dset, Dataset):
1499 raise ValueError('cannot create region reference without Dataset')
1500 return self.__file[path].regionref[region]
1501 else:
1502 return self.__file[path].ref
1504 def __is_ref(self, dtype):
1505 if isinstance(dtype, DtypeSpec):
1506 return self.__is_ref(dtype.dtype)
1507 if isinstance(dtype, RefSpec):
1508 return True
1509 if isinstance(dtype, dict): # may be dict from reading a compound dataset
1510 return self.__is_ref(dtype['dtype'])
1511 if isinstance(dtype, str):
1512 return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE
1513 return False
1515 def __queue_ref(self, func):
1516 '''Set aside filling dset with references
1518 dest[sl] = func()
1520 Args:
1521 dset: the h5py.Dataset that the references need to be added to
1522 sl: the np.s_ (slice) object for indexing into dset
1523 func: a function to call to return the chunk of data, with
1524 references filled in
1525 '''
1526 # TODO: come up with more intelligent way of
1527 # queueing reference resolution, based on reference
1528 # dependency
1529 self.__ref_queue.append(func)
1531 def __rec_get_ref(self, ref_list):
1532 ret = list()
1533 for elem in ref_list:
1534 if isinstance(elem, (list, tuple)):
1535 ret.append(self.__rec_get_ref(elem))
1536 elif isinstance(elem, (Builder, Container)):
1537 ret.append(self.__get_ref(elem))
1538 else:
1539 ret.append(elem)
1540 return ret
1542 @property
1543 def mode(self):
1544 """
1545 Return the HDF5 file mode. One of ("w", "r", "r+", "a", "w-", "x").
1546 """
1547 return self.__mode
1549 @classmethod
1550 @docval(*get_docval(H5DataIO.__init__))
1551 def set_dataio(cls, **kwargs):
1552 """
1553 Wrap the given Data object with an H5DataIO.
1555 This method is provided merely for convenience. It is the equivalent
1556 of the following:
1558 .. code-block:: python
1560 from hdmf.backends.hdf5 import H5DataIO
1561 data = ...
1562 data = H5DataIO(data)
1563 """
1564 return H5DataIO.__init__(**kwargs)