Coverage for src/hdmf/backends/hdf5/h5tools.py: 87%
905 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
1import logging
2import os.path
3import warnings
4from collections import deque
5from functools import partial
6from pathlib import Path, PurePosixPath as pp
8import numpy as np
9import h5py
10from h5py import File, Group, Dataset, special_dtype, SoftLink, ExternalLink, Reference, RegionReference, check_dtype
12from .h5_utils import (BuilderH5ReferenceDataset, BuilderH5RegionDataset, BuilderH5TableDataset, H5DataIO,
13 H5SpecReader, H5SpecWriter, HDF5IODataChunkIteratorQueue)
14from ..io import HDMFIO
15from ..errors import UnsupportedOperation
16from ..warnings import BrokenLinkWarning
17from ...build import (Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, RegionBuilder,
18 ReferenceBuilder, TypeMap, ObjectMapper)
19from ...container import Container
20from ...term_set import TermSetWrapper
21from ...data_utils import AbstractDataChunkIterator
22from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
23from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset
24from ..utils import NamespaceToBuilderHelper, WriteStatusTracker
26ROOT_NAME = 'root'
27SPEC_LOC_ATTR = '.specloc'
28H5_TEXT = special_dtype(vlen=str)
29H5_BINARY = special_dtype(vlen=bytes)
30H5_REF = special_dtype(ref=Reference)
31H5_REGREF = special_dtype(ref=RegionReference)
33RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB
35H5PY_3 = h5py.__version__.startswith('3')
38class HDF5IO(HDMFIO):
40 __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group
42 @staticmethod
43 def can_read(path):
44 """Determines whether a given path is readable by the HDF5IO class"""
45 if not os.path.isfile(path):
46 return False
47 try:
48 with h5py.File(path, "r"):
49 return True
50 except IOError:
51 return False
53 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
54 {'name': 'mode', 'type': str,
55 'doc': ('the mode to open the HDF5 file with, one of ("w", "r", "r+", "a", "w-", "x"). '
56 'See `h5py.File <http://docs.h5py.org/en/latest/high/file.html#opening-creating-files>`_ for '
57 'more details.'),
58 'default': 'r'},
59 {'name': 'manager', 'type': (TypeMap, BuildManager),
60 'doc': 'the BuildManager or a TypeMap to construct a BuildManager to use for I/O', 'default': None},
61 {'name': 'comm', 'type': 'Intracomm',
62 'doc': 'the MPI communicator to use for parallel I/O', 'default': None},
63 {'name': 'file', 'type': [File, "S3File", "RemFile"],
64 'doc': 'a pre-existing h5py.File, S3File, or RemFile object', 'default': None},
65 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
66 {'name': 'herd_path', 'type': str,
67 'doc': 'The path to read/write the HERD file', 'default': None},)
68 def __init__(self, **kwargs):
69 """Open an HDF5 file for IO.
70 """
71 self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
72 path, manager, mode, comm, file_obj, driver, herd_path = popargs('path', 'manager', 'mode',
73 'comm', 'file', 'driver',
74 'herd_path',
75 kwargs)
77 self.__open_links = [] # keep track of other files opened from links in this file
78 self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close
80 path = self.__check_path_file_obj(path, file_obj)
82 if file_obj is None and not os.path.exists(path) and (mode == 'r' or mode == 'r+') and driver != 'ros3':
83 msg = "Unable to open file %s in '%s' mode. File does not exist." % (path, mode)
84 raise UnsupportedOperation(msg)
86 if file_obj is None and os.path.exists(path) and (mode == 'w-' or mode == 'x'):
87 msg = "Unable to open file %s in '%s' mode. File already exists." % (path, mode)
88 raise UnsupportedOperation(msg)
90 if manager is None:
91 manager = BuildManager(TypeMap(NamespaceCatalog()))
92 elif isinstance(manager, TypeMap): 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true
93 manager = BuildManager(manager)
94 self.__driver = driver
95 self.__comm = comm
96 self.__mode = mode
97 self.__file = file_obj
98 super().__init__(manager, source=path, herd_path=herd_path)
99 # NOTE: source is not set if path is None and file_obj is passed
100 self.__built = dict() # keep track of each builder for each dataset/group/link for each file
101 self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder
102 self.__ref_queue = deque() # a queue of the references that need to be added
103 self.__dci_queue = HDF5IODataChunkIteratorQueue() # a queue of DataChunkIterators that need to be exhausted
104 ObjectMapper.no_convert(Dataset)
105 self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object
107 @property
108 def comm(self):
109 """The MPI communicator to use for parallel I/O."""
110 return self.__comm
112 @property
113 def _file(self):
114 return self.__file
116 @property
117 def driver(self):
118 return self.__driver
120 @classmethod
121 def __check_path_file_obj(cls, path, file_obj):
122 if isinstance(path, Path):
123 path = str(path)
125 if path is None and file_obj is None:
126 raise ValueError("Either the 'path' or 'file' argument must be supplied.")
128 if path is not None and file_obj is not None: # consistency check
129 if os.path.abspath(file_obj.filename) != os.path.abspath(path):
130 msg = ("You argued '%s' as this object's path, but supplied a file with filename: %s"
131 % (path, file_obj.filename))
132 raise ValueError(msg)
134 return path
136 @classmethod
137 def __resolve_file_obj(cls, path, file_obj, driver):
138 path = cls.__check_path_file_obj(path, file_obj)
140 if file_obj is None:
141 file_kwargs = dict()
142 if driver is not None: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 file_kwargs.update(driver=driver)
144 file_obj = File(path, 'r', **file_kwargs)
145 return file_obj
147 @classmethod
148 @docval({'name': 'namespace_catalog', 'type': (NamespaceCatalog, TypeMap),
149 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'},
150 {'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
151 {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None},
152 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
153 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
154 returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and "
155 "the included data types"),
156 rtype=dict)
157 def load_namespaces(cls, **kwargs):
158 """Load cached namespaces from a file.
160 If `file` is not supplied, then an :py:class:`h5py.File` object will be opened for the given `path`, the
161 namespaces will be read, and the File object will be closed. If `file` is supplied, then
162 the given File object will be read from and not closed.
164 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
165 """
166 namespace_catalog, path, namespaces, file_obj, driver = popargs(
167 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs)
169 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
170 if file_obj is None: # need to close the file object that we just opened
171 with open_file_obj:
172 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj)
173 return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj)
175 @classmethod
176 def __load_namespaces(cls, namespace_catalog, namespaces, file_obj):
177 d = {}
179 if not cls.__check_specloc(file_obj):
180 return d
182 namespace_versions = cls.__get_namespaces(file_obj)
184 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]]
185 if namespaces is None: 185 ↛ 188line 185 didn't jump to line 188, because the condition on line 185 was never false
186 namespaces = list(spec_group.keys())
188 readers = dict()
189 deps = dict()
190 for ns in namespaces:
191 latest_version = namespace_versions[ns]
192 ns_group = spec_group[ns][latest_version]
193 reader = H5SpecReader(ns_group)
194 readers[ns] = reader
195 # for each namespace in the 'namespace' dataset, track all included namespaces (dependencies)
196 for spec_ns in reader.read_namespace(cls.__ns_spec_path):
197 deps[ns] = list()
198 for s in spec_ns['schema']:
199 dep = s.get('namespace')
200 if dep is not None:
201 deps[ns].append(dep)
203 order = cls._order_deps(deps)
204 for ns in order:
205 reader = readers[ns]
206 d.update(namespace_catalog.load_namespaces(cls.__ns_spec_path, reader=reader))
208 return d
210 @classmethod
211 def __check_specloc(cls, file_obj):
212 return SPEC_LOC_ATTR in file_obj.attrs
214 @classmethod
215 @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
216 {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
217 {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
218 returns="dict mapping names to versions of the namespaces in the file", rtype=dict)
219 def get_namespaces(cls, **kwargs):
220 """Get the names and versions of the cached namespaces from a file.
222 If ``file`` is not supplied, then an :py:class:`h5py.File` object will be opened for the given ``path``, the
223 namespaces will be read, and the File object will be closed. If `file` is supplied, then
224 the given File object will be read from and not closed.
226 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric
227 ordering) is returned. This is the version of the namespace that is loaded by HDF5IO.load_namespaces(...).
229 :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
230 """
231 path, file_obj, driver = popargs('path', 'file', 'driver', kwargs)
233 open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
234 if file_obj is None: # need to close the file object that we just opened
235 with open_file_obj:
236 return cls.__get_namespaces(open_file_obj)
237 return cls.__get_namespaces(open_file_obj)
239 @classmethod
240 def __get_namespaces(cls, file_obj):
241 """Return a dict mapping namespace name to version string for the latest version of that namespace in the file.
243 If there are multiple versions of a namespace cached in the file, then only the latest one (using alphanumeric
244 ordering) is returned. This is the version of the namespace that is loaded by ``HDF5IO.load_namespaces``.
245 """
246 used_version_names = dict()
247 if not cls.__check_specloc(file_obj):
248 return used_version_names
250 spec_group = file_obj[file_obj.attrs[SPEC_LOC_ATTR]]
251 namespaces = list(spec_group.keys())
252 for ns in namespaces:
253 ns_group = spec_group[ns]
254 # NOTE: by default, objects within groups are iterated in alphanumeric order
255 version_names = list(ns_group.keys())
256 if len(version_names) > 1:
257 # prior to HDMF 1.6.1, extensions without a version were written under the group name "unversioned"
258 # make sure that if there is another group representing a newer version, that is read instead
259 if 'unversioned' in version_names:
260 version_names.remove('unversioned')
261 if len(version_names) > 1:
262 # as of HDMF 1.6.1, extensions without a version are written under the group name "None"
263 # make sure that if there is another group representing a newer version, that is read instead
264 if 'None' in version_names:
265 version_names.remove('None')
266 used_version_names[ns] = version_names[-1] # save the largest in alphanumeric order
268 return used_version_names
270 @classmethod
271 def _order_deps(cls, deps):
272 """
273 Order namespaces according to dependency for loading into a NamespaceCatalog
275 Args:
276 deps (dict): a dictionary that maps a namespace name to a list of name of
277 the namespaces on which the namespace is directly dependent
278 Example: {'a': ['b', 'c'], 'b': ['d'], 'c': ['d'], 'd': []}
279 Expected output: ['d', 'b', 'c', 'a']
280 """
281 order = list()
282 keys = list(deps.keys())
283 deps = dict(deps)
284 for k in keys:
285 if k in deps:
286 cls.__order_deps_aux(order, deps, k)
287 return order
289 @classmethod
290 def __order_deps_aux(cls, order, deps, key):
291 """
292 A recursive helper function for _order_deps
293 """
294 if key not in deps:
295 return
296 subdeps = deps.pop(key)
297 for subk in subdeps:
298 cls.__order_deps_aux(order, deps, subk)
299 order.append(key)
301 @classmethod
302 @docval({'name': 'source_filename', 'type': str, 'doc': 'the path to the HDF5 file to copy'},
303 {'name': 'dest_filename', 'type': str, 'doc': 'the name of the destination file'},
304 {'name': 'expand_external', 'type': bool, 'doc': 'expand external links into new objects', 'default': True},
305 {'name': 'expand_refs', 'type': bool, 'doc': 'copy objects which are pointed to by reference',
306 'default': False},
307 {'name': 'expand_soft', 'type': bool, 'doc': 'expand soft links into new objects', 'default': False}
308 )
309 def copy_file(self, **kwargs):
310 """
311 Convenience function to copy an HDF5 file while allowing external links to be resolved.
313 .. warning::
315 As of HDMF 2.0, this method is no longer supported and may be removed in a future version.
316 Please use the export method or h5py.File.copy method instead.
318 .. note::
320 The source file will be opened in 'r' mode and the destination file will be opened in 'w' mode
321 using h5py. To avoid possible collisions, care should be taken that, e.g., the source file is
322 not opened already when calling this function.
324 """
326 warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of "
327 "HDMF. Please use the export method or h5py.File.copy method instead.", DeprecationWarning)
329 source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename',
330 'dest_filename',
331 'expand_external',
332 'expand_refs',
333 'expand_soft',
334 kwargs)
335 source_file = File(source_filename, 'r')
336 dest_file = File(dest_filename, 'w')
337 for objname in source_file["/"].keys():
338 source_file.copy(source=objname,
339 dest=dest_file,
340 name=objname,
341 expand_external=expand_external,
342 expand_refs=expand_refs,
343 expand_soft=expand_soft,
344 shallow=False,
345 without_attrs=False,
346 )
347 for objname in source_file['/'].attrs:
348 dest_file['/'].attrs[objname] = source_file['/'].attrs[objname]
349 source_file.close()
350 dest_file.close()
352 @docval({'name': 'container', 'type': Container, 'doc': 'the Container object to write'},
353 {'name': 'cache_spec', 'type': bool,
354 'doc': ('If True (default), cache specification to file (highly recommended). If False, do not cache '
355 'specification to file. The appropriate specification will then need to be loaded prior to '
356 'reading the file.'),
357 'default': True},
358 {'name': 'link_data', 'type': bool,
359 'doc': 'If True (default), create external links to HDF5 Datasets. If False, copy HDF5 Datasets.',
360 'default': True},
361 {'name': 'exhaust_dci', 'type': bool,
362 'doc': 'If True (default), exhaust DataChunkIterators one at a time. If False, exhaust them concurrently.',
363 'default': True},
364 {'name': 'herd', 'type': 'HERD',
365 'doc': 'A HERD object to populate with references.',
366 'default': None})
367 def write(self, **kwargs):
368 """Write the container to an HDF5 file."""
369 if self.__mode == 'r':
370 raise UnsupportedOperation(("Cannot write to file %s in mode '%s'. "
371 "Please use mode 'r+', 'w', 'w-', 'x', or 'a'")
372 % (self.source, self.__mode))
374 cache_spec = popargs('cache_spec', kwargs)
375 super().write(**kwargs)
376 if cache_spec:
377 self.__cache_spec()
379 def __cache_spec(self):
380 ref = self.__file.attrs.get(SPEC_LOC_ATTR)
381 spec_group = None
382 if ref is not None:
383 spec_group = self.__file[ref]
384 else:
385 path = 'specifications' # do something to figure out where the specifications should go
386 spec_group = self.__file.require_group(path)
387 self.__file.attrs[SPEC_LOC_ATTR] = spec_group.ref
388 ns_catalog = self.manager.namespace_catalog
389 for ns_name in ns_catalog.namespaces:
390 ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name)
391 namespace = ns_catalog.get_namespace(ns_name)
392 group_name = '%s/%s' % (ns_name, namespace.version)
393 if group_name in spec_group:
394 continue
395 ns_group = spec_group.create_group(group_name)
396 writer = H5SpecWriter(ns_group)
397 ns_builder.export(self.__ns_spec_path, writer=writer)
399 _export_args = (
400 {'name': 'src_io', 'type': 'HDMFIO', 'doc': 'the HDMFIO object for reading the data to export'},
401 {'name': 'container', 'type': Container,
402 'doc': ('the Container object to export. If None, then the entire contents of the HDMFIO object will be '
403 'exported'),
404 'default': None},
405 {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`',
406 'default': None},
407 {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file',
408 'default': True}
409 # clear_cache is an arg on HDMFIO.export but it is intended for internal usage
410 # so it is not available on HDF5IO
411 )
413 @docval(*_export_args)
414 def export(self, **kwargs):
415 """Export data read from a file from any backend to HDF5.
417 See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details.
418 """
419 if self.__mode != 'w':
420 raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'."
421 % (self.source, self.__mode))
423 src_io = getargs('src_io', kwargs)
424 write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs)
425 if write_args is None:
426 write_args = dict()
428 if not isinstance(src_io, HDF5IO) and write_args.get('link_data', True):
429 raise UnsupportedOperation("Cannot export from non-HDF5 backend %s to HDF5 with write argument "
430 "link_data=True." % src_io.__class__.__name__)
432 write_args['export_source'] = os.path.abspath(src_io.source) if src_io.source is not None else None
433 ckwargs = kwargs.copy()
434 ckwargs['write_args'] = write_args
435 if not write_args.get('link_data', True):
436 ckwargs['clear_cache'] = True
437 super().export(**ckwargs)
438 if cache_spec:
439 # add any namespaces from the src_io that have not yet been loaded
440 for namespace in src_io.manager.namespace_catalog.namespaces:
441 if namespace not in self.manager.namespace_catalog.namespaces: 441 ↛ 440line 441 didn't jump to line 440, because the condition on line 441 was never false
442 self.manager.namespace_catalog.add_namespace(
443 name=namespace,
444 namespace=src_io.manager.namespace_catalog.get_namespace(namespace)
445 )
446 self.__cache_spec()
448 @classmethod
449 @docval({'name': 'path', 'type': str, 'doc': 'the path to the destination HDF5 file'},
450 {'name': 'comm', 'type': 'Intracomm', 'doc': 'the MPI communicator to use for parallel I/O',
451 'default': None},
452 *_export_args) # NOTE: src_io is required and is the second positional argument
453 def export_io(self, **kwargs):
454 """Export from one backend to HDF5 (class method).
456 Convenience function for :py:meth:`export` where you do not need to
457 instantiate a new ``HDF5IO`` object for writing. An ``HDF5IO`` object is created with mode 'w' and the given
458 arguments.
460 Example usage:
462 .. code-block:: python
464 old_io = HDF5IO('old.h5', 'r')
465 HDF5IO.export_io(path='new_copy.h5', src_io=old_io)
467 See :py:meth:`export` for more details.
468 """
469 path, comm = popargs('path', 'comm', kwargs)
471 with HDF5IO(path=path, comm=comm, mode='w') as write_io:
472 write_io.export(**kwargs)
474 def read(self, **kwargs):
475 if self.__mode == 'w' or self.__mode == 'w-' or self.__mode == 'x':
476 raise UnsupportedOperation("Cannot read from file %s in mode '%s'. Please use mode 'r', 'r+', or 'a'."
477 % (self.source, self.__mode))
478 try:
479 return super().read(**kwargs)
480 except UnsupportedOperation as e:
481 if str(e) == 'Cannot build data. There are no values.': # pragma: no cover
482 raise UnsupportedOperation("Cannot read data from file %s in mode '%s'. There are no values."
483 % (self.source, self.__mode))
485 @docval(returns='a GroupBuilder representing the data object', rtype='GroupBuilder')
486 def read_builder(self):
487 """
488 Read data and return the GroupBuilder representing it.
490 NOTE: On read, the Builder.source may will usually not be set of the Builders.
491 NOTE: The Builder.location is used internally to ensure correct handling of links (in particular on export)
492 and should be set on read for all GroupBuilder, DatasetBuilder, and LinkBuilder objects.
493 """
494 if not self.__file:
495 raise UnsupportedOperation("Cannot read data from closed HDF5 file '%s'" % self.source)
496 f_builder = self.__read.get(self.__file)
497 # ignore cached specs when reading builder
498 ignore = set()
499 specloc = self.__file.attrs.get(SPEC_LOC_ATTR)
500 if specloc is not None:
501 ignore.add(self.__file[specloc].name)
502 if f_builder is None:
503 f_builder = self.__read_group(self.__file, ROOT_NAME, ignore=ignore)
504 self.__read[self.__file] = f_builder
505 return f_builder
507 def __set_written(self, builder):
508 """
509 Helper function used to set the written status for builders
511 :param builder: Builder object to be marked as written
512 :type builder: Builder
513 """
514 self._written_builders.set_written(builder)
516 def get_written(self, builder):
517 """Return True if this builder has been written to (or read from) disk by this IO object, False otherwise.
519 :param builder: Builder object to get the written flag for
520 :type builder: Builder
522 :return: True if the builder is found in self._written_builders using the builder ID, False otherwise
523 """
524 return self._written_builders.get_written(builder)
526 def __set_built(self, fpath, id, builder):
527 """
528 Update self.__built to cache the given builder for the given file and id.
530 :param fpath: Path to the HDF5 file containing the object
531 :type fpath: str
532 :param id: ID of the HDF5 object in the path
533 :type id: h5py GroupID object
534 :param builder: The builder to be cached
535 """
536 self.__built.setdefault(fpath, dict()).setdefault(id, builder)
538 def __get_built(self, fpath, id):
539 """
540 Look up a builder for the given file and id in self.__built cache
542 :param fpath: Path to the HDF5 file containing the object
543 :type fpath: str
544 :param id: ID of the HDF5 object in the path
545 :type id: h5py GroupID object
547 :return: Builder in the self.__built cache or None
548 """
550 fdict = self.__built.get(fpath)
551 if fdict:
552 return fdict.get(id)
553 else:
554 return None
556 @docval({'name': 'h5obj', 'type': (Dataset, Group),
557 'doc': 'the HDF5 object to the corresponding Builder object for'})
558 def get_builder(self, **kwargs):
559 """
560 Get the builder for the corresponding h5py Group or Dataset
562 :raises ValueError: When no builder has been constructed yet for the given h5py object
563 """
564 h5obj = getargs('h5obj', kwargs)
565 fpath = h5obj.file.filename
566 builder = self.__get_built(fpath, h5obj.id)
567 if builder is None: 567 ↛ 568line 567 didn't jump to line 568, because the condition on line 567 was never true
568 msg = '%s:%s has not been built' % (fpath, h5obj.name)
569 raise ValueError(msg)
570 return builder
572 @docval({'name': 'h5obj', 'type': (Dataset, Group),
573 'doc': 'the HDF5 object to the corresponding Container/Data object for'})
574 def get_container(self, **kwargs):
575 """
576 Get the container for the corresponding h5py Group or Dataset
578 :raises ValueError: When no builder has been constructed yet for the given h5py object
579 """
580 h5obj = getargs('h5obj', kwargs)
581 builder = self.get_builder(h5obj)
582 container = self.manager.construct(builder)
583 return container
585 def __read_group(self, h5obj, name=None, ignore=set()):
586 kwargs = {
587 "attributes": self.__read_attrs(h5obj),
588 "groups": dict(),
589 "datasets": dict(),
590 "links": dict()
591 }
593 for key, val in kwargs['attributes'].items():
594 if isinstance(val, bytes): 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true
595 kwargs['attributes'][key] = val.decode('UTF-8')
597 if name is None:
598 name = str(os.path.basename(h5obj.name))
599 for k in h5obj:
600 sub_h5obj = h5obj.get(k)
601 if sub_h5obj is not None:
602 if sub_h5obj.name in ignore:
603 continue
604 link_type = h5obj.get(k, getlink=True)
605 if isinstance(link_type, (SoftLink, ExternalLink)):
606 # Reading links might be better suited in its own function
607 # get path of link (the key used for tracking what's been built)
608 target_path = link_type.path
609 target_obj = sub_h5obj.file[target_path]
610 builder_name = os.path.basename(target_path)
611 # get builder if already read, else build it
612 builder = self.__get_built(sub_h5obj.file.filename, target_obj.id)
613 if builder is None:
614 # NOTE: all links must have absolute paths
615 if isinstance(target_obj, Dataset):
616 builder = self.__read_dataset(target_obj, builder_name)
617 else:
618 builder = self.__read_group(target_obj, builder_name, ignore=ignore)
619 self.__set_built(sub_h5obj.file.filename, target_obj.id, builder)
620 link_builder = LinkBuilder(builder=builder, name=k, source=os.path.abspath(h5obj.file.filename))
621 link_builder.location = h5obj.name
622 self.__set_written(link_builder)
623 kwargs['links'][builder_name] = link_builder
624 if isinstance(link_type, ExternalLink):
625 self.__open_links.append(sub_h5obj)
626 else:
627 builder = self.__get_built(sub_h5obj.file.filename, sub_h5obj.id)
628 obj_type = None
629 read_method = None
630 if isinstance(sub_h5obj, Dataset):
631 read_method = self.__read_dataset
632 obj_type = kwargs['datasets']
633 else:
634 read_method = partial(self.__read_group, ignore=ignore)
635 obj_type = kwargs['groups']
636 if builder is None:
637 builder = read_method(sub_h5obj)
638 self.__set_built(sub_h5obj.file.filename, sub_h5obj.id, builder)
639 obj_type[builder.name] = builder
640 else:
641 warnings.warn('Path to Group altered/broken at ' + os.path.join(h5obj.name, k), BrokenLinkWarning)
642 kwargs['datasets'][k] = None
643 continue
644 kwargs['source'] = os.path.abspath(h5obj.file.filename)
645 ret = GroupBuilder(name, **kwargs)
646 ret.location = os.path.dirname(h5obj.name)
647 self.__set_written(ret)
648 return ret
650 def __read_dataset(self, h5obj, name=None):
651 kwargs = {
652 "attributes": self.__read_attrs(h5obj),
653 "dtype": h5obj.dtype,
654 "maxshape": h5obj.maxshape
655 }
656 for key, val in kwargs['attributes'].items():
657 if isinstance(val, bytes): 657 ↛ 658line 657 didn't jump to line 658, because the condition on line 657 was never true
658 kwargs['attributes'][key] = val.decode('UTF-8')
660 if name is None:
661 name = str(os.path.basename(h5obj.name))
662 kwargs['source'] = os.path.abspath(h5obj.file.filename)
663 ndims = len(h5obj.shape)
664 if ndims == 0: # read scalar
665 scalar = h5obj[()]
666 if isinstance(scalar, bytes): 666 ↛ 667line 666 didn't jump to line 667, because the condition on line 666 was never true
667 scalar = scalar.decode('UTF-8')
669 if isinstance(scalar, Reference): 669 ↛ 671line 669 didn't jump to line 671, because the condition on line 669 was never true
670 # TODO (AJTRITT): This should call __read_ref to support Group references
671 target = h5obj.file[scalar]
672 target_builder = self.__read_dataset(target)
673 self.__set_built(target.file.filename, target.id, target_builder)
674 if isinstance(scalar, RegionReference):
675 d = RegionBuilder(scalar, target_builder)
676 else:
677 d = ReferenceBuilder(target_builder)
678 kwargs['data'] = d
679 kwargs['dtype'] = d.dtype
680 else:
681 kwargs["data"] = scalar
682 else:
683 d = None
684 if h5obj.dtype.kind == 'O' and len(h5obj) > 0:
685 elem1 = h5obj[tuple([0] * (h5obj.ndim - 1) + [0])]
686 if isinstance(elem1, (str, bytes)):
687 d = self._check_str_dtype(h5obj)
688 elif isinstance(elem1, RegionReference): # read list of references 688 ↛ 689line 688 didn't jump to line 689, because the condition on line 688 was never true
689 d = BuilderH5RegionDataset(h5obj, self)
690 kwargs['dtype'] = d.dtype
691 elif isinstance(elem1, Reference): 691 ↛ 701line 691 didn't jump to line 701, because the condition on line 691 was never false
692 d = BuilderH5ReferenceDataset(h5obj, self)
693 kwargs['dtype'] = d.dtype
694 elif h5obj.dtype.kind == 'V': # table / compound data type
695 cpd_dt = h5obj.dtype
696 ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))]
697 d = BuilderH5TableDataset(h5obj, self, ref_cols)
698 kwargs['dtype'] = HDF5IO.__compound_dtype_to_list(h5obj.dtype, d.dtype)
699 else:
700 d = h5obj
701 kwargs["data"] = d
702 ret = DatasetBuilder(name, **kwargs)
703 ret.location = os.path.dirname(h5obj.name)
704 self.__set_written(ret)
705 return ret
707 def _check_str_dtype(self, h5obj):
708 dtype = h5obj.dtype
709 if dtype.kind == 'O': 709 ↛ 712line 709 didn't jump to line 712, because the condition on line 709 was never false
710 if dtype.metadata.get('vlen') == str and H5PY_3: 710 ↛ 712line 710 didn't jump to line 712, because the condition on line 710 was never false
711 return StrDataset(h5obj, None)
712 return h5obj
714 @classmethod
715 def __compound_dtype_to_list(cls, h5obj_dtype, dset_dtype):
716 ret = []
717 for name, dtype in zip(h5obj_dtype.fields, dset_dtype):
718 ret.append({'name': name, 'dtype': dtype})
719 return ret
721 def __read_attrs(self, h5obj):
722 ret = dict()
723 for k, v in h5obj.attrs.items():
724 if k == SPEC_LOC_ATTR: # ignore cached spec
725 continue
726 if isinstance(v, RegionReference): 726 ↛ 727line 726 didn't jump to line 727, because the condition on line 726 was never true
727 raise ValueError("cannot read region reference attributes yet")
728 elif isinstance(v, Reference):
729 ret[k] = self.__read_ref(h5obj.file[v])
730 else:
731 ret[k] = v
732 return ret
734 def __read_ref(self, h5obj):
735 ret = None
736 ret = self.__get_built(h5obj.file.filename, h5obj.id)
737 if ret is None:
738 if isinstance(h5obj, Dataset):
739 ret = self.__read_dataset(h5obj)
740 elif isinstance(h5obj, Group): 740 ↛ 743line 740 didn't jump to line 743, because the condition on line 740 was never false
741 ret = self.__read_group(h5obj)
742 else:
743 raise ValueError("h5obj must be a Dataset or a Group - got %s" % str(h5obj))
744 self.__set_built(h5obj.file.filename, h5obj.id, ret)
745 return ret
747 def open(self):
748 if self.__file is None:
749 open_flag = self.__mode
750 kwargs = dict(rdcc_nbytes=RDCC_NBYTES)
751 if self.comm: 751 ↛ 752line 751 didn't jump to line 752, because the condition on line 751 was never true
752 kwargs.update(driver='mpio', comm=self.comm)
754 if self.driver is not None: 754 ↛ 755line 754 didn't jump to line 755, because the condition on line 754 was never true
755 kwargs.update(driver=self.driver)
757 self.__file = File(self.source, open_flag, **kwargs)
759 def close(self, close_links=True):
760 """Close this file and any files linked to from this file.
762 :param close_links: Whether to close all files linked to from this file. (default: True)
763 :type close_links: bool
764 """
765 if close_links:
766 self.close_linked_files()
767 try:
768 if self.__file is not None:
769 self.__file.close()
770 except AttributeError:
771 # Do not do anything in case that self._file does not exist. This
772 # may happen in case that an error occurs before HDF5IO has been fully
773 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises
774 # an error before self.__file has been created
775 self.__file = None
777 def close_linked_files(self):
778 """Close all opened, linked-to files.
780 MacOS and Linux automatically release the linked-to file after the linking file is closed, but Windows does
781 not, which prevents the linked-to file from being deleted or truncated. Use this method to close all opened,
782 linked-to files.
783 """
784 # Make sure
785 try:
786 for obj in self.__open_links:
787 if obj:
788 obj.file.close()
789 except AttributeError:
790 # Do not do anything in case that self.__open_links does not exist. This
791 # may happen in case that an error occurs before HDF5IO has been fully
792 # setup in __init__, e.g,. if a child class (such as NWBHDF5IO) raises
793 # an error before self.__open_links has been created.
794 pass
795 finally:
796 self.__open_links = []
798 @docval({'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the HDF5 file'},
799 {'name': 'link_data', 'type': bool,
800 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
801 {'name': 'exhaust_dci', 'type': bool,
802 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
803 'default': True},
804 {'name': 'export_source', 'type': str,
805 'doc': 'The source of the builders when exporting', 'default': None})
806 def write_builder(self, **kwargs):
807 f_builder = popargs('builder', kwargs)
808 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
809 self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s"
810 % (f_builder.name, self.source, kwargs))
811 for name, gbldr in f_builder.groups.items():
812 self.write_group(self.__file, gbldr, **kwargs)
813 for name, dbldr in f_builder.datasets.items():
814 self.write_dataset(self.__file, dbldr, **kwargs)
815 for name, lbldr in f_builder.links.items():
816 self.write_link(self.__file, lbldr, export_source=kwargs.get("export_source"))
817 self.set_attributes(self.__file, f_builder.attributes)
818 self.__add_refs()
819 self.__dci_queue.exhaust_queue()
820 self.__set_written(f_builder)
821 self.logger.debug("Done writing %s '%s' to path '%s'" %
822 (f_builder.__class__.__qualname__, f_builder.name, self.source))
824 def __add_refs(self):
825 '''
826 Add all references in the file.
828 References get queued to be added at the end of write. This is because
829 the current traversal algorithm (i.e. iterating over GroupBuilder items)
830 does not happen in a guaranteed order. We need to figure out what objects
831 will be references, and then write them after we write everything else.
832 '''
833 failed = set()
834 while len(self.__ref_queue) > 0:
835 call = self.__ref_queue.popleft()
836 self.logger.debug("Adding reference with call id %d from queue (length %d)"
837 % (id(call), len(self.__ref_queue)))
838 try:
839 call()
840 except KeyError:
841 if id(call) in failed:
842 raise RuntimeError('Unable to resolve reference')
843 self.logger.debug("Adding reference with call id %d failed. Appending call to queue" % id(call))
844 failed.add(id(call))
845 self.__ref_queue.append(call)
847 @classmethod
848 def get_type(cls, data):
849 if isinstance(data, str):
850 return H5_TEXT
851 elif isinstance(data, bytes): 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true
852 return H5_BINARY
853 elif isinstance(data, Container): 853 ↛ 854line 853 didn't jump to line 854, because the condition on line 853 was never true
854 return H5_REF
855 elif not hasattr(data, '__len__'):
856 return type(data)
857 else:
858 if len(data) == 0:
859 if hasattr(data, 'dtype'): 859 ↛ 860line 859 didn't jump to line 860, because the condition on line 859 was never true
860 return data.dtype
861 else:
862 raise ValueError('cannot determine type for empty data')
863 return cls.get_type(data[0])
865 __dtypes = {
866 "float": np.float32,
867 "float32": np.float32,
868 "double": np.float64,
869 "float64": np.float64,
870 "long": np.int64,
871 "int64": np.int64,
872 "int": np.int32,
873 "int32": np.int32,
874 "short": np.int16,
875 "int16": np.int16,
876 "int8": np.int8,
877 "uint64": np.uint64,
878 "uint": np.uint32,
879 "uint32": np.uint32,
880 "uint16": np.uint16,
881 "uint8": np.uint8,
882 "bool": np.bool_,
883 "text": H5_TEXT,
884 "utf": H5_TEXT,
885 "utf8": H5_TEXT,
886 "utf-8": H5_TEXT,
887 "ascii": H5_BINARY,
888 "bytes": H5_BINARY,
889 "ref": H5_REF,
890 "reference": H5_REF,
891 "object": H5_REF,
892 "region": H5_REGREF,
893 "isodatetime": H5_TEXT,
894 "datetime": H5_TEXT,
895 }
897 @classmethod
898 def __resolve_dtype__(cls, dtype, data):
899 # TODO: These values exist, but I haven't solved them yet
900 # binary
901 # number
902 dtype = cls.__resolve_dtype_helper__(dtype)
903 if dtype is None:
904 dtype = cls.get_type(data)
905 return dtype
907 @classmethod
908 def __resolve_dtype_helper__(cls, dtype):
909 if dtype is None:
910 return None
911 elif isinstance(dtype, str):
912 return cls.__dtypes.get(dtype)
913 elif isinstance(dtype, dict):
914 return cls.__dtypes.get(dtype['reftype'])
915 elif isinstance(dtype, np.dtype):
916 # NOTE: some dtypes may not be supported, but we need to support writing of read-in compound types
917 return dtype
918 else:
919 return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype])
921 @docval({'name': 'obj', 'type': (Group, Dataset), 'doc': 'the HDF5 object to add attributes to'},
922 {'name': 'attributes',
923 'type': dict,
924 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'})
925 def set_attributes(self, **kwargs):
926 obj, attributes = getargs('obj', 'attributes', kwargs)
927 for key, value in attributes.items():
928 try:
929 if isinstance(value, (set, list, tuple)):
930 tmp = tuple(value)
931 if len(tmp) > 0:
932 if isinstance(tmp[0], (str, bytes)): 932 ↛ 934line 932 didn't jump to line 934, because the condition on line 932 was never false
933 value = np.array(value, dtype=special_dtype(vlen=type(tmp[0])))
934 elif isinstance(tmp[0], Container): # a list of references
935 self.__queue_ref(self._make_attr_ref_filler(obj, key, tmp))
936 else:
937 value = np.array(value)
938 self.logger.debug("Setting %s '%s' attribute '%s' to %s"
939 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
940 obj.attrs[key] = value
941 elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference
942 self.__queue_ref(self._make_attr_ref_filler(obj, key, value))
943 else:
944 self.logger.debug("Setting %s '%s' attribute '%s' to %s"
945 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
946 if isinstance(value, np.ndarray) and value.dtype.kind == 'U': 946 ↛ 947line 946 didn't jump to line 947, because the condition on line 946 was never true
947 value = np.array(value, dtype=H5_TEXT)
948 obj.attrs[key] = value # a regular scalar
949 except Exception as e:
950 msg = "unable to write attribute '%s' on object '%s'" % (key, obj.name)
951 raise RuntimeError(msg) from e
953 def _make_attr_ref_filler(self, obj, key, value):
954 '''
955 Make the callable for setting references to attributes
956 '''
957 self.logger.debug("Queueing set %s '%s' attribute '%s' to %s"
958 % (obj.__class__.__name__, obj.name, key, value.__class__.__name__))
959 if isinstance(value, (tuple, list)): 959 ↛ 960line 959 didn't jump to line 960, because the condition on line 959 was never true
960 def _filler():
961 ret = list()
962 for item in value:
963 ret.append(self.__get_ref(item))
964 obj.attrs[key] = ret
965 else:
966 def _filler():
967 obj.attrs[key] = self.__get_ref(value)
968 return _filler
970 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'},
971 {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'},
972 {'name': 'link_data', 'type': bool,
973 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
974 {'name': 'exhaust_dci', 'type': bool,
975 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
976 'default': True},
977 {'name': 'export_source', 'type': str,
978 'doc': 'The source of the builders when exporting', 'default': None},
979 returns='the Group that was created', rtype='Group')
980 def write_group(self, **kwargs):
981 parent, builder = popargs('parent', 'builder', kwargs)
982 self.logger.debug("Writing GroupBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
983 if self.get_written(builder):
984 self.logger.debug(" GroupBuilder '%s' is already written" % builder.name)
985 group = parent[builder.name]
986 else:
987 self.logger.debug(" Creating group '%s'" % builder.name)
988 group = parent.create_group(builder.name)
989 # write all groups
990 subgroups = builder.groups
991 if subgroups:
992 for subgroup_name, sub_builder in subgroups.items():
993 # do not create an empty group without attributes or links
994 self.write_group(group, sub_builder, **kwargs)
995 # write all datasets
996 datasets = builder.datasets
997 if datasets:
998 for dset_name, sub_builder in datasets.items():
999 self.write_dataset(group, sub_builder, **kwargs)
1000 # write all links
1001 links = builder.links
1002 if links:
1003 for link_name, sub_builder in links.items():
1004 self.write_link(group, sub_builder, export_source=kwargs.get("export_source"))
1005 attributes = builder.attributes
1006 self.set_attributes(group, attributes)
1007 self.__set_written(builder)
1008 return group
1010 def __get_path(self, builder):
1011 """Get the path to the builder.
1013 Note that the root of the file has no name - it is just "/". Thus, the name of the root container is ignored.
1014 If builder.location is set then it is used as the path, otherwise the function
1015 determines the path by constructing it iteratively from the parents of the
1016 builder.
1017 """
1018 if builder.location is not None:
1019 path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/")
1020 else:
1021 curr = builder
1022 names = list()
1023 while curr.parent is not None:
1024 names.append(curr.name)
1025 curr = curr.parent
1026 delim = "/"
1027 path = "%s%s" % (delim, delim.join(reversed(names)))
1028 return path
1030 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'},
1031 {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'},
1032 {'name': 'export_source', 'type': str,
1033 'doc': 'The source of the builders when exporting', 'default': None},
1034 returns='the Link that was created', rtype='Link')
1035 def write_link(self, **kwargs):
1036 parent, builder, export_source = getargs('parent', 'builder', 'export_source', kwargs)
1037 self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
1038 if self.get_written(builder): 1038 ↛ 1039line 1038 didn't jump to line 1039, because the condition on line 1038 was never true
1039 self.logger.debug(" LinkBuilder '%s' is already written" % builder.name)
1040 return None
1041 name = builder.name
1042 target_builder = builder.builder
1043 path = self.__get_path(target_builder)
1044 # source will indicate target_builder's location
1045 if export_source is None:
1046 write_source = builder.source
1047 else:
1048 write_source = export_source
1050 parent_filename = os.path.abspath(parent.file.filename)
1051 if target_builder.source in (write_source, parent_filename):
1052 link_obj = SoftLink(path)
1053 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1054 % (parent.name, name, link_obj.path))
1055 elif target_builder.source is not None: 1055 ↛ 1064line 1055 didn't jump to line 1064, because the condition on line 1055 was never false
1056 target_filename = os.path.abspath(target_builder.source)
1057 relative_path = os.path.relpath(target_filename, os.path.dirname(parent_filename))
1058 if target_builder.location is not None:
1059 path = target_builder.location + "/" + target_builder.name
1060 link_obj = ExternalLink(relative_path, path)
1061 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1062 % (parent.name, name, link_obj.filename, link_obj.path))
1063 else:
1064 msg = 'cannot create external link to %s' % path
1065 raise ValueError(msg)
1066 parent[name] = link_obj
1067 self.__set_written(builder)
1068 return link_obj
1070 @docval({'name': 'parent', 'type': Group, 'doc': 'the parent HDF5 object'}, # noqa: C901
1071 {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'},
1072 {'name': 'link_data', 'type': bool,
1073 'doc': 'If not specified otherwise link (True) or copy (False) HDF5 Datasets', 'default': True},
1074 {'name': 'exhaust_dci', 'type': bool,
1075 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
1076 'default': True},
1077 {'name': 'export_source', 'type': str,
1078 'doc': 'The source of the builders when exporting', 'default': None},
1079 returns='the Dataset that was created', rtype=Dataset)
1080 def write_dataset(self, **kwargs): # noqa: C901
1081 """ Write a dataset to HDF5
1083 The function uses other dataset-dependent write functions, e.g,
1084 ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data.
1085 """
1086 parent, builder = popargs('parent', 'builder', kwargs)
1087 link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
1088 self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
1089 if self.get_written(builder):
1090 self.logger.debug(" DatasetBuilder '%s' is already written" % builder.name)
1091 return None
1092 name = builder.name
1093 data = builder.data
1094 dataio = None
1095 options = dict() # dict with additional
1096 if isinstance(data, H5DataIO):
1097 options['io_settings'] = data.io_settings
1098 dataio = data
1099 link_data = data.link_data
1100 data = data.data
1101 else:
1102 options['io_settings'] = {}
1103 if isinstance(data, TermSetWrapper):
1104 # This is for when the wrapped item is a dataset
1105 # (refer to objectmapper.py for wrapped attributes)
1106 data = data.value
1107 attributes = builder.attributes
1108 options['dtype'] = builder.dtype
1109 dset = None
1110 link = None
1112 # The user provided an existing h5py dataset as input and asked to create a link to the dataset
1113 if isinstance(data, Dataset):
1114 data_filename = os.path.abspath(data.file.filename)
1115 if link_data:
1116 if export_source is None: # not exporting
1117 parent_filename = os.path.abspath(parent.file.filename)
1118 if data_filename != parent_filename: # create external link to data
1119 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename))
1120 link = ExternalLink(relative_path, data.name)
1121 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1122 % (parent.name, name, link.filename, link.path))
1123 else: # create soft link to dataset already in this file -- possible if mode == 'r+'
1124 link = SoftLink(data.name)
1125 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1126 % (parent.name, name, link.path))
1127 parent[name] = link
1128 else: # exporting
1129 export_source = os.path.abspath(export_source)
1130 parent_filename = os.path.abspath(parent.file.filename)
1131 if data_filename != export_source: # dataset is in different file than export source
1132 # possible if user adds a link to a dataset in a different file after reading export source
1133 # to memory
1134 relative_path = os.path.relpath(data_filename, os.path.dirname(parent_filename))
1135 link = ExternalLink(relative_path, data.name)
1136 self.logger.debug(" Creating ExternalLink '%s/%s' to '%s://%s'"
1137 % (parent.name, name, link.filename, link.path))
1138 parent[name] = link
1139 elif parent.name != data.parent.name: # dataset is in export source and has different path
1140 # so create a soft link to the dataset in this file
1141 # possible if user adds a link to a dataset in export source after reading to memory
1142 # TODO check that there is/will be still a dataset at data.name -- if the dataset has
1143 # been removed, then this link will be broken
1144 link = SoftLink(data.name)
1145 self.logger.debug(" Creating SoftLink '%s/%s' to '%s'"
1146 % (parent.name, name, link.path))
1147 parent[name] = link
1148 else: # dataset is in export source and has same path as the builder, so copy the dataset
1149 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'"
1150 % (data.file.filename, data.name, parent.name, name))
1151 parent.copy(source=data,
1152 dest=parent,
1153 name=name,
1154 expand_soft=False,
1155 expand_external=False,
1156 expand_refs=False,
1157 without_attrs=True)
1158 dset = parent[name]
1159 else:
1160 # TODO add option for case where there are multiple links to the same dataset within a file:
1161 # instead of copying the dset N times, copy it once and create soft links to it within the file
1162 self.logger.debug(" Copying data from '%s://%s' to '%s/%s'"
1163 % (data.file.filename, data.name, parent.name, name))
1164 parent.copy(source=data,
1165 dest=parent,
1166 name=name,
1167 expand_soft=False,
1168 expand_external=False,
1169 expand_refs=False,
1170 without_attrs=True)
1171 dset = parent[name]
1173 # Write a compound dataset, i.e, a dataset with compound data type
1174 elif isinstance(options['dtype'], list):
1175 # do some stuff to figure out what data is a reference
1176 refs = list()
1177 for i, dts in enumerate(options['dtype']):
1178 if self.__is_ref(dts):
1179 refs.append(i)
1180 # If one or more of the parts of the compound data type are references then we need to deal with those
1181 if len(refs) > 0:
1182 try:
1183 _dtype = self.__resolve_dtype__(options['dtype'], data)
1184 except Exception as exc:
1185 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1186 raise Exception(msg) from exc
1187 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings'])
1188 self.__set_written(builder)
1189 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1190 "object references. attributes: %s"
1191 % (name, list(attributes.keys())))
1193 @self.__queue_ref
1194 def _filler():
1195 self.logger.debug("Resolving object references and setting attribute on dataset '%s' "
1196 "containing attributes: %s"
1197 % (name, list(attributes.keys())))
1198 ret = list()
1199 for item in data:
1200 new_item = list(item)
1201 for i in refs:
1202 new_item[i] = self.__get_ref(item[i])
1203 ret.append(tuple(new_item))
1204 dset = parent[name]
1205 dset[:] = ret
1206 self.set_attributes(dset, attributes)
1208 return
1209 # If the compound data type contains only regular data (i.e., no references) then we can write it as usual
1210 else:
1211 dset = self.__list_fill__(parent, name, data, options)
1212 # Write a dataset containing references, i.e., a region or object reference.
1213 # NOTE: we can ignore options['io_settings'] for scalar data
1214 elif self.__is_ref(options['dtype']):
1215 _dtype = self.__dtypes.get(options['dtype'])
1216 # Write a scalar data region reference dataset
1217 if isinstance(data, RegionBuilder): 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true
1218 dset = parent.require_dataset(name, shape=(), dtype=_dtype)
1219 self.__set_written(builder)
1220 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing a "
1221 "region reference. attributes: %s"
1222 % (name, list(attributes.keys())))
1224 @self.__queue_ref
1225 def _filler():
1226 self.logger.debug("Resolving region reference and setting attribute on dataset '%s' "
1227 "containing attributes: %s"
1228 % (name, list(attributes.keys())))
1229 ref = self.__get_ref(data.builder, data.region)
1230 dset = parent[name]
1231 dset[()] = ref
1232 self.set_attributes(dset, attributes)
1233 # Write a scalar object reference dataset
1234 elif isinstance(data, ReferenceBuilder): 1234 ↛ 1235line 1234 didn't jump to line 1235, because the condition on line 1234 was never true
1235 dset = parent.require_dataset(name, dtype=_dtype, shape=())
1236 self.__set_written(builder)
1237 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing an "
1238 "object reference. attributes: %s"
1239 % (name, list(attributes.keys())))
1241 @self.__queue_ref
1242 def _filler():
1243 self.logger.debug("Resolving object reference and setting attribute on dataset '%s' "
1244 "containing attributes: %s"
1245 % (name, list(attributes.keys())))
1246 ref = self.__get_ref(data.builder)
1247 dset = parent[name]
1248 dset[()] = ref
1249 self.set_attributes(dset, attributes)
1250 # Write an array dataset of references
1251 else:
1252 # Write a array of region references
1253 if options['dtype'] == 'region': 1253 ↛ 1254line 1253 didn't jump to line 1254, because the condition on line 1253 was never true
1254 dset = parent.require_dataset(name, dtype=_dtype, shape=(len(data),), **options['io_settings'])
1255 self.__set_written(builder)
1256 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1257 "region references. attributes: %s"
1258 % (name, list(attributes.keys())))
1260 @self.__queue_ref
1261 def _filler():
1262 self.logger.debug("Resolving region references and setting attribute on dataset '%s' "
1263 "containing attributes: %s"
1264 % (name, list(attributes.keys())))
1265 refs = list()
1266 for item in data:
1267 refs.append(self.__get_ref(item.builder, item.region))
1268 dset = parent[name]
1269 dset[()] = refs
1270 self.set_attributes(dset, attributes)
1271 # Write array of object references
1272 else:
1273 dset = parent.require_dataset(name, shape=(len(data),), dtype=_dtype, **options['io_settings'])
1274 self.__set_written(builder)
1275 self.logger.debug("Queueing reference resolution and set attribute on dataset '%s' containing "
1276 "object references. attributes: %s"
1277 % (name, list(attributes.keys())))
1279 @self.__queue_ref
1280 def _filler():
1281 self.logger.debug("Resolving object references and setting attribute on dataset '%s' "
1282 "containing attributes: %s"
1283 % (name, list(attributes.keys())))
1284 refs = list()
1285 for item in data:
1286 refs.append(self.__get_ref(item))
1287 dset = parent[name]
1288 dset[()] = refs
1289 self.set_attributes(dset, attributes)
1290 return
1291 # write a "regular" dataset
1292 else:
1293 # Create an empty dataset
1294 if data is None:
1295 dset = self.__setup_empty_dset__(parent, name, options['io_settings'])
1296 dataio.dataset = dset
1297 # Write a scalar dataset containing a single string
1298 elif isinstance(data, (str, bytes)):
1299 dset = self.__scalar_fill__(parent, name, data, options)
1300 # Iterative write of a data chunk iterator
1301 elif isinstance(data, AbstractDataChunkIterator):
1302 dset = self.__setup_chunked_dset__(parent, name, data, options)
1303 self.__dci_queue.append(dataset=dset, data=data)
1304 # Write a regular in memory array (e.g., numpy array, list etc.)
1305 elif hasattr(data, '__len__'):
1306 dset = self.__list_fill__(parent, name, data, options)
1307 # Write a regular scalar dataset
1308 else:
1309 dset = self.__scalar_fill__(parent, name, data, options)
1310 # Create the attributes on the dataset only if we are the primary and not just a Soft/External link
1311 if link is None:
1312 self.set_attributes(dset, attributes)
1313 # Validate the attributes on the linked dataset
1314 elif len(attributes) > 0:
1315 pass
1316 self.__set_written(builder)
1317 if exhaust_dci: 1317 ↛ exitline 1317 didn't return from function 'write_dataset', because the condition on line 1317 was never false
1318 self.__dci_queue.exhaust_queue()
1320 @classmethod
1321 def __scalar_fill__(cls, parent, name, data, options=None):
1322 dtype = None
1323 io_settings = {}
1324 if options is not None: 1324 ↛ 1327line 1324 didn't jump to line 1327, because the condition on line 1324 was never false
1325 dtype = options.get('dtype')
1326 io_settings = options.get('io_settings')
1327 if not isinstance(dtype, type): 1327 ↛ 1333line 1327 didn't jump to line 1333, because the condition on line 1327 was never false
1328 try:
1329 dtype = cls.__resolve_dtype__(dtype, data)
1330 except Exception as exc:
1331 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1332 raise Exception(msg) from exc
1333 try:
1334 dset = parent.create_dataset(name, data=data, shape=None, dtype=dtype, **io_settings)
1335 except Exception as exc:
1336 msg = "Could not create scalar dataset %s in %s" % (name, parent.name)
1337 raise Exception(msg) from exc
1338 return dset
1340 @classmethod
1341 def __setup_chunked_dset__(cls, parent, name, data, options=None):
1342 """
1343 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator
1345 :param parent: The parent object to which the dataset should be added
1346 :type parent: h5py.Group, h5py.File
1347 :param name: The name of the dataset
1348 :type name: str
1349 :param data: The data to be written.
1350 :type data: DataChunkIterator
1351 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1352 :type options: dict
1354 """
1355 io_settings = {}
1356 if options is not None:
1357 if 'io_settings' in options: 1357 ↛ 1360line 1357 didn't jump to line 1360, because the condition on line 1357 was never false
1358 io_settings = options.get('io_settings')
1359 # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write.
1360 if 'chunks' not in io_settings:
1361 recommended_chunks = data.recommended_chunk_shape()
1362 io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks
1363 # Define the shape of the data if not provided by the user
1364 if 'shape' not in io_settings: 1364 ↛ 1367line 1364 didn't jump to line 1367, because the condition on line 1364 was never false
1365 io_settings['shape'] = data.recommended_data_shape()
1366 # Define the maxshape of the data if not provided by the user
1367 if 'maxshape' not in io_settings:
1368 io_settings['maxshape'] = data.maxshape
1369 if 'dtype' not in io_settings: 1369 ↛ 1377line 1369 didn't jump to line 1377, because the condition on line 1369 was never false
1370 if (options is not None) and ('dtype' in options):
1371 io_settings['dtype'] = options['dtype']
1372 else:
1373 io_settings['dtype'] = data.dtype
1374 if isinstance(io_settings['dtype'], str): 1374 ↛ 1376line 1374 didn't jump to line 1376, because the condition on line 1374 was never true
1375 # map to real dtype if we were given a string
1376 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype'])
1377 try:
1378 dset = parent.create_dataset(name, **io_settings)
1379 except Exception as exc:
1380 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc
1381 return dset
1383 @classmethod
1384 def __setup_empty_dset__(cls, parent, name, io_settings):
1385 """
1386 Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator
1388 :param parent: The parent object to which the dataset should be added
1389 :type parent: h5py.Group, h5py.File
1390 :param name: The name of the dataset
1391 :type name: str
1392 :param data: The data to be written.
1393 :type data: DataChunkIterator
1394 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1395 :type options: dict
1397 """
1398 # Define the shape of the data if not provided by the user
1399 if 'shape' not in io_settings:
1400 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without shape")
1401 if 'dtype' not in io_settings:
1402 raise ValueError(f"Cannot setup empty dataset {pp(parent.name, name)} without dtype")
1403 if isinstance(io_settings['dtype'], str):
1404 # map to real dtype if we were given a string
1405 io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype'])
1406 try:
1407 dset = parent.create_dataset(name, **io_settings)
1408 except Exception as exc:
1409 raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc
1410 return dset
1412 @classmethod
1413 def __chunked_iter_fill__(cls, parent, name, data, options=None):
1414 """
1415 Write data to a dataset one-chunk-at-a-time based on the given DataChunkIterator
1417 :param parent: The parent object to which the dataset should be added
1418 :type parent: h5py.Group, h5py.File
1419 :param name: The name of the dataset
1420 :type name: str
1421 :param data: The data to be written.
1422 :type data: DataChunkIterator
1423 :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings'
1424 :type options: dict
1426 """
1427 dset = cls.__setup_chunked_dset__(parent, name, data, options=options)
1428 read = True
1429 while read:
1430 read = HDF5IODataChunkIteratorQueue._write_chunk(dset, data)
1431 return dset
1433 @classmethod
1434 def __list_fill__(cls, parent, name, data, options=None):
1435 # define the io settings and data type if necessary
1436 io_settings = {}
1437 dtype = None
1438 if options is not None:
1439 dtype = options.get('dtype')
1440 io_settings = options.get('io_settings')
1441 if not isinstance(dtype, type):
1442 try:
1443 dtype = cls.__resolve_dtype__(dtype, data)
1444 except Exception as exc:
1445 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
1446 raise Exception(msg) from exc
1447 # define the data shape
1448 if 'shape' in io_settings: 1448 ↛ 1449line 1448 didn't jump to line 1449, because the condition on line 1448 was never true
1449 data_shape = io_settings.pop('shape')
1450 elif hasattr(data, 'shape'):
1451 data_shape = data.shape
1452 elif isinstance(dtype, np.dtype):
1453 data_shape = (len(data),)
1454 else:
1455 data_shape = get_data_shape(data)
1457 # Create the dataset
1458 try:
1459 dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings)
1460 except Exception as exc:
1461 msg = "Could not create dataset %s in %s with shape %s, dtype %s, and iosettings %s. %s" % \
1462 (name, parent.name, str(data_shape), str(dtype), str(io_settings), str(exc))
1463 raise Exception(msg) from exc
1464 # Write the data
1465 if len(data) > dset.shape[0]: 1465 ↛ 1466line 1465 didn't jump to line 1466, because the condition on line 1465 was never true
1466 new_shape = list(dset.shape)
1467 new_shape[0] = len(data)
1468 dset.resize(new_shape)
1469 try:
1470 dset[:] = data
1471 except Exception as e:
1472 raise e
1473 return dset
1475 @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference',
1476 'default': None},
1477 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object',
1478 'default': None},
1479 returns='the reference', rtype=Reference)
1480 def __get_ref(self, **kwargs):
1481 container, region = getargs('container', 'region', kwargs)
1482 if container is None: 1482 ↛ 1483line 1482 didn't jump to line 1483, because the condition on line 1482 was never true
1483 return None
1484 if isinstance(container, Builder):
1485 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
1486 if isinstance(container, LinkBuilder): 1486 ↛ 1487line 1486 didn't jump to line 1487, because the condition on line 1486 was never true
1487 builder = container.target_builder
1488 else:
1489 builder = container
1490 elif isinstance(container, ReferenceBuilder):
1491 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.builder.name))
1492 builder = container.builder
1493 else:
1494 self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
1495 builder = self.manager.build(container)
1496 path = self.__get_path(builder)
1497 self.logger.debug("Getting reference at path '%s'" % path)
1498 if isinstance(container, RegionBuilder): 1498 ↛ 1499line 1498 didn't jump to line 1499, because the condition on line 1498 was never true
1499 region = container.region
1500 if region is not None: 1500 ↛ 1501line 1500 didn't jump to line 1501, because the condition on line 1500 was never true
1501 dset = self.__file[path]
1502 if not isinstance(dset, Dataset):
1503 raise ValueError('cannot create region reference without Dataset')
1504 return self.__file[path].regionref[region]
1505 else:
1506 return self.__file[path].ref
1508 def __is_ref(self, dtype):
1509 if isinstance(dtype, DtypeSpec):
1510 return self.__is_ref(dtype.dtype)
1511 if isinstance(dtype, RefSpec):
1512 return True
1513 if isinstance(dtype, dict): # may be dict from reading a compound dataset
1514 return self.__is_ref(dtype['dtype'])
1515 if isinstance(dtype, str):
1516 return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE
1517 return False
1519 def __queue_ref(self, func):
1520 '''Set aside filling dset with references
1522 dest[sl] = func()
1524 Args:
1525 dset: the h5py.Dataset that the references need to be added to
1526 sl: the np.s_ (slice) object for indexing into dset
1527 func: a function to call to return the chunk of data, with
1528 references filled in
1529 '''
1530 # TODO: come up with more intelligent way of
1531 # queueing reference resolution, based on reference
1532 # dependency
1533 self.__ref_queue.append(func)
1535 def __rec_get_ref(self, ref_list):
1536 ret = list()
1537 for elem in ref_list:
1538 if isinstance(elem, (list, tuple)):
1539 ret.append(self.__rec_get_ref(elem))
1540 elif isinstance(elem, (Builder, Container)):
1541 ret.append(self.__get_ref(elem))
1542 else:
1543 ret.append(elem)
1544 return ret
1546 @property
1547 def mode(self):
1548 """
1549 Return the HDF5 file mode. One of ("w", "r", "r+", "a", "w-", "x").
1550 """
1551 return self.__mode
1553 @classmethod
1554 @docval(*get_docval(H5DataIO.__init__))
1555 def set_dataio(cls, **kwargs):
1556 """
1557 Wrap the given Data object with an H5DataIO.
1559 This method is provided merely for convenience. It is the equivalent
1560 of the following:
1562 .. code-block:: python
1564 from hdmf.backends.hdf5 import H5DataIO
1565 data = ...
1566 data = H5DataIO(data)
1567 """
1568 return H5DataIO.__init__(**kwargs)