Coverage for src/hdmf/backends/hdf5/h5_utils.py: 89%
328 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
1"""
2Utilities for the HDF5 I/O backend,
3e.g., for wrapping HDF5 datasets on read, wrapping arrays for configuring write, or
4writing the spec among others"""
6from collections import deque
7from abc import ABCMeta, abstractmethod
8from collections.abc import Iterable
9from copy import copy
11from h5py import Group, Dataset, RegionReference, Reference, special_dtype
12from h5py import filters as h5py_filters
13import json
14import numpy as np
15import warnings
16import os
17import logging
19from ...array import Array
20from ...data_utils import DataIO, AbstractDataChunkIterator
21from ...query import HDMFDataset, ReferenceResolver, ContainerResolver, BuilderResolver
22from ...region import RegionSlicer
23from ...spec import SpecWriter, SpecReader
24from ...utils import docval, getargs, popargs, get_docval
27class HDF5IODataChunkIteratorQueue(deque):
28 """
29 Helper class used by HDF5IO to manage the write for DataChunkIterators
31 Each queue element must be a tuple of two elements:
32 1) the dataset to write to and 2) the AbstractDataChunkIterator with the data
33 """
34 def __init__(self):
35 self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
36 super().__init__()
38 @classmethod
39 def _write_chunk(cls, dset, data):
40 """
41 Read a chunk from the given DataChunkIterator and write it to the given Dataset
43 :param dset: The Dataset to write to
44 :type dset: Dataset
45 :param data: The DataChunkIterator to read from
46 :type data: AbstractDataChunkIterator
47 :return: True if a chunk was written, False otherwise
48 :rtype: bool
50 """
51 # Read the next data block
52 try:
53 chunk_i = next(data)
54 except StopIteration:
55 return False
56 # Determine the minimum array size required to store the chunk
57 max_bounds = chunk_i.get_min_bounds()
58 # Expand the dataset if needed
59 dset.id.extend(max_bounds)
60 # Write the data
61 dset[chunk_i.selection] = chunk_i.data
63 return True
65 def exhaust_queue(self):
66 """
67 Read and write from any queued DataChunkIterators in a round-robin fashion
68 """
69 while len(self) > 0:
70 self.logger.debug("Exhausting DataChunkIterator from queue (length %d)" % len(self))
71 dset, data = self.popleft()
72 if self._write_chunk(dset, data):
73 self.append(dataset=dset, data=data)
75 def append(self, dataset, data):
76 """
77 Append a value to the queue
79 :param dataset: The dataset where the DataChunkIterator is written to
80 :type dataset: Dataset
81 :param data: DataChunkIterator with the data to be written
82 :type data: AbstractDataChunkIterator
83 """
84 super().append((dataset, data))
87class H5Dataset(HDMFDataset):
88 @docval({'name': 'dataset', 'type': (Dataset, Array), 'doc': 'the HDF5 file lazily evaluate'},
89 {'name': 'io', 'type': 'HDF5IO', 'doc': 'the IO object that was used to read the underlying dataset'})
90 def __init__(self, **kwargs):
91 self.__io = popargs('io', kwargs)
92 super().__init__(**kwargs)
94 @property
95 def io(self):
96 return self.__io
98 @property
99 def regionref(self):
100 return self.dataset.regionref
102 @property
103 def ref(self):
104 return self.dataset.ref
106 @property
107 def shape(self):
108 return self.dataset.shape
111class DatasetOfReferences(H5Dataset, ReferenceResolver, metaclass=ABCMeta):
112 """
113 An extension of the base ReferenceResolver class to add more abstract methods for
114 subclasses that will read HDF5 references
115 """
117 @abstractmethod
118 def get_object(self, h5obj):
119 """
120 A class that maps an HDF5 object to a Builder or Container
121 """
122 pass
124 def invert(self):
125 """
126 Return an object that defers reference resolution
127 but in the opposite direction.
128 """
129 if not hasattr(self, '__inverted'): 129 ↛ 136line 129 didn't jump to line 136, because the condition on line 129 was never false
130 cls = self.get_inverse_class()
131 docval = get_docval(cls.__init__)
132 kwargs = dict()
133 for arg in docval:
134 kwargs[arg['name']] = getattr(self, arg['name'])
135 self.__inverted = cls(**kwargs)
136 return self.__inverted
138 def _get_ref(self, ref):
139 return self.get_object(self.dataset.file[ref])
141 def __iter__(self):
142 for ref in super().__iter__():
143 yield self._get_ref(ref)
145 def __next__(self):
146 return self._get_ref(super().__next__())
149class BuilderResolverMixin(BuilderResolver):
150 """
151 A mixin for adding to HDF5 reference-resolving types
152 the get_object method that returns Builders
153 """
155 def get_object(self, h5obj):
156 """
157 A class that maps an HDF5 object to a Builder
158 """
159 return self.io.get_builder(h5obj)
162class ContainerResolverMixin(ContainerResolver):
163 """
164 A mixin for adding to HDF5 reference-resolving types
165 the get_object method that returns Containers
166 """
168 def get_object(self, h5obj):
169 """
170 A class that maps an HDF5 object to a Container
171 """
172 return self.io.get_container(h5obj)
175class AbstractH5TableDataset(DatasetOfReferences):
177 @docval({'name': 'dataset', 'type': (Dataset, Array), 'doc': 'the HDF5 file lazily evaluate'},
178 {'name': 'io', 'type': 'HDF5IO', 'doc': 'the IO object that was used to read the underlying dataset'},
179 {'name': 'types', 'type': (list, tuple),
180 'doc': 'the IO object that was used to read the underlying dataset'})
181 def __init__(self, **kwargs):
182 types = popargs('types', kwargs)
183 super().__init__(**kwargs)
184 self.__refgetters = dict()
185 for i, t in enumerate(types):
186 if t is RegionReference: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true
187 self.__refgetters[i] = self.__get_regref
188 elif t is Reference:
189 self.__refgetters[i] = self._get_ref
190 elif t is str:
191 # we need this for when we read compound data types
192 # that have unicode sub-dtypes since h5py does not
193 # store UTF-8 in compound dtypes
194 self.__refgetters[i] = self._get_utf
195 self.__types = types
196 tmp = list()
197 for i in range(len(self.dataset.dtype)):
198 sub = self.dataset.dtype[i]
199 if sub.metadata:
200 if 'vlen' in sub.metadata:
201 t = sub.metadata['vlen']
202 if t is str: 202 ↛ 204line 202 didn't jump to line 204, because the condition on line 202 was never false
203 tmp.append('utf')
204 elif t is bytes:
205 tmp.append('ascii')
206 elif 'ref' in sub.metadata: 206 ↛ 197line 206 didn't jump to line 197, because the condition on line 206 was never false
207 t = sub.metadata['ref']
208 if t is Reference: 208 ↛ 210line 208 didn't jump to line 210, because the condition on line 208 was never false
209 tmp.append('object')
210 elif t is RegionReference:
211 tmp.append('region')
212 else:
213 tmp.append(sub.type.__name__)
214 self.__dtype = tmp
216 @property
217 def types(self):
218 return self.__types
220 @property
221 def dtype(self):
222 return self.__dtype
224 def __getitem__(self, arg):
225 rows = copy(super().__getitem__(arg))
226 if np.issubdtype(type(arg), np.integer): 226 ↛ 229line 226 didn't jump to line 229, because the condition on line 226 was never false
227 self.__swap_refs(rows)
228 else:
229 for row in rows:
230 self.__swap_refs(row)
231 return rows
233 def __swap_refs(self, row):
234 for i in self.__refgetters:
235 getref = self.__refgetters[i]
236 row[i] = getref(row[i])
238 def _get_utf(self, string):
239 """
240 Decode a dataset element to unicode
241 """
242 return string.decode('utf-8') if isinstance(string, bytes) else string
244 def __get_regref(self, ref):
245 obj = self._get_ref(ref)
246 return obj[ref]
248 def resolve(self, manager):
249 return self[0:len(self)]
251 def __iter__(self):
252 for i in range(len(self)):
253 yield self[i]
256class AbstractH5ReferenceDataset(DatasetOfReferences):
258 def __getitem__(self, arg):
259 ref = super().__getitem__(arg)
260 if isinstance(ref, np.ndarray):
261 return [self._get_ref(x) for x in ref]
262 else:
263 return self._get_ref(ref)
265 @property
266 def dtype(self):
267 return 'object'
270class AbstractH5RegionDataset(AbstractH5ReferenceDataset):
272 def __getitem__(self, arg):
273 obj = super().__getitem__(arg)
274 ref = self.dataset[arg]
275 return obj[ref]
277 @property
278 def dtype(self):
279 return 'region'
282class ContainerH5TableDataset(ContainerResolverMixin, AbstractH5TableDataset):
283 """
284 A reference-resolving dataset for resolving references inside tables
285 (i.e. compound dtypes) that returns resolved references as Containers
286 """
288 @classmethod
289 def get_inverse_class(cls):
290 return BuilderH5TableDataset
293class BuilderH5TableDataset(BuilderResolverMixin, AbstractH5TableDataset):
294 """
295 A reference-resolving dataset for resolving references inside tables
296 (i.e. compound dtypes) that returns resolved references as Builders
297 """
299 @classmethod
300 def get_inverse_class(cls):
301 return ContainerH5TableDataset
304class ContainerH5ReferenceDataset(ContainerResolverMixin, AbstractH5ReferenceDataset):
305 """
306 A reference-resolving dataset for resolving object references that returns
307 resolved references as Containers
308 """
310 @classmethod
311 def get_inverse_class(cls):
312 return BuilderH5ReferenceDataset
315class BuilderH5ReferenceDataset(BuilderResolverMixin, AbstractH5ReferenceDataset):
316 """
317 A reference-resolving dataset for resolving object references that returns
318 resolved references as Builders
319 """
321 @classmethod
322 def get_inverse_class(cls):
323 return ContainerH5ReferenceDataset
326class ContainerH5RegionDataset(ContainerResolverMixin, AbstractH5RegionDataset):
327 """
328 A reference-resolving dataset for resolving region references that returns
329 resolved references as Containers
330 """
332 @classmethod
333 def get_inverse_class(cls):
334 return BuilderH5RegionDataset
337class BuilderH5RegionDataset(BuilderResolverMixin, AbstractH5RegionDataset):
338 """
339 A reference-resolving dataset for resolving region references that returns
340 resolved references as Builders
341 """
343 @classmethod
344 def get_inverse_class(cls):
345 return ContainerH5RegionDataset
348class H5SpecWriter(SpecWriter):
350 __str_type = special_dtype(vlen=str)
352 @docval({'name': 'group', 'type': Group, 'doc': 'the HDF5 file to write specs to'})
353 def __init__(self, **kwargs):
354 self.__group = getargs('group', kwargs)
356 @staticmethod
357 def stringify(spec):
358 '''
359 Converts a spec into a JSON string to write to a dataset
360 '''
361 return json.dumps(spec, separators=(',', ':'))
363 def __write(self, d, name):
364 data = self.stringify(d)
365 # create spec group if it does not exist. otherwise, do not overwrite existing spec
366 dset = self.__group.create_dataset(name, shape=tuple(), data=data, dtype=self.__str_type)
367 return dset
369 def write_spec(self, spec, path):
370 return self.__write(spec, path)
372 def write_namespace(self, namespace, path):
373 return self.__write({'namespaces': [namespace]}, path)
376class H5SpecReader(SpecReader):
377 """Class that reads cached JSON-formatted namespace and spec data from an HDF5 group."""
379 @docval({'name': 'group', 'type': Group, 'doc': 'the HDF5 group to read specs from'})
380 def __init__(self, **kwargs):
381 self.__group = popargs('group', kwargs)
382 source = "%s:%s" % (os.path.abspath(self.__group.file.name), self.__group.name)
383 super().__init__(source=source)
384 self.__cache = None
386 def __read(self, path):
387 s = self.__group[path][()]
388 if isinstance(s, np.ndarray) and s.shape == (1,): # unpack scalar spec dataset 388 ↛ 389line 388 didn't jump to line 389, because the condition on line 388 was never true
389 s = s[0]
391 if isinstance(s, bytes): 391 ↛ 394line 391 didn't jump to line 394, because the condition on line 391 was never false
392 s = s.decode('UTF-8')
394 d = json.loads(s)
395 return d
397 def read_spec(self, spec_path):
398 return self.__read(spec_path)
400 def read_namespace(self, ns_path):
401 if self.__cache is None:
402 self.__cache = self.__read(ns_path)
403 ret = self.__cache['namespaces']
404 return ret
407class H5RegionSlicer(RegionSlicer):
409 @docval({'name': 'dataset', 'type': (Dataset, H5Dataset), 'doc': 'the HDF5 dataset to slice'},
410 {'name': 'region', 'type': RegionReference, 'doc': 'the region reference to use to slice'})
411 def __init__(self, **kwargs):
412 self.__dataset = getargs('dataset', kwargs)
413 self.__regref = getargs('region', kwargs)
414 self.__len = self.__dataset.regionref.selection(self.__regref)[0]
415 self.__region = None
417 def __read_region(self):
418 if self.__region is None:
419 self.__region = self.__dataset[self.__regref]
421 def __getitem__(self, idx):
422 self.__read_region()
423 return self.__region[idx]
425 def __len__(self):
426 return self.__len
429class H5DataIO(DataIO):
430 """
431 Wrap data arrays for write via HDF5IO to customize I/O behavior, such as compression and chunking
432 for data arrays.
433 """
435 @docval({'name': 'data',
436 'type': (np.ndarray, list, tuple, Dataset, Iterable),
437 'doc': 'the data to be written. NOTE: If an h5py.Dataset is used, all other settings but link_data' +
438 ' will be ignored as the dataset will either be linked to or copied as is in H5DataIO.',
439 'default': None},
440 {'name': 'maxshape',
441 'type': tuple,
442 'doc': 'Dataset will be resizable up to this shape (Tuple). Automatically enables chunking.' +
443 'Use None for the axes you want to be unlimited.',
444 'default': None},
445 {'name': 'chunks',
446 'type': (bool, tuple),
447 'doc': 'Chunk shape or True to enable auto-chunking',
448 'default': None},
449 {'name': 'compression',
450 'type': (str, bool, int),
451 'doc': 'Compression strategy. If a bool is given, then gzip compression will be used by default.' +
452 'http://docs.h5py.org/en/latest/high/dataset.html#dataset-compression',
453 'default': None},
454 {'name': 'compression_opts',
455 'type': (int, tuple),
456 'doc': 'Parameter for compression filter',
457 'default': None},
458 {'name': 'fillvalue',
459 'type': None,
460 'doc': 'Value to be returned when reading uninitialized parts of the dataset',
461 'default': None},
462 {'name': 'shuffle',
463 'type': bool,
464 'doc': 'Enable shuffle I/O filter. http://docs.h5py.org/en/latest/high/dataset.html#dataset-shuffle',
465 'default': None},
466 {'name': 'fletcher32',
467 'type': bool,
468 'doc': 'Enable fletcher32 checksum. http://docs.h5py.org/en/latest/high/dataset.html#dataset-fletcher32',
469 'default': None},
470 {'name': 'link_data',
471 'type': bool,
472 'doc': 'If data is an h5py.Dataset should it be linked to or copied. NOTE: This parameter is only ' +
473 'allowed if data is an h5py.Dataset',
474 'default': False},
475 {'name': 'allow_plugin_filters',
476 'type': bool,
477 'doc': 'Enable passing dynamically loaded filters as compression parameter',
478 'default': False},
479 {'name': 'shape',
480 'type': tuple,
481 'doc': 'the shape of the new dataset, used only if data is None',
482 'default': None},
483 {'name': 'dtype',
484 'type': (str, type, np.dtype),
485 'doc': 'the data type of the new dataset, used only if data is None',
486 'default': None}
487 )
488 def __init__(self, **kwargs):
489 # Get the list of I/O options that user has passed in
490 ioarg_names = [name for name in kwargs.keys() if name not in ['data', 'link_data', 'allow_plugin_filters',
491 'dtype', 'shape']]
493 # Remove the ioargs from kwargs
494 ioarg_values = [popargs(argname, kwargs) for argname in ioarg_names]
495 # Consume link_data parameter
496 self.__link_data = popargs('link_data', kwargs)
497 # Consume allow_plugin_filters parameter
498 self.__allow_plugin_filters = popargs('allow_plugin_filters', kwargs)
499 # Check for possible collision with other parameters
500 if not isinstance(getargs('data', kwargs), Dataset) and self.__link_data:
501 self.__link_data = False
502 warnings.warn('link_data parameter in H5DataIO will be ignored')
503 # Call the super constructor and consume the data parameter
504 super().__init__(**kwargs)
505 # Construct the dict with the io args, ignoring all options that were set to None
506 self.__iosettings = {k: v for k, v in zip(ioarg_names, ioarg_values) if v is not None}
507 if self.data is None:
508 self.__iosettings['dtype'] = self.dtype
509 self.__iosettings['shape'] = self.shape
510 # Set io_properties for DataChunkIterators
511 if isinstance(self.data, AbstractDataChunkIterator):
512 # Define the chunking options if the user has not set them explicitly.
513 if 'chunks' not in self.__iosettings and self.data.recommended_chunk_shape() is not None:
514 self.__iosettings['chunks'] = self.data.recommended_chunk_shape()
515 # Define the maxshape of the data if not provided by the user
516 if 'maxshape' not in self.__iosettings: 516 ↛ 519line 516 didn't jump to line 519, because the condition on line 516 was never false
517 self.__iosettings['maxshape'] = self.data.maxshape
518 # Make default settings when compression set to bool (True/False)
519 if isinstance(self.__iosettings.get('compression', None), bool):
520 if self.__iosettings['compression']:
521 self.__iosettings['compression'] = 'gzip'
522 else:
523 self.__iosettings.pop('compression', None)
524 if 'compression_opts' in self.__iosettings: 524 ↛ 529line 524 didn't jump to line 529, because the condition on line 524 was never false
525 warnings.warn('Compression disabled by compression=False setting. ' +
526 'compression_opts parameter will, therefore, be ignored.')
527 self.__iosettings.pop('compression_opts', None)
528 # Validate the compression options used
529 self._check_compression_options()
530 # Confirm that the compressor is supported by h5py
531 if not self.filter_available(self.__iosettings.get('compression', None),
532 self.__allow_plugin_filters):
533 msg = "%s compression may not be supported by this version of h5py." % str(self.__iosettings['compression'])
534 if not self.__allow_plugin_filters:
535 msg += " Set `allow_plugin_filters=True` to enable the use of dynamically-loaded plugin filters."
536 raise ValueError(msg)
537 # Check possible parameter collisions
538 if isinstance(self.data, Dataset):
539 for k in self.__iosettings.keys():
540 warnings.warn("%s in H5DataIO will be ignored with H5DataIO.data being an HDF5 dataset" % k)
542 self.__dataset = None
544 @property
545 def dataset(self):
546 return self.__dataset
548 @dataset.setter
549 def dataset(self, val):
550 if self.__dataset is not None:
551 raise ValueError("Cannot overwrite H5DataIO.dataset")
552 self.__dataset = val
554 def get_io_params(self):
555 """
556 Returns a dict with the I/O parameters specified in this DataIO.
557 """
558 ret = dict(self.__iosettings)
559 ret['link_data'] = self.__link_data
560 return ret
562 def _check_compression_options(self):
563 """
564 Internal helper function used to check if compression options are compliant
565 with the compression filter used.
567 :raises ValueError: If incompatible options are detected
568 """
569 if 'compression' in self.__iosettings:
570 if 'compression_opts' in self.__iosettings:
571 if self.__iosettings['compression'] == 'gzip':
572 if self.__iosettings['compression_opts'] not in range(10):
573 raise ValueError("GZIP compression_opts setting must be an integer from 0-9, "
574 "not " + str(self.__iosettings['compression_opts']))
575 elif self.__iosettings['compression'] == 'lzf':
576 if self.__iosettings['compression_opts'] is not None: 576 ↛ 594line 576 didn't jump to line 594, because the condition on line 576 was never false
577 raise ValueError("LZF compression filter accepts no compression_opts")
578 elif self.__iosettings['compression'] == 'szip': 578 ↛ 594line 578 didn't jump to line 594, because the condition on line 578 was never false
579 szip_opts_error = False
580 # Check that we have a tuple
581 szip_opts_error |= not isinstance(self.__iosettings['compression_opts'], tuple)
582 # Check that we have a tuple of the right length and correct settings
583 if not szip_opts_error:
584 try:
585 szmethod, szpix = self.__iosettings['compression_opts']
586 szip_opts_error |= (szmethod not in ('ec', 'nn'))
587 szip_opts_error |= (not (0 < szpix <= 32 and szpix % 2 == 0))
588 except ValueError: # ValueError is raised if tuple does not have the right length to unpack
589 szip_opts_error = True
590 if szip_opts_error:
591 raise ValueError("SZIP compression filter compression_opts"
592 " must be a 2-tuple ('ec'|'nn', even integer 0-32).")
593 # Warn if compressor other than gzip is being used
594 if self.__iosettings['compression'] not in ['gzip', h5py_filters.h5z.FILTER_DEFLATE]:
595 warnings.warn(str(self.__iosettings['compression']) + " compression may not be available "
596 "on all installations of HDF5. Use of gzip is recommended to ensure portability of "
597 "the generated HDF5 files.")
599 @staticmethod
600 def filter_available(filter, allow_plugin_filters):
601 """
602 Check if a given I/O filter is available
604 :param filter: String with the name of the filter, e.g., gzip, szip etc.
605 int with the registered filter ID, e.g. 307
606 :type filter: String, int
607 :param allow_plugin_filters: bool indicating whether the given filter can be dynamically loaded
608 :return: bool indicating whether the given filter is available
609 """
610 if filter is not None:
611 if filter in h5py_filters.encode:
612 return True
613 elif allow_plugin_filters is True:
614 if isinstance(filter, int): 614 ↛ 620line 614 didn't jump to line 620, because the condition on line 614 was never false
615 if h5py_filters.h5z.filter_avail(filter):
616 filter_info = h5py_filters.h5z.get_filter_info(filter)
617 if filter_info == (h5py_filters.h5z.FILTER_CONFIG_DECODE_ENABLED + 617 ↛ 620line 617 didn't jump to line 620, because the condition on line 617 was never false
618 h5py_filters.h5z.FILTER_CONFIG_ENCODE_ENABLED):
619 return True
620 return False
621 else:
622 return True
624 @property
625 def link_data(self):
626 return self.__link_data
628 @property
629 def io_settings(self):
630 return self.__iosettings
632 @property
633 def valid(self):
634 if isinstance(self.data, Dataset) and not self.data.id.valid:
635 return False
636 return super().valid