Coverage for src/hdmf/common/resources.py: 98%
432 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
1import pandas as pd
2import numpy as np
3from . import register_class, EXP_NAMESPACE
4from . import get_type_map
5from ..container import Table, Row, Container, Data, AbstractContainer, HERDManager
6from ..utils import docval, popargs, AllowPositional
7from ..build import TypeMap
8from ..term_set import TermSetWrapper
9from glob import glob
10import os
11import zipfile
12from collections import namedtuple
15class KeyTable(Table):
16 """
17 A table for storing keys used to reference external resources.
18 """
20 __defaultname__ = 'keys'
22 __columns__ = (
23 {'name': 'key', 'type': str,
24 'doc': 'The user key that maps to the resource term / registry symbol.'},
25 )
28class Key(Row):
29 """
30 A Row class for representing rows in the KeyTable.
31 """
33 __table__ = KeyTable
36class EntityTable(Table):
37 """
38 A table for storing the external resources a key refers to.
39 """
41 __defaultname__ = 'entities'
43 __columns__ = (
44 {'name': 'entity_id', 'type': str,
45 'doc': 'The unique ID for the resource term / registry symbol.'},
46 {'name': 'entity_uri', 'type': str,
47 'doc': 'The URI for the resource term / registry symbol.'},
48 )
51class Entity(Row):
52 """
53 A Row class for representing rows in the EntityTable.
54 """
56 __table__ = EntityTable
59class FileTable(Table):
60 """
61 A table for storing file ids used in external resources.
62 """
64 __defaultname__ = 'files'
66 __columns__ = (
67 {'name': 'file_object_id', 'type': str,
68 'doc': 'The file id of the file that contains the object'},
69 )
72class File(Row):
73 """
74 A Row class for representing rows in the FileTable.
75 """
77 __table__ = FileTable
80class ObjectTable(Table):
81 """
82 A table for storing objects (i.e. Containers) that contain keys that refer to external resources.
83 """
85 __defaultname__ = 'objects'
87 __columns__ = (
88 {'name': 'files_idx', 'type': int,
89 'doc': 'The row idx for the file_object_id in FileTable containing the object.'},
90 {'name': 'object_id', 'type': str,
91 'doc': 'The object ID for the Container/Data.'},
92 {'name': 'object_type', 'type': str,
93 'doc': 'The type of the object. This is also the parent in relative_path.'},
94 {'name': 'relative_path', 'type': str,
95 'doc': ('The relative_path of the attribute of the object that uses ',
96 'an external resource reference key. Use an empty string if not applicable.')},
97 {'name': 'field', 'type': str,
98 'doc': ('The field of the compound data type using an external resource. '
99 'Use an empty string if not applicable.')}
100 )
103class Object(Row):
104 """
105 A Row class for representing rows in the ObjectTable.
106 """
108 __table__ = ObjectTable
111class ObjectKeyTable(Table):
112 """
113 A table for identifying which keys are used by which objects for referring to external resources.
114 """
116 __defaultname__ = 'object_keys'
118 __columns__ = (
119 {'name': 'objects_idx', 'type': (int, Object),
120 'doc': 'The index into the objects table for the Object that uses the Key.'},
121 {'name': 'keys_idx', 'type': (int, Key),
122 'doc': 'The index into the keys table that is used to make an external resource reference.'}
123 )
126class EntityKeyTable(Table):
127 """
128 A table for identifying which entities are used by which keys for referring to external resources.
129 """
131 __defaultname__ = 'entity_keys'
133 __columns__ = (
134 {'name': 'entities_idx', 'type': (int, Entity),
135 'doc': 'The index into the EntityTable for the Entity that associated with the Key.'},
136 {'name': 'keys_idx', 'type': (int, Key),
137 'doc': 'The index into the KeyTable that is used to make an external resource reference.'}
138 )
141class EntityKey(Row):
142 """
143 A Row class for representing rows in the EntityKeyTable.
144 """
146 __table__ = EntityKeyTable
149class ObjectKey(Row):
150 """
151 A Row class for representing rows in the ObjectKeyTable.
152 """
154 __table__ = ObjectKeyTable
157@register_class('HERD', EXP_NAMESPACE)
158class HERD(Container):
159 """
160 HDMF External Resources Data Structure.
161 A table for mapping user terms (i.e. keys) to resource entities.
162 """
164 __fields__ = (
165 {'name': 'keys', 'child': True},
166 {'name': 'files', 'child': True},
167 {'name': 'objects', 'child': True},
168 {'name': 'object_keys', 'child': True},
169 {'name': 'entity_keys', 'child': True},
170 {'name': 'entities', 'child': True},
171 )
173 @docval({'name': 'keys', 'type': KeyTable, 'default': None,
174 'doc': 'The table storing user keys for referencing resources.'},
175 {'name': 'files', 'type': FileTable, 'default': None,
176 'doc': 'The table for storing file ids used in external resources.'},
177 {'name': 'entities', 'type': EntityTable, 'default': None,
178 'doc': 'The table storing entity information.'},
179 {'name': 'objects', 'type': ObjectTable, 'default': None,
180 'doc': 'The table storing object information.'},
181 {'name': 'object_keys', 'type': ObjectKeyTable, 'default': None,
182 'doc': 'The table storing object-key relationships.'},
183 {'name': 'entity_keys', 'type': EntityKeyTable, 'default': None,
184 'doc': 'The table storing entity-key relationships.'},
185 {'name': 'type_map', 'type': TypeMap, 'default': None,
186 'doc': 'The type map. If None is provided, the HDMF-common type map will be used.'},
187 allow_positional=AllowPositional.WARNING)
188 def __init__(self, **kwargs):
189 name = 'external_resources'
190 super().__init__(name)
191 self.keys = kwargs['keys'] or KeyTable()
192 self.files = kwargs['files'] or FileTable()
193 self.entities = kwargs['entities'] or EntityTable()
194 self.objects = kwargs['objects'] or ObjectTable()
195 self.object_keys = kwargs['object_keys'] or ObjectKeyTable()
196 self.entity_keys = kwargs['entity_keys'] or EntityKeyTable()
197 self.type_map = kwargs['type_map'] or get_type_map()
199 @staticmethod
200 def assert_external_resources_equal(left, right, check_dtype=True):
201 """
202 Compare that the keys, resources, entities, objects, and object_keys tables match
204 :param left: HERD object to compare with right
205 :param right: HERD object to compare with left
206 :param check_dtype: Enforce strict checking of dtypes. Dtypes may be different
207 for example for ids, where depending on how the data was saved
208 ids may change from int64 to int32. (Default: True)
209 :returns: The function returns True if all values match. If mismatches are found,
210 AssertionError will be raised.
211 :raises AssertionError: Raised if any differences are found. The function collects
212 all differences into a single error so that the assertion will indicate
213 all found differences.
214 """
215 errors = []
216 try:
217 pd.testing.assert_frame_equal(left.keys.to_dataframe(),
218 right.keys.to_dataframe(),
219 check_dtype=check_dtype)
220 except AssertionError as e:
221 errors.append(e)
222 try:
223 pd.testing.assert_frame_equal(left.files.to_dataframe(),
224 right.files.to_dataframe(),
225 check_dtype=check_dtype)
226 except AssertionError as e:
227 errors.append(e)
228 try:
229 pd.testing.assert_frame_equal(left.objects.to_dataframe(),
230 right.objects.to_dataframe(),
231 check_dtype=check_dtype)
232 except AssertionError as e:
233 errors.append(e)
234 try:
235 pd.testing.assert_frame_equal(left.entities.to_dataframe(),
236 right.entities.to_dataframe(),
237 check_dtype=check_dtype)
238 except AssertionError as e:
239 errors.append(e)
240 try:
241 pd.testing.assert_frame_equal(left.object_keys.to_dataframe(),
242 right.object_keys.to_dataframe(),
243 check_dtype=check_dtype)
244 except AssertionError as e:
245 errors.append(e)
246 if len(errors) > 0:
247 msg = ''.join(str(e)+"\n\n" for e in errors)
248 raise AssertionError(msg)
249 return True
251 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the key to be added.'})
252 def _add_key(self, **kwargs):
253 """
254 Add a key to be used for making references to external resources.
256 It is possible to use the same *key_name* to refer to different resources so long as the *key_name* is not
257 used within the same object, relative_path, and field. To do so, this method must be called for the
258 two different resources.
260 The returned Key objects must be managed by the caller so as to be appropriately passed to subsequent calls
261 to methods for storing information about the different resources.
262 """
263 key = kwargs['key_name']
264 return Key(key, table=self.keys)
266 @docval({'name': 'file_object_id', 'type': str, 'doc': 'The id of the file'})
267 def _add_file(self, **kwargs):
268 """
269 Add a file to be used for making references to external resources.
271 This is optional when working in HDMF.
272 """
273 file_object_id = kwargs['file_object_id']
274 return File(file_object_id, table=self.files)
276 @docval({'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'},
277 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the entity.'})
278 def _add_entity(self, **kwargs):
279 """
280 Add an entity that will be referenced to using keys specified in HERD.entity_keys.
281 """
282 entity_id = kwargs['entity_id']
283 entity_uri = kwargs['entity_uri']
284 entity = Entity( entity_id, entity_uri, table=self.entities)
285 return entity
287 @docval({'name': 'container', 'type': (str, AbstractContainer),
288 'doc': 'The Container/Data object to add or the object id of the Container/Data object to add.'},
289 {'name': 'files_idx', 'type': int,
290 'doc': 'The file_object_id row idx.'},
291 {'name': 'object_type', 'type': str, 'default': None,
292 'doc': ('The type of the object. This is also the parent in relative_path. If omitted, '
293 'the name of the container class is used.')},
294 {'name': 'relative_path', 'type': str,
295 'doc': ('The relative_path of the attribute of the object that uses ',
296 'an external resource reference key. Use an empty string if not applicable.')},
297 {'name': 'field', 'type': str, 'default': '',
298 'doc': ('The field of the compound data type using an external resource.')})
299 def _add_object(self, **kwargs):
300 """
301 Add an object that references an external resource.
302 """
303 files_idx, container, object_type, relative_path, field = popargs('files_idx',
304 'container',
305 'object_type',
306 'relative_path',
307 'field', kwargs)
309 if object_type is None: 309 ↛ 312line 309 didn't jump to line 312, because the condition on line 309 was never false
310 object_type = container.__class__.__name__
312 if isinstance(container, AbstractContainer): 312 ↛ 314line 312 didn't jump to line 314, because the condition on line 312 was never false
313 container = container.object_id
314 obj = Object(files_idx, container, object_type, relative_path, field, table=self.objects)
315 return obj
317 @docval({'name': 'obj', 'type': (int, Object), 'doc': 'The Object that uses the Key.'},
318 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the Object uses.'})
319 def _add_object_key(self, **kwargs):
320 """
321 Specify that an object (i.e. container and relative_path) uses a key to reference
322 an external resource.
323 """
324 obj, key = popargs('obj', 'key', kwargs)
325 return ObjectKey(obj, key, table=self.object_keys)
327 @docval({'name': 'entity', 'type': (int, Entity), 'doc': 'The Entity associated with the Key.'},
328 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the connected to the Entity.'})
329 def _add_entity_key(self, **kwargs):
330 """
331 Add entity-key relationship to the EntityKeyTable.
332 """
333 entity, key = popargs('entity', 'key', kwargs)
334 return EntityKey(entity, key, table=self.entity_keys)
336 @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.'},
337 {'name': 'container', 'type': AbstractContainer,
338 'doc': ('The Container/Data object that uses the key or '
339 'the object id for the Container/Data object that uses the key.')},
340 {'name': 'relative_path', 'type': str,
341 'doc': ('The relative_path of the attribute of the object that uses ',
342 'an external resource reference key. Use an empty string if not applicable.'),
343 'default': ''},
344 {'name': 'field', 'type': str, 'default': '',
345 'doc': ('The field of the compound data type using an external resource.')},
346 {'name': 'create', 'type': bool, 'default': True})
347 def _check_object_field(self, **kwargs):
348 """
349 Check if a container, relative path, and field have been added.
351 The container can be either an object_id string or an AbstractContainer.
353 If the container, relative_path, and field have not been added, add them
354 and return the corresponding Object. Otherwise, just return the Object.
355 """
356 file = kwargs['file']
357 container = kwargs['container']
358 relative_path = kwargs['relative_path']
359 field = kwargs['field']
360 create = kwargs['create']
361 file_object_id = file.object_id
362 files_idx = self.files.which(file_object_id=file_object_id)
364 if len(files_idx) > 1:
365 raise ValueError("Found multiple instances of the same file.")
366 elif len(files_idx) == 1:
367 files_idx = files_idx[0]
368 else:
369 self._add_file(file_object_id)
370 files_idx = self.files.which(file_object_id=file_object_id)[0]
372 objecttable_idx = self.objects.which(object_id=container.object_id)
374 if len(objecttable_idx) > 0:
375 relative_path_idx = self.objects.which(relative_path=relative_path)
376 field_idx = self.objects.which(field=field)
377 objecttable_idx = list(set(objecttable_idx) & set(relative_path_idx) & set(field_idx))
378 if len(objecttable_idx) == 1:
379 return self.objects.row[objecttable_idx[0]]
380 elif len(objecttable_idx) == 0 and create:
381 return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field)
382 elif len(objecttable_idx) == 0 and not create:
383 raise ValueError("Object not in Object Table.")
384 else:
385 raise ValueError("Found multiple instances of the same object id, relative path, "
386 "and field in objects table.")
388 @docval({'name': 'container', 'type': (str, AbstractContainer),
389 'doc': ('The Container/Data object that uses the key or '
390 'the object id for the Container/Data object that uses the key.')})
391 def _get_file_from_container(self, **kwargs):
392 """
393 Method to retrieve a file associated with the container in the case a file is not provided.
394 """
395 container = kwargs['container']
397 if isinstance(container, HERDManager):
398 file = container
399 return file
400 else:
401 parent = container.parent
402 if parent is not None:
403 while parent is not None: 403 ↛ exitline 403 didn't return from function '_get_file_from_container', because the condition on line 403 was never false
404 if isinstance(parent, HERDManager):
405 file = parent
406 return file
407 else:
408 parent = parent.parent
409 else:
410 msg = 'Could not find file. Add container to the file.'
411 raise ValueError(msg)
413 @docval({'name': 'objects', 'type': list,
414 'doc': 'List of objects to check for TermSetWrapper within the fields.'})
415 def __check_termset_wrapper(self, **kwargs):
416 """
417 Takes a list of objects and checks the fields for TermSetWrapper.
419 wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper'])
420 :return: [wrapped_obj(object1, attribute_name1, wrapper1), ...]
421 """
422 objects = kwargs['objects']
424 ret = [] # list to be returned with the objects, attributes and corresponding termsets
426 for obj in objects:
427 # Get all the fields, parse out the methods and internal variables
428 obj_fields = [a for a in dir(obj) if not a.startswith('_') and not callable(getattr(obj, a))]
429 for attribute in obj_fields:
430 attr = getattr(obj, attribute)
431 if isinstance(attr, TermSetWrapper):
432 # Search objects that are wrapped
433 wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper'])
434 ret.append(wrapped_obj(obj, attribute, attr))
436 return ret
438 @docval({'name': 'root_container', 'type': HERDManager,
439 'doc': 'The root container or file containing objects with a TermSet.'})
440 def add_ref_term_set(self, **kwargs):
441 """
442 Method to search through the root_container for all instances of TermSet.
443 Currently, only datasets are supported. By using a TermSet, the data comes validated
444 and can use the permissible values within the set to populate HERD.
445 """
446 root_container = kwargs['root_container']
448 all_objects = root_container.all_children() # list of child objects and the container itself
450 add_ref_items = self.__check_termset_wrapper(objects=all_objects)
451 for ref in add_ref_items:
452 container, attr_name, wrapper = ref
453 if isinstance(wrapper.value, (list, np.ndarray, tuple)):
454 values = wrapper.value
455 else:
456 # create list for single values (edge-case) for a simple iteration downstream
457 values = [wrapper.value]
458 for term in values:
459 term_info = wrapper.termset[term]
460 entity_id = term_info[0]
461 entity_uri = term_info[2]
462 self.add_ref(file=root_container,
463 container=container,
464 attribute=attr_name,
465 key=term,
466 entity_id=entity_id,
467 entity_uri=entity_uri)
469 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'},
470 {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.',
471 'default': None},
472 {'name': 'container', 'type': (str, AbstractContainer), 'default': None,
473 'doc': ('The Container/Data object that uses the key or '
474 'the object id for the Container/Data object that uses the key.')},
475 {'name': 'relative_path', 'type': str,
476 'doc': ('The relative_path of the attribute of the object that uses ',
477 'an external resource reference key. Use an empty string if not applicable.'),
478 'default': ''},
479 {'name': 'field', 'type': str, 'default': '',
480 'doc': ('The field of the compound data type using an external resource.')})
481 def get_key(self, **kwargs):
482 """
483 Return a Key.
485 If container, relative_path, and field are provided, the Key that corresponds to the given name of the key
486 for the given container, relative_path, and field is returned.
487 """
488 key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs)
489 key_idx_matches = self.keys.which(key=key_name)
491 file = kwargs['file']
493 if container is not None:
494 if file is None:
495 file = self._get_file_from_container(container=container)
496 # if same key is used multiple times, determine
497 # which instance based on the Container
498 object_field = self._check_object_field(file=file,
499 container=container,
500 relative_path=relative_path,
501 field=field)
502 for row_idx in self.object_keys.which(objects_idx=object_field.idx):
503 key_idx = self.object_keys['keys_idx', row_idx]
504 if key_idx in key_idx_matches:
505 return self.keys.row[key_idx]
506 msg = "No key found with that container."
507 raise ValueError(msg)
508 else:
509 if len(key_idx_matches) == 0:
510 # the key has never been used before
511 raise ValueError("key '%s' does not exist" % key_name)
512 elif len(key_idx_matches) > 1:
513 msg = "There are more than one key with that name. Please search with additional information."
514 raise ValueError(msg)
515 else:
516 return self.keys.row[key_idx_matches[0]]
518 @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'})
519 def get_entity(self, **kwargs):
520 entity_id = kwargs['entity_id']
521 entity = self.entities.which(entity_id=entity_id)
522 if len(entity)>0:
523 return self.entities.row[entity[0]]
524 else:
525 return None
527 @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None,
528 'doc': ('The Container/Data object that uses the key or '
529 'the object_id for the Container/Data object that uses the key.')},
530 {'name': 'attribute', 'type': str,
531 'doc': 'The attribute of the container for the external reference.', 'default': None},
532 {'name': 'field', 'type': str, 'default': '',
533 'doc': ('The field of the compound data type using an external resource.')},
534 {'name': 'key', 'type': (str, Key), 'default': None,
535 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'},
536 {'name': 'entity_id', 'type': str, 'doc': 'The identifier for the entity at the resource.'},
537 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.', 'default': None},
538 {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.',
539 'default': None},
540 )
541 def add_ref(self, **kwargs):
542 """
543 Add information about an external reference used in this file.
545 It is possible to use the same name of the key to refer to different resources
546 so long as the name of the key is not used within the same object, relative_path, and
547 field combination. This method does not support such functionality by default.
548 """
549 ###############################################################
550 container = kwargs['container']
551 attribute = kwargs['attribute']
552 if isinstance(container, Data):
553 if attribute == 'data':
554 attribute = None
555 key = kwargs['key']
556 field = kwargs['field']
557 entity_id = kwargs['entity_id']
558 entity_uri = kwargs['entity_uri']
559 file = kwargs['file']
561 if file is None:
562 file = self._get_file_from_container(container=container)
564 if attribute is None: # Trivial Case
565 relative_path = ''
566 object_field = self._check_object_field(file=file,
567 container=container,
568 relative_path=relative_path,
569 field=field)
570 else: # DataType Attribute Case
571 attribute_object = getattr(container, attribute) # returns attribute object
572 if isinstance(attribute_object, AbstractContainer):
573 relative_path = ''
574 object_field = self._check_object_field(file=file,
575 container=attribute_object,
576 relative_path=relative_path,
577 field=field)
578 else: # Non-DataType Attribute Case:
579 obj_mapper = self.type_map.get_map(container)
580 spec = obj_mapper.get_attr_spec(attr_name=attribute)
581 parent_spec = spec.parent # return the parent spec of the attribute
582 if parent_spec.data_type is None:
583 while parent_spec.data_type is None:
584 parent_spec = parent_spec.parent # find the closest parent with a data_type
585 parent_cls = self.type_map.get_dt_container_cls(data_type=parent_spec.data_type, autogen=False)
586 if isinstance(container, parent_cls): 586 ↛ 596line 586 didn't jump to line 596, because the condition on line 586 was never false
587 parent = container
588 # We need to get the path of the spec for relative_path
589 absolute_path = spec.path
590 relative_path = absolute_path[absolute_path.find('/')+1:]
591 object_field = self._check_object_field(file=file,
592 container=parent,
593 relative_path=relative_path,
594 field=field)
595 else:
596 msg = 'Container not the nearest data_type'
597 raise ValueError(msg)
598 else:
599 parent = container # container needs to be the parent
600 absolute_path = spec.path
601 relative_path = absolute_path[absolute_path.find('/')+1:]
602 # this regex removes everything prior to the container on the absolute_path
603 object_field = self._check_object_field(file=file,
604 container=parent,
605 relative_path=relative_path,
606 field=field)
608 if not isinstance(key, Key):
609 key_idx_matches = self.keys.which(key=key)
610 # if same key is used multiple times, determine
611 # which instance based on the Container
612 for row_idx in self.object_keys.which(objects_idx=object_field.idx):
613 key_idx = self.object_keys['keys_idx', row_idx]
614 if key_idx in key_idx_matches: 614 ↛ 615line 614 didn't jump to line 615, because the condition on line 614 was never true
615 msg = "Use Key Object when referencing an existing (container, relative_path, key)"
616 raise ValueError(msg)
618 key = self._add_key(key)
619 self._add_object_key(object_field, key)
621 else:
622 # Check to see that the existing key is being used with the object.
623 # If true, do nothing. If false, create a new obj/key relationship
624 # in the ObjectKeyTable
625 key_idx = key.idx
626 object_key_row_idx = self.object_keys.which(keys_idx=key_idx)
627 if len(object_key_row_idx)!=0:
628 obj_key_check = False
629 for row_idx in object_key_row_idx:
630 obj_idx = self.object_keys['objects_idx', row_idx]
631 if obj_idx == object_field.idx:
632 obj_key_check = True
633 if not obj_key_check:
634 self._add_object_key(object_field, key)
635 else:
636 msg = "Cannot find key object. Create new Key with string."
637 raise ValueError(msg)
638 # check if the key and object have been related in the ObjectKeyTable
640 entity = self.get_entity(entity_id=entity_id)
641 if entity is None:
642 if entity_uri is None:
643 msg = 'New entities must have an entity_uri.'
644 raise ValueError(msg)
645 entity = self._add_entity(entity_id, entity_uri)
646 self._add_entity_key(entity, key)
647 else:
648 if entity_uri is not None:
649 msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.'
650 raise ValueError(msg)
651 # check for entity-key relationship in EntityKeyTable
652 key_idx = key.idx
653 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx)
654 if len(entity_key_row_idx)!=0:
655 # this means there exists rows where the key is in the EntityKeyTable
656 entity_key_check = False
657 for row_idx in entity_key_row_idx:
658 entity_idx = self.entity_keys['entities_idx', row_idx]
659 if entity_idx == entity.idx:
660 entity_key_check = True
661 # this means there is already a key-entity relationship recorded
662 if not entity_key_check:
663 # this means that though the key is there, there is not key-entity relationship
664 # a.k.a add it now
665 self._add_entity_key(entity, key)
666 else:
667 # this means that specific key is not in the EntityKeyTable, so add it and establish
668 # the relationship with the entity
669 self._add_entity_key(entity, key)
670 return key, entity
672 @docval({'name': 'object_type', 'type': str,
673 'doc': 'The type of the object. This is also the parent in relative_path.'},
674 {'name': 'relative_path', 'type': str,
675 'doc': ('The relative_path of the attribute of the object that uses ',
676 'an external resource reference key. Use an empty string if not applicable.'),
677 'default': ''},
678 {'name': 'field', 'type': str, 'default': '',
679 'doc': ('The field of the compound data type using an external resource.')},
680 {'name': 'all_instances', 'type': bool, 'default': False,
681 'doc': ('The bool to return a dataframe with all instances of the object_type.',
682 'If True, relative_path and field inputs will be ignored.')})
683 def get_object_type(self, **kwargs):
684 """
685 Get all entities/resources associated with an object_type.
686 """
687 object_type = kwargs['object_type']
688 relative_path = kwargs['relative_path']
689 field = kwargs['field']
690 all_instances = kwargs['all_instances']
692 df = self.to_dataframe()
694 if all_instances:
695 df = df.loc[df['object_type'] == object_type]
696 else:
697 df = df.loc[(df['object_type'] == object_type)
698 & (df['relative_path'] == relative_path)
699 & (df['field'] == field)]
700 return df
702 @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file.',
703 'default': None},
704 {'name': 'container', 'type': (str, AbstractContainer),
705 'doc': 'The Container/data object that is linked to resources/entities.'},
706 {'name': 'attribute', 'type': str,
707 'doc': 'The attribute of the container for the external reference.', 'default': None},
708 {'name': 'relative_path', 'type': str,
709 'doc': ('The relative_path of the attribute of the object that uses ',
710 'an external resource reference key. Use an empty string if not applicable.'),
711 'default': ''},
712 {'name': 'field', 'type': str, 'default': '',
713 'doc': ('The field of the compound data type using an external resource.')})
714 def get_object_entities(self, **kwargs):
715 """
716 Get all entities/resources associated with an object.
717 """
718 file = kwargs['file']
719 container = kwargs['container']
720 attribute = kwargs['attribute']
721 relative_path = kwargs['relative_path']
722 field = kwargs['field']
724 if file is None:
725 file = self._get_file_from_container(container=container)
727 keys = []
728 entities = []
729 if attribute is None:
730 object_field = self._check_object_field(file=file,
731 container=container,
732 relative_path=relative_path,
733 field=field,
734 create=False)
735 else:
736 object_field = self._check_object_field(file=file,
737 container=container[attribute],
738 relative_path=relative_path,
739 field=field,
740 create=False)
741 # Find all keys associated with the object
742 for row_idx in self.object_keys.which(objects_idx=object_field.idx):
743 keys.append(self.object_keys['keys_idx', row_idx])
744 # Find all the entities/resources for each key.
745 for key_idx in keys:
746 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx)
747 for row_idx in entity_key_row_idx:
748 entity_idx = self.entity_keys['entities_idx', row_idx]
749 entities.append(self.entities.__getitem__(entity_idx))
750 df = pd.DataFrame(entities, columns=['entity_id', 'entity_uri'])
751 return df
753 @docval({'name': 'use_categories', 'type': bool, 'default': False,
754 'doc': 'Use a multi-index on the columns to indicate which category each column belongs to.'},
755 rtype=pd.DataFrame, returns='A DataFrame with all data merged into a flat, denormalized table.')
756 def to_dataframe(self, **kwargs):
757 """
758 Convert the data from the keys, resources, entities, objects, and object_keys tables
759 to a single joint dataframe. I.e., here data is being denormalized, e.g., keys that
760 are used across multiple entities or objects will duplicated across the corresponding
761 rows.
763 Returns: :py:class:`~pandas.DataFrame` with all data merged into a single, flat, denormalized table.
765 """
766 use_categories = popargs('use_categories', kwargs)
767 # Step 1: Combine the entities, keys, and entity_keys table
768 ent_key_df = self.entity_keys.to_dataframe()
769 entities_mapped_df = self.entities.to_dataframe().iloc[ent_key_df['entities_idx']].reset_index(drop=True)
770 keys_mapped_df = self.keys.to_dataframe().iloc[ent_key_df['keys_idx']].reset_index(drop=True)
771 ent_key_df = pd.concat(objs=[ent_key_df, entities_mapped_df, keys_mapped_df],
772 axis=1,
773 verify_integrity=False)
774 # Step 2: Combine the the files, object_keys and objects tables
775 object_keys_df = self.object_keys.to_dataframe()
776 objects_mapped_df = self.objects.to_dataframe().iloc[object_keys_df['objects_idx']].reset_index(drop=True)
777 object_keys_df = pd.concat(objs=[object_keys_df, objects_mapped_df],
778 axis=1,
779 verify_integrity=False)
780 files_df = self.files.to_dataframe().iloc[object_keys_df['files_idx']].reset_index(drop=True)
781 file_object_object_key_df = pd.concat(objs=[object_keys_df, files_df],
782 axis=1,
783 verify_integrity=False)
784 # Step 3: merge the combined entities_df and object_keys_df DataFrames
785 result_df = pd.concat(
786 # Create for each row in the objects_keys table a DataFrame with all corresponding data from all tables
787 objs=[pd.merge(
788 # Find all entities that correspond to the row i of the object_keys_table
789 ent_key_df[ent_key_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True),
790 # Get a DataFrame for row i of the objects_keys_table
791 file_object_object_key_df.iloc[[i, ]],
792 # Merge the entities and object_keys on the keys_idx column so that the values from the single
793 # object_keys_table row are copied across all corresponding rows in the entities table
794 on='keys_idx')
795 for i in range(len(object_keys_df))],
796 # Concatenate the rows of the objs
797 axis=0,
798 verify_integrity=False)
799 # Step 4: Clean up the index and sort columns by table type and name
800 result_df.reset_index(inplace=True, drop=True)
801 # ADD files
802 file_id_col = []
803 for idx in result_df['files_idx']:
804 file_id_val = self.files.to_dataframe().iloc[int(idx)]['file_object_id']
805 file_id_col.append(file_id_val)
807 result_df['file_object_id'] = file_id_col
808 column_labels = [('files', 'file_object_id'),
809 ('objects', 'objects_idx'), ('objects', 'object_id'), ('objects', 'files_idx'),
810 ('objects', 'object_type'), ('objects', 'relative_path'), ('objects', 'field'),
811 ('keys', 'keys_idx'), ('keys', 'key'),
812 ('entities', 'entities_idx'), ('entities', 'entity_id'), ('entities', 'entity_uri')]
813 # sort the columns based on our custom order
814 result_df = result_df.reindex(labels=[c[1] for c in column_labels],
815 axis=1)
816 result_df = result_df.astype({'keys_idx': 'uint32',
817 'objects_idx': 'uint32',
818 'files_idx': 'uint32',
819 'entities_idx': 'uint32'})
820 # Add the categories if requested
821 if use_categories: 821 ↛ 822line 821 didn't jump to line 822, because the condition on line 821 was never true
822 result_df.columns = pd.MultiIndex.from_tuples(column_labels)
823 # return the result
824 return result_df
826 @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
827 def to_zip(self, **kwargs):
828 """
829 Write the tables in HERD to zipped tsv files.
830 """
831 zip_file = kwargs['path']
832 directory = os.path.dirname(zip_file)
834 files = [os.path.join(directory, child.name)+'.tsv' for child in self.children]
835 for i in range(len(self.children)):
836 df = self.children[i].to_dataframe()
837 df.to_csv(files[i], sep='\t', index=False)
839 with zipfile.ZipFile(zip_file, 'w') as zipF:
840 for file in files:
841 zipF.write(file)
843 # remove tsv files
844 for file in files:
845 os.remove(file)
847 @classmethod
848 @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
849 def from_zip(cls, **kwargs):
850 """
851 Method to read in zipped tsv files to populate HERD.
852 """
853 zip_file = kwargs['path']
854 directory = os.path.dirname(zip_file)
856 with zipfile.ZipFile(zip_file, 'r') as zip:
857 zip.extractall(directory)
858 tsv_paths = glob(directory+'/*')
860 for file in tsv_paths:
861 file_name = os.path.basename(file)
862 if file_name == 'files.tsv':
863 files_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
864 files = FileTable().from_dataframe(df=files_df, name='files', extra_ok=False)
865 os.remove(file)
866 continue
867 if file_name == 'keys.tsv':
868 keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
869 keys = KeyTable().from_dataframe(df=keys_df, name='keys', extra_ok=False)
870 os.remove(file)
871 continue
872 if file_name == 'entities.tsv':
873 entities_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
874 entities = EntityTable().from_dataframe(df=entities_df, name='entities', extra_ok=False)
875 os.remove(file)
876 continue
877 if file_name == 'objects.tsv':
878 objects_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
879 objects = ObjectTable().from_dataframe(df=objects_df, name='objects', extra_ok=False)
880 os.remove(file)
881 continue
882 if file_name == 'object_keys.tsv':
883 object_keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
884 object_keys = ObjectKeyTable().from_dataframe(df=object_keys_df, name='object_keys', extra_ok=False)
885 os.remove(file)
886 continue
887 if file_name == 'entity_keys.tsv':
888 ent_key_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
889 entity_keys = EntityKeyTable().from_dataframe(df=ent_key_df, name='entity_keys', extra_ok=False)
890 os.remove(file)
891 continue
893 # we need to check the idx columns in entities, objects, and object_keys
894 entity_idx = entity_keys['entities_idx']
895 for idx in entity_idx:
896 if not int(idx) < len(entities):
897 msg = "Entity Index out of range in EntityTable. Please check for alterations."
898 raise ValueError(msg)
900 files_idx = objects['files_idx']
901 for idx in files_idx:
902 if not int(idx) < len(files):
903 msg = "File_ID Index out of range in ObjectTable. Please check for alterations."
904 raise ValueError(msg)
906 object_idx = object_keys['objects_idx']
907 for idx in object_idx:
908 if not int(idx) < len(objects):
909 msg = "Object Index out of range in ObjectKeyTable. Please check for alterations."
910 raise ValueError(msg)
912 keys_idx = object_keys['keys_idx']
913 for idx in keys_idx:
914 if not int(idx) < len(keys):
915 msg = "Key Index out of range in ObjectKeyTable. Please check for alterations."
916 raise ValueError(msg)
918 keys_idx = entity_keys['keys_idx']
919 for idx in keys_idx:
920 if not int(idx) < len(keys):
921 msg = "Key Index out of range in EntityKeyTable. Please check for alterations."
922 raise ValueError(msg)
925 er = HERD(files=files,
926 keys=keys,
927 entities=entities,
928 entity_keys=entity_keys,
929 objects=objects,
930 object_keys=object_keys)
931 return er