Coverage for src/hdmf/common/resources.py: 97%
443 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-07-10 23:48 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-07-10 23:48 +0000
1import pandas as pd
2import numpy as np
3from . import register_class, EXP_NAMESPACE
4from . import get_type_map
5from ..container import Table, Row, Container, AbstractContainer, Data, ExternalResourcesManager
6from ..data_utils import DataIO
7from ..utils import docval, popargs, AllowPositional
8from ..build import TypeMap
9from ..term_set import TermSet
10from glob import glob
11import os
12import zipfile
15class KeyTable(Table):
16 """
17 A table for storing keys used to reference external resources.
18 """
20 __defaultname__ = 'keys'
22 __columns__ = (
23 {'name': 'key', 'type': str,
24 'doc': 'The user key that maps to the resource term / registry symbol.'},
25 )
28class Key(Row):
29 """
30 A Row class for representing rows in the KeyTable.
31 """
33 __table__ = KeyTable
36class EntityTable(Table):
37 """
38 A table for storing the external resources a key refers to.
39 """
41 __defaultname__ = 'entities'
43 __columns__ = (
44 {'name': 'entity_id', 'type': str,
45 'doc': 'The unique ID for the resource term / registry symbol.'},
46 {'name': 'entity_uri', 'type': str,
47 'doc': 'The URI for the resource term / registry symbol.'},
48 )
51class Entity(Row):
52 """
53 A Row class for representing rows in the EntityTable.
54 """
56 __table__ = EntityTable
59class FileTable(Table):
60 """
61 A table for storing file ids used in external resources.
62 """
64 __defaultname__ = 'files'
66 __columns__ = (
67 {'name': 'file_object_id', 'type': str,
68 'doc': 'The file id of the file that contains the object'},
69 )
72class File(Row):
73 """
74 A Row class for representing rows in the FileTable.
75 """
77 __table__ = FileTable
80class ObjectTable(Table):
81 """
82 A table for storing objects (i.e. Containers) that contain keys that refer to external resources.
83 """
85 __defaultname__ = 'objects'
87 __columns__ = (
88 {'name': 'files_idx', 'type': int,
89 'doc': 'The row idx for the file_object_id in FileTable containing the object.'},
90 {'name': 'object_id', 'type': str,
91 'doc': 'The object ID for the Container/Data.'},
92 {'name': 'object_type', 'type': str,
93 'doc': 'The type of the object. This is also the parent in relative_path.'},
94 {'name': 'relative_path', 'type': str,
95 'doc': ('The relative_path of the attribute of the object that uses ',
96 'an external resource reference key. Use an empty string if not applicable.')},
97 {'name': 'field', 'type': str,
98 'doc': ('The field of the compound data type using an external resource. '
99 'Use an empty string if not applicable.')}
100 )
103class Object(Row):
104 """
105 A Row class for representing rows in the ObjectTable.
106 """
108 __table__ = ObjectTable
111class ObjectKeyTable(Table):
112 """
113 A table for identifying which keys are used by which objects for referring to external resources.
114 """
116 __defaultname__ = 'object_keys'
118 __columns__ = (
119 {'name': 'objects_idx', 'type': (int, Object),
120 'doc': 'The index into the objects table for the Object that uses the Key.'},
121 {'name': 'keys_idx', 'type': (int, Key),
122 'doc': 'The index into the keys table that is used to make an external resource reference.'}
123 )
126class EntityKeyTable(Table):
127 """
128 A table for identifying which entities are used by which keys for referring to external resources.
129 """
131 __defaultname__ = 'entity_keys'
133 __columns__ = (
134 {'name': 'entities_idx', 'type': (int, Entity),
135 'doc': 'The index into the EntityTable for the Entity that associated with the Key.'},
136 {'name': 'keys_idx', 'type': (int, Key),
137 'doc': 'The index into the KeyTable that is used to make an external resource reference.'}
138 )
141class EntityKey(Row):
142 """
143 A Row class for representing rows in the EntityKeyTable.
144 """
146 __table__ = EntityKeyTable
149class ObjectKey(Row):
150 """
151 A Row class for representing rows in the ObjectKeyTable.
152 """
154 __table__ = ObjectKeyTable
157@register_class('ExternalResources', EXP_NAMESPACE)
158class ExternalResources(Container):
159 """A table for mapping user terms (i.e. keys) to resource entities."""
161 __fields__ = (
162 {'name': 'keys', 'child': True},
163 {'name': 'files', 'child': True},
164 {'name': 'objects', 'child': True},
165 {'name': 'object_keys', 'child': True},
166 {'name': 'entity_keys', 'child': True},
167 {'name': 'entities', 'child': True},
168 )
170 @docval({'name': 'keys', 'type': KeyTable, 'default': None,
171 'doc': 'The table storing user keys for referencing resources.'},
172 {'name': 'files', 'type': FileTable, 'default': None,
173 'doc': 'The table for storing file ids used in external resources.'},
174 {'name': 'entities', 'type': EntityTable, 'default': None,
175 'doc': 'The table storing entity information.'},
176 {'name': 'objects', 'type': ObjectTable, 'default': None,
177 'doc': 'The table storing object information.'},
178 {'name': 'object_keys', 'type': ObjectKeyTable, 'default': None,
179 'doc': 'The table storing object-key relationships.'},
180 {'name': 'entity_keys', 'type': EntityKeyTable, 'default': None,
181 'doc': 'The table storing entity-key relationships.'},
182 {'name': 'type_map', 'type': TypeMap, 'default': None,
183 'doc': 'The type map. If None is provided, the HDMF-common type map will be used.'},
184 allow_positional=AllowPositional.WARNING)
185 def __init__(self, **kwargs):
186 name = 'external_resources'
187 super().__init__(name)
188 self.keys = kwargs['keys'] or KeyTable()
189 self.files = kwargs['files'] or FileTable()
190 self.entities = kwargs['entities'] or EntityTable()
191 self.objects = kwargs['objects'] or ObjectTable()
192 self.object_keys = kwargs['object_keys'] or ObjectKeyTable()
193 self.entity_keys = kwargs['entity_keys'] or EntityKeyTable()
194 self.type_map = kwargs['type_map'] or get_type_map()
196 @staticmethod
197 def assert_external_resources_equal(left, right, check_dtype=True):
198 """
199 Compare that the keys, resources, entities, objects, and object_keys tables match
201 :param left: ExternalResources object to compare with right
202 :param right: ExternalResources object to compare with left
203 :param check_dtype: Enforce strict checking of dtypes. Dtypes may be different
204 for example for ids, where depending on how the data was saved
205 ids may change from int64 to int32. (Default: True)
206 :returns: The function returns True if all values match. If mismatches are found,
207 AssertionError will be raised.
208 :raises AssertionError: Raised if any differences are found. The function collects
209 all differences into a single error so that the assertion will indicate
210 all found differences.
211 """
212 errors = []
213 try:
214 pd.testing.assert_frame_equal(left.keys.to_dataframe(),
215 right.keys.to_dataframe(),
216 check_dtype=check_dtype)
217 except AssertionError as e:
218 errors.append(e)
219 try:
220 pd.testing.assert_frame_equal(left.files.to_dataframe(),
221 right.files.to_dataframe(),
222 check_dtype=check_dtype)
223 except AssertionError as e:
224 errors.append(e)
225 try:
226 pd.testing.assert_frame_equal(left.objects.to_dataframe(),
227 right.objects.to_dataframe(),
228 check_dtype=check_dtype)
229 except AssertionError as e:
230 errors.append(e)
231 try:
232 pd.testing.assert_frame_equal(left.entities.to_dataframe(),
233 right.entities.to_dataframe(),
234 check_dtype=check_dtype)
235 except AssertionError as e:
236 errors.append(e)
237 try:
238 pd.testing.assert_frame_equal(left.object_keys.to_dataframe(),
239 right.object_keys.to_dataframe(),
240 check_dtype=check_dtype)
241 except AssertionError as e:
242 errors.append(e)
243 if len(errors) > 0:
244 msg = ''.join(str(e)+"\n\n" for e in errors)
245 raise AssertionError(msg)
246 return True
248 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the key to be added.'})
249 def _add_key(self, **kwargs):
250 """
251 Add a key to be used for making references to external resources.
253 It is possible to use the same *key_name* to refer to different resources so long as the *key_name* is not
254 used within the same object, relative_path, and field. To do so, this method must be called for the
255 two different resources.
257 The returned Key objects must be managed by the caller so as to be appropriately passed to subsequent calls
258 to methods for storing information about the different resources.
259 """
260 key = kwargs['key_name']
261 return Key(key, table=self.keys)
263 @docval({'name': 'file_object_id', 'type': str, 'doc': 'The id of the file'})
264 def _add_file(self, **kwargs):
265 """
266 Add a file to be used for making references to external resources.
268 This is optional when working in HDMF.
269 """
270 file_object_id = kwargs['file_object_id']
271 return File(file_object_id, table=self.files)
273 @docval({'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'},
274 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the entity.'})
275 def _add_entity(self, **kwargs):
276 """
277 Add an entity that will be referenced to using keys specified in ExternalResources.entity_keys.
278 """
279 entity_id = kwargs['entity_id']
280 entity_uri = kwargs['entity_uri']
281 entity = Entity( entity_id, entity_uri, table=self.entities)
282 return entity
284 @docval({'name': 'container', 'type': (str, AbstractContainer),
285 'doc': 'The Container/Data object to add or the object id of the Container/Data object to add.'},
286 {'name': 'files_idx', 'type': int,
287 'doc': 'The file_object_id row idx.'},
288 {'name': 'object_type', 'type': str, 'default': None,
289 'doc': ('The type of the object. This is also the parent in relative_path. If omitted, '
290 'the name of the container class is used.')},
291 {'name': 'relative_path', 'type': str,
292 'doc': ('The relative_path of the attribute of the object that uses ',
293 'an external resource reference key. Use an empty string if not applicable.')},
294 {'name': 'field', 'type': str, 'default': '',
295 'doc': ('The field of the compound data type using an external resource.')})
296 def _add_object(self, **kwargs):
297 """
298 Add an object that references an external resource.
299 """
300 files_idx, container, object_type, relative_path, field = popargs('files_idx',
301 'container',
302 'object_type',
303 'relative_path',
304 'field', kwargs)
306 if object_type is None: 306 ↛ 309line 306 didn't jump to line 309, because the condition on line 306 was never false
307 object_type = container.__class__.__name__
309 if isinstance(container, AbstractContainer): 309 ↛ 311line 309 didn't jump to line 311, because the condition on line 309 was never false
310 container = container.object_id
311 obj = Object(files_idx, container, object_type, relative_path, field, table=self.objects)
312 return obj
314 @docval({'name': 'obj', 'type': (int, Object), 'doc': 'The Object that uses the Key.'},
315 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the Object uses.'})
316 def _add_object_key(self, **kwargs):
317 """
318 Specify that an object (i.e. container and relative_path) uses a key to reference
319 an external resource.
320 """
321 obj, key = popargs('obj', 'key', kwargs)
322 return ObjectKey(obj, key, table=self.object_keys)
324 @docval({'name': 'entity', 'type': (int, Entity), 'doc': 'The Entity associated with the Key.'},
325 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the connected to the Entity.'})
326 def _add_entity_key(self, **kwargs):
327 """
328 Add entity-key relationship to the EntityKeyTable.
329 """
330 entity, key = popargs('entity', 'key', kwargs)
331 return EntityKey(entity, key, table=self.entity_keys)
333 @docval({'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file associated with the container.'},
334 {'name': 'container', 'type': AbstractContainer,
335 'doc': ('The Container/Data object that uses the key or '
336 'the object id for the Container/Data object that uses the key.')},
337 {'name': 'relative_path', 'type': str,
338 'doc': ('The relative_path of the attribute of the object that uses ',
339 'an external resource reference key. Use an empty string if not applicable.'),
340 'default': ''},
341 {'name': 'field', 'type': str, 'default': '',
342 'doc': ('The field of the compound data type using an external resource.')},
343 {'name': 'create', 'type': bool, 'default': True})
344 def _check_object_field(self, **kwargs):
345 """
346 Check if a container, relative path, and field have been added.
348 The container can be either an object_id string or an AbstractContainer.
350 If the container, relative_path, and field have not been added, add them
351 and return the corresponding Object. Otherwise, just return the Object.
352 """
353 file = kwargs['file']
354 container = kwargs['container']
355 relative_path = kwargs['relative_path']
356 field = kwargs['field']
357 create = kwargs['create']
358 file_object_id = file.object_id
359 files_idx = self.files.which(file_object_id=file_object_id)
361 if len(files_idx) > 1:
362 raise ValueError("Found multiple instances of the same file.")
363 elif len(files_idx) == 1:
364 files_idx = files_idx[0]
365 else:
366 self._add_file(file_object_id)
367 files_idx = self.files.which(file_object_id=file_object_id)[0]
369 objecttable_idx = self.objects.which(object_id=container.object_id)
371 if len(objecttable_idx) > 0:
372 relative_path_idx = self.objects.which(relative_path=relative_path)
373 field_idx = self.objects.which(field=field)
374 objecttable_idx = list(set(objecttable_idx) & set(relative_path_idx) & set(field_idx))
375 if len(objecttable_idx) == 1:
376 return self.objects.row[objecttable_idx[0]]
377 elif len(objecttable_idx) == 0 and create:
378 return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field)
379 elif len(objecttable_idx) == 0 and not create:
380 raise ValueError("Object not in Object Table.")
381 else:
382 raise ValueError("Found multiple instances of the same object id, relative path, "
383 "and field in objects table.")
385 @docval({'name': 'container', 'type': (str, AbstractContainer),
386 'doc': ('The Container/Data object that uses the key or '
387 'the object id for the Container/Data object that uses the key.')})
388 def _get_file_from_container(self, **kwargs):
389 """
390 Method to retrieve a file associated with the container in the case a file is not provided.
391 """
392 container = kwargs['container']
394 if isinstance(container, ExternalResourcesManager):
395 file = container
396 return file
397 else:
398 parent = container.parent
399 if parent is not None:
400 while parent is not None: 400 ↛ exitline 400 didn't return from function '_get_file_from_container', because the condition on line 400 was never false
401 if isinstance(parent, ExternalResourcesManager):
402 file = parent
403 return file
404 else:
405 parent = parent.parent
406 else:
407 msg = 'Could not find file. Add container to the file.'
408 raise ValueError(msg)
410 @docval({'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file associated with the container.',
411 'default': None},
412 {'name': 'container', 'type': (str, AbstractContainer), 'default': None,
413 'doc': ('The Container/Data object that uses the key or '
414 'the object_id for the Container/Data object that uses the key.')},
415 {'name': 'attribute', 'type': str,
416 'doc': 'The attribute of the container for the external reference.', 'default': None},
417 {'name': 'field', 'type': str, 'default': '',
418 'doc': ('The field of the compound data type using an external resource.')},
419 {'name': 'key', 'type': (str, Key), 'default': None,
420 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'},
421 {'name': 'term_set', 'type': TermSet, 'default': None,
422 'doc': 'The TermSet to be used if the container/attribute does not have one.'}
423 )
424 def add_ref_term_set(self, **kwargs):
425 file = kwargs['file']
426 container = kwargs['container']
427 attribute = kwargs['attribute']
428 key = kwargs['key']
429 field = kwargs['field']
430 term_set = kwargs['term_set']
432 if term_set is None:
433 if attribute is None:
434 try:
435 term_set = container.term_set
436 except AttributeError:
437 msg = "Cannot Find TermSet"
438 raise AttributeError(msg)
439 else:
440 term_set = container[attribute].term_set
441 if term_set is None:
442 msg = "Cannot Find TermSet"
443 raise ValueError(msg)
445 if file is None:
446 file = self._get_file_from_container(container=container)
448 # if key is provided then add_ref proceeds as normal
449 # use key provided as the term in the term_set for entity look-up
450 if key is not None: 450 ↛ 451line 450 didn't jump to line 451, because the condition on line 450 was never true
451 data = [key]
452 else:
453 if attribute is None: 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true
454 data_object = container
455 else:
456 data_object = container[attribute]
457 if isinstance(data_object, (Data, DataIO)): 457 ↛ 459line 457 didn't jump to line 459, because the condition on line 457 was never false
458 data = data_object.data
459 elif isinstance(data_object, (list, np.ndarray)):
460 data = data_object
461 missing_terms = []
462 for term in data:
463 try:
464 term_info = term_set[term]
465 except ValueError:
466 missing_terms.append(term)
467 continue
468 entity_id = term_info[0]
469 entity_uri = term_info[2]
470 self.add_ref(file=file,
471 container=container,
472 attribute=attribute,
473 key=term,
474 field=field,
475 entity_id=entity_id,
476 entity_uri=entity_uri)
477 if len(missing_terms)>0:
478 return {"Missing Values in TermSet": missing_terms}
479 else:
480 return True
482 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'},
483 {'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file associated with the container.',
484 'default': None},
485 {'name': 'container', 'type': (str, AbstractContainer), 'default': None,
486 'doc': ('The Container/Data object that uses the key or '
487 'the object id for the Container/Data object that uses the key.')},
488 {'name': 'relative_path', 'type': str,
489 'doc': ('The relative_path of the attribute of the object that uses ',
490 'an external resource reference key. Use an empty string if not applicable.'),
491 'default': ''},
492 {'name': 'field', 'type': str, 'default': '',
493 'doc': ('The field of the compound data type using an external resource.')})
494 def get_key(self, **kwargs):
495 """
496 Return a Key.
498 If container, relative_path, and field are provided, the Key that corresponds to the given name of the key
499 for the given container, relative_path, and field is returned.
500 """
501 key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs)
502 key_idx_matches = self.keys.which(key=key_name)
504 file = kwargs['file']
506 if container is not None:
507 if file is None:
508 file = self._get_file_from_container(container=container)
509 # if same key is used multiple times, determine
510 # which instance based on the Container
511 object_field = self._check_object_field(file=file,
512 container=container,
513 relative_path=relative_path,
514 field=field)
515 for row_idx in self.object_keys.which(objects_idx=object_field.idx):
516 key_idx = self.object_keys['keys_idx', row_idx]
517 if key_idx in key_idx_matches:
518 return self.keys.row[key_idx]
519 msg = "No key found with that container."
520 raise ValueError(msg)
521 else:
522 if len(key_idx_matches) == 0:
523 # the key has never been used before
524 raise ValueError("key '%s' does not exist" % key_name)
525 elif len(key_idx_matches) > 1:
526 msg = "There are more than one key with that name. Please search with additional information."
527 raise ValueError(msg)
528 else:
529 return self.keys.row[key_idx_matches[0]]
531 @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'})
532 def get_entity(self, **kwargs):
533 entity_id = kwargs['entity_id']
534 entity = self.entities.which(entity_id=entity_id)
535 if len(entity)>0:
536 return self.entities.row[entity[0]]
537 else:
538 return None
540 @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None,
541 'doc': ('The Container/Data object that uses the key or '
542 'the object_id for the Container/Data object that uses the key.')},
543 {'name': 'attribute', 'type': str,
544 'doc': 'The attribute of the container for the external reference.', 'default': None},
545 {'name': 'field', 'type': str, 'default': '',
546 'doc': ('The field of the compound data type using an external resource.')},
547 {'name': 'key', 'type': (str, Key), 'default': None,
548 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'},
549 {'name': 'entity_id', 'type': str, 'doc': 'The identifier for the entity at the resource.'},
550 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.', 'default': None},
551 {'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file associated with the container.',
552 'default': None},
553 )
554 def add_ref(self, **kwargs):
555 """
556 Add information about an external reference used in this file.
558 It is possible to use the same name of the key to refer to different resources
559 so long as the name of the key is not used within the same object, relative_path, and
560 field combination. This method does not support such functionality by default.
561 """
562 ###############################################################
563 container = kwargs['container']
564 attribute = kwargs['attribute']
565 key = kwargs['key']
566 field = kwargs['field']
567 entity_id = kwargs['entity_id']
568 entity_uri = kwargs['entity_uri']
569 file = kwargs['file']
571 if file is None:
572 file = self._get_file_from_container(container=container)
574 if attribute is None: # Trivial Case
575 relative_path = ''
576 object_field = self._check_object_field(file=file,
577 container=container,
578 relative_path=relative_path,
579 field=field)
580 else: # DataType Attribute Case
581 attribute_object = getattr(container, attribute) # returns attribute object
582 if isinstance(attribute_object, AbstractContainer):
583 relative_path = ''
584 object_field = self._check_object_field(file=file,
585 container=attribute_object,
586 relative_path=relative_path,
587 field=field)
588 else: # Non-DataType Attribute Case:
589 obj_mapper = self.type_map.get_map(container)
590 spec = obj_mapper.get_attr_spec(attr_name=attribute)
591 parent_spec = spec.parent # return the parent spec of the attribute
592 if parent_spec.data_type is None:
593 while parent_spec.data_type is None:
594 parent_spec = parent_spec.parent # find the closest parent with a data_type
595 parent_cls = self.type_map.get_dt_container_cls(data_type=parent_spec.data_type, autogen=False)
596 if isinstance(container, parent_cls): 596 ↛ 606line 596 didn't jump to line 606, because the condition on line 596 was never false
597 parent = container
598 # We need to get the path of the spec for relative_path
599 absolute_path = spec.path
600 relative_path = absolute_path[absolute_path.find('/')+1:]
601 object_field = self._check_object_field(file=file,
602 container=parent,
603 relative_path=relative_path,
604 field=field)
605 else:
606 msg = 'Container not the nearest data_type'
607 raise ValueError(msg)
608 else:
609 parent = container # container needs to be the parent
610 absolute_path = spec.path
611 relative_path = absolute_path[absolute_path.find('/')+1:]
612 # this regex removes everything prior to the container on the absolute_path
613 object_field = self._check_object_field(file=file,
614 container=parent,
615 relative_path=relative_path,
616 field=field)
618 if not isinstance(key, Key):
619 key_idx_matches = self.keys.which(key=key)
620 # if same key is used multiple times, determine
621 # which instance based on the Container
622 for row_idx in self.object_keys.which(objects_idx=object_field.idx):
623 key_idx = self.object_keys['keys_idx', row_idx]
624 if key_idx in key_idx_matches: 624 ↛ 625line 624 didn't jump to line 625, because the condition on line 624 was never true
625 msg = "Use Key Object when referencing an existing (container, relative_path, key)"
626 raise ValueError(msg)
628 key = self._add_key(key)
629 self._add_object_key(object_field, key)
631 else:
632 # Check to see that the existing key is being used with the object.
633 # If true, do nothing. If false, create a new obj/key relationship
634 # in the ObjectKeyTable
635 key_idx = key.idx
636 object_key_row_idx = self.object_keys.which(keys_idx=key_idx)
637 if len(object_key_row_idx)!=0:
638 obj_key_check = False
639 for row_idx in object_key_row_idx:
640 obj_idx = self.object_keys['objects_idx', row_idx]
641 if obj_idx == object_field.idx:
642 obj_key_check = True
643 if not obj_key_check:
644 self._add_object_key(object_field, key)
645 else:
646 msg = "Cannot find key object. Create new Key with string."
647 raise ValueError(msg)
648 # check if the key and object have been related in the ObjectKeyTable
650 entity = self.get_entity(entity_id=entity_id)
651 if entity is None:
652 if entity_uri is None:
653 msg = 'New entities must have an entity_uri.'
654 raise ValueError(msg)
655 entity = self._add_entity(entity_id, entity_uri)
656 self._add_entity_key(entity, key)
657 else:
658 if entity_uri is not None:
659 msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.'
660 raise ValueError(msg)
661 # check for entity-key relationship in EntityKeyTable
662 key_idx = key.idx
663 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx)
664 if len(entity_key_row_idx)!=0:
665 # this means there exists rows where the key is in the EntityKeyTable
666 entity_key_check = False
667 for row_idx in entity_key_row_idx:
668 entity_idx = self.entity_keys['entities_idx', row_idx]
669 if entity_idx == entity.idx:
670 entity_key_check = True
671 # this means there is already a key-entity relationship recorded
672 if not entity_key_check:
673 # this means that though the key is there, there is not key-entity relationship
674 # a.k.a add it now
675 self._add_entity_key(entity, key)
676 else:
677 # this means that specific key is not in the EntityKeyTable, so add it and establish
678 # the relationship with the entity
679 self._add_entity_key(entity, key)
680 return key, entity
682 @docval({'name': 'object_type', 'type': str,
683 'doc': 'The type of the object. This is also the parent in relative_path.'},
684 {'name': 'relative_path', 'type': str,
685 'doc': ('The relative_path of the attribute of the object that uses ',
686 'an external resource reference key. Use an empty string if not applicable.'),
687 'default': ''},
688 {'name': 'field', 'type': str, 'default': '',
689 'doc': ('The field of the compound data type using an external resource.')},
690 {'name': 'all_instances', 'type': bool, 'default': False,
691 'doc': ('The bool to return a dataframe with all instances of the object_type.',
692 'If True, relative_path and field inputs will be ignored.')})
693 def get_object_type(self, **kwargs):
694 """
695 Get all entities/resources associated with an object_type.
696 """
697 object_type = kwargs['object_type']
698 relative_path = kwargs['relative_path']
699 field = kwargs['field']
700 all_instances = kwargs['all_instances']
702 df = self.to_dataframe()
704 if all_instances:
705 df = df.loc[df['object_type'] == object_type]
706 else:
707 df = df.loc[(df['object_type'] == object_type)
708 & (df['relative_path'] == relative_path)
709 & (df['field'] == field)]
710 return df
712 @docval({'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file.',
713 'default': None},
714 {'name': 'container', 'type': (str, AbstractContainer),
715 'doc': 'The Container/data object that is linked to resources/entities.'},
716 {'name': 'attribute', 'type': str,
717 'doc': 'The attribute of the container for the external reference.', 'default': None},
718 {'name': 'relative_path', 'type': str,
719 'doc': ('The relative_path of the attribute of the object that uses ',
720 'an external resource reference key. Use an empty string if not applicable.'),
721 'default': ''},
722 {'name': 'field', 'type': str, 'default': '',
723 'doc': ('The field of the compound data type using an external resource.')})
724 def get_object_entities(self, **kwargs):
725 """
726 Get all entities/resources associated with an object.
727 """
728 file = kwargs['file']
729 container = kwargs['container']
730 attribute = kwargs['attribute']
731 relative_path = kwargs['relative_path']
732 field = kwargs['field']
734 if file is None:
735 file = self._get_file_from_container(container=container)
737 keys = []
738 entities = []
739 if attribute is None:
740 object_field = self._check_object_field(file=file,
741 container=container,
742 relative_path=relative_path,
743 field=field,
744 create=False)
745 else:
746 object_field = self._check_object_field(file=file,
747 container=container[attribute],
748 relative_path=relative_path,
749 field=field,
750 create=False)
751 # Find all keys associated with the object
752 for row_idx in self.object_keys.which(objects_idx=object_field.idx):
753 keys.append(self.object_keys['keys_idx', row_idx])
754 # Find all the entities/resources for each key.
755 for key_idx in keys:
756 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx)
757 for row_idx in entity_key_row_idx:
758 entity_idx = self.entity_keys['entities_idx', row_idx]
759 entities.append(self.entities.__getitem__(entity_idx))
760 df = pd.DataFrame(entities, columns=['entity_id', 'entity_uri'])
761 return df
763 @docval({'name': 'use_categories', 'type': bool, 'default': False,
764 'doc': 'Use a multi-index on the columns to indicate which category each column belongs to.'},
765 rtype=pd.DataFrame, returns='A DataFrame with all data merged into a flat, denormalized table.')
766 def to_dataframe(self, **kwargs):
767 """
768 Convert the data from the keys, resources, entities, objects, and object_keys tables
769 to a single joint dataframe. I.e., here data is being denormalized, e.g., keys that
770 are used across multiple entities or objects will duplicated across the corresponding
771 rows.
773 Returns: :py:class:`~pandas.DataFrame` with all data merged into a single, flat, denormalized table.
775 """
776 use_categories = popargs('use_categories', kwargs)
777 # Step 1: Combine the entities, keys, and entity_keys table
778 ent_key_df = self.entity_keys.to_dataframe()
779 entities_mapped_df = self.entities.to_dataframe().iloc[ent_key_df['entities_idx']].reset_index(drop=True)
780 keys_mapped_df = self.keys.to_dataframe().iloc[ent_key_df['keys_idx']].reset_index(drop=True)
781 ent_key_df = pd.concat(objs=[ent_key_df, entities_mapped_df, keys_mapped_df],
782 axis=1,
783 verify_integrity=False)
784 # Step 2: Combine the the files, object_keys and objects tables
785 object_keys_df = self.object_keys.to_dataframe()
786 objects_mapped_df = self.objects.to_dataframe().iloc[object_keys_df['objects_idx']].reset_index(drop=True)
787 object_keys_df = pd.concat(objs=[object_keys_df, objects_mapped_df],
788 axis=1,
789 verify_integrity=False)
790 files_df = self.files.to_dataframe().iloc[object_keys_df['files_idx']].reset_index(drop=True)
791 file_object_object_key_df = pd.concat(objs=[object_keys_df, files_df],
792 axis=1,
793 verify_integrity=False)
794 # Step 3: merge the combined entities_df and object_keys_df DataFrames
795 result_df = pd.concat(
796 # Create for each row in the objects_keys table a DataFrame with all corresponding data from all tables
797 objs=[pd.merge(
798 # Find all entities that correspond to the row i of the object_keys_table
799 ent_key_df[ent_key_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True),
800 # Get a DataFrame for row i of the objects_keys_table
801 file_object_object_key_df.iloc[[i, ]],
802 # Merge the entities and object_keys on the keys_idx column so that the values from the single
803 # object_keys_table row are copied across all corresponding rows in the entities table
804 on='keys_idx')
805 for i in range(len(object_keys_df))],
806 # Concatenate the rows of the objs
807 axis=0,
808 verify_integrity=False)
809 # Step 4: Clean up the index and sort columns by table type and name
810 result_df.reset_index(inplace=True, drop=True)
811 # ADD files
812 file_id_col = []
813 for idx in result_df['files_idx']:
814 file_id_val = self.files.to_dataframe().iloc[int(idx)]['file_object_id']
815 file_id_col.append(file_id_val)
817 result_df['file_object_id'] = file_id_col
818 column_labels = [('files', 'file_object_id'),
819 ('objects', 'objects_idx'), ('objects', 'object_id'), ('objects', 'files_idx'),
820 ('objects', 'object_type'), ('objects', 'relative_path'), ('objects', 'field'),
821 ('keys', 'keys_idx'), ('keys', 'key'),
822 ('entities', 'entities_idx'), ('entities', 'entity_id'), ('entities', 'entity_uri')]
823 # sort the columns based on our custom order
824 result_df = result_df.reindex(labels=[c[1] for c in column_labels],
825 axis=1)
826 result_df = result_df.astype({'keys_idx': 'uint32',
827 'objects_idx': 'uint32',
828 'files_idx': 'uint32',
829 'entities_idx': 'uint32'})
830 # Add the categories if requested
831 if use_categories: 831 ↛ 832line 831 didn't jump to line 832, because the condition on line 831 was never true
832 result_df.columns = pd.MultiIndex.from_tuples(column_labels)
833 # return the result
834 return result_df
836 @docval({'name': 'path', 'type': str, 'doc': 'path of the folder tsv file to write'})
837 def to_norm_tsv(self, **kwargs):
838 """
839 Write the tables in ExternalResources to individual tsv files.
840 """
841 path = kwargs['path']
842 files = [path+child.name+'.tsv' for child in self.children]
844 for i in range(len(self.children)):
845 df = self.children[i].to_dataframe()
846 df.to_csv(files[i], sep='\t', index=False)
848 with zipfile.ZipFile('er.zip', 'w') as zipF:
849 for file in files:
850 zipF.write(file)
852 # remove tsv files
853 for file in files:
854 os.remove(file)
856 @classmethod
857 @docval({'name': 'path', 'type': str, 'doc': 'path of the folder containing the tsv files to read'},
858 returns="ExternalResources loaded from TSV", rtype="ExternalResources")
859 def from_norm_tsv(cls, **kwargs):
860 path = kwargs['path']
861 with zipfile.ZipFile(path+'/er.zip', 'r') as zip:
862 zip.extractall(path)
863 tsv_paths = glob(path+'/*')
865 for file in tsv_paths:
866 file_name = os.path.basename(file)
867 if file_name == 'files.tsv':
868 files_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
869 files = FileTable().from_dataframe(df=files_df, name='files', extra_ok=False)
870 os.remove(file)
871 continue
872 if file_name == 'keys.tsv':
873 keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
874 keys = KeyTable().from_dataframe(df=keys_df, name='keys', extra_ok=False)
875 os.remove(file)
876 continue
877 if file_name == 'entities.tsv':
878 entities_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
879 entities = EntityTable().from_dataframe(df=entities_df, name='entities', extra_ok=False)
880 os.remove(file)
881 continue
882 if file_name == 'objects.tsv':
883 objects_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
884 objects = ObjectTable().from_dataframe(df=objects_df, name='objects', extra_ok=False)
885 os.remove(file)
886 continue
887 if file_name == 'object_keys.tsv':
888 object_keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
889 object_keys = ObjectKeyTable().from_dataframe(df=object_keys_df, name='object_keys', extra_ok=False)
890 os.remove(file)
891 continue
892 if file_name == 'entity_keys.tsv':
893 ent_key_df = pd.read_csv(file, sep='\t').replace(np.nan, '')
894 entity_keys = EntityKeyTable().from_dataframe(df=ent_key_df, name='entity_keys', extra_ok=False)
895 os.remove(file)
896 continue
898 # we need to check the idx columns in entities, objects, and object_keys
899 entity_idx = entity_keys['entities_idx']
900 for idx in entity_idx:
901 if not int(idx) < len(entities):
902 msg = "Entity Index out of range in EntityTable. Please check for alterations."
903 raise ValueError(msg)
905 files_idx = objects['files_idx']
906 for idx in files_idx:
907 if not int(idx) < len(files):
908 msg = "File_ID Index out of range in ObjectTable. Please check for alterations."
909 raise ValueError(msg)
911 object_idx = object_keys['objects_idx']
912 for idx in object_idx:
913 if not int(idx) < len(objects):
914 msg = "Object Index out of range in ObjectKeyTable. Please check for alterations."
915 raise ValueError(msg)
917 keys_idx = object_keys['keys_idx']
918 for idx in keys_idx:
919 if not int(idx) < len(keys):
920 msg = "Key Index out of range in ObjectKeyTable. Please check for alterations."
921 raise ValueError(msg)
923 keys_idx = entity_keys['keys_idx']
924 for idx in keys_idx:
925 if not int(idx) < len(keys):
926 msg = "Key Index out of range in EntityKeyTable. Please check for alterations."
927 raise ValueError(msg)
930 er = ExternalResources(files=files,
931 keys=keys,
932 entities=entities,
933 entity_keys=entity_keys,
934 objects=objects,
935 object_keys=object_keys)
936 return er