Coverage for src/hdmf/common/resources.py: 98%

432 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-04 02:57 +0000

1import pandas as pd 

2import numpy as np 

3from . import register_class, EXP_NAMESPACE 

4from . import get_type_map 

5from ..container import Table, Row, Container, Data, AbstractContainer, HERDManager 

6from ..utils import docval, popargs, AllowPositional 

7from ..build import TypeMap 

8from ..term_set import TermSetWrapper 

9from glob import glob 

10import os 

11import zipfile 

12from collections import namedtuple 

13 

14 

15class KeyTable(Table): 

16 """ 

17 A table for storing keys used to reference external resources. 

18 """ 

19 

20 __defaultname__ = 'keys' 

21 

22 __columns__ = ( 

23 {'name': 'key', 'type': str, 

24 'doc': 'The user key that maps to the resource term / registry symbol.'}, 

25 ) 

26 

27 

28class Key(Row): 

29 """ 

30 A Row class for representing rows in the KeyTable. 

31 """ 

32 

33 __table__ = KeyTable 

34 

35 

36class EntityTable(Table): 

37 """ 

38 A table for storing the external resources a key refers to. 

39 """ 

40 

41 __defaultname__ = 'entities' 

42 

43 __columns__ = ( 

44 {'name': 'entity_id', 'type': str, 

45 'doc': 'The unique ID for the resource term / registry symbol.'}, 

46 {'name': 'entity_uri', 'type': str, 

47 'doc': 'The URI for the resource term / registry symbol.'}, 

48 ) 

49 

50 

51class Entity(Row): 

52 """ 

53 A Row class for representing rows in the EntityTable. 

54 """ 

55 

56 __table__ = EntityTable 

57 

58 

59class FileTable(Table): 

60 """ 

61 A table for storing file ids used in external resources. 

62 """ 

63 

64 __defaultname__ = 'files' 

65 

66 __columns__ = ( 

67 {'name': 'file_object_id', 'type': str, 

68 'doc': 'The file id of the file that contains the object'}, 

69 ) 

70 

71 

72class File(Row): 

73 """ 

74 A Row class for representing rows in the FileTable. 

75 """ 

76 

77 __table__ = FileTable 

78 

79 

80class ObjectTable(Table): 

81 """ 

82 A table for storing objects (i.e. Containers) that contain keys that refer to external resources. 

83 """ 

84 

85 __defaultname__ = 'objects' 

86 

87 __columns__ = ( 

88 {'name': 'files_idx', 'type': int, 

89 'doc': 'The row idx for the file_object_id in FileTable containing the object.'}, 

90 {'name': 'object_id', 'type': str, 

91 'doc': 'The object ID for the Container/Data.'}, 

92 {'name': 'object_type', 'type': str, 

93 'doc': 'The type of the object. This is also the parent in relative_path.'}, 

94 {'name': 'relative_path', 'type': str, 

95 'doc': ('The relative_path of the attribute of the object that uses ', 

96 'an external resource reference key. Use an empty string if not applicable.')}, 

97 {'name': 'field', 'type': str, 

98 'doc': ('The field of the compound data type using an external resource. ' 

99 'Use an empty string if not applicable.')} 

100 ) 

101 

102 

103class Object(Row): 

104 """ 

105 A Row class for representing rows in the ObjectTable. 

106 """ 

107 

108 __table__ = ObjectTable 

109 

110 

111class ObjectKeyTable(Table): 

112 """ 

113 A table for identifying which keys are used by which objects for referring to external resources. 

114 """ 

115 

116 __defaultname__ = 'object_keys' 

117 

118 __columns__ = ( 

119 {'name': 'objects_idx', 'type': (int, Object), 

120 'doc': 'The index into the objects table for the Object that uses the Key.'}, 

121 {'name': 'keys_idx', 'type': (int, Key), 

122 'doc': 'The index into the keys table that is used to make an external resource reference.'} 

123 ) 

124 

125 

126class EntityKeyTable(Table): 

127 """ 

128 A table for identifying which entities are used by which keys for referring to external resources. 

129 """ 

130 

131 __defaultname__ = 'entity_keys' 

132 

133 __columns__ = ( 

134 {'name': 'entities_idx', 'type': (int, Entity), 

135 'doc': 'The index into the EntityTable for the Entity that associated with the Key.'}, 

136 {'name': 'keys_idx', 'type': (int, Key), 

137 'doc': 'The index into the KeyTable that is used to make an external resource reference.'} 

138 ) 

139 

140 

141class EntityKey(Row): 

142 """ 

143 A Row class for representing rows in the EntityKeyTable. 

144 """ 

145 

146 __table__ = EntityKeyTable 

147 

148 

149class ObjectKey(Row): 

150 """ 

151 A Row class for representing rows in the ObjectKeyTable. 

152 """ 

153 

154 __table__ = ObjectKeyTable 

155 

156 

157@register_class('HERD', EXP_NAMESPACE) 

158class HERD(Container): 

159 """ 

160 HDMF External Resources Data Structure. 

161 A table for mapping user terms (i.e. keys) to resource entities. 

162 """ 

163 

164 __fields__ = ( 

165 {'name': 'keys', 'child': True}, 

166 {'name': 'files', 'child': True}, 

167 {'name': 'objects', 'child': True}, 

168 {'name': 'object_keys', 'child': True}, 

169 {'name': 'entity_keys', 'child': True}, 

170 {'name': 'entities', 'child': True}, 

171 ) 

172 

173 @docval({'name': 'keys', 'type': KeyTable, 'default': None, 

174 'doc': 'The table storing user keys for referencing resources.'}, 

175 {'name': 'files', 'type': FileTable, 'default': None, 

176 'doc': 'The table for storing file ids used in external resources.'}, 

177 {'name': 'entities', 'type': EntityTable, 'default': None, 

178 'doc': 'The table storing entity information.'}, 

179 {'name': 'objects', 'type': ObjectTable, 'default': None, 

180 'doc': 'The table storing object information.'}, 

181 {'name': 'object_keys', 'type': ObjectKeyTable, 'default': None, 

182 'doc': 'The table storing object-key relationships.'}, 

183 {'name': 'entity_keys', 'type': EntityKeyTable, 'default': None, 

184 'doc': 'The table storing entity-key relationships.'}, 

185 {'name': 'type_map', 'type': TypeMap, 'default': None, 

186 'doc': 'The type map. If None is provided, the HDMF-common type map will be used.'}, 

187 allow_positional=AllowPositional.WARNING) 

188 def __init__(self, **kwargs): 

189 name = 'external_resources' 

190 super().__init__(name) 

191 self.keys = kwargs['keys'] or KeyTable() 

192 self.files = kwargs['files'] or FileTable() 

193 self.entities = kwargs['entities'] or EntityTable() 

194 self.objects = kwargs['objects'] or ObjectTable() 

195 self.object_keys = kwargs['object_keys'] or ObjectKeyTable() 

196 self.entity_keys = kwargs['entity_keys'] or EntityKeyTable() 

197 self.type_map = kwargs['type_map'] or get_type_map() 

198 

199 @staticmethod 

200 def assert_external_resources_equal(left, right, check_dtype=True): 

201 """ 

202 Compare that the keys, resources, entities, objects, and object_keys tables match 

203 

204 :param left: HERD object to compare with right 

205 :param right: HERD object to compare with left 

206 :param check_dtype: Enforce strict checking of dtypes. Dtypes may be different 

207 for example for ids, where depending on how the data was saved 

208 ids may change from int64 to int32. (Default: True) 

209 :returns: The function returns True if all values match. If mismatches are found, 

210 AssertionError will be raised. 

211 :raises AssertionError: Raised if any differences are found. The function collects 

212 all differences into a single error so that the assertion will indicate 

213 all found differences. 

214 """ 

215 errors = [] 

216 try: 

217 pd.testing.assert_frame_equal(left.keys.to_dataframe(), 

218 right.keys.to_dataframe(), 

219 check_dtype=check_dtype) 

220 except AssertionError as e: 

221 errors.append(e) 

222 try: 

223 pd.testing.assert_frame_equal(left.files.to_dataframe(), 

224 right.files.to_dataframe(), 

225 check_dtype=check_dtype) 

226 except AssertionError as e: 

227 errors.append(e) 

228 try: 

229 pd.testing.assert_frame_equal(left.objects.to_dataframe(), 

230 right.objects.to_dataframe(), 

231 check_dtype=check_dtype) 

232 except AssertionError as e: 

233 errors.append(e) 

234 try: 

235 pd.testing.assert_frame_equal(left.entities.to_dataframe(), 

236 right.entities.to_dataframe(), 

237 check_dtype=check_dtype) 

238 except AssertionError as e: 

239 errors.append(e) 

240 try: 

241 pd.testing.assert_frame_equal(left.object_keys.to_dataframe(), 

242 right.object_keys.to_dataframe(), 

243 check_dtype=check_dtype) 

244 except AssertionError as e: 

245 errors.append(e) 

246 if len(errors) > 0: 

247 msg = ''.join(str(e)+"\n\n" for e in errors) 

248 raise AssertionError(msg) 

249 return True 

250 

251 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the key to be added.'}) 

252 def _add_key(self, **kwargs): 

253 """ 

254 Add a key to be used for making references to external resources. 

255 

256 It is possible to use the same *key_name* to refer to different resources so long as the *key_name* is not 

257 used within the same object, relative_path, and field. To do so, this method must be called for the 

258 two different resources. 

259 

260 The returned Key objects must be managed by the caller so as to be appropriately passed to subsequent calls 

261 to methods for storing information about the different resources. 

262 """ 

263 key = kwargs['key_name'] 

264 return Key(key, table=self.keys) 

265 

266 @docval({'name': 'file_object_id', 'type': str, 'doc': 'The id of the file'}) 

267 def _add_file(self, **kwargs): 

268 """ 

269 Add a file to be used for making references to external resources. 

270 

271 This is optional when working in HDMF. 

272 """ 

273 file_object_id = kwargs['file_object_id'] 

274 return File(file_object_id, table=self.files) 

275 

276 @docval({'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'}, 

277 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the entity.'}) 

278 def _add_entity(self, **kwargs): 

279 """ 

280 Add an entity that will be referenced to using keys specified in HERD.entity_keys. 

281 """ 

282 entity_id = kwargs['entity_id'] 

283 entity_uri = kwargs['entity_uri'] 

284 entity = Entity( entity_id, entity_uri, table=self.entities) 

285 return entity 

286 

287 @docval({'name': 'container', 'type': (str, AbstractContainer), 

288 'doc': 'The Container/Data object to add or the object id of the Container/Data object to add.'}, 

289 {'name': 'files_idx', 'type': int, 

290 'doc': 'The file_object_id row idx.'}, 

291 {'name': 'object_type', 'type': str, 'default': None, 

292 'doc': ('The type of the object. This is also the parent in relative_path. If omitted, ' 

293 'the name of the container class is used.')}, 

294 {'name': 'relative_path', 'type': str, 

295 'doc': ('The relative_path of the attribute of the object that uses ', 

296 'an external resource reference key. Use an empty string if not applicable.')}, 

297 {'name': 'field', 'type': str, 'default': '', 

298 'doc': ('The field of the compound data type using an external resource.')}) 

299 def _add_object(self, **kwargs): 

300 """ 

301 Add an object that references an external resource. 

302 """ 

303 files_idx, container, object_type, relative_path, field = popargs('files_idx', 

304 'container', 

305 'object_type', 

306 'relative_path', 

307 'field', kwargs) 

308 

309 if object_type is None: 309 ↛ 312line 309 didn't jump to line 312, because the condition on line 309 was never false

310 object_type = container.__class__.__name__ 

311 

312 if isinstance(container, AbstractContainer): 312 ↛ 314line 312 didn't jump to line 314, because the condition on line 312 was never false

313 container = container.object_id 

314 obj = Object(files_idx, container, object_type, relative_path, field, table=self.objects) 

315 return obj 

316 

317 @docval({'name': 'obj', 'type': (int, Object), 'doc': 'The Object that uses the Key.'}, 

318 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the Object uses.'}) 

319 def _add_object_key(self, **kwargs): 

320 """ 

321 Specify that an object (i.e. container and relative_path) uses a key to reference 

322 an external resource. 

323 """ 

324 obj, key = popargs('obj', 'key', kwargs) 

325 return ObjectKey(obj, key, table=self.object_keys) 

326 

327 @docval({'name': 'entity', 'type': (int, Entity), 'doc': 'The Entity associated with the Key.'}, 

328 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the connected to the Entity.'}) 

329 def _add_entity_key(self, **kwargs): 

330 """ 

331 Add entity-key relationship to the EntityKeyTable. 

332 """ 

333 entity, key = popargs('entity', 'key', kwargs) 

334 return EntityKey(entity, key, table=self.entity_keys) 

335 

336 @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.'}, 

337 {'name': 'container', 'type': AbstractContainer, 

338 'doc': ('The Container/Data object that uses the key or ' 

339 'the object id for the Container/Data object that uses the key.')}, 

340 {'name': 'relative_path', 'type': str, 

341 'doc': ('The relative_path of the attribute of the object that uses ', 

342 'an external resource reference key. Use an empty string if not applicable.'), 

343 'default': ''}, 

344 {'name': 'field', 'type': str, 'default': '', 

345 'doc': ('The field of the compound data type using an external resource.')}, 

346 {'name': 'create', 'type': bool, 'default': True}) 

347 def _check_object_field(self, **kwargs): 

348 """ 

349 Check if a container, relative path, and field have been added. 

350 

351 The container can be either an object_id string or an AbstractContainer. 

352 

353 If the container, relative_path, and field have not been added, add them 

354 and return the corresponding Object. Otherwise, just return the Object. 

355 """ 

356 file = kwargs['file'] 

357 container = kwargs['container'] 

358 relative_path = kwargs['relative_path'] 

359 field = kwargs['field'] 

360 create = kwargs['create'] 

361 file_object_id = file.object_id 

362 files_idx = self.files.which(file_object_id=file_object_id) 

363 

364 if len(files_idx) > 1: 

365 raise ValueError("Found multiple instances of the same file.") 

366 elif len(files_idx) == 1: 

367 files_idx = files_idx[0] 

368 else: 

369 self._add_file(file_object_id) 

370 files_idx = self.files.which(file_object_id=file_object_id)[0] 

371 

372 objecttable_idx = self.objects.which(object_id=container.object_id) 

373 

374 if len(objecttable_idx) > 0: 

375 relative_path_idx = self.objects.which(relative_path=relative_path) 

376 field_idx = self.objects.which(field=field) 

377 objecttable_idx = list(set(objecttable_idx) & set(relative_path_idx) & set(field_idx)) 

378 if len(objecttable_idx) == 1: 

379 return self.objects.row[objecttable_idx[0]] 

380 elif len(objecttable_idx) == 0 and create: 

381 return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field) 

382 elif len(objecttable_idx) == 0 and not create: 

383 raise ValueError("Object not in Object Table.") 

384 else: 

385 raise ValueError("Found multiple instances of the same object id, relative path, " 

386 "and field in objects table.") 

387 

388 @docval({'name': 'container', 'type': (str, AbstractContainer), 

389 'doc': ('The Container/Data object that uses the key or ' 

390 'the object id for the Container/Data object that uses the key.')}) 

391 def _get_file_from_container(self, **kwargs): 

392 """ 

393 Method to retrieve a file associated with the container in the case a file is not provided. 

394 """ 

395 container = kwargs['container'] 

396 

397 if isinstance(container, HERDManager): 

398 file = container 

399 return file 

400 else: 

401 parent = container.parent 

402 if parent is not None: 

403 while parent is not None: 403 ↛ exitline 403 didn't return from function '_get_file_from_container', because the condition on line 403 was never false

404 if isinstance(parent, HERDManager): 

405 file = parent 

406 return file 

407 else: 

408 parent = parent.parent 

409 else: 

410 msg = 'Could not find file. Add container to the file.' 

411 raise ValueError(msg) 

412 

413 @docval({'name': 'objects', 'type': list, 

414 'doc': 'List of objects to check for TermSetWrapper within the fields.'}) 

415 def __check_termset_wrapper(self, **kwargs): 

416 """ 

417 Takes a list of objects and checks the fields for TermSetWrapper. 

418 

419 wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper']) 

420 :return: [wrapped_obj(object1, attribute_name1, wrapper1), ...] 

421 """ 

422 objects = kwargs['objects'] 

423 

424 ret = [] # list to be returned with the objects, attributes and corresponding termsets 

425 

426 for obj in objects: 

427 # Get all the fields, parse out the methods and internal variables 

428 obj_fields = [a for a in dir(obj) if not a.startswith('_') and not callable(getattr(obj, a))] 

429 for attribute in obj_fields: 

430 attr = getattr(obj, attribute) 

431 if isinstance(attr, TermSetWrapper): 

432 # Search objects that are wrapped 

433 wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper']) 

434 ret.append(wrapped_obj(obj, attribute, attr)) 

435 

436 return ret 

437 

438 @docval({'name': 'root_container', 'type': HERDManager, 

439 'doc': 'The root container or file containing objects with a TermSet.'}) 

440 def add_ref_term_set(self, **kwargs): 

441 """ 

442 Method to search through the root_container for all instances of TermSet. 

443 Currently, only datasets are supported. By using a TermSet, the data comes validated 

444 and can use the permissible values within the set to populate HERD. 

445 """ 

446 root_container = kwargs['root_container'] 

447 

448 all_objects = root_container.all_children() # list of child objects and the container itself 

449 

450 add_ref_items = self.__check_termset_wrapper(objects=all_objects) 

451 for ref in add_ref_items: 

452 container, attr_name, wrapper = ref 

453 if isinstance(wrapper.value, (list, np.ndarray, tuple)): 

454 values = wrapper.value 

455 else: 

456 # create list for single values (edge-case) for a simple iteration downstream 

457 values = [wrapper.value] 

458 for term in values: 

459 term_info = wrapper.termset[term] 

460 entity_id = term_info[0] 

461 entity_uri = term_info[2] 

462 self.add_ref(file=root_container, 

463 container=container, 

464 attribute=attr_name, 

465 key=term, 

466 entity_id=entity_id, 

467 entity_uri=entity_uri) 

468 

469 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, 

470 {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', 

471 'default': None}, 

472 {'name': 'container', 'type': (str, AbstractContainer), 'default': None, 

473 'doc': ('The Container/Data object that uses the key or ' 

474 'the object id for the Container/Data object that uses the key.')}, 

475 {'name': 'relative_path', 'type': str, 

476 'doc': ('The relative_path of the attribute of the object that uses ', 

477 'an external resource reference key. Use an empty string if not applicable.'), 

478 'default': ''}, 

479 {'name': 'field', 'type': str, 'default': '', 

480 'doc': ('The field of the compound data type using an external resource.')}) 

481 def get_key(self, **kwargs): 

482 """ 

483 Return a Key. 

484 

485 If container, relative_path, and field are provided, the Key that corresponds to the given name of the key 

486 for the given container, relative_path, and field is returned. 

487 """ 

488 key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) 

489 key_idx_matches = self.keys.which(key=key_name) 

490 

491 file = kwargs['file'] 

492 

493 if container is not None: 

494 if file is None: 

495 file = self._get_file_from_container(container=container) 

496 # if same key is used multiple times, determine 

497 # which instance based on the Container 

498 object_field = self._check_object_field(file=file, 

499 container=container, 

500 relative_path=relative_path, 

501 field=field) 

502 for row_idx in self.object_keys.which(objects_idx=object_field.idx): 

503 key_idx = self.object_keys['keys_idx', row_idx] 

504 if key_idx in key_idx_matches: 

505 return self.keys.row[key_idx] 

506 msg = "No key found with that container." 

507 raise ValueError(msg) 

508 else: 

509 if len(key_idx_matches) == 0: 

510 # the key has never been used before 

511 raise ValueError("key '%s' does not exist" % key_name) 

512 elif len(key_idx_matches) > 1: 

513 msg = "There are more than one key with that name. Please search with additional information." 

514 raise ValueError(msg) 

515 else: 

516 return self.keys.row[key_idx_matches[0]] 

517 

518 @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) 

519 def get_entity(self, **kwargs): 

520 entity_id = kwargs['entity_id'] 

521 entity = self.entities.which(entity_id=entity_id) 

522 if len(entity)>0: 

523 return self.entities.row[entity[0]] 

524 else: 

525 return None 

526 

527 @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None, 

528 'doc': ('The Container/Data object that uses the key or ' 

529 'the object_id for the Container/Data object that uses the key.')}, 

530 {'name': 'attribute', 'type': str, 

531 'doc': 'The attribute of the container for the external reference.', 'default': None}, 

532 {'name': 'field', 'type': str, 'default': '', 

533 'doc': ('The field of the compound data type using an external resource.')}, 

534 {'name': 'key', 'type': (str, Key), 'default': None, 

535 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, 

536 {'name': 'entity_id', 'type': str, 'doc': 'The identifier for the entity at the resource.'}, 

537 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.', 'default': None}, 

538 {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', 

539 'default': None}, 

540 ) 

541 def add_ref(self, **kwargs): 

542 """ 

543 Add information about an external reference used in this file. 

544 

545 It is possible to use the same name of the key to refer to different resources 

546 so long as the name of the key is not used within the same object, relative_path, and 

547 field combination. This method does not support such functionality by default. 

548 """ 

549 ############################################################### 

550 container = kwargs['container'] 

551 attribute = kwargs['attribute'] 

552 if isinstance(container, Data): 

553 if attribute == 'data': 

554 attribute = None 

555 key = kwargs['key'] 

556 field = kwargs['field'] 

557 entity_id = kwargs['entity_id'] 

558 entity_uri = kwargs['entity_uri'] 

559 file = kwargs['file'] 

560 

561 if file is None: 

562 file = self._get_file_from_container(container=container) 

563 

564 if attribute is None: # Trivial Case 

565 relative_path = '' 

566 object_field = self._check_object_field(file=file, 

567 container=container, 

568 relative_path=relative_path, 

569 field=field) 

570 else: # DataType Attribute Case 

571 attribute_object = getattr(container, attribute) # returns attribute object 

572 if isinstance(attribute_object, AbstractContainer): 

573 relative_path = '' 

574 object_field = self._check_object_field(file=file, 

575 container=attribute_object, 

576 relative_path=relative_path, 

577 field=field) 

578 else: # Non-DataType Attribute Case: 

579 obj_mapper = self.type_map.get_map(container) 

580 spec = obj_mapper.get_attr_spec(attr_name=attribute) 

581 parent_spec = spec.parent # return the parent spec of the attribute 

582 if parent_spec.data_type is None: 

583 while parent_spec.data_type is None: 

584 parent_spec = parent_spec.parent # find the closest parent with a data_type 

585 parent_cls = self.type_map.get_dt_container_cls(data_type=parent_spec.data_type, autogen=False) 

586 if isinstance(container, parent_cls): 586 ↛ 596line 586 didn't jump to line 596, because the condition on line 586 was never false

587 parent = container 

588 # We need to get the path of the spec for relative_path 

589 absolute_path = spec.path 

590 relative_path = absolute_path[absolute_path.find('/')+1:] 

591 object_field = self._check_object_field(file=file, 

592 container=parent, 

593 relative_path=relative_path, 

594 field=field) 

595 else: 

596 msg = 'Container not the nearest data_type' 

597 raise ValueError(msg) 

598 else: 

599 parent = container # container needs to be the parent 

600 absolute_path = spec.path 

601 relative_path = absolute_path[absolute_path.find('/')+1:] 

602 # this regex removes everything prior to the container on the absolute_path 

603 object_field = self._check_object_field(file=file, 

604 container=parent, 

605 relative_path=relative_path, 

606 field=field) 

607 

608 if not isinstance(key, Key): 

609 key_idx_matches = self.keys.which(key=key) 

610 # if same key is used multiple times, determine 

611 # which instance based on the Container 

612 for row_idx in self.object_keys.which(objects_idx=object_field.idx): 

613 key_idx = self.object_keys['keys_idx', row_idx] 

614 if key_idx in key_idx_matches: 614 ↛ 615line 614 didn't jump to line 615, because the condition on line 614 was never true

615 msg = "Use Key Object when referencing an existing (container, relative_path, key)" 

616 raise ValueError(msg) 

617 

618 key = self._add_key(key) 

619 self._add_object_key(object_field, key) 

620 

621 else: 

622 # Check to see that the existing key is being used with the object. 

623 # If true, do nothing. If false, create a new obj/key relationship 

624 # in the ObjectKeyTable 

625 key_idx = key.idx 

626 object_key_row_idx = self.object_keys.which(keys_idx=key_idx) 

627 if len(object_key_row_idx)!=0: 

628 obj_key_check = False 

629 for row_idx in object_key_row_idx: 

630 obj_idx = self.object_keys['objects_idx', row_idx] 

631 if obj_idx == object_field.idx: 

632 obj_key_check = True 

633 if not obj_key_check: 

634 self._add_object_key(object_field, key) 

635 else: 

636 msg = "Cannot find key object. Create new Key with string." 

637 raise ValueError(msg) 

638 # check if the key and object have been related in the ObjectKeyTable 

639 

640 entity = self.get_entity(entity_id=entity_id) 

641 if entity is None: 

642 if entity_uri is None: 

643 msg = 'New entities must have an entity_uri.' 

644 raise ValueError(msg) 

645 entity = self._add_entity(entity_id, entity_uri) 

646 self._add_entity_key(entity, key) 

647 else: 

648 if entity_uri is not None: 

649 msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.' 

650 raise ValueError(msg) 

651 # check for entity-key relationship in EntityKeyTable 

652 key_idx = key.idx 

653 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) 

654 if len(entity_key_row_idx)!=0: 

655 # this means there exists rows where the key is in the EntityKeyTable 

656 entity_key_check = False 

657 for row_idx in entity_key_row_idx: 

658 entity_idx = self.entity_keys['entities_idx', row_idx] 

659 if entity_idx == entity.idx: 

660 entity_key_check = True 

661 # this means there is already a key-entity relationship recorded 

662 if not entity_key_check: 

663 # this means that though the key is there, there is not key-entity relationship 

664 # a.k.a add it now 

665 self._add_entity_key(entity, key) 

666 else: 

667 # this means that specific key is not in the EntityKeyTable, so add it and establish 

668 # the relationship with the entity 

669 self._add_entity_key(entity, key) 

670 return key, entity 

671 

672 @docval({'name': 'object_type', 'type': str, 

673 'doc': 'The type of the object. This is also the parent in relative_path.'}, 

674 {'name': 'relative_path', 'type': str, 

675 'doc': ('The relative_path of the attribute of the object that uses ', 

676 'an external resource reference key. Use an empty string if not applicable.'), 

677 'default': ''}, 

678 {'name': 'field', 'type': str, 'default': '', 

679 'doc': ('The field of the compound data type using an external resource.')}, 

680 {'name': 'all_instances', 'type': bool, 'default': False, 

681 'doc': ('The bool to return a dataframe with all instances of the object_type.', 

682 'If True, relative_path and field inputs will be ignored.')}) 

683 def get_object_type(self, **kwargs): 

684 """ 

685 Get all entities/resources associated with an object_type. 

686 """ 

687 object_type = kwargs['object_type'] 

688 relative_path = kwargs['relative_path'] 

689 field = kwargs['field'] 

690 all_instances = kwargs['all_instances'] 

691 

692 df = self.to_dataframe() 

693 

694 if all_instances: 

695 df = df.loc[df['object_type'] == object_type] 

696 else: 

697 df = df.loc[(df['object_type'] == object_type) 

698 & (df['relative_path'] == relative_path) 

699 & (df['field'] == field)] 

700 return df 

701 

702 @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file.', 

703 'default': None}, 

704 {'name': 'container', 'type': (str, AbstractContainer), 

705 'doc': 'The Container/data object that is linked to resources/entities.'}, 

706 {'name': 'attribute', 'type': str, 

707 'doc': 'The attribute of the container for the external reference.', 'default': None}, 

708 {'name': 'relative_path', 'type': str, 

709 'doc': ('The relative_path of the attribute of the object that uses ', 

710 'an external resource reference key. Use an empty string if not applicable.'), 

711 'default': ''}, 

712 {'name': 'field', 'type': str, 'default': '', 

713 'doc': ('The field of the compound data type using an external resource.')}) 

714 def get_object_entities(self, **kwargs): 

715 """ 

716 Get all entities/resources associated with an object. 

717 """ 

718 file = kwargs['file'] 

719 container = kwargs['container'] 

720 attribute = kwargs['attribute'] 

721 relative_path = kwargs['relative_path'] 

722 field = kwargs['field'] 

723 

724 if file is None: 

725 file = self._get_file_from_container(container=container) 

726 

727 keys = [] 

728 entities = [] 

729 if attribute is None: 

730 object_field = self._check_object_field(file=file, 

731 container=container, 

732 relative_path=relative_path, 

733 field=field, 

734 create=False) 

735 else: 

736 object_field = self._check_object_field(file=file, 

737 container=container[attribute], 

738 relative_path=relative_path, 

739 field=field, 

740 create=False) 

741 # Find all keys associated with the object 

742 for row_idx in self.object_keys.which(objects_idx=object_field.idx): 

743 keys.append(self.object_keys['keys_idx', row_idx]) 

744 # Find all the entities/resources for each key. 

745 for key_idx in keys: 

746 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) 

747 for row_idx in entity_key_row_idx: 

748 entity_idx = self.entity_keys['entities_idx', row_idx] 

749 entities.append(self.entities.__getitem__(entity_idx)) 

750 df = pd.DataFrame(entities, columns=['entity_id', 'entity_uri']) 

751 return df 

752 

753 @docval({'name': 'use_categories', 'type': bool, 'default': False, 

754 'doc': 'Use a multi-index on the columns to indicate which category each column belongs to.'}, 

755 rtype=pd.DataFrame, returns='A DataFrame with all data merged into a flat, denormalized table.') 

756 def to_dataframe(self, **kwargs): 

757 """ 

758 Convert the data from the keys, resources, entities, objects, and object_keys tables 

759 to a single joint dataframe. I.e., here data is being denormalized, e.g., keys that 

760 are used across multiple entities or objects will duplicated across the corresponding 

761 rows. 

762 

763 Returns: :py:class:`~pandas.DataFrame` with all data merged into a single, flat, denormalized table. 

764 

765 """ 

766 use_categories = popargs('use_categories', kwargs) 

767 # Step 1: Combine the entities, keys, and entity_keys table 

768 ent_key_df = self.entity_keys.to_dataframe() 

769 entities_mapped_df = self.entities.to_dataframe().iloc[ent_key_df['entities_idx']].reset_index(drop=True) 

770 keys_mapped_df = self.keys.to_dataframe().iloc[ent_key_df['keys_idx']].reset_index(drop=True) 

771 ent_key_df = pd.concat(objs=[ent_key_df, entities_mapped_df, keys_mapped_df], 

772 axis=1, 

773 verify_integrity=False) 

774 # Step 2: Combine the the files, object_keys and objects tables 

775 object_keys_df = self.object_keys.to_dataframe() 

776 objects_mapped_df = self.objects.to_dataframe().iloc[object_keys_df['objects_idx']].reset_index(drop=True) 

777 object_keys_df = pd.concat(objs=[object_keys_df, objects_mapped_df], 

778 axis=1, 

779 verify_integrity=False) 

780 files_df = self.files.to_dataframe().iloc[object_keys_df['files_idx']].reset_index(drop=True) 

781 file_object_object_key_df = pd.concat(objs=[object_keys_df, files_df], 

782 axis=1, 

783 verify_integrity=False) 

784 # Step 3: merge the combined entities_df and object_keys_df DataFrames 

785 result_df = pd.concat( 

786 # Create for each row in the objects_keys table a DataFrame with all corresponding data from all tables 

787 objs=[pd.merge( 

788 # Find all entities that correspond to the row i of the object_keys_table 

789 ent_key_df[ent_key_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True), 

790 # Get a DataFrame for row i of the objects_keys_table 

791 file_object_object_key_df.iloc[[i, ]], 

792 # Merge the entities and object_keys on the keys_idx column so that the values from the single 

793 # object_keys_table row are copied across all corresponding rows in the entities table 

794 on='keys_idx') 

795 for i in range(len(object_keys_df))], 

796 # Concatenate the rows of the objs 

797 axis=0, 

798 verify_integrity=False) 

799 # Step 4: Clean up the index and sort columns by table type and name 

800 result_df.reset_index(inplace=True, drop=True) 

801 # ADD files 

802 file_id_col = [] 

803 for idx in result_df['files_idx']: 

804 file_id_val = self.files.to_dataframe().iloc[int(idx)]['file_object_id'] 

805 file_id_col.append(file_id_val) 

806 

807 result_df['file_object_id'] = file_id_col 

808 column_labels = [('files', 'file_object_id'), 

809 ('objects', 'objects_idx'), ('objects', 'object_id'), ('objects', 'files_idx'), 

810 ('objects', 'object_type'), ('objects', 'relative_path'), ('objects', 'field'), 

811 ('keys', 'keys_idx'), ('keys', 'key'), 

812 ('entities', 'entities_idx'), ('entities', 'entity_id'), ('entities', 'entity_uri')] 

813 # sort the columns based on our custom order 

814 result_df = result_df.reindex(labels=[c[1] for c in column_labels], 

815 axis=1) 

816 result_df = result_df.astype({'keys_idx': 'uint32', 

817 'objects_idx': 'uint32', 

818 'files_idx': 'uint32', 

819 'entities_idx': 'uint32'}) 

820 # Add the categories if requested 

821 if use_categories: 821 ↛ 822line 821 didn't jump to line 822, because the condition on line 821 was never true

822 result_df.columns = pd.MultiIndex.from_tuples(column_labels) 

823 # return the result 

824 return result_df 

825 

826 @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) 

827 def to_zip(self, **kwargs): 

828 """ 

829 Write the tables in HERD to zipped tsv files. 

830 """ 

831 zip_file = kwargs['path'] 

832 directory = os.path.dirname(zip_file) 

833 

834 files = [os.path.join(directory, child.name)+'.tsv' for child in self.children] 

835 for i in range(len(self.children)): 

836 df = self.children[i].to_dataframe() 

837 df.to_csv(files[i], sep='\t', index=False) 

838 

839 with zipfile.ZipFile(zip_file, 'w') as zipF: 

840 for file in files: 

841 zipF.write(file) 

842 

843 # remove tsv files 

844 for file in files: 

845 os.remove(file) 

846 

847 @classmethod 

848 @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) 

849 def from_zip(cls, **kwargs): 

850 """ 

851 Method to read in zipped tsv files to populate HERD. 

852 """ 

853 zip_file = kwargs['path'] 

854 directory = os.path.dirname(zip_file) 

855 

856 with zipfile.ZipFile(zip_file, 'r') as zip: 

857 zip.extractall(directory) 

858 tsv_paths = glob(directory+'/*') 

859 

860 for file in tsv_paths: 

861 file_name = os.path.basename(file) 

862 if file_name == 'files.tsv': 

863 files_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

864 files = FileTable().from_dataframe(df=files_df, name='files', extra_ok=False) 

865 os.remove(file) 

866 continue 

867 if file_name == 'keys.tsv': 

868 keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

869 keys = KeyTable().from_dataframe(df=keys_df, name='keys', extra_ok=False) 

870 os.remove(file) 

871 continue 

872 if file_name == 'entities.tsv': 

873 entities_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

874 entities = EntityTable().from_dataframe(df=entities_df, name='entities', extra_ok=False) 

875 os.remove(file) 

876 continue 

877 if file_name == 'objects.tsv': 

878 objects_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

879 objects = ObjectTable().from_dataframe(df=objects_df, name='objects', extra_ok=False) 

880 os.remove(file) 

881 continue 

882 if file_name == 'object_keys.tsv': 

883 object_keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

884 object_keys = ObjectKeyTable().from_dataframe(df=object_keys_df, name='object_keys', extra_ok=False) 

885 os.remove(file) 

886 continue 

887 if file_name == 'entity_keys.tsv': 

888 ent_key_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

889 entity_keys = EntityKeyTable().from_dataframe(df=ent_key_df, name='entity_keys', extra_ok=False) 

890 os.remove(file) 

891 continue 

892 

893 # we need to check the idx columns in entities, objects, and object_keys 

894 entity_idx = entity_keys['entities_idx'] 

895 for idx in entity_idx: 

896 if not int(idx) < len(entities): 

897 msg = "Entity Index out of range in EntityTable. Please check for alterations." 

898 raise ValueError(msg) 

899 

900 files_idx = objects['files_idx'] 

901 for idx in files_idx: 

902 if not int(idx) < len(files): 

903 msg = "File_ID Index out of range in ObjectTable. Please check for alterations." 

904 raise ValueError(msg) 

905 

906 object_idx = object_keys['objects_idx'] 

907 for idx in object_idx: 

908 if not int(idx) < len(objects): 

909 msg = "Object Index out of range in ObjectKeyTable. Please check for alterations." 

910 raise ValueError(msg) 

911 

912 keys_idx = object_keys['keys_idx'] 

913 for idx in keys_idx: 

914 if not int(idx) < len(keys): 

915 msg = "Key Index out of range in ObjectKeyTable. Please check for alterations." 

916 raise ValueError(msg) 

917 

918 keys_idx = entity_keys['keys_idx'] 

919 for idx in keys_idx: 

920 if not int(idx) < len(keys): 

921 msg = "Key Index out of range in EntityKeyTable. Please check for alterations." 

922 raise ValueError(msg) 

923 

924 

925 er = HERD(files=files, 

926 keys=keys, 

927 entities=entities, 

928 entity_keys=entity_keys, 

929 objects=objects, 

930 object_keys=object_keys) 

931 return er