Coverage for src/hdmf/common/resources.py: 98%

416 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-08-18 20:49 +0000

1import pandas as pd 

2import numpy as np 

3from . import register_class, EXP_NAMESPACE 

4from . import get_type_map 

5from ..container import Table, Row, Container, AbstractContainer, HERDManager 

6from ..utils import docval, popargs, AllowPositional 

7from ..build import TypeMap 

8from glob import glob 

9import os 

10import zipfile 

11 

12 

13class KeyTable(Table): 

14 """ 

15 A table for storing keys used to reference external resources. 

16 """ 

17 

18 __defaultname__ = 'keys' 

19 

20 __columns__ = ( 

21 {'name': 'key', 'type': str, 

22 'doc': 'The user key that maps to the resource term / registry symbol.'}, 

23 ) 

24 

25 

26class Key(Row): 

27 """ 

28 A Row class for representing rows in the KeyTable. 

29 """ 

30 

31 __table__ = KeyTable 

32 

33 

34class EntityTable(Table): 

35 """ 

36 A table for storing the external resources a key refers to. 

37 """ 

38 

39 __defaultname__ = 'entities' 

40 

41 __columns__ = ( 

42 {'name': 'entity_id', 'type': str, 

43 'doc': 'The unique ID for the resource term / registry symbol.'}, 

44 {'name': 'entity_uri', 'type': str, 

45 'doc': 'The URI for the resource term / registry symbol.'}, 

46 ) 

47 

48 

49class Entity(Row): 

50 """ 

51 A Row class for representing rows in the EntityTable. 

52 """ 

53 

54 __table__ = EntityTable 

55 

56 

57class FileTable(Table): 

58 """ 

59 A table for storing file ids used in external resources. 

60 """ 

61 

62 __defaultname__ = 'files' 

63 

64 __columns__ = ( 

65 {'name': 'file_object_id', 'type': str, 

66 'doc': 'The file id of the file that contains the object'}, 

67 ) 

68 

69 

70class File(Row): 

71 """ 

72 A Row class for representing rows in the FileTable. 

73 """ 

74 

75 __table__ = FileTable 

76 

77 

78class ObjectTable(Table): 

79 """ 

80 A table for storing objects (i.e. Containers) that contain keys that refer to external resources. 

81 """ 

82 

83 __defaultname__ = 'objects' 

84 

85 __columns__ = ( 

86 {'name': 'files_idx', 'type': int, 

87 'doc': 'The row idx for the file_object_id in FileTable containing the object.'}, 

88 {'name': 'object_id', 'type': str, 

89 'doc': 'The object ID for the Container/Data.'}, 

90 {'name': 'object_type', 'type': str, 

91 'doc': 'The type of the object. This is also the parent in relative_path.'}, 

92 {'name': 'relative_path', 'type': str, 

93 'doc': ('The relative_path of the attribute of the object that uses ', 

94 'an external resource reference key. Use an empty string if not applicable.')}, 

95 {'name': 'field', 'type': str, 

96 'doc': ('The field of the compound data type using an external resource. ' 

97 'Use an empty string if not applicable.')} 

98 ) 

99 

100 

101class Object(Row): 

102 """ 

103 A Row class for representing rows in the ObjectTable. 

104 """ 

105 

106 __table__ = ObjectTable 

107 

108 

109class ObjectKeyTable(Table): 

110 """ 

111 A table for identifying which keys are used by which objects for referring to external resources. 

112 """ 

113 

114 __defaultname__ = 'object_keys' 

115 

116 __columns__ = ( 

117 {'name': 'objects_idx', 'type': (int, Object), 

118 'doc': 'The index into the objects table for the Object that uses the Key.'}, 

119 {'name': 'keys_idx', 'type': (int, Key), 

120 'doc': 'The index into the keys table that is used to make an external resource reference.'} 

121 ) 

122 

123 

124class EntityKeyTable(Table): 

125 """ 

126 A table for identifying which entities are used by which keys for referring to external resources. 

127 """ 

128 

129 __defaultname__ = 'entity_keys' 

130 

131 __columns__ = ( 

132 {'name': 'entities_idx', 'type': (int, Entity), 

133 'doc': 'The index into the EntityTable for the Entity that associated with the Key.'}, 

134 {'name': 'keys_idx', 'type': (int, Key), 

135 'doc': 'The index into the KeyTable that is used to make an external resource reference.'} 

136 ) 

137 

138 

139class EntityKey(Row): 

140 """ 

141 A Row class for representing rows in the EntityKeyTable. 

142 """ 

143 

144 __table__ = EntityKeyTable 

145 

146 

147class ObjectKey(Row): 

148 """ 

149 A Row class for representing rows in the ObjectKeyTable. 

150 """ 

151 

152 __table__ = ObjectKeyTable 

153 

154 

155@register_class('HERD', EXP_NAMESPACE) 

156class HERD(Container): 

157 """ 

158 HDMF External Resources Data Structure. 

159 A table for mapping user terms (i.e. keys) to resource entities. 

160 """ 

161 

162 __fields__ = ( 

163 {'name': 'keys', 'child': True}, 

164 {'name': 'files', 'child': True}, 

165 {'name': 'objects', 'child': True}, 

166 {'name': 'object_keys', 'child': True}, 

167 {'name': 'entity_keys', 'child': True}, 

168 {'name': 'entities', 'child': True}, 

169 ) 

170 

171 @docval({'name': 'keys', 'type': KeyTable, 'default': None, 

172 'doc': 'The table storing user keys for referencing resources.'}, 

173 {'name': 'files', 'type': FileTable, 'default': None, 

174 'doc': 'The table for storing file ids used in external resources.'}, 

175 {'name': 'entities', 'type': EntityTable, 'default': None, 

176 'doc': 'The table storing entity information.'}, 

177 {'name': 'objects', 'type': ObjectTable, 'default': None, 

178 'doc': 'The table storing object information.'}, 

179 {'name': 'object_keys', 'type': ObjectKeyTable, 'default': None, 

180 'doc': 'The table storing object-key relationships.'}, 

181 {'name': 'entity_keys', 'type': EntityKeyTable, 'default': None, 

182 'doc': 'The table storing entity-key relationships.'}, 

183 {'name': 'type_map', 'type': TypeMap, 'default': None, 

184 'doc': 'The type map. If None is provided, the HDMF-common type map will be used.'}, 

185 allow_positional=AllowPositional.WARNING) 

186 def __init__(self, **kwargs): 

187 name = 'external_resources' 

188 super().__init__(name) 

189 self.keys = kwargs['keys'] or KeyTable() 

190 self.files = kwargs['files'] or FileTable() 

191 self.entities = kwargs['entities'] or EntityTable() 

192 self.objects = kwargs['objects'] or ObjectTable() 

193 self.object_keys = kwargs['object_keys'] or ObjectKeyTable() 

194 self.entity_keys = kwargs['entity_keys'] or EntityKeyTable() 

195 self.type_map = kwargs['type_map'] or get_type_map() 

196 

197 @staticmethod 

198 def assert_external_resources_equal(left, right, check_dtype=True): 

199 """ 

200 Compare that the keys, resources, entities, objects, and object_keys tables match 

201 

202 :param left: HERD object to compare with right 

203 :param right: HERD object to compare with left 

204 :param check_dtype: Enforce strict checking of dtypes. Dtypes may be different 

205 for example for ids, where depending on how the data was saved 

206 ids may change from int64 to int32. (Default: True) 

207 :returns: The function returns True if all values match. If mismatches are found, 

208 AssertionError will be raised. 

209 :raises AssertionError: Raised if any differences are found. The function collects 

210 all differences into a single error so that the assertion will indicate 

211 all found differences. 

212 """ 

213 errors = [] 

214 try: 

215 pd.testing.assert_frame_equal(left.keys.to_dataframe(), 

216 right.keys.to_dataframe(), 

217 check_dtype=check_dtype) 

218 except AssertionError as e: 

219 errors.append(e) 

220 try: 

221 pd.testing.assert_frame_equal(left.files.to_dataframe(), 

222 right.files.to_dataframe(), 

223 check_dtype=check_dtype) 

224 except AssertionError as e: 

225 errors.append(e) 

226 try: 

227 pd.testing.assert_frame_equal(left.objects.to_dataframe(), 

228 right.objects.to_dataframe(), 

229 check_dtype=check_dtype) 

230 except AssertionError as e: 

231 errors.append(e) 

232 try: 

233 pd.testing.assert_frame_equal(left.entities.to_dataframe(), 

234 right.entities.to_dataframe(), 

235 check_dtype=check_dtype) 

236 except AssertionError as e: 

237 errors.append(e) 

238 try: 

239 pd.testing.assert_frame_equal(left.object_keys.to_dataframe(), 

240 right.object_keys.to_dataframe(), 

241 check_dtype=check_dtype) 

242 except AssertionError as e: 

243 errors.append(e) 

244 if len(errors) > 0: 

245 msg = ''.join(str(e)+"\n\n" for e in errors) 

246 raise AssertionError(msg) 

247 return True 

248 

249 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the key to be added.'}) 

250 def _add_key(self, **kwargs): 

251 """ 

252 Add a key to be used for making references to external resources. 

253 

254 It is possible to use the same *key_name* to refer to different resources so long as the *key_name* is not 

255 used within the same object, relative_path, and field. To do so, this method must be called for the 

256 two different resources. 

257 

258 The returned Key objects must be managed by the caller so as to be appropriately passed to subsequent calls 

259 to methods for storing information about the different resources. 

260 """ 

261 key = kwargs['key_name'] 

262 return Key(key, table=self.keys) 

263 

264 @docval({'name': 'file_object_id', 'type': str, 'doc': 'The id of the file'}) 

265 def _add_file(self, **kwargs): 

266 """ 

267 Add a file to be used for making references to external resources. 

268 

269 This is optional when working in HDMF. 

270 """ 

271 file_object_id = kwargs['file_object_id'] 

272 return File(file_object_id, table=self.files) 

273 

274 @docval({'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'}, 

275 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the entity.'}) 

276 def _add_entity(self, **kwargs): 

277 """ 

278 Add an entity that will be referenced to using keys specified in HERD.entity_keys. 

279 """ 

280 entity_id = kwargs['entity_id'] 

281 entity_uri = kwargs['entity_uri'] 

282 entity = Entity( entity_id, entity_uri, table=self.entities) 

283 return entity 

284 

285 @docval({'name': 'container', 'type': (str, AbstractContainer), 

286 'doc': 'The Container/Data object to add or the object id of the Container/Data object to add.'}, 

287 {'name': 'files_idx', 'type': int, 

288 'doc': 'The file_object_id row idx.'}, 

289 {'name': 'object_type', 'type': str, 'default': None, 

290 'doc': ('The type of the object. This is also the parent in relative_path. If omitted, ' 

291 'the name of the container class is used.')}, 

292 {'name': 'relative_path', 'type': str, 

293 'doc': ('The relative_path of the attribute of the object that uses ', 

294 'an external resource reference key. Use an empty string if not applicable.')}, 

295 {'name': 'field', 'type': str, 'default': '', 

296 'doc': ('The field of the compound data type using an external resource.')}) 

297 def _add_object(self, **kwargs): 

298 """ 

299 Add an object that references an external resource. 

300 """ 

301 files_idx, container, object_type, relative_path, field = popargs('files_idx', 

302 'container', 

303 'object_type', 

304 'relative_path', 

305 'field', kwargs) 

306 

307 if object_type is None: 307 ↛ 310line 307 didn't jump to line 310, because the condition on line 307 was never false

308 object_type = container.__class__.__name__ 

309 

310 if isinstance(container, AbstractContainer): 310 ↛ 312line 310 didn't jump to line 312, because the condition on line 310 was never false

311 container = container.object_id 

312 obj = Object(files_idx, container, object_type, relative_path, field, table=self.objects) 

313 return obj 

314 

315 @docval({'name': 'obj', 'type': (int, Object), 'doc': 'The Object that uses the Key.'}, 

316 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the Object uses.'}) 

317 def _add_object_key(self, **kwargs): 

318 """ 

319 Specify that an object (i.e. container and relative_path) uses a key to reference 

320 an external resource. 

321 """ 

322 obj, key = popargs('obj', 'key', kwargs) 

323 return ObjectKey(obj, key, table=self.object_keys) 

324 

325 @docval({'name': 'entity', 'type': (int, Entity), 'doc': 'The Entity associated with the Key.'}, 

326 {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the connected to the Entity.'}) 

327 def _add_entity_key(self, **kwargs): 

328 """ 

329 Add entity-key relationship to the EntityKeyTable. 

330 """ 

331 entity, key = popargs('entity', 'key', kwargs) 

332 return EntityKey(entity, key, table=self.entity_keys) 

333 

334 @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.'}, 

335 {'name': 'container', 'type': AbstractContainer, 

336 'doc': ('The Container/Data object that uses the key or ' 

337 'the object id for the Container/Data object that uses the key.')}, 

338 {'name': 'relative_path', 'type': str, 

339 'doc': ('The relative_path of the attribute of the object that uses ', 

340 'an external resource reference key. Use an empty string if not applicable.'), 

341 'default': ''}, 

342 {'name': 'field', 'type': str, 'default': '', 

343 'doc': ('The field of the compound data type using an external resource.')}, 

344 {'name': 'create', 'type': bool, 'default': True}) 

345 def _check_object_field(self, **kwargs): 

346 """ 

347 Check if a container, relative path, and field have been added. 

348 

349 The container can be either an object_id string or an AbstractContainer. 

350 

351 If the container, relative_path, and field have not been added, add them 

352 and return the corresponding Object. Otherwise, just return the Object. 

353 """ 

354 file = kwargs['file'] 

355 container = kwargs['container'] 

356 relative_path = kwargs['relative_path'] 

357 field = kwargs['field'] 

358 create = kwargs['create'] 

359 file_object_id = file.object_id 

360 files_idx = self.files.which(file_object_id=file_object_id) 

361 

362 if len(files_idx) > 1: 

363 raise ValueError("Found multiple instances of the same file.") 

364 elif len(files_idx) == 1: 

365 files_idx = files_idx[0] 

366 else: 

367 self._add_file(file_object_id) 

368 files_idx = self.files.which(file_object_id=file_object_id)[0] 

369 

370 objecttable_idx = self.objects.which(object_id=container.object_id) 

371 

372 if len(objecttable_idx) > 0: 

373 relative_path_idx = self.objects.which(relative_path=relative_path) 

374 field_idx = self.objects.which(field=field) 

375 objecttable_idx = list(set(objecttable_idx) & set(relative_path_idx) & set(field_idx)) 

376 if len(objecttable_idx) == 1: 

377 return self.objects.row[objecttable_idx[0]] 

378 elif len(objecttable_idx) == 0 and create: 

379 return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field) 

380 elif len(objecttable_idx) == 0 and not create: 

381 raise ValueError("Object not in Object Table.") 

382 else: 

383 raise ValueError("Found multiple instances of the same object id, relative path, " 

384 "and field in objects table.") 

385 

386 @docval({'name': 'container', 'type': (str, AbstractContainer), 

387 'doc': ('The Container/Data object that uses the key or ' 

388 'the object id for the Container/Data object that uses the key.')}) 

389 def _get_file_from_container(self, **kwargs): 

390 """ 

391 Method to retrieve a file associated with the container in the case a file is not provided. 

392 """ 

393 container = kwargs['container'] 

394 

395 if isinstance(container, HERDManager): 

396 file = container 

397 return file 

398 else: 

399 parent = container.parent 

400 if parent is not None: 

401 while parent is not None: 401 ↛ exitline 401 didn't return from function '_get_file_from_container', because the condition on line 401 was never false

402 if isinstance(parent, HERDManager): 

403 file = parent 

404 return file 

405 else: 

406 parent = parent.parent 

407 else: 

408 msg = 'Could not find file. Add container to the file.' 

409 raise ValueError(msg) 

410 

411 @docval({'name': 'root_container', 'type': HERDManager, 

412 'doc': 'The root container or file containing objects with a TermSet.'}) 

413 def add_ref_term_set(self, **kwargs): 

414 """ 

415 Method to search through the root_container for all instances of TermSet. 

416 Currently, only datasets are supported. By using a TermSet, the data comes validated 

417 and can use the permissible values within the set to populate HERD. 

418 """ 

419 root_container = kwargs['root_container'] 

420 

421 all_children = root_container.all_objects # dictionary of objects with the IDs as keys 

422 

423 for child in all_children: 

424 try: 

425 term_set = all_children[child].term_set 

426 data = all_children[child].data # TODO: This will be expanded to not just support data 

427 except AttributeError: 

428 continue 

429 

430 if term_set is not None: 

431 for term in data: 

432 term_info = term_set[term] 

433 entity_id = term_info[0] 

434 entity_uri = term_info[2] 

435 self.add_ref(file=root_container, 

436 container=all_children[child], 

437 key=term, 

438 entity_id=entity_id, 

439 entity_uri=entity_uri) 

440 

441 @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, 

442 {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', 

443 'default': None}, 

444 {'name': 'container', 'type': (str, AbstractContainer), 'default': None, 

445 'doc': ('The Container/Data object that uses the key or ' 

446 'the object id for the Container/Data object that uses the key.')}, 

447 {'name': 'relative_path', 'type': str, 

448 'doc': ('The relative_path of the attribute of the object that uses ', 

449 'an external resource reference key. Use an empty string if not applicable.'), 

450 'default': ''}, 

451 {'name': 'field', 'type': str, 'default': '', 

452 'doc': ('The field of the compound data type using an external resource.')}) 

453 def get_key(self, **kwargs): 

454 """ 

455 Return a Key. 

456 

457 If container, relative_path, and field are provided, the Key that corresponds to the given name of the key 

458 for the given container, relative_path, and field is returned. 

459 """ 

460 key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) 

461 key_idx_matches = self.keys.which(key=key_name) 

462 

463 file = kwargs['file'] 

464 

465 if container is not None: 

466 if file is None: 

467 file = self._get_file_from_container(container=container) 

468 # if same key is used multiple times, determine 

469 # which instance based on the Container 

470 object_field = self._check_object_field(file=file, 

471 container=container, 

472 relative_path=relative_path, 

473 field=field) 

474 for row_idx in self.object_keys.which(objects_idx=object_field.idx): 

475 key_idx = self.object_keys['keys_idx', row_idx] 

476 if key_idx in key_idx_matches: 

477 return self.keys.row[key_idx] 

478 msg = "No key found with that container." 

479 raise ValueError(msg) 

480 else: 

481 if len(key_idx_matches) == 0: 

482 # the key has never been used before 

483 raise ValueError("key '%s' does not exist" % key_name) 

484 elif len(key_idx_matches) > 1: 

485 msg = "There are more than one key with that name. Please search with additional information." 

486 raise ValueError(msg) 

487 else: 

488 return self.keys.row[key_idx_matches[0]] 

489 

490 @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) 

491 def get_entity(self, **kwargs): 

492 entity_id = kwargs['entity_id'] 

493 entity = self.entities.which(entity_id=entity_id) 

494 if len(entity)>0: 

495 return self.entities.row[entity[0]] 

496 else: 

497 return None 

498 

499 @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None, 

500 'doc': ('The Container/Data object that uses the key or ' 

501 'the object_id for the Container/Data object that uses the key.')}, 

502 {'name': 'attribute', 'type': str, 

503 'doc': 'The attribute of the container for the external reference.', 'default': None}, 

504 {'name': 'field', 'type': str, 'default': '', 

505 'doc': ('The field of the compound data type using an external resource.')}, 

506 {'name': 'key', 'type': (str, Key), 'default': None, 

507 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, 

508 {'name': 'entity_id', 'type': str, 'doc': 'The identifier for the entity at the resource.'}, 

509 {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.', 'default': None}, 

510 {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', 

511 'default': None}, 

512 ) 

513 def add_ref(self, **kwargs): 

514 """ 

515 Add information about an external reference used in this file. 

516 

517 It is possible to use the same name of the key to refer to different resources 

518 so long as the name of the key is not used within the same object, relative_path, and 

519 field combination. This method does not support such functionality by default. 

520 """ 

521 ############################################################### 

522 container = kwargs['container'] 

523 attribute = kwargs['attribute'] 

524 key = kwargs['key'] 

525 field = kwargs['field'] 

526 entity_id = kwargs['entity_id'] 

527 entity_uri = kwargs['entity_uri'] 

528 file = kwargs['file'] 

529 

530 if file is None: 

531 file = self._get_file_from_container(container=container) 

532 

533 if attribute is None: # Trivial Case 

534 relative_path = '' 

535 object_field = self._check_object_field(file=file, 

536 container=container, 

537 relative_path=relative_path, 

538 field=field) 

539 else: # DataType Attribute Case 

540 attribute_object = getattr(container, attribute) # returns attribute object 

541 if isinstance(attribute_object, AbstractContainer): 

542 relative_path = '' 

543 object_field = self._check_object_field(file=file, 

544 container=attribute_object, 

545 relative_path=relative_path, 

546 field=field) 

547 else: # Non-DataType Attribute Case: 

548 obj_mapper = self.type_map.get_map(container) 

549 spec = obj_mapper.get_attr_spec(attr_name=attribute) 

550 parent_spec = spec.parent # return the parent spec of the attribute 

551 if parent_spec.data_type is None: 

552 while parent_spec.data_type is None: 

553 parent_spec = parent_spec.parent # find the closest parent with a data_type 

554 parent_cls = self.type_map.get_dt_container_cls(data_type=parent_spec.data_type, autogen=False) 

555 if isinstance(container, parent_cls): 555 ↛ 565line 555 didn't jump to line 565, because the condition on line 555 was never false

556 parent = container 

557 # We need to get the path of the spec for relative_path 

558 absolute_path = spec.path 

559 relative_path = absolute_path[absolute_path.find('/')+1:] 

560 object_field = self._check_object_field(file=file, 

561 container=parent, 

562 relative_path=relative_path, 

563 field=field) 

564 else: 

565 msg = 'Container not the nearest data_type' 

566 raise ValueError(msg) 

567 else: 

568 parent = container # container needs to be the parent 

569 absolute_path = spec.path 

570 relative_path = absolute_path[absolute_path.find('/')+1:] 

571 # this regex removes everything prior to the container on the absolute_path 

572 object_field = self._check_object_field(file=file, 

573 container=parent, 

574 relative_path=relative_path, 

575 field=field) 

576 

577 if not isinstance(key, Key): 

578 key_idx_matches = self.keys.which(key=key) 

579 # if same key is used multiple times, determine 

580 # which instance based on the Container 

581 for row_idx in self.object_keys.which(objects_idx=object_field.idx): 

582 key_idx = self.object_keys['keys_idx', row_idx] 

583 if key_idx in key_idx_matches: 583 ↛ 584line 583 didn't jump to line 584, because the condition on line 583 was never true

584 msg = "Use Key Object when referencing an existing (container, relative_path, key)" 

585 raise ValueError(msg) 

586 

587 key = self._add_key(key) 

588 self._add_object_key(object_field, key) 

589 

590 else: 

591 # Check to see that the existing key is being used with the object. 

592 # If true, do nothing. If false, create a new obj/key relationship 

593 # in the ObjectKeyTable 

594 key_idx = key.idx 

595 object_key_row_idx = self.object_keys.which(keys_idx=key_idx) 

596 if len(object_key_row_idx)!=0: 

597 obj_key_check = False 

598 for row_idx in object_key_row_idx: 

599 obj_idx = self.object_keys['objects_idx', row_idx] 

600 if obj_idx == object_field.idx: 

601 obj_key_check = True 

602 if not obj_key_check: 

603 self._add_object_key(object_field, key) 

604 else: 

605 msg = "Cannot find key object. Create new Key with string." 

606 raise ValueError(msg) 

607 # check if the key and object have been related in the ObjectKeyTable 

608 

609 entity = self.get_entity(entity_id=entity_id) 

610 if entity is None: 

611 if entity_uri is None: 

612 msg = 'New entities must have an entity_uri.' 

613 raise ValueError(msg) 

614 entity = self._add_entity(entity_id, entity_uri) 

615 self._add_entity_key(entity, key) 

616 else: 

617 if entity_uri is not None: 

618 msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.' 

619 raise ValueError(msg) 

620 # check for entity-key relationship in EntityKeyTable 

621 key_idx = key.idx 

622 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) 

623 if len(entity_key_row_idx)!=0: 

624 # this means there exists rows where the key is in the EntityKeyTable 

625 entity_key_check = False 

626 for row_idx in entity_key_row_idx: 

627 entity_idx = self.entity_keys['entities_idx', row_idx] 

628 if entity_idx == entity.idx: 

629 entity_key_check = True 

630 # this means there is already a key-entity relationship recorded 

631 if not entity_key_check: 

632 # this means that though the key is there, there is not key-entity relationship 

633 # a.k.a add it now 

634 self._add_entity_key(entity, key) 

635 else: 

636 # this means that specific key is not in the EntityKeyTable, so add it and establish 

637 # the relationship with the entity 

638 self._add_entity_key(entity, key) 

639 return key, entity 

640 

641 @docval({'name': 'object_type', 'type': str, 

642 'doc': 'The type of the object. This is also the parent in relative_path.'}, 

643 {'name': 'relative_path', 'type': str, 

644 'doc': ('The relative_path of the attribute of the object that uses ', 

645 'an external resource reference key. Use an empty string if not applicable.'), 

646 'default': ''}, 

647 {'name': 'field', 'type': str, 'default': '', 

648 'doc': ('The field of the compound data type using an external resource.')}, 

649 {'name': 'all_instances', 'type': bool, 'default': False, 

650 'doc': ('The bool to return a dataframe with all instances of the object_type.', 

651 'If True, relative_path and field inputs will be ignored.')}) 

652 def get_object_type(self, **kwargs): 

653 """ 

654 Get all entities/resources associated with an object_type. 

655 """ 

656 object_type = kwargs['object_type'] 

657 relative_path = kwargs['relative_path'] 

658 field = kwargs['field'] 

659 all_instances = kwargs['all_instances'] 

660 

661 df = self.to_dataframe() 

662 

663 if all_instances: 

664 df = df.loc[df['object_type'] == object_type] 

665 else: 

666 df = df.loc[(df['object_type'] == object_type) 

667 & (df['relative_path'] == relative_path) 

668 & (df['field'] == field)] 

669 return df 

670 

671 @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file.', 

672 'default': None}, 

673 {'name': 'container', 'type': (str, AbstractContainer), 

674 'doc': 'The Container/data object that is linked to resources/entities.'}, 

675 {'name': 'attribute', 'type': str, 

676 'doc': 'The attribute of the container for the external reference.', 'default': None}, 

677 {'name': 'relative_path', 'type': str, 

678 'doc': ('The relative_path of the attribute of the object that uses ', 

679 'an external resource reference key. Use an empty string if not applicable.'), 

680 'default': ''}, 

681 {'name': 'field', 'type': str, 'default': '', 

682 'doc': ('The field of the compound data type using an external resource.')}) 

683 def get_object_entities(self, **kwargs): 

684 """ 

685 Get all entities/resources associated with an object. 

686 """ 

687 file = kwargs['file'] 

688 container = kwargs['container'] 

689 attribute = kwargs['attribute'] 

690 relative_path = kwargs['relative_path'] 

691 field = kwargs['field'] 

692 

693 if file is None: 

694 file = self._get_file_from_container(container=container) 

695 

696 keys = [] 

697 entities = [] 

698 if attribute is None: 

699 object_field = self._check_object_field(file=file, 

700 container=container, 

701 relative_path=relative_path, 

702 field=field, 

703 create=False) 

704 else: 

705 object_field = self._check_object_field(file=file, 

706 container=container[attribute], 

707 relative_path=relative_path, 

708 field=field, 

709 create=False) 

710 # Find all keys associated with the object 

711 for row_idx in self.object_keys.which(objects_idx=object_field.idx): 

712 keys.append(self.object_keys['keys_idx', row_idx]) 

713 # Find all the entities/resources for each key. 

714 for key_idx in keys: 

715 entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) 

716 for row_idx in entity_key_row_idx: 

717 entity_idx = self.entity_keys['entities_idx', row_idx] 

718 entities.append(self.entities.__getitem__(entity_idx)) 

719 df = pd.DataFrame(entities, columns=['entity_id', 'entity_uri']) 

720 return df 

721 

722 @docval({'name': 'use_categories', 'type': bool, 'default': False, 

723 'doc': 'Use a multi-index on the columns to indicate which category each column belongs to.'}, 

724 rtype=pd.DataFrame, returns='A DataFrame with all data merged into a flat, denormalized table.') 

725 def to_dataframe(self, **kwargs): 

726 """ 

727 Convert the data from the keys, resources, entities, objects, and object_keys tables 

728 to a single joint dataframe. I.e., here data is being denormalized, e.g., keys that 

729 are used across multiple entities or objects will duplicated across the corresponding 

730 rows. 

731 

732 Returns: :py:class:`~pandas.DataFrame` with all data merged into a single, flat, denormalized table. 

733 

734 """ 

735 use_categories = popargs('use_categories', kwargs) 

736 # Step 1: Combine the entities, keys, and entity_keys table 

737 ent_key_df = self.entity_keys.to_dataframe() 

738 entities_mapped_df = self.entities.to_dataframe().iloc[ent_key_df['entities_idx']].reset_index(drop=True) 

739 keys_mapped_df = self.keys.to_dataframe().iloc[ent_key_df['keys_idx']].reset_index(drop=True) 

740 ent_key_df = pd.concat(objs=[ent_key_df, entities_mapped_df, keys_mapped_df], 

741 axis=1, 

742 verify_integrity=False) 

743 # Step 2: Combine the the files, object_keys and objects tables 

744 object_keys_df = self.object_keys.to_dataframe() 

745 objects_mapped_df = self.objects.to_dataframe().iloc[object_keys_df['objects_idx']].reset_index(drop=True) 

746 object_keys_df = pd.concat(objs=[object_keys_df, objects_mapped_df], 

747 axis=1, 

748 verify_integrity=False) 

749 files_df = self.files.to_dataframe().iloc[object_keys_df['files_idx']].reset_index(drop=True) 

750 file_object_object_key_df = pd.concat(objs=[object_keys_df, files_df], 

751 axis=1, 

752 verify_integrity=False) 

753 # Step 3: merge the combined entities_df and object_keys_df DataFrames 

754 result_df = pd.concat( 

755 # Create for each row in the objects_keys table a DataFrame with all corresponding data from all tables 

756 objs=[pd.merge( 

757 # Find all entities that correspond to the row i of the object_keys_table 

758 ent_key_df[ent_key_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True), 

759 # Get a DataFrame for row i of the objects_keys_table 

760 file_object_object_key_df.iloc[[i, ]], 

761 # Merge the entities and object_keys on the keys_idx column so that the values from the single 

762 # object_keys_table row are copied across all corresponding rows in the entities table 

763 on='keys_idx') 

764 for i in range(len(object_keys_df))], 

765 # Concatenate the rows of the objs 

766 axis=0, 

767 verify_integrity=False) 

768 # Step 4: Clean up the index and sort columns by table type and name 

769 result_df.reset_index(inplace=True, drop=True) 

770 # ADD files 

771 file_id_col = [] 

772 for idx in result_df['files_idx']: 

773 file_id_val = self.files.to_dataframe().iloc[int(idx)]['file_object_id'] 

774 file_id_col.append(file_id_val) 

775 

776 result_df['file_object_id'] = file_id_col 

777 column_labels = [('files', 'file_object_id'), 

778 ('objects', 'objects_idx'), ('objects', 'object_id'), ('objects', 'files_idx'), 

779 ('objects', 'object_type'), ('objects', 'relative_path'), ('objects', 'field'), 

780 ('keys', 'keys_idx'), ('keys', 'key'), 

781 ('entities', 'entities_idx'), ('entities', 'entity_id'), ('entities', 'entity_uri')] 

782 # sort the columns based on our custom order 

783 result_df = result_df.reindex(labels=[c[1] for c in column_labels], 

784 axis=1) 

785 result_df = result_df.astype({'keys_idx': 'uint32', 

786 'objects_idx': 'uint32', 

787 'files_idx': 'uint32', 

788 'entities_idx': 'uint32'}) 

789 # Add the categories if requested 

790 if use_categories: 790 ↛ 791line 790 didn't jump to line 791, because the condition on line 790 was never true

791 result_df.columns = pd.MultiIndex.from_tuples(column_labels) 

792 # return the result 

793 return result_df 

794 

795 @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) 

796 def to_zip(self, **kwargs): 

797 """ 

798 Write the tables in HERD to zipped tsv files. 

799 """ 

800 zip_file = kwargs['path'] 

801 directory = os.path.dirname(zip_file) 

802 

803 files = [os.path.join(directory, child.name)+'.tsv' for child in self.children] 

804 for i in range(len(self.children)): 

805 df = self.children[i].to_dataframe() 

806 df.to_csv(files[i], sep='\t', index=False) 

807 

808 with zipfile.ZipFile(zip_file, 'w') as zipF: 

809 for file in files: 

810 zipF.write(file) 

811 

812 # remove tsv files 

813 for file in files: 

814 os.remove(file) 

815 

816 @classmethod 

817 @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) 

818 def from_zip(cls, **kwargs): 

819 """ 

820 Method to read in zipped tsv files to populate HERD. 

821 """ 

822 zip_file = kwargs['path'] 

823 directory = os.path.dirname(zip_file) 

824 

825 with zipfile.ZipFile(zip_file, 'r') as zip: 

826 zip.extractall(directory) 

827 tsv_paths = glob(directory+'/*') 

828 

829 for file in tsv_paths: 

830 file_name = os.path.basename(file) 

831 if file_name == 'files.tsv': 

832 files_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

833 files = FileTable().from_dataframe(df=files_df, name='files', extra_ok=False) 

834 os.remove(file) 

835 continue 

836 if file_name == 'keys.tsv': 

837 keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

838 keys = KeyTable().from_dataframe(df=keys_df, name='keys', extra_ok=False) 

839 os.remove(file) 

840 continue 

841 if file_name == 'entities.tsv': 

842 entities_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

843 entities = EntityTable().from_dataframe(df=entities_df, name='entities', extra_ok=False) 

844 os.remove(file) 

845 continue 

846 if file_name == 'objects.tsv': 

847 objects_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

848 objects = ObjectTable().from_dataframe(df=objects_df, name='objects', extra_ok=False) 

849 os.remove(file) 

850 continue 

851 if file_name == 'object_keys.tsv': 

852 object_keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

853 object_keys = ObjectKeyTable().from_dataframe(df=object_keys_df, name='object_keys', extra_ok=False) 

854 os.remove(file) 

855 continue 

856 if file_name == 'entity_keys.tsv': 

857 ent_key_df = pd.read_csv(file, sep='\t').replace(np.nan, '') 

858 entity_keys = EntityKeyTable().from_dataframe(df=ent_key_df, name='entity_keys', extra_ok=False) 

859 os.remove(file) 

860 continue 

861 

862 # we need to check the idx columns in entities, objects, and object_keys 

863 entity_idx = entity_keys['entities_idx'] 

864 for idx in entity_idx: 

865 if not int(idx) < len(entities): 

866 msg = "Entity Index out of range in EntityTable. Please check for alterations." 

867 raise ValueError(msg) 

868 

869 files_idx = objects['files_idx'] 

870 for idx in files_idx: 

871 if not int(idx) < len(files): 

872 msg = "File_ID Index out of range in ObjectTable. Please check for alterations." 

873 raise ValueError(msg) 

874 

875 object_idx = object_keys['objects_idx'] 

876 for idx in object_idx: 

877 if not int(idx) < len(objects): 

878 msg = "Object Index out of range in ObjectKeyTable. Please check for alterations." 

879 raise ValueError(msg) 

880 

881 keys_idx = object_keys['keys_idx'] 

882 for idx in keys_idx: 

883 if not int(idx) < len(keys): 

884 msg = "Key Index out of range in ObjectKeyTable. Please check for alterations." 

885 raise ValueError(msg) 

886 

887 keys_idx = entity_keys['keys_idx'] 

888 for idx in keys_idx: 

889 if not int(idx) < len(keys): 

890 msg = "Key Index out of range in EntityKeyTable. Please check for alterations." 

891 raise ValueError(msg) 

892 

893 

894 er = HERD(files=files, 

895 keys=keys, 

896 entities=entities, 

897 entity_keys=entity_keys, 

898 objects=objects, 

899 object_keys=object_keys) 

900 return er