Coverage for src/hdmf/common/table.py: 89%

768 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-04 02:57 +0000

1""" 

2Collection of Container classes for interacting with data types related to 

3the storage and use of dynamic data tables as part of the hdmf-common schema 

4""" 

5 

6import re 

7from collections import OrderedDict 

8from typing import NamedTuple, Union 

9from warnings import warn 

10 

11import numpy as np 

12import pandas as pd 

13import itertools 

14 

15from . import register_class, EXP_NAMESPACE 

16from ..container import Container, Data 

17from ..data_utils import DataIO, AbstractDataChunkIterator 

18from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional 

19from ..term_set import TermSetWrapper 

20 

21 

22@register_class('VectorData') 

23class VectorData(Data): 

24 """ 

25 A n-dimensional dataset representing a column of a DynamicTable. 

26 If used without an accompanying VectorIndex, first dimension is 

27 along the rows of the DynamicTable and each step along the first 

28 dimension is a cell of the larger table. VectorData can also be 

29 used to represent a ragged array if paired with a VectorIndex. 

30 This allows for storing arrays of varying length in a single cell 

31 of the DynamicTable by indexing into this VectorData. The first 

32 vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at 

33 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on. 

34 """ 

35 

36 __fields__ = ("description",) 

37 

38 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, 

39 {'name': 'description', 'type': str, 'doc': 'a description for this column'}, 

40 {'name': 'data', 'type': ('array_data', 'data'), 

41 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()}, 

42 allow_positional=AllowPositional.WARNING) 

43 def __init__(self, **kwargs): 

44 description = popargs('description', kwargs) 

45 super().__init__(**kwargs) 

46 self.description = description 

47 

48 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'}) 

49 def add_row(self, **kwargs): 

50 """Append a data value to this VectorData column""" 

51 val = getargs('val', kwargs) 

52 self.append(val) 

53 

54 def get(self, key, **kwargs): 

55 """ 

56 Retrieve elements from this VectorData 

57 

58 :param key: Selection of the elements 

59 :param kwargs: Ignored 

60 """ 

61 return super().get(key) 

62 

63 def extend(self, ar, **kwargs): 

64 """Add all elements of the iterable arg to the end of this VectorData. 

65 

66 Each subclass of VectorData should have its own extend method to ensure functionality and efficiency. 

67 

68 :param arg: The iterable to add to the end of this VectorData 

69 """ 

70 ################################################################################# 

71 # Each subclass of VectorData should have its own extend method to ensure 

72 # functionality AND efficiency of the extend operation. However, because currently 

73 # they do not all have one of these methods, the only way to ensure functionality 

74 # is with calls to add_row. Because that is inefficient for basic VectorData, 

75 # this check is added to ensure we always call extend on a basic VectorData. 

76 if self.__class__.__mro__[0] == VectorData: 

77 super().extend(ar) 

78 else: 

79 for i in ar: 

80 self.add_row(i, **kwargs) 

81 

82 

83@register_class('VectorIndex') 

84class VectorIndex(VectorData): 

85 """ 

86 When paired with a VectorData, this allows for storing arrays of varying 

87 length in a single cell of the DynamicTable by indexing into this VectorData. 

88 The first vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at 

89 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on. 

90 """ 

91 

92 __fields__ = ("target",) 

93 

94 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorIndex'}, 

95 {'name': 'data', 'type': ('array_data', 'data'), 

96 'doc': 'a 1D dataset containing indexes that apply to VectorData object'}, 

97 {'name': 'target', 'type': VectorData, 

98 'doc': 'the target dataset that this index applies to'}, 

99 allow_positional=AllowPositional.WARNING) 

100 def __init__(self, **kwargs): 

101 target = popargs('target', kwargs) 

102 kwargs['description'] = "Index for VectorData '%s'" % target.name 

103 super().__init__(**kwargs) 

104 self.target = target 

105 self.__uint = np.uint8 

106 self.__maxval = 255 

107 if isinstance(self.data, (list, np.ndarray)): 

108 if len(self.data) > 0: 

109 self.__check_precision(len(self.target)) 

110 # adjust precision for types that we can adjust precision for 

111 self.__adjust_precision(self.__uint) 

112 

113 def add_vector(self, arg, **kwargs): 

114 """ 

115 Add the given data value to the target VectorData and append the corresponding index to this VectorIndex 

116 :param arg: The data value to be added to self.target 

117 """ 

118 if isinstance(self.target, VectorIndex): 

119 for a in arg: 

120 self.target.add_vector(a) 

121 else: 

122 self.target.extend(arg, **kwargs) 

123 self.append(self.__check_precision(len(self.target))) 

124 

125 def __check_precision(self, idx): 

126 """ 

127 Check precision of current dataset and, if necessary, adjust precision to accommodate new value. 

128 

129 Returns: 

130 unsigned integer encoding of idx 

131 """ 

132 if idx > self.__maxval: 

133 while idx > self.__maxval: 

134 nbits = (np.log2(self.__maxval + 1) * 2) # 8->16, 16->32, 32->64 

135 if nbits == 128: # pragma: no cover 

136 msg = ('Cannot store more than 18446744073709551615 elements in a VectorData. Largest dtype ' 

137 'allowed for VectorIndex is uint64.') 

138 raise ValueError(msg) 

139 self.__maxval = 2 ** nbits - 1 

140 self.__uint = np.dtype('uint%d' % nbits).type 

141 self.__adjust_precision(self.__uint) 

142 return self.__uint(idx) 

143 

144 def __adjust_precision(self, uint): 

145 """ 

146 Adjust precision of data to specified unsigned integer precision. 

147 """ 

148 if isinstance(self.data, list): 

149 for i in range(len(self.data)): 

150 self.data[i] = uint(self.data[i]) 

151 elif isinstance(self.data, np.ndarray): 151 ↛ 155line 151 didn't jump to line 155, because the condition on line 151 was never false

152 # use self._Data__data to work around restriction on resetting self.data 

153 self._Data__data = self.data.astype(uint) 

154 else: 

155 raise ValueError("cannot adjust precision of type %s to %s", (type(self.data), uint)) 

156 

157 def add_row(self, arg, **kwargs): 

158 """ 

159 Convenience function. Same as :py:func:`add_vector` 

160 """ 

161 self.add_vector(arg, **kwargs) 

162 

163 def __getitem_helper(self, arg, **kwargs): 

164 """ 

165 Internal helper function used by __getitem__ to retrieve a data value from self.target 

166 

167 :param arg: Integer index into this VectorIndex indicating the element we want to retrieve from the target 

168 :param kwargs: any additional arguments to *get* method of the self.target VectorData 

169 :return: Scalar or list of values retrieved 

170 """ 

171 start = 0 if arg == 0 else self.data[arg - 1] 

172 end = self.data[arg] 

173 return self.target.get(slice(start, end), **kwargs) 

174 

175 def __getitem__(self, arg): 

176 """ 

177 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData 

178 

179 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex 

180 :return: Scalar or list of values retrieved 

181 """ 

182 return self.get(arg) 

183 

184 def get(self, arg, **kwargs): 

185 """ 

186 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData 

187 

188 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex 

189 :param kwargs: any additional arguments to *get* method of the self.target VectorData 

190 :return: Scalar or list of values retrieved 

191 """ 

192 if np.isscalar(arg): 

193 return self.__getitem_helper(arg, **kwargs) 

194 else: 

195 if isinstance(arg, slice): 

196 indices = list(range(*arg.indices(len(self.data)))) 

197 else: 

198 if isinstance(arg[0], bool): 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true

199 arg = np.where(arg)[0] 

200 indices = arg 

201 ret = list() 

202 for i in indices: 

203 ret.append(self.__getitem_helper(i, **kwargs)) 

204 return ret 

205 

206 

207@register_class('ElementIdentifiers') 

208class ElementIdentifiers(Data): 

209 """ 

210 Data container with a list of unique identifiers for values within a dataset, e.g. rows of a DynamicTable. 

211 """ 

212 

213 @docval({'name': 'name', 'type': str, 'doc': 'the name of this ElementIdentifiers'}, 

214 {'name': 'data', 'type': ('array_data', 'data'), 'doc': 'a 1D dataset containing identifiers', 

215 'default': list()}, 

216 allow_positional=AllowPositional.WARNING) 

217 def __init__(self, **kwargs): 

218 super().__init__(**kwargs) 

219 

220 @docval({'name': 'other', 'type': (Data, np.ndarray, list, tuple, int), 

221 'doc': 'List of ids to search for in this ElementIdentifer object'}, 

222 rtype=np.ndarray, 

223 returns='Array with the list of indices where the elements in the list where found.' 

224 'Note, the elements in the returned list are ordered in increasing index' 

225 'of the found elements, rather than in the order in which the elements' 

226 'where given for the search. Also the length of the result may be different from the length' 

227 'of the input array. E.g., if our ids are [1,2,3] and we are search for [3,1,5] the ' 

228 'result would be [0,2] and NOT [2,0,None]') 

229 def __eq__(self, other): 

230 """ 

231 Given a list of ids return the indices in the ElementIdentifiers array where the indices are found. 

232 """ 

233 # Determine the ids we want to find 

234 search_ids = other if not isinstance(other, Data) else other.data 

235 if isinstance(search_ids, int): 

236 search_ids = [search_ids] 

237 # Find all matching locations 

238 return np.in1d(self.data, search_ids).nonzero()[0] 

239 

240 

241@register_class('DynamicTable') 

242class DynamicTable(Container): 

243 r""" 

244 A column-based table. Columns are defined by the argument *columns*. This argument 

245 must be a list/tuple of :class:`~hdmf.common.table.VectorData` and :class:`~hdmf.common.table.VectorIndex` objects 

246 or a list/tuple of dicts containing the keys ``name`` and ``description`` that provide the name and description 

247 of each column in the table. Additionally, the keys ``index``, ``table``, ``enum`` can be used for specifying 

248 additional structure to the table columns. Setting the key ``index`` to ``True`` can be used to indicate that the 

249 :class:`~hdmf.common.table.VectorData` column will store a ragged array (i.e. will be accompanied with a 

250 :class:`~hdmf.common.table.VectorIndex`). Setting the key ``table`` to ``True`` can be used to indicate that the 

251 column will store regions to another DynamicTable. Setting the key ``enum`` to ``True`` can be used to indicate 

252 that the column data will come from a fixed set of values. 

253 

254 Columns in DynamicTable subclasses can be statically defined by specifying the class attribute *\_\_columns\_\_*, 

255 rather than specifying them at runtime at the instance level. This is useful for defining a table structure 

256 that will get reused. The requirements for *\_\_columns\_\_* are the same as the requirements described above 

257 for specifying table columns with the *columns* argument to the DynamicTable constructor. 

258 """ 

259 

260 __fields__ = ( 

261 {'name': 'id', 'child': True}, 

262 {'name': 'columns', 'child': True}, 

263 'colnames', 

264 'description' 

265 ) 

266 

267 __columns__ = tuple() 

268 

269 @ExtenderMeta.pre_init 

270 def __gather_columns(cls, name, bases, classdict): 

271 r""" 

272 Gather columns from the *\_\_columns\_\_* class attribute and add them to the class. 

273 

274 This classmethod will be called during class declaration in the metaclass to automatically 

275 include all columns declared in subclasses. 

276 """ 

277 if not isinstance(cls.__columns__, tuple): 

278 msg = "'__columns__' must be of type tuple, found %s" % type(cls.__columns__) 

279 raise TypeError(msg) 

280 

281 if (len(bases) and 'DynamicTable' in globals() and issubclass(bases[-1], Container) 

282 and bases[-1].__columns__ is not cls.__columns__): 

283 new_columns = list(cls.__columns__) 

284 new_columns[0:0] = bases[-1].__columns__ # prepend superclass columns to new_columns 

285 cls.__columns__ = tuple(new_columns) 

286 

287 @docval({'name': 'name', 'type': str, 'doc': 'the name of this table'}, # noqa: C901 

288 {'name': 'description', 'type': str, 'doc': 'a description of what is in this table'}, 

289 {'name': 'id', 'type': ('array_data', 'data', ElementIdentifiers), 'doc': 'the identifiers for this table', 

290 'default': None}, 

291 {'name': 'columns', 'type': (tuple, list), 'doc': 'the columns in this table', 'default': None}, 

292 {'name': 'colnames', 'type': 'array_data', 

293 'doc': 'the ordered names of the columns in this table. columns must also be provided.', 

294 'default': None}, 

295 allow_positional=AllowPositional.WARNING) 

296 def __init__(self, **kwargs): # noqa: C901 

297 id, columns, desc, colnames = popargs('id', 'columns', 'description', 'colnames', kwargs) 

298 super().__init__(**kwargs) 

299 self.description = desc 

300 

301 # hold names of optional columns that are defined in __columns__ that are not yet initialized 

302 # map name to column specification 

303 self.__uninit_cols = dict() 

304 

305 # All tables must have ElementIdentifiers (i.e. a primary key column) 

306 # Here, we figure out what to do for that 

307 user_provided_ids = (id is not None) 

308 if user_provided_ids: 

309 if not isinstance(id, ElementIdentifiers): 

310 id = ElementIdentifiers(name='id', data=id) 

311 else: 

312 id = ElementIdentifiers(name='id') 

313 

314 if columns is not None and len(columns) > 0: 

315 # If columns have been passed in, check them over and process accordingly 

316 if isinstance(columns[0], dict): 

317 columns = self.__build_columns(columns) 

318 elif not all(isinstance(c, VectorData) for c in columns): 

319 raise ValueError("'columns' must be a list of dict, VectorData, DynamicTableRegion, or VectorIndex") 

320 

321 all_names = [c.name for c in columns] 

322 if len(all_names) != len(set(all_names)): 

323 raise ValueError("'columns' contains columns with duplicate names: %s" % all_names) 

324 

325 all_targets = [c.target.name for c in columns if isinstance(c, VectorIndex)] 

326 if len(all_targets) != len(set(all_targets)): 

327 raise ValueError("'columns' contains index columns with the same target: %s" % all_targets) 

328 

329 # TODO: check columns against __columns__ 

330 # mismatches should raise an error (e.g., a VectorData cannot be passed in with the same name as a 

331 # prespecified table region column) 

332 

333 # check column lengths against each other and id length 

334 # set ids if non-zero cols are provided and ids is empty 

335 colset = {c.name: c for c in columns} 

336 for c in columns: # remove all VectorData objects that have an associated VectorIndex from colset 

337 if isinstance(c, VectorIndex): 

338 if c.target.name in colset: 

339 colset.pop(c.target.name) 

340 else: 

341 raise ValueError("Found VectorIndex '%s' but not its target '%s'" % (c.name, c.target.name)) 

342 elif isinstance(c, EnumData): 

343 if c.elements.name in colset: 343 ↛ 345line 343 didn't jump to line 345, because the condition on line 343 was never false

344 colset.pop(c.elements.name) 

345 _data = c.data 

346 if isinstance(_data, DataIO): 

347 _data = _data.data 

348 if isinstance(_data, AbstractDataChunkIterator): 

349 colset.pop(c.name, None) 

350 lens = [len(c) for c in colset.values()] 

351 all_columns_are_iterators = (len(lens) == 0) 

352 

353 if not all(i == lens[0] for i in lens): 

354 raise ValueError("Columns must be the same length") 

355 # If we have columns given, but all columns are AbstractDataChunkIterator's, then we 

356 # cannot determine how many elements the id column will need. I.e., in this case the 

357 # user needs to provide the id's as otherwise we may create an invalid table with an 

358 # empty Id column but data in the rows. See: https://github.com/hdmf-dev/hdmf/issues/952 

359 if all_columns_are_iterators and not user_provided_ids: 

360 raise ValueError("Cannot determine row id's for table. Must provide ids with same length " 

361 "as the columns when all columns are specified via DataChunkIterator objects.") 

362 # If we have columns with a known length but the length (i.e., number of rows) 

363 # does not match the number of id's then initialize the id's 

364 if not all_columns_are_iterators and lens[0] != len(id): 

365 if user_provided_ids and len(id) > 0: 

366 raise ValueError("Must provide same number of ids as length of columns") 

367 else: # set ids to: 0 to length of columns - 1 

368 id.data.extend(range(lens[0])) 

369 

370 self.id = id 

371 

372 # NOTE: self.colnames and self.columns are always tuples 

373 # if kwarg colnames is an h5dataset, self.colnames is still a tuple 

374 if colnames is None or len(colnames) == 0: 

375 if columns is None: 

376 # make placeholder for columns if nothing was given 

377 self.colnames = tuple() 

378 self.columns = tuple() 

379 else: 

380 # Figure out column names if columns were given 

381 tmp = OrderedDict() 

382 skip = set() 

383 for col in columns: 

384 if col.name in skip: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true

385 continue 

386 if isinstance(col, VectorIndex): 

387 continue 

388 if isinstance(col, EnumData): 388 ↛ 389line 388 didn't jump to line 389, because the condition on line 388 was never true

389 skip.add(col.elements.name) 

390 tmp.pop(col.elements.name, None) 

391 tmp[col.name] = None 

392 self.colnames = tuple(tmp) 

393 self.columns = tuple(columns) 

394 else: 

395 # Calculate the order of column names 

396 if columns is None: 

397 raise ValueError("Must supply 'columns' if specifying 'colnames'") 

398 else: 

399 # order the columns according to the column names, which does not include indices 

400 self.colnames = tuple(pystr(c) for c in colnames) 

401 col_dict = {col.name: col for col in columns} 

402 # map from vectordata name to list of vectorindex objects where target of last vectorindex is vectordata 

403 indices = dict() 

404 # determine which columns are indexed by another column 

405 for col in columns: 

406 if isinstance(col, VectorIndex): 

407 # loop through nested indices to get to non-index column 

408 tmp_indices = [col] 

409 curr_col = col 

410 while isinstance(curr_col.target, VectorIndex): 

411 curr_col = curr_col.target 

412 tmp_indices.append(curr_col) 

413 # make sure the indices values has the full index chain, so replace existing value if it is 

414 # shorter 

415 if len(tmp_indices) > len(indices.get(curr_col.target.name, [])): 

416 indices[curr_col.target.name] = tmp_indices 

417 elif isinstance(col, EnumData): 

418 # EnumData is the indexing column, so it should go first 

419 if col.name not in indices: 419 ↛ 405line 419 didn't jump to line 405, because the condition on line 419 was never false

420 indices[col.name] = [col] # EnumData is the indexing object 

421 col_dict[col.name] = col.elements # EnumData.elements is the column with values 

422 else: 

423 if col.name in indices: 

424 continue 

425 indices[col.name] = [] 

426 # put columns in order of colnames, with indices before the target vectordata 

427 tmp = [] 

428 for name in self.colnames: 

429 tmp.extend(indices[name]) 

430 tmp.append(col_dict[name]) 

431 self.columns = tuple(tmp) 

432 

433 # to make generating DataFrames and Series easier 

434 col_dict = dict() 

435 self.__indices = dict() 

436 for col in self.columns: 

437 if isinstance(col, VectorIndex): 

438 # if index has already been added because it is part of a nested index chain, ignore this column 

439 if col.name in self.__indices: 

440 continue 

441 self.__indices[col.name] = col 

442 

443 # loop through nested indices to get to non-index column 

444 curr_col = col 

445 self.__set_table_attr(curr_col) 

446 while isinstance(curr_col.target, VectorIndex): 

447 curr_col = curr_col.target 

448 # check if index has been added. if not, add it 

449 if not hasattr(self, curr_col.name): 

450 self.__set_table_attr(curr_col) 

451 self.__indices[curr_col.name] = col 

452 

453 # use target vectordata name at end of indexing chain as key to get to the top level index 

454 col_dict[curr_col.target.name] = col 

455 if not hasattr(self, curr_col.target.name): 

456 self.__set_table_attr(curr_col.target) 

457 else: # this is a regular VectorData or EnumData 

458 # if we added this column using its index, ignore this column 

459 if col.name in col_dict: 

460 continue 

461 else: 

462 col_dict[col.name] = col 

463 self.__set_table_attr(col) 

464 

465 self.__df_cols = [self.id] + [col_dict[name] for name in self.colnames] 

466 

467 # self.__colids maps the column name to an index starting at 1 

468 self.__colids = {name: i + 1 for i, name in enumerate(self.colnames)} 

469 self._init_class_columns() 

470 

471 def __set_table_attr(self, col): 

472 if hasattr(self, col.name) and col.name not in self.__uninit_cols: 

473 msg = ("An attribute '%s' already exists on %s '%s' so this column cannot be accessed as an attribute, " 

474 "e.g., table.%s; it can only be accessed using other methods, e.g., table['%s']." 

475 % (col.name, self.__class__.__name__, self.name, col.name, col.name)) 

476 warn(msg) 

477 else: 

478 setattr(self, col.name, col) 

479 

480 __reserved_colspec_keys = ['name', 'description', 'index', 'table', 'required', 'class'] 

481 

482 def _init_class_columns(self): 

483 """ 

484 Process all predefined columns specified in class variable __columns__. 

485 Optional columns are not tracked but not added. 

486 """ 

487 for col in self.__columns__: 

488 if col['name'] not in self.__colids: # if column has not been added in __init__ 

489 if col.get('required', False): 

490 self.add_column(name=col['name'], 

491 description=col['description'], 

492 index=col.get('index', False), 

493 table=col.get('table', False), 

494 col_cls=col.get('class', VectorData), 

495 # Pass through extra kwargs for add_column that subclasses may have added 

496 **{k: col[k] for k in col.keys() 

497 if k not in DynamicTable.__reserved_colspec_keys}) 

498 else: 

499 # track the not yet initialized optional predefined columns 

500 self.__uninit_cols[col['name']] = col 

501 

502 # set the table attributes for not yet init optional predefined columns 

503 setattr(self, col['name'], None) 

504 index = col.get('index', False) 

505 if index is not False: 

506 if index is True: 

507 index = 1 

508 if isinstance(index, int): 508 ↛ 515line 508 didn't jump to line 515, because the condition on line 508 was never false

509 assert index > 0, ValueError("integer index value must be greater than 0") 

510 index_name = col['name'] 

511 for i in range(index): 

512 index_name = index_name + '_index' 

513 self.__uninit_cols[index_name] = col 

514 setattr(self, index_name, None) 

515 if col.get('enum', False): 

516 self.__uninit_cols[col['name'] + '_elements'] = col 

517 setattr(self, col['name'] + '_elements', None) 

518 

519 @staticmethod 

520 def __build_columns(columns, df=None): 

521 """ 

522 Build column objects according to specifications 

523 """ 

524 tmp = list() 

525 for d in columns: 

526 name = d['name'] 

527 desc = d.get('description', 'no description') 

528 col_cls = d.get('class', VectorData) 

529 data = None 

530 if df is not None: 

531 data = list(df[name].values) 

532 index = d.get('index', False) 

533 if index is not False: 533 ↛ 534line 533 didn't jump to line 534, because the condition on line 533 was never true

534 if isinstance(index, int) and index > 1: 

535 raise ValueError('Creating nested index columns using this method is not yet supported. Use ' 

536 'add_column or define the columns using __columns__ instead.') 

537 index_data = None 

538 if data is not None: 

539 index_data = [len(data[0])] 

540 for i in range(1, len(data)): 

541 index_data.append(len(data[i]) + index_data[i - 1]) 

542 # assume data came in through a DataFrame, so we need 

543 # to concatenate it 

544 tmp_data = list() 

545 for d in data: 

546 tmp_data.extend(d) 

547 data = tmp_data 

548 vdata = col_cls(name=name, description=desc, data=data) 

549 vindex = VectorIndex(name="%s_index" % name, data=index_data, target=vdata) 

550 tmp.append(vindex) 

551 tmp.append(vdata) 

552 elif d.get('enum', False): 552 ↛ 554line 552 didn't jump to line 554, because the condition on line 552 was never true

553 # EnumData is the indexing column, so it should go first 

554 if data is not None: 

555 elements, data = np.unique(data, return_inverse=True) 

556 tmp.append(EnumData(name, desc, data=data, elements=elements)) 

557 else: 

558 tmp.append(EnumData(name, desc, data=data)) 

559 # EnumData handles constructing the VectorData object that contains EnumData.elements 

560 # --> use this functionality (rather than creating here) for consistency and less code/complexity 

561 tmp.append(tmp[-1].elements) 

562 else: 

563 if data is None: 

564 data = list() 

565 if d.get('table', False): 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true

566 col_cls = DynamicTableRegion 

567 tmp.append(col_cls(name=name, description=desc, data=data)) 

568 return tmp 

569 

570 def __len__(self): 

571 """Number of rows in the table""" 

572 return len(self.id) 

573 

574 @docval({'name': 'data', 'type': dict, 'doc': 'the data to put in this row', 'default': None}, 

575 {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None}, 

576 {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique', 

577 'default': False}, 

578 allow_extra=True) 

579 def add_row(self, **kwargs): 

580 """ 

581 Add a row to the table. If *id* is not provided, it will auto-increment. 

582 """ 

583 data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs) 

584 data = data if data is not None else kwargs 

585 

586 bad_data = [] 

587 extra_columns = set(list(data.keys())) - set(list(self.__colids.keys())) 

588 missing_columns = set(list(self.__colids.keys())) - set(list(data.keys())) 

589 

590 for colname, colnum in self.__colids.items(): 

591 if colname not in data: 

592 raise ValueError("column '%s' missing" % colname) 

593 col = self.__df_cols[colnum] 

594 if isinstance(col, VectorIndex): 

595 continue 

596 else: 

597 if isinstance(col.data, TermSetWrapper): 

598 if col.data.termset.validate(term=data[colname]): 

599 continue 

600 else: 

601 bad_data.append(data[colname]) 

602 

603 if len(bad_data)!=0: 

604 msg = ('"%s" is not in the term set.' % ', '.join([str(item) for item in bad_data])) 

605 raise ValueError(msg) 

606 

607 # check to see if any of the extra columns just need to be added 

608 if extra_columns: 

609 for col in self.__columns__: 

610 if col['name'] in extra_columns: 

611 if data[col['name']] is not None: 611 ↛ 621line 611 didn't jump to line 621, because the condition on line 611 was never false

612 self.add_column(col['name'], col['description'], 

613 index=col.get('index', False), 

614 table=col.get('table', False), 

615 enum=col.get('enum', False), 

616 col_cls=col.get('class', VectorData), 

617 # Pass through extra keyword arguments for add_column that 

618 # subclasses may have added 

619 **{k: col[k] for k in col.keys() 

620 if k not in DynamicTable.__reserved_colspec_keys}) 

621 extra_columns.remove(col['name']) 

622 

623 if extra_columns or missing_columns: 

624 raise ValueError( 

625 '\n'.join([ 

626 'row data keys don\'t match available columns', 

627 'you supplied {} extra keys: {}'.format(len(extra_columns), extra_columns), 

628 'and were missing {} keys: {}'.format(len(missing_columns), missing_columns) 

629 ]) 

630 ) 

631 if row_id is None: 

632 row_id = data.pop('id', None) 

633 if row_id is None: 

634 row_id = len(self) 

635 if enforce_unique_id: 

636 if row_id in self.id: 

637 raise ValueError("id %i already in the table" % row_id) 

638 self.id.append(row_id) 

639 

640 for colname, colnum in self.__colids.items(): 

641 if colname not in data: 641 ↛ 642line 641 didn't jump to line 642, because the condition on line 641 was never true

642 raise ValueError("column '%s' missing" % colname) 

643 c = self.__df_cols[colnum] 

644 if isinstance(c, VectorIndex): 

645 c.add_vector(data[colname]) 

646 else: 

647 c.add_row(data[colname]) 

648 

649 def __eq__(self, other): 

650 """Compare if the two DynamicTables contain the same data. 

651 

652 First this returns False if the other DynamicTable has a different name or 

653 description. Then, this table and the other table are converted to pandas 

654 dataframes and the equality of the two tables is returned. 

655 

656 :param other: DynamicTable to compare to 

657 

658 :return: Bool indicating whether the two DynamicTables contain the same data 

659 """ 

660 if other is self: 

661 return True 

662 if not isinstance(other, DynamicTable): 

663 return False 

664 if self.name != other.name or self.description != other.description: 

665 return False 

666 return self.to_dataframe().equals(other.to_dataframe()) 

667 

668 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, # noqa: C901 

669 {'name': 'description', 'type': str, 'doc': 'a description for this column'}, 

670 {'name': 'data', 'type': ('array_data', 'data'), 

671 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()}, 

672 {'name': 'table', 'type': (bool, 'DynamicTable'), 

673 'doc': 'whether or not this is a table region or the table the region applies to', 'default': False}, 

674 {'name': 'index', 'type': (bool, VectorIndex, 'array_data', int), 

675 'doc': ' * ``False`` (default): do not generate a VectorIndex\n\n' 

676 ' * ``True``: generate one empty VectorIndex \n\n' 

677 ' * ``VectorIndex``: Use the supplied VectorIndex \n\n' 

678 ' * array-like of ints: Create a VectorIndex and use these values as the data \n\n' 

679 ' * ``int``: Recursively create `n` VectorIndex objects for a multi-ragged array \n', 

680 'default': False}, 

681 {'name': 'enum', 'type': (bool, 'array_data'), 'default': False, 

682 'doc': ('whether or not this column contains data from a fixed set of elements')}, 

683 {'name': 'col_cls', 'type': type, 'default': VectorData, 

684 'doc': ('class to use to represent the column data. If table=True, this field is ignored and a ' 

685 'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData ' 

686 'object is used.')}, 

687 allow_extra=True) 

688 def add_column(self, **kwargs): # noqa: C901 

689 """ 

690 Add a column to this table. 

691 

692 If data is provided, it must contain the same number of rows as the current state of the table. 

693 

694 Extra keyword arguments will be passed to the constructor of the column class ("col_cls"). 

695 

696 :raises ValueError: if the column has already been added to the table 

697 """ 

698 name, data = getargs('name', 'data', kwargs) 

699 index, table, enum, col_cls= popargs('index', 'table', 'enum', 'col_cls', kwargs) 

700 

701 if isinstance(index, VectorIndex): 

702 warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be " 

703 "deprecated in a future version of HDMF.", FutureWarning) 

704 

705 if name in self.__colids: # column has already been added 

706 msg = "column '%s' already exists in %s '%s'" % (name, self.__class__.__name__, self.name) 

707 raise ValueError(msg) 

708 

709 if name in self.__uninit_cols: # column is a predefined optional column from the spec 

710 # check the given values against the predefined optional column spec. if they do not match, raise a warning 

711 # and ignore the given arguments. users should not be able to override these values 

712 table_bool = table or not isinstance(table, bool) 

713 spec_table = self.__uninit_cols[name].get('table', False) 

714 if table_bool != spec_table: 

715 msg = ("Column '%s' is predefined in %s with table=%s which does not match the entered " 

716 "table argument. The predefined table spec will be ignored. " 

717 "Please ensure the new column complies with the spec. " 

718 "This will raise an error in a future version of HDMF." 

719 % (name, self.__class__.__name__, spec_table)) 

720 warn(msg) 

721 

722 index_bool = index or not isinstance(index, bool) 

723 spec_index = self.__uninit_cols[name].get('index', False) 

724 if index_bool != spec_index: 

725 msg = ("Column '%s' is predefined in %s with index=%s which does not match the entered " 

726 "index argument. The predefined index spec will be ignored. " 

727 "Please ensure the new column complies with the spec. " 

728 "This will raise an error in a future version of HDMF." 

729 % (name, self.__class__.__name__, spec_index)) 

730 warn(msg) 

731 

732 spec_col_cls = self.__uninit_cols[name].get('class', VectorData) 

733 if col_cls != spec_col_cls: 

734 msg = ("Column '%s' is predefined in %s with class=%s which does not match the entered " 

735 "col_cls argument. The predefined class spec will be ignored. " 

736 "Please ensure the new column complies with the spec. " 

737 "This will raise an error in a future version of HDMF." 

738 % (name, self.__class__.__name__, spec_col_cls)) 

739 warn(msg) 

740 

741 ckwargs = dict(kwargs) 

742 

743 # Add table if it's been specified 

744 if table and enum: 744 ↛ 745line 744 didn't jump to line 745, because the condition on line 744 was never true

745 raise ValueError("column '%s' cannot be both a table region " 

746 "and come from an enumerable set of elements" % name) 

747 if table is not False: 

748 col_cls = DynamicTableRegion 

749 if isinstance(table, DynamicTable): 

750 ckwargs['table'] = table 

751 if enum is not False: 

752 col_cls = EnumData 

753 if isinstance(enum, (list, tuple, np.ndarray, VectorData)): 753 ↛ 754line 753 didn't jump to line 754, because the condition on line 753 was never true

754 ckwargs['elements'] = enum 

755 

756 # If the user provided a list of lists that needs to be indexed, then we now need to flatten the data 

757 # We can only create the index actual VectorIndex once we have the VectorData column so we compute 

758 # the index and flatten the data here and then create the VectorIndex later from create_vector_index 

759 # once we have created the column 

760 create_vector_index = None 

761 if ckwargs.get('data', None) is not None: 761 ↛ 785line 761 didn't jump to line 785, because the condition on line 761 was never false

762 # Check that we are asked to create an index 

763 if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0: 

764 # Iteratively flatten the data we use for the column based on the depth of the index to generate. 

765 # Also, for each level compute the data for the VectorIndex for that level 

766 flatten_data = data 

767 create_vector_index = [] 

768 for i in range(index): 

769 try: 

770 create_vector_index.append(np.cumsum([len(c) for c in flatten_data]).tolist()) 

771 except TypeError as e: 

772 raise ValueError("Cannot automatically construct VectorIndex for nested array. " 

773 "Invalid data array element found.") from e 

774 flatten_data = list(itertools.chain.from_iterable(flatten_data)) 

775 # if our data still is an array (e.g., a list or numpy array) then warn that the index parameter 

776 # may be incorrect. 

777 if len(flatten_data) > 0 and isinstance(flatten_data[0], (np.ndarray, list, tuple)): 

778 raise ValueError("Cannot automatically construct VectorIndex for nested array. " 

779 "Column data contains arrays as cell values. Please check the 'data' and 'index' " 

780 "parameters. 'index=%s' may be too small for the given data." % str(index)) 

781 # overwrite the data to be used for the VectorData column with the flattened data 

782 ckwargs['data'] = flatten_data 

783 

784 # Create the VectorData column 

785 col = col_cls(**ckwargs) 

786 col.parent = self 

787 columns = [col] 

788 self.__set_table_attr(col) 

789 if col in self.__uninit_cols: 789 ↛ 790line 789 didn't jump to line 790, because the condition on line 789 was never true

790 self.__uninit_cols.pop(col) 

791 

792 if col_cls is EnumData: 

793 columns.append(col.elements) 

794 col.elements.parent = self 

795 

796 # Add index if it's been specified 

797 if index is not False: 

798 if isinstance(index, VectorIndex): 

799 col_index = index 

800 self.__add_column_index_helper(col_index) 

801 elif isinstance(index, bool): 

802 # create empty index for empty column 

803 if create_vector_index is None: 

804 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index") 

805 col_index = VectorIndex(name=name + "_index", data=list(), target=col) 

806 # create single-level VectorIndex from the data based on the create_vector_index we computed earlier 

807 else: 

808 col_index = VectorIndex(name=name + "_index", data=create_vector_index[0], target=col) 

809 # add the column with the index 

810 self.__add_column_index_helper(col_index) 

811 elif isinstance(index, int): 

812 if create_vector_index is None: 

813 assert index > 0, ValueError("integer index value must be greater than 0") 

814 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index") 

815 index_name = name 

816 for i in range(index): 

817 index_name = index_name + "_index" 

818 col_index = VectorIndex(name=index_name, data=list(), target=col) 

819 self.__add_column_index_helper(col_index) 

820 if i < index - 1: 

821 columns.insert(0, col_index) 

822 col = col_index 

823 # Create the nested VectorIndex from the create_vector_index we computed above 

824 else: 

825 index_name = name 

826 for i in range(index): 

827 index_name = index_name + "_index" 

828 col_index = VectorIndex(name=index_name, data=create_vector_index[-(i+1)], target=col) 

829 self.__add_column_index_helper(col_index) 

830 if i < index - 1: 

831 columns.insert(0, col_index) 

832 col = col_index 

833 else: # make VectorIndex with supplied data 

834 assert len(col) > 0, ValueError("cannot pass non-empty index with empty data to index") 

835 col_index = VectorIndex(name=name + "_index", data=index, target=col) 

836 self.__add_column_index_helper(col_index) 

837 columns.insert(0, col_index) 

838 col = col_index 

839 

840 if len(col) != len(self.id): 

841 raise ValueError("column must have the same number of rows as 'id'") 

842 self.__colids[name] = len(self.__df_cols) 

843 self.fields['colnames'] = tuple(list(self.colnames) + [name]) 

844 self.fields['columns'] = tuple(list(self.columns) + columns) 

845 self.__df_cols.append(col) 

846 

847 def __add_column_index_helper(self, col_index): 

848 if not isinstance(col_index.parent, Container): 848 ↛ 851line 848 didn't jump to line 851, because the condition on line 848 was never false

849 col_index.parent = self 

850 # else, the ObjectMapper will create a link from self (parent) to col_index (child with existing parent) 

851 self.__indices[col_index.name] = col_index 

852 self.__set_table_attr(col_index) 

853 if col_index in self.__uninit_cols: 853 ↛ 854line 853 didn't jump to line 854, because the condition on line 853 was never true

854 self.__uninit_cols.pop(col_index) 

855 

856 @docval({'name': 'name', 'type': str, 'doc': 'the name of the DynamicTableRegion object'}, 

857 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the indices of the table'}, 

858 {'name': 'description', 'type': str, 'doc': 'a brief description of what the region is'}) 

859 def create_region(self, **kwargs): 

860 """ 

861 Create a DynamicTableRegion selecting a region (i.e., rows) in this DynamicTable. 

862 

863 :raises: IndexError if the provided region contains invalid indices 

864 

865 """ 

866 region = getargs('region', kwargs) 

867 if isinstance(region, slice): 

868 if (region.start is not None and region.start < 0) or (region.stop is not None and region.stop > len(self)): 

869 msg = 'region slice %s is out of range for this DynamicTable of length %d' % (str(region), len(self)) 

870 raise IndexError(msg) 

871 region = list(range(*region.indices(len(self)))) 

872 else: 

873 for idx in region: 

874 if idx < 0 or idx >= len(self): 

875 raise IndexError('The index ' + str(idx) + 

876 ' is out of range for this DynamicTable of length ' 

877 + str(len(self))) 

878 desc = getargs('description', kwargs) 

879 name = getargs('name', kwargs) 

880 return DynamicTableRegion(name=name, data=region, description=desc, table=self) 

881 

882 def __getitem__(self, key): 

883 ret = self.get(key) 

884 if ret is None: 

885 raise KeyError(key) 

886 return ret 

887 

888 def get(self, key, default=None, df=True, index=True, **kwargs): 

889 """Select a subset from the table. 

890 

891 If the table includes a DynamicTableRegion column, then by default, 

892 the index/indices of the DynamicTableRegion will be returned. If ``df=True`` and ``index=False``, 

893 then the returned pandas DataFrame will contain a nested DataFrame in each row of the 

894 DynamicTableRegion column. If ``df=False`` and ``index=True``, then a list of lists will be returned 

895 where the list containing the DynamicTableRegion column contains the indices of the DynamicTableRegion. 

896 Note that in this case, the DynamicTable referenced by the DynamicTableRegion can be accessed through 

897 the ``table`` attribute of the DynamicTableRegion object. ``df=False`` and ``index=False`` is 

898 not yet supported. 

899 

900 :param key: Key defining which elements of the table to select. This may be one of the following: 

901 

902 1) string with the name of the column to select 

903 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the 

904 column to select by name 

905 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then 

906 scalars are returned for each column that has a single value. If a list, array, or slice is used and 

907 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a 

908 single row. 

909 

910 :return: 1) If key is a string, then return the VectorData object representing the column with the string name 

911 2) If key is a tuple of (int, str), then return the scalar value of the selected cell 

912 3) If key is an int, list, np.ndarray, or slice, then return pandas.DataFrame or lists 

913 consisting of one or more rows 

914 

915 :raises: KeyError 

916 """ 

917 ret = None 

918 if not df and not index: 

919 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported 

920 raise ValueError('DynamicTable.get() with df=False and index=False is not yet supported.') 

921 if isinstance(key, tuple): 

922 # index by row and column --> return specific cell 

923 arg1 = key[0] 

924 arg2 = key[1] 

925 if isinstance(arg2, str): 

926 arg2 = self.__colids[arg2] 

927 ret = self.__df_cols[arg2][arg1] 

928 elif isinstance(key, str): 

929 # index by one string --> return column 

930 if key == 'id': 930 ↛ 931line 930 didn't jump to line 931, because the condition on line 930 was never true

931 return self.id 

932 elif key in self.__colids: 

933 ret = self.__df_cols[self.__colids[key]] 

934 elif key in self.__indices: 

935 ret = self.__indices[key] 

936 else: 

937 return default 

938 else: 

939 # index by int, list, np.ndarray, or slice --> 

940 # return pandas Dataframe or lists consisting of one or more rows 

941 sel = self.__get_selection_as_dict(key, df, index, **kwargs) 

942 if df: 

943 # reformat objects to fit into a pandas DataFrame 

944 if np.isscalar(key): 

945 ret = self.__get_selection_as_df_single_row(sel) 

946 else: 

947 ret = self.__get_selection_as_df(sel) 

948 else: 

949 ret = list(sel.values()) 

950 

951 return ret 

952 

953 def __get_selection_as_dict(self, arg, df, index, exclude=None, **kwargs): 

954 """Return a dict mapping column names to values (lists/arrays or dataframes) for the given selection. 

955 Uses each column's get() method, passing kwargs as necessary. 

956 

957 :param arg: key passed to get() to return one or more rows 

958 :type arg: int, list, np.ndarray, or slice 

959 """ 

960 if not (np.issubdtype(type(arg), np.integer) or isinstance(arg, (slice, list, np.ndarray))): 

961 raise KeyError("Key type not supported by DynamicTable %s" % str(type(arg))) 

962 if isinstance(arg, np.ndarray) and arg.ndim != 1: 962 ↛ 963line 962 didn't jump to line 963, because the condition on line 962 was never true

963 raise ValueError("Cannot index DynamicTable with multiple dimensions") 

964 if exclude is None: 

965 exclude = set([]) 

966 ret = OrderedDict() 

967 try: 

968 # index with a python slice or single int to select one or multiple rows 

969 ret['id'] = self.id[arg] 

970 for name in self.colnames: 

971 if name in exclude: 

972 continue 

973 col = self.__df_cols[self.__colids[name]] 

974 if index and (isinstance(col, DynamicTableRegion) or 

975 (isinstance(col, VectorIndex) and isinstance(col.target, DynamicTableRegion))): 

976 # return indices (in list, array, etc.) for DTR and ragged DTR 

977 ret[name] = col.get(arg, df=False, index=True, **kwargs) 

978 else: 

979 ret[name] = col.get(arg, df=df, index=index, **kwargs) 

980 return ret 

981 # if index is out of range, different errors can be generated depending on the dtype of the column 

982 # but despite the differences, raise an IndexError from that error 

983 except ValueError as ve: 983 ↛ 986line 983 didn't jump to line 986, because the exception caught by line 983 didn't happen

984 # in h5py <2, if the column is an h5py.Dataset, a ValueError was raised 

985 # in h5py 3+, this became an IndexError 

986 x = re.match(r"^Index \((.*)\) out of range \(.*\)$", str(ve)) 

987 if x: 

988 msg = ("Row index %s out of range for %s '%s' (length %d)." 

989 % (x.groups()[0], self.__class__.__name__, self.name, len(self))) 

990 raise IndexError(msg) from ve 

991 else: # pragma: no cover 

992 raise ve 

993 except IndexError as ie: 

994 x = re.match(r"^Index \((.*)\) out of range for \(.*\)$", str(ie)) 

995 if x: 

996 msg = ("Row index %s out of range for %s '%s' (length %d)." 

997 % (x.groups()[0], self.__class__.__name__, self.name, len(self))) 

998 raise IndexError(msg) 

999 elif str(ie) == 'list index out of range': 

1000 msg = ("Row index out of range for %s '%s' (length %d)." 

1001 % (self.__class__.__name__, self.name, len(self))) 

1002 raise IndexError(msg) from ie 

1003 else: # pragma: no cover 

1004 raise ie 

1005 

1006 def __get_selection_as_df_single_row(self, coldata): 

1007 """Return a pandas dataframe for the given row and columns with the id column as the index. 

1008 

1009 This is a special case of __get_selection_as_df where a single row was requested. 

1010 

1011 :param coldata: dict mapping column names to values (list/arrays or dataframes) 

1012 :type coldata: dict 

1013 """ 

1014 id_index_orig = coldata.pop('id') 

1015 id_index = [id_index_orig] 

1016 df_input = OrderedDict() 

1017 for k in coldata: # for each column 

1018 if isinstance(coldata[k], (np.ndarray, list, tuple, pd.DataFrame)): 

1019 # wrap in a list because coldata[k] may be an array/list/tuple with multiple elements (ragged or 

1020 # multi-dim column) and pandas needs to have one element per index row (=1 in this case) 

1021 df_input[k] = [coldata[k]] 

1022 else: # scalar, don't wrap 

1023 df_input[k] = coldata[k] 

1024 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64)) 

1025 ret.name = self.name 

1026 return ret 

1027 

1028 def __get_selection_as_df(self, coldata): 

1029 """Return a pandas dataframe for the given rows and columns with the id column as the index. 

1030 

1031 This is used when multiple row indices are selected (or a list/array/slice of a single index is passed to get). 

1032 __get_selection_as_df_single_row should be used if a single index is passed to get. 

1033 

1034 :param coldata: dict mapping column names to values (list/arrays or dataframes) 

1035 :type coldata: dict 

1036 """ 

1037 id_index = coldata.pop('id') 

1038 df_input = OrderedDict() 

1039 for k in coldata: # for each column 

1040 if isinstance(coldata[k], np.ndarray) and coldata[k].ndim > 1: 

1041 df_input[k] = list(coldata[k]) # convert multi-dim array to list of inner arrays 

1042 elif isinstance(coldata[k], pd.DataFrame): 

1043 # multiple rows were selected and collapsed into a dataframe 

1044 # split up the rows of the df into a list of dataframes, one per row 

1045 # TODO make this more efficient 

1046 df_input[k] = [coldata[k].iloc[[i]] for i in range(len(coldata[k]))] 

1047 else: 

1048 df_input[k] = coldata[k] 

1049 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64)) 

1050 ret.name = self.name 

1051 return ret 

1052 

1053 def __contains__(self, val): 

1054 """ 

1055 Check if the given value (i.e., column) exists in this table 

1056 """ 

1057 return val in self.__colids or val in self.__indices 

1058 

1059 def get_foreign_columns(self): 

1060 """ 

1061 Determine the names of all columns that link to another DynamicTable, i.e., 

1062 find all DynamicTableRegion type columns. Similar to a foreign key in a 

1063 database, a DynamicTableRegion column references elements in another table. 

1064 

1065 :returns: List of strings with the column names 

1066 """ 

1067 col_names = [] 

1068 for col_index, col in enumerate(self.columns): 

1069 if isinstance(col, DynamicTableRegion): 

1070 col_names.append(col.name) 

1071 return col_names 

1072 

1073 def has_foreign_columns(self): 

1074 """ 

1075 Does the table contain DynamicTableRegion columns 

1076 

1077 :returns: True if the table contains a DynamicTableRegion column, else False 

1078 """ 

1079 for col_index, col in enumerate(self.columns): 

1080 if isinstance(col, DynamicTableRegion): 

1081 return True 

1082 return False 

1083 

1084 @docval({'name': 'other_tables', 'type': (list, tuple, set), 

1085 'doc': "List of additional tables to consider in the search. Usually this " 

1086 "parameter is used for internal purposes, e.g., when we need to " 

1087 "consider AlignedDynamicTable", 'default': None}, 

1088 allow_extra=False) 

1089 def get_linked_tables(self, **kwargs): 

1090 """ 

1091 Get a list of the full list of all tables that are being linked to directly or indirectly 

1092 from this table via foreign DynamicTableColumns included in this table or in any table that 

1093 can be reached through DynamicTableRegion columns 

1094 

1095 Returns: List of NamedTuple objects with: 

1096 * 'source_table' : The source table containing the DynamicTableRegion column 

1097 * 'source_column' : The relevant DynamicTableRegion column in the 'source_table' 

1098 * 'target_table' : The target DynamicTable; same as source_column.table. 

1099 """ 

1100 link_type = NamedTuple('DynamicTableLink', 

1101 [('source_table', DynamicTable), 

1102 ('source_column', Union[DynamicTableRegion, VectorIndex]), 

1103 ('target_table', DynamicTable)]) 

1104 curr_tables = [self, ] # Set of tables 

1105 other_tables = getargs('other_tables', kwargs) 

1106 if other_tables is not None: 

1107 curr_tables += other_tables 

1108 curr_index = 0 

1109 foreign_cols = [] 

1110 while curr_index < len(curr_tables): 

1111 for col_index, col in enumerate(curr_tables[curr_index].columns): 

1112 if isinstance(col, DynamicTableRegion): 

1113 foreign_cols.append(link_type(source_table=curr_tables[curr_index], 

1114 source_column=col, 

1115 target_table=col.table)) 

1116 curr_table_visited = False 

1117 for t in curr_tables: 

1118 if t is col.table: 

1119 curr_table_visited = True 

1120 if not curr_table_visited: 

1121 curr_tables.append(col.table) 

1122 curr_index += 1 

1123 return foreign_cols 

1124 

1125 @docval({'name': 'exclude', 'type': set, 'doc': 'Set of column names to exclude from the dataframe', 

1126 'default': None}, 

1127 {'name': 'index', 'type': bool, 

1128 'doc': ('Whether to return indices for a DynamicTableRegion column. If False, nested dataframes will be ' 

1129 'returned.'), 

1130 'default': False} 

1131 ) 

1132 def to_dataframe(self, **kwargs): 

1133 """ 

1134 Produce a pandas DataFrame containing this table's data. 

1135 

1136 If this table contains a DynamicTableRegion, by default, 

1137 

1138 If exclude is None, this is equivalent to table.get(slice(None, None, None), index=False). 

1139 """ 

1140 arg = slice(None, None, None) # select all rows 

1141 sel = self.__get_selection_as_dict(arg, df=True, **kwargs) 

1142 ret = self.__get_selection_as_df(sel) 

1143 return ret 

1144 

1145 @classmethod 

1146 @docval( 

1147 {'name': 'df', 'type': pd.DataFrame, 'doc': 'source DataFrame'}, 

1148 {'name': 'name', 'type': str, 'doc': 'the name of this table'}, 

1149 { 

1150 'name': 'index_column', 

1151 'type': str, 

1152 'doc': 'if provided, this column will become the table\'s index', 

1153 'default': None 

1154 }, 

1155 { 

1156 'name': 'table_description', 

1157 'type': str, 

1158 'doc': 'a description of what is in the resulting table', 

1159 'default': '' 

1160 }, 

1161 { 

1162 'name': 'columns', 

1163 'type': (list, tuple), 

1164 'doc': 'a list/tuple of dictionaries specifying columns in the table', 

1165 'default': None 

1166 }, 

1167 allow_extra=True 

1168 ) 

1169 def from_dataframe(cls, **kwargs): 

1170 ''' 

1171 Construct an instance of DynamicTable (or a subclass) from a pandas DataFrame. 

1172 

1173 The columns of the resulting table are defined by the columns of the 

1174 dataframe and the index by the dataframe's index (make sure it has a 

1175 name!) or by a column whose name is supplied to the index_column 

1176 parameter. We recommend that you supply *columns* - a list/tuple of 

1177 dictionaries containing the name and description of the column- to help 

1178 others understand the contents of your table. See 

1179 :py:class:`~hdmf.common.table.DynamicTable` for more details on *columns*. 

1180 ''' 

1181 

1182 columns = kwargs.pop('columns') 

1183 df = kwargs.pop('df') 

1184 name = kwargs.pop('name') 

1185 index_column = kwargs.pop('index_column') 

1186 table_description = kwargs.pop('table_description') 

1187 column_descriptions = kwargs.pop('column_descriptions', dict()) 

1188 

1189 supplied_columns = dict() 

1190 if columns: 1190 ↛ 1191line 1190 didn't jump to line 1191, because the condition on line 1190 was never true

1191 supplied_columns = {x['name']: x for x in columns} 

1192 

1193 class_cols = {x['name']: x for x in cls.__columns__} 

1194 required_cols = set(x['name'] for x in cls.__columns__ if 'required' in x and x['required']) 

1195 df_cols = df.columns 

1196 if required_cols - set(df_cols): 1196 ↛ 1197line 1196 didn't jump to line 1197, because the condition on line 1196 was never true

1197 raise ValueError('missing required cols: ' + str(required_cols - set(df_cols))) 

1198 if set(supplied_columns.keys()) - set(df_cols): 1198 ↛ 1199line 1198 didn't jump to line 1199, because the condition on line 1198 was never true

1199 raise ValueError('cols specified but not provided: ' + str(set(supplied_columns.keys()) - set(df_cols))) 

1200 columns = [] 

1201 for col_name in df_cols: 

1202 if col_name in class_cols: 1202 ↛ 1203line 1202 didn't jump to line 1203, because the condition on line 1202 was never true

1203 columns.append(class_cols[col_name]) 

1204 elif col_name in supplied_columns: 1204 ↛ 1205line 1204 didn't jump to line 1205, because the condition on line 1204 was never true

1205 columns.append(supplied_columns[col_name]) 

1206 else: 

1207 columns.append({'name': col_name, 

1208 'description': column_descriptions.get(col_name, 'no description')}) 

1209 if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str): 

1210 lengths = [len(x) for x in df[col_name]] 

1211 if not lengths[1:] == lengths[:-1]: 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true

1212 columns[-1].update(index=True) 

1213 

1214 if index_column is not None: 1214 ↛ 1215line 1214 didn't jump to line 1215, because the condition on line 1214 was never true

1215 ids = ElementIdentifiers(name=index_column, data=df[index_column].values.tolist()) 

1216 else: 

1217 index_name = df.index.name if df.index.name is not None else 'id' 

1218 ids = ElementIdentifiers(name=index_name, data=df.index.values.tolist()) 

1219 

1220 columns = cls.__build_columns(columns, df=df) 

1221 

1222 return cls(name=name, id=ids, columns=columns, description=table_description, **kwargs) 

1223 

1224 def copy(self): 

1225 """ 

1226 Return a copy of this DynamicTable. 

1227 This is useful for linking. 

1228 """ 

1229 kwargs = dict(name=self.name, id=self.id, columns=self.columns, description=self.description, 

1230 colnames=self.colnames) 

1231 return self.__class__(**kwargs) 

1232 

1233 

1234@register_class('DynamicTableRegion') 

1235class DynamicTableRegion(VectorData): 

1236 """ 

1237 DynamicTableRegion provides a link from one table to an index or region of another. The `table` 

1238 attribute is another `DynamicTable`, indicating which table is referenced. The data is int(s) 

1239 indicating the row(s) (0-indexed) of the target array. `DynamicTableRegion`s can be used to 

1240 associate multiple rows with the same meta-data without data duplication. They can also be used to 

1241 create hierarchical relationships between multiple `DynamicTable`s. `DynamicTableRegion` objects 

1242 may be paired with a `VectorIndex` object to create ragged references, so a single cell of a 

1243 `DynamicTable` can reference many rows of another `DynamicTable`. 

1244 """ 

1245 

1246 __fields__ = ( 

1247 'table', 

1248 ) 

1249 

1250 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, 

1251 {'name': 'data', 'type': ('array_data', 'data'), 

1252 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'}, 

1253 {'name': 'description', 'type': str, 'doc': 'a description of what this region represents'}, 

1254 {'name': 'table', 'type': DynamicTable, 

1255 'doc': 'the DynamicTable this region applies to', 'default': None}, 

1256 allow_positional=AllowPositional.WARNING) 

1257 def __init__(self, **kwargs): 

1258 t = popargs('table', kwargs) 

1259 super().__init__(**kwargs) 

1260 self.table = t 

1261 

1262 @property 

1263 def table(self): 

1264 """The DynamicTable this DynamicTableRegion is pointing to""" 

1265 return self.fields.get('table') 

1266 

1267 @table.setter 

1268 def table(self, val): 

1269 """ 

1270 Set the table this DynamicTableRegion should be pointing to 

1271 

1272 :param val: The DynamicTable this DynamicTableRegion should be pointing to 

1273 

1274 :raises: AttributeError if table is already in fields 

1275 :raises: IndexError if the current indices are out of bounds for the new table given by val 

1276 """ 

1277 if val is None: 

1278 return 

1279 if 'table' in self.fields: 1279 ↛ 1280line 1279 didn't jump to line 1280, because the condition on line 1279 was never true

1280 msg = "can't set attribute 'table' -- already set" 

1281 raise AttributeError(msg) 

1282 dat = self.data 

1283 if isinstance(dat, DataIO): 1283 ↛ 1284line 1283 didn't jump to line 1284, because the condition on line 1283 was never true

1284 dat = dat.data 

1285 self.fields['table'] = val 

1286 

1287 def __getitem__(self, arg): 

1288 return self.get(arg) 

1289 

1290 def get(self, arg, index=False, df=True, **kwargs): 

1291 """ 

1292 Subset the DynamicTableRegion 

1293 

1294 :param arg: Key defining which elements of the table to select. This may be one of the following: 

1295 

1296 1) string with the name of the column to select 

1297 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the 

1298 column to select by name 

1299 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then 

1300 scalars are returned for each column that has a single value. If a list, array, or slice is used and 

1301 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a 

1302 single row. 

1303 

1304 :param index: Boolean indicating whether to return indices of the DTR (default False) 

1305 :param df: Boolean indicating whether to return the result as a pandas DataFrame (default True) 

1306 

1307 :return: Result from self.table[...] with the appropriate selection based on the 

1308 rows selected by this DynamicTableRegion 

1309 """ 

1310 if not df and not index: 

1311 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported 

1312 raise ValueError('DynamicTableRegion.get() with df=False and index=False is not yet supported.') 

1313 # treat the list of indices as data that can be indexed. then pass the 

1314 # result to the table to get the data 

1315 if isinstance(arg, tuple): 

1316 arg1 = arg[0] 

1317 arg2 = arg[1] 

1318 return self.table[self.data[arg1], arg2] 

1319 elif isinstance(arg, str): 

1320 return self.table[arg] 

1321 elif np.issubdtype(type(arg), np.integer): 

1322 if arg >= len(self.data): 

1323 raise IndexError('index {} out of bounds for data of length {}'.format(arg, len(self.data))) 

1324 ret = self.data[arg] 

1325 if not index: 

1326 ret = self.table.get(ret, df=df, index=index, **kwargs) 

1327 return ret 

1328 elif isinstance(arg, (list, slice, np.ndarray)): 

1329 idx = arg 

1330 

1331 # get the data at the specified indices 

1332 if isinstance(self.data, (tuple, list)) and isinstance(idx, (list, np.ndarray)): 

1333 ret = [self.data[i] for i in idx] 

1334 else: 

1335 ret = self.data[idx] 

1336 

1337 # dereference them if necessary 

1338 if not index: 

1339 # These lines are needed because indexing Dataset with a list/ndarray 

1340 # of ints requires the list to be sorted. 

1341 # 

1342 # First get the unique elements, retrieve them from the table, and then 

1343 # reorder the result according to the original index that the user passed in. 

1344 # 

1345 # When not returning a DataFrame, we need to recursively sort the subelements 

1346 # of the list we are returning. This is carried out by the recursive method _index_lol 

1347 uniq = np.unique(ret) 

1348 lut = {val: i for i, val in enumerate(uniq)} 

1349 values = self.table.get(uniq, df=df, index=index, **kwargs) 

1350 if df: 1350 ↛ 1353line 1350 didn't jump to line 1353, because the condition on line 1350 was never false

1351 ret = values.iloc[[lut[i] for i in ret]] 

1352 else: 

1353 ret = self._index_lol(values, ret, lut) 

1354 return ret 

1355 else: 

1356 raise ValueError("unrecognized argument: '%s'" % arg) 

1357 

1358 def _index_lol(self, result, index, lut): 

1359 """ 

1360 This is a helper function for indexing a list of lists/ndarrays. When not returning a 

1361 DataFrame, indexing a DynamicTable will return a list of lists and ndarrays. To sort 

1362 the result of a DynamicTable index according to the order of the indices passed in by the 

1363 user, we have to recursively sort the sub-lists/sub-ndarrays. 

1364 """ 

1365 ret = list() 

1366 for col in result: 

1367 if isinstance(col, list): 

1368 if isinstance(col[0], list): 

1369 # list of columns that need to be sorted 

1370 ret.append(self._index_lol(col, index, lut)) 

1371 else: 

1372 # list of elements, one for each row to return 

1373 ret.append([col[lut[i]] for i in index]) 

1374 elif isinstance(col, np.ndarray): 

1375 ret.append(np.array([col[lut[i]] for i in index], dtype=col.dtype)) 

1376 else: 

1377 raise ValueError('unrecognized column type: %s. Expected list or np.ndarray' % type(col)) 

1378 return ret 

1379 

1380 def to_dataframe(self, **kwargs): 

1381 """ 

1382 Convert the whole DynamicTableRegion to a pandas dataframe. 

1383 

1384 Keyword arguments are passed through to the to_dataframe method of DynamicTable that 

1385 is being referenced (i.e., self.table). This allows specification of the 'exclude' 

1386 parameter and any other parameters of DynamicTable.to_dataframe. 

1387 """ 

1388 return self.table.to_dataframe(**kwargs).iloc[self.data[:]] 

1389 

1390 @property 

1391 def shape(self): 

1392 """ 

1393 Define the shape, i.e., (num_rows, num_columns) of the selected table region 

1394 :return: Shape tuple with two integers indicating the number of rows and number of columns 

1395 """ 

1396 return (len(self.data), len(self.table.columns)) 

1397 

1398 def __repr__(self): 

1399 """ 

1400 :return: Human-readable string representation of the DynamicTableRegion 

1401 """ 

1402 cls = self.__class__ 

1403 template = "%s %s.%s at 0x%d\n" % (self.name, cls.__module__, cls.__name__, id(self)) 

1404 template += " Target table: %s %s.%s at 0x%d\n" % (self.table.name, 

1405 self.table.__class__.__module__, 

1406 self.table.__class__.__name__, 

1407 id(self.table)) 

1408 return template 

1409 

1410 def _validate_on_set_parent(self): 

1411 # when this DynamicTableRegion is added to a parent, check: 

1412 # 1) if the table was read from a written file, no need to validate further 

1413 p = self.table 

1414 while p is not None: 

1415 if p.container_source is not None: 1415 ↛ 1416line 1415 didn't jump to line 1416, because the condition on line 1415 was never true

1416 return super()._validate_on_set_parent() 

1417 p = p.parent 

1418 

1419 # 2) if none of the ancestors are ancestors of the linked-to table, then when this is written, the table 

1420 # field will point to a table that is not in the file 

1421 table_ancestor_ids = [id(x) for x in self.table.get_ancestors()] 

1422 self_ancestor_ids = [id(x) for x in self.get_ancestors()] 

1423 

1424 if set(table_ancestor_ids).isdisjoint(self_ancestor_ids): 

1425 msg = (f"The linked table for DynamicTableRegion '{self.name}' does not share an ancestor with the " 

1426 "DynamicTableRegion.") 

1427 warn(msg) 

1428 return super()._validate_on_set_parent() 

1429 

1430 

1431def _uint_precision(elements): 

1432 """ Calculate the uint precision needed to encode a set of elements """ 

1433 n_elements = elements 

1434 if hasattr(elements, '__len__'): 1434 ↛ 1436line 1434 didn't jump to line 1436, because the condition on line 1434 was never false

1435 n_elements = len(elements) 

1436 return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type 

1437 

1438 

1439def _map_elements(uint, elements): 

1440 """ Map CV terms to their uint index """ 

1441 return {t[1]: uint(t[0]) for t in enumerate(elements)} 

1442 

1443 

1444@register_class('EnumData', EXP_NAMESPACE) 

1445class EnumData(VectorData): 

1446 """ 

1447 A n-dimensional dataset that can contain elements from fixed set of elements. 

1448 """ 

1449 

1450 __fields__ = ('elements', ) 

1451 

1452 @docval({'name': 'name', 'type': str, 'doc': 'the name of this column'}, 

1453 {'name': 'description', 'type': str, 'doc': 'a description for this column'}, 

1454 {'name': 'data', 'type': ('array_data', 'data'), 

1455 'doc': 'integers that index into elements for the value of each row', 'default': list()}, 

1456 {'name': 'elements', 'type': ('array_data', 'data', VectorData), 'default': list(), 

1457 'doc': 'lookup values for each integer in ``data``'}, 

1458 allow_positional=AllowPositional.WARNING) 

1459 def __init__(self, **kwargs): 

1460 elements = popargs('elements', kwargs) 

1461 super().__init__(**kwargs) 

1462 if not isinstance(elements, VectorData): 

1463 elements = VectorData(name='%s_elements' % self.name, data=elements, 

1464 description='fixed set of elements referenced by %s' % self.name) 

1465 self.elements = elements 

1466 if len(self.elements) > 0: 

1467 self.__uint = _uint_precision(self.elements.data) 

1468 self.__revidx = _map_elements(self.__uint, self.elements.data) 

1469 else: 

1470 self.__revidx = dict() # a map from term to index 

1471 self.__uint = None # the precision needed to encode all terms 

1472 

1473 def __add_term(self, term): 

1474 """ 

1475 Add a new CV term, and return it's corresponding index 

1476 

1477 Returns: 

1478 The index of the term 

1479 """ 

1480 if term not in self.__revidx: 

1481 # get minimum uint precision needed for elements 

1482 self.elements.append(term) 

1483 uint = _uint_precision(self.elements) 

1484 if self.__uint is uint: 

1485 # add the new term to the index-term map 

1486 self.__revidx[term] = self.__uint(len(self.elements) - 1) 

1487 else: 

1488 # remap terms to their uint and bump the precision of existing data 

1489 self.__uint = uint 

1490 self.__revidx = _map_elements(self.__uint, self.elements) 

1491 for i in range(len(self.data)): 1491 ↛ 1492line 1491 didn't jump to line 1492, because the loop on line 1491 never started

1492 self.data[i] = self.__uint(self.data[i]) 

1493 return self.__revidx[term] 

1494 

1495 def __getitem__(self, arg): 

1496 return self.get(arg, index=False) 

1497 

1498 def _get_helper(self, idx, index=False, join=False, **kwargs): 

1499 """ 

1500 A helper function for getting elements elements 

1501 

1502 This helper function contains the post-processing of retrieve indices. By separating this, 

1503 it allows customizing processing of indices before resolving the elements elements 

1504 """ 

1505 if index: 

1506 return idx 

1507 if not np.isscalar(idx): 

1508 idx = np.asarray(idx) 

1509 ret = np.asarray(self.elements.get(idx.ravel(), **kwargs)).reshape(idx.shape) 

1510 if join: 

1511 ret = ''.join(ret.ravel()) 

1512 else: 

1513 ret = self.elements.get(idx, **kwargs) 

1514 return ret 

1515 

1516 def get(self, arg, index=False, join=False, **kwargs): 

1517 """ 

1518 Return elements elements for the given argument. 

1519 

1520 Args: 

1521 index (bool): Return indices, do not return CV elements 

1522 join (bool): Concatenate elements together into a single string 

1523 

1524 Returns: 

1525 CV elements if *join* is False or a concatenation of all selected 

1526 elements if *join* is True. 

1527 """ 

1528 idx = self.data[arg] 

1529 return self._get_helper(idx, index=index, join=join, **kwargs) 

1530 

1531 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'}, 

1532 {'name': 'index', 'type': bool, 'doc': 'whether or not the value being added is an index', 

1533 'default': False}) 

1534 def add_row(self, **kwargs): 

1535 """Append a data value to this EnumData column 

1536 

1537 If an element is provided for *val* (i.e. *index* is False), the correct 

1538 index value will be determined. Otherwise, *val* will be added as provided. 

1539 """ 

1540 val, index = getargs('val', 'index', kwargs) 

1541 if not index: 

1542 val = self.__add_term(val) 

1543 super().append(val)