Coverage for src/hdmf/common/table.py: 89%

767 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-07-10 23:48 +0000

1""" 

2Collection of Container classes for interacting with data types related to 

3the storage and use of dynamic data tables as part of the hdmf-common schema 

4""" 

5 

6import re 

7from collections import OrderedDict 

8from typing import NamedTuple, Union 

9from warnings import warn 

10 

11import numpy as np 

12import pandas as pd 

13import itertools 

14 

15from . import register_class, EXP_NAMESPACE 

16from ..container import Container, Data 

17from ..data_utils import DataIO, AbstractDataChunkIterator 

18from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional 

19from ..term_set import TermSet 

20 

21 

22@register_class('VectorData') 

23class VectorData(Data): 

24 """ 

25 A n-dimensional dataset representing a column of a DynamicTable. 

26 If used without an accompanying VectorIndex, first dimension is 

27 along the rows of the DynamicTable and each step along the first 

28 dimension is a cell of the larger table. VectorData can also be 

29 used to represent a ragged array if paired with a VectorIndex. 

30 This allows for storing arrays of varying length in a single cell 

31 of the DynamicTable by indexing into this VectorData. The first 

32 vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at 

33 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on. 

34 """ 

35 

36 __fields__ = ("description",) 

37 

38 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, 

39 {'name': 'description', 'type': str, 'doc': 'a description for this column'}, 

40 {'name': 'data', 'type': ('array_data', 'data'), 

41 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()}, 

42 {'name': 'term_set', 'type': TermSet, 'doc': 'the set of terms used to validate data on add', 

43 'default': None}, 

44 allow_positional=AllowPositional.WARNING) 

45 def __init__(self, **kwargs): 

46 description = popargs('description', kwargs) 

47 super().__init__(**kwargs) 

48 self.description = description 

49 

50 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'}) 

51 def add_row(self, **kwargs): 

52 """Append a data value to this VectorData column""" 

53 val = getargs('val', kwargs) 

54 if self.term_set is not None: 

55 if self.term_set.validate(term=val): 55 ↛ 58line 55 didn't jump to line 58, because the condition on line 55 was never false

56 self.append(val) 

57 else: 

58 msg = ("%s is not in the term set." % val) 

59 raise ValueError(msg) 

60 

61 else: 

62 self.append(val) 

63 

64 def get(self, key, **kwargs): 

65 """ 

66 Retrieve elements from this VectorData 

67 

68 :param key: Selection of the elements 

69 :param kwargs: Ignored 

70 """ 

71 return super().get(key) 

72 

73 def extend(self, ar, **kwargs): 

74 """Add all elements of the iterable arg to the end of this VectorData. 

75 

76 Each subclass of VectorData should have its own extend method to ensure functionality and efficiency. 

77 

78 :param arg: The iterable to add to the end of this VectorData 

79 """ 

80 ################################################################################# 

81 # Each subclass of VectorData should have its own extend method to ensure 

82 # functionality AND efficiency of the extend operation. However, because currently 

83 # they do not all have one of these methods, the only way to ensure functionality 

84 # is with calls to add_row. Because that is inefficient for basic VectorData, 

85 # this check is added to ensure we always call extend on a basic VectorData. 

86 if self.__class__.__mro__[0] == VectorData: 

87 super().extend(ar) 

88 else: 

89 for i in ar: 

90 self.add_row(i, **kwargs) 

91 

92 

93@register_class('VectorIndex') 

94class VectorIndex(VectorData): 

95 """ 

96 When paired with a VectorData, this allows for storing arrays of varying 

97 length in a single cell of the DynamicTable by indexing into this VectorData. 

98 The first vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at 

99 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on. 

100 """ 

101 

102 __fields__ = ("target",) 

103 

104 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorIndex'}, 

105 {'name': 'data', 'type': ('array_data', 'data'), 

106 'doc': 'a 1D dataset containing indexes that apply to VectorData object'}, 

107 {'name': 'target', 'type': VectorData, 

108 'doc': 'the target dataset that this index applies to'}, 

109 allow_positional=AllowPositional.WARNING) 

110 def __init__(self, **kwargs): 

111 target = popargs('target', kwargs) 

112 kwargs['description'] = "Index for VectorData '%s'" % target.name 

113 super().__init__(**kwargs) 

114 self.target = target 

115 self.__uint = np.uint8 

116 self.__maxval = 255 

117 if isinstance(self.data, (list, np.ndarray)): 

118 if len(self.data) > 0: 

119 self.__check_precision(len(self.target)) 

120 # adjust precision for types that we can adjust precision for 

121 self.__adjust_precision(self.__uint) 

122 

123 def add_vector(self, arg, **kwargs): 

124 """ 

125 Add the given data value to the target VectorData and append the corresponding index to this VectorIndex 

126 :param arg: The data value to be added to self.target 

127 """ 

128 if isinstance(self.target, VectorIndex): 

129 for a in arg: 

130 self.target.add_vector(a) 

131 else: 

132 self.target.extend(arg, **kwargs) 

133 self.append(self.__check_precision(len(self.target))) 

134 

135 def __check_precision(self, idx): 

136 """ 

137 Check precision of current dataset and, if necessary, adjust precision to accommodate new value. 

138 

139 Returns: 

140 unsigned integer encoding of idx 

141 """ 

142 if idx > self.__maxval: 

143 while idx > self.__maxval: 

144 nbits = (np.log2(self.__maxval + 1) * 2) # 8->16, 16->32, 32->64 

145 if nbits == 128: # pragma: no cover 

146 msg = ('Cannot store more than 18446744073709551615 elements in a VectorData. Largest dtype ' 

147 'allowed for VectorIndex is uint64.') 

148 raise ValueError(msg) 

149 self.__maxval = 2 ** nbits - 1 

150 self.__uint = np.dtype('uint%d' % nbits).type 

151 self.__adjust_precision(self.__uint) 

152 return self.__uint(idx) 

153 

154 def __adjust_precision(self, uint): 

155 """ 

156 Adjust precision of data to specified unsigned integer precision. 

157 """ 

158 if isinstance(self.data, list): 

159 for i in range(len(self.data)): 

160 self.data[i] = uint(self.data[i]) 

161 elif isinstance(self.data, np.ndarray): 161 ↛ 165line 161 didn't jump to line 165, because the condition on line 161 was never false

162 # use self._Data__data to work around restriction on resetting self.data 

163 self._Data__data = self.data.astype(uint) 

164 else: 

165 raise ValueError("cannot adjust precision of type %s to %s", (type(self.data), uint)) 

166 

167 def add_row(self, arg, **kwargs): 

168 """ 

169 Convenience function. Same as :py:func:`add_vector` 

170 """ 

171 self.add_vector(arg, **kwargs) 

172 

173 def __getitem_helper(self, arg, **kwargs): 

174 """ 

175 Internal helper function used by __getitem__ to retrieve a data value from self.target 

176 

177 :param arg: Integer index into this VectorIndex indicating the element we want to retrieve from the target 

178 :param kwargs: any additional arguments to *get* method of the self.target VectorData 

179 :return: Scalar or list of values retrieved 

180 """ 

181 start = 0 if arg == 0 else self.data[arg - 1] 

182 end = self.data[arg] 

183 return self.target.get(slice(start, end), **kwargs) 

184 

185 def __getitem__(self, arg): 

186 """ 

187 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData 

188 

189 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex 

190 :return: Scalar or list of values retrieved 

191 """ 

192 return self.get(arg) 

193 

194 def get(self, arg, **kwargs): 

195 """ 

196 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData 

197 

198 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex 

199 :param kwargs: any additional arguments to *get* method of the self.target VectorData 

200 :return: Scalar or list of values retrieved 

201 """ 

202 if np.isscalar(arg): 

203 return self.__getitem_helper(arg, **kwargs) 

204 else: 

205 if isinstance(arg, slice): 

206 indices = list(range(*arg.indices(len(self.data)))) 

207 else: 

208 if isinstance(arg[0], bool): 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true

209 arg = np.where(arg)[0] 

210 indices = arg 

211 ret = list() 

212 for i in indices: 

213 ret.append(self.__getitem_helper(i, **kwargs)) 

214 return ret 

215 

216 

217@register_class('ElementIdentifiers') 

218class ElementIdentifiers(Data): 

219 """ 

220 Data container with a list of unique identifiers for values within a dataset, e.g. rows of a DynamicTable. 

221 """ 

222 

223 @docval({'name': 'name', 'type': str, 'doc': 'the name of this ElementIdentifiers'}, 

224 {'name': 'data', 'type': ('array_data', 'data'), 'doc': 'a 1D dataset containing identifiers', 

225 'default': list()}, 

226 allow_positional=AllowPositional.WARNING) 

227 def __init__(self, **kwargs): 

228 super().__init__(**kwargs) 

229 

230 @docval({'name': 'other', 'type': (Data, np.ndarray, list, tuple, int), 

231 'doc': 'List of ids to search for in this ElementIdentifer object'}, 

232 rtype=np.ndarray, 

233 returns='Array with the list of indices where the elements in the list where found.' 

234 'Note, the elements in the returned list are ordered in increasing index' 

235 'of the found elements, rather than in the order in which the elements' 

236 'where given for the search. Also the length of the result may be different from the length' 

237 'of the input array. E.g., if our ids are [1,2,3] and we are search for [3,1,5] the ' 

238 'result would be [0,2] and NOT [2,0,None]') 

239 def __eq__(self, other): 

240 """ 

241 Given a list of ids return the indices in the ElementIdentifiers array where the indices are found. 

242 """ 

243 # Determine the ids we want to find 

244 search_ids = other if not isinstance(other, Data) else other.data 

245 if isinstance(search_ids, int): 

246 search_ids = [search_ids] 

247 # Find all matching locations 

248 return np.in1d(self.data, search_ids).nonzero()[0] 

249 

250 

251@register_class('DynamicTable') 

252class DynamicTable(Container): 

253 r""" 

254 A column-based table. Columns are defined by the argument *columns*. This argument 

255 must be a list/tuple of :class:`~hdmf.common.table.VectorData` and :class:`~hdmf.common.table.VectorIndex` objects 

256 or a list/tuple of dicts containing the keys ``name`` and ``description`` that provide the name and description 

257 of each column in the table. Additionally, the keys ``index``, ``table``, ``enum`` can be used for specifying 

258 additional structure to the table columns. Setting the key ``index`` to ``True`` can be used to indicate that the 

259 :class:`~hdmf.common.table.VectorData` column will store a ragged array (i.e. will be accompanied with a 

260 :class:`~hdmf.common.table.VectorIndex`). Setting the key ``table`` to ``True`` can be used to indicate that the 

261 column will store regions to another DynamicTable. Setting the key ``enum`` to ``True`` can be used to indicate 

262 that the column data will come from a fixed set of values. 

263 

264 Columns in DynamicTable subclasses can be statically defined by specifying the class attribute *\_\_columns\_\_*, 

265 rather than specifying them at runtime at the instance level. This is useful for defining a table structure 

266 that will get reused. The requirements for *\_\_columns\_\_* are the same as the requirements described above 

267 for specifying table columns with the *columns* argument to the DynamicTable constructor. 

268 """ 

269 

270 __fields__ = ( 

271 {'name': 'id', 'child': True}, 

272 {'name': 'columns', 'child': True}, 

273 'colnames', 

274 'description' 

275 ) 

276 

277 __columns__ = tuple() 

278 

279 @ExtenderMeta.pre_init 

280 def __gather_columns(cls, name, bases, classdict): 

281 r""" 

282 Gather columns from the *\_\_columns\_\_* class attribute and add them to the class. 

283 

284 This classmethod will be called during class declaration in the metaclass to automatically 

285 include all columns declared in subclasses. 

286 """ 

287 if not isinstance(cls.__columns__, tuple): 

288 msg = "'__columns__' must be of type tuple, found %s" % type(cls.__columns__) 

289 raise TypeError(msg) 

290 

291 if (len(bases) and 'DynamicTable' in globals() and issubclass(bases[-1], Container) 

292 and bases[-1].__columns__ is not cls.__columns__): 

293 new_columns = list(cls.__columns__) 

294 new_columns[0:0] = bases[-1].__columns__ # prepend superclass columns to new_columns 

295 cls.__columns__ = tuple(new_columns) 

296 

297 @docval({'name': 'name', 'type': str, 'doc': 'the name of this table'}, # noqa: C901 

298 {'name': 'description', 'type': str, 'doc': 'a description of what is in this table'}, 

299 {'name': 'id', 'type': ('array_data', 'data', ElementIdentifiers), 'doc': 'the identifiers for this table', 

300 'default': None}, 

301 {'name': 'columns', 'type': (tuple, list), 'doc': 'the columns in this table', 'default': None}, 

302 {'name': 'colnames', 'type': 'array_data', 

303 'doc': 'the ordered names of the columns in this table. columns must also be provided.', 

304 'default': None}, 

305 allow_positional=AllowPositional.WARNING) 

306 def __init__(self, **kwargs): # noqa: C901 

307 id, columns, desc, colnames = popargs('id', 'columns', 'description', 'colnames', kwargs) 

308 super().__init__(**kwargs) 

309 self.description = desc 

310 

311 # hold names of optional columns that are defined in __columns__ that are not yet initialized 

312 # map name to column specification 

313 self.__uninit_cols = dict() 

314 

315 # All tables must have ElementIdentifiers (i.e. a primary key column) 

316 # Here, we figure out what to do for that 

317 if id is not None: 

318 if not isinstance(id, ElementIdentifiers): 

319 id = ElementIdentifiers(name='id', data=id) 

320 else: 

321 id = ElementIdentifiers(name='id') 

322 

323 if columns is not None and len(columns) > 0: 

324 # If columns have been passed in, check them over and process accordingly 

325 if isinstance(columns[0], dict): 

326 columns = self.__build_columns(columns) 

327 elif not all(isinstance(c, VectorData) for c in columns): 

328 raise ValueError("'columns' must be a list of dict, VectorData, DynamicTableRegion, or VectorIndex") 

329 

330 all_names = [c.name for c in columns] 

331 if len(all_names) != len(set(all_names)): 

332 raise ValueError("'columns' contains columns with duplicate names: %s" % all_names) 

333 

334 all_targets = [c.target.name for c in columns if isinstance(c, VectorIndex)] 

335 if len(all_targets) != len(set(all_targets)): 

336 raise ValueError("'columns' contains index columns with the same target: %s" % all_targets) 

337 

338 # TODO: check columns against __columns__ 

339 # mismatches should raise an error (e.g., a VectorData cannot be passed in with the same name as a 

340 # prespecified table region column) 

341 

342 # check column lengths against each other and id length 

343 # set ids if non-zero cols are provided and ids is empty 

344 colset = {c.name: c for c in columns} 

345 for c in columns: # remove all VectorData objects that have an associated VectorIndex from colset 

346 if isinstance(c, VectorIndex): 

347 if c.target.name in colset: 

348 colset.pop(c.target.name) 

349 else: 

350 raise ValueError("Found VectorIndex '%s' but not its target '%s'" % (c.name, c.target.name)) 

351 elif isinstance(c, EnumData): 

352 if c.elements.name in colset: 352 ↛ 354line 352 didn't jump to line 354, because the condition on line 352 was never false

353 colset.pop(c.elements.name) 

354 _data = c.data 

355 if isinstance(_data, DataIO): 

356 _data = _data.data 

357 if isinstance(_data, AbstractDataChunkIterator): 357 ↛ 358line 357 didn't jump to line 358, because the condition on line 357 was never true

358 colset.pop(c.name, None) 

359 lens = [len(c) for c in colset.values()] 

360 if not all(i == lens[0] for i in lens): 

361 raise ValueError("columns must be the same length") 

362 if len(lens) > 0 and lens[0] != len(id): 

363 # the first part of this conditional is needed in the 

364 # event that all columns are AbstractDataChunkIterators 

365 if len(id) > 0: 

366 raise ValueError("must provide same number of ids as length of columns") 

367 else: # set ids to: 0 to length of columns - 1 

368 id.data.extend(range(lens[0])) 

369 

370 self.id = id 

371 

372 # NOTE: self.colnames and self.columns are always tuples 

373 # if kwarg colnames is an h5dataset, self.colnames is still a tuple 

374 if colnames is None or len(colnames) == 0: 

375 if columns is None: 

376 # make placeholder for columns if nothing was given 

377 self.colnames = tuple() 

378 self.columns = tuple() 

379 else: 

380 # Figure out column names if columns were given 

381 tmp = OrderedDict() 

382 skip = set() 

383 for col in columns: 

384 if col.name in skip: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true

385 continue 

386 if isinstance(col, VectorIndex): 

387 continue 

388 if isinstance(col, EnumData): 388 ↛ 389line 388 didn't jump to line 389, because the condition on line 388 was never true

389 skip.add(col.elements.name) 

390 tmp.pop(col.elements.name, None) 

391 tmp[col.name] = None 

392 self.colnames = tuple(tmp) 

393 self.columns = tuple(columns) 

394 else: 

395 # Calculate the order of column names 

396 if columns is None: 

397 raise ValueError("Must supply 'columns' if specifying 'colnames'") 

398 else: 

399 # order the columns according to the column names, which does not include indices 

400 self.colnames = tuple(pystr(c) for c in colnames) 

401 col_dict = {col.name: col for col in columns} 

402 # map from vectordata name to list of vectorindex objects where target of last vectorindex is vectordata 

403 indices = dict() 

404 # determine which columns are indexed by another column 

405 for col in columns: 

406 if isinstance(col, VectorIndex): 

407 # loop through nested indices to get to non-index column 

408 tmp_indices = [col] 

409 curr_col = col 

410 while isinstance(curr_col.target, VectorIndex): 

411 curr_col = curr_col.target 

412 tmp_indices.append(curr_col) 

413 # make sure the indices values has the full index chain, so replace existing value if it is 

414 # shorter 

415 if len(tmp_indices) > len(indices.get(curr_col.target.name, [])): 

416 indices[curr_col.target.name] = tmp_indices 

417 elif isinstance(col, EnumData): 

418 # EnumData is the indexing column, so it should go first 

419 if col.name not in indices: 419 ↛ 405line 419 didn't jump to line 405, because the condition on line 419 was never false

420 indices[col.name] = [col] # EnumData is the indexing object 

421 col_dict[col.name] = col.elements # EnumData.elements is the column with values 

422 else: 

423 if col.name in indices: 

424 continue 

425 indices[col.name] = [] 

426 # put columns in order of colnames, with indices before the target vectordata 

427 tmp = [] 

428 for name in self.colnames: 

429 tmp.extend(indices[name]) 

430 tmp.append(col_dict[name]) 

431 self.columns = tuple(tmp) 

432 

433 # to make generating DataFrames and Series easier 

434 col_dict = dict() 

435 self.__indices = dict() 

436 for col in self.columns: 

437 if isinstance(col, VectorIndex): 

438 # if index has already been added because it is part of a nested index chain, ignore this column 

439 if col.name in self.__indices: 

440 continue 

441 self.__indices[col.name] = col 

442 

443 # loop through nested indices to get to non-index column 

444 curr_col = col 

445 self.__set_table_attr(curr_col) 

446 while isinstance(curr_col.target, VectorIndex): 

447 curr_col = curr_col.target 

448 # check if index has been added. if not, add it 

449 if not hasattr(self, curr_col.name): 

450 self.__set_table_attr(curr_col) 

451 self.__indices[curr_col.name] = col 

452 

453 # use target vectordata name at end of indexing chain as key to get to the top level index 

454 col_dict[curr_col.target.name] = col 

455 if not hasattr(self, curr_col.target.name): 

456 self.__set_table_attr(curr_col.target) 

457 else: # this is a regular VectorData or EnumData 

458 # if we added this column using its index, ignore this column 

459 if col.name in col_dict: 

460 continue 

461 else: 

462 col_dict[col.name] = col 

463 self.__set_table_attr(col) 

464 

465 self.__df_cols = [self.id] + [col_dict[name] for name in self.colnames] 

466 

467 # self.__colids maps the column name to an index starting at 1 

468 self.__colids = {name: i + 1 for i, name in enumerate(self.colnames)} 

469 self._init_class_columns() 

470 

471 def __set_table_attr(self, col): 

472 if hasattr(self, col.name) and col.name not in self.__uninit_cols: 

473 msg = ("An attribute '%s' already exists on %s '%s' so this column cannot be accessed as an attribute, " 

474 "e.g., table.%s; it can only be accessed using other methods, e.g., table['%s']." 

475 % (col.name, self.__class__.__name__, self.name, col.name, col.name)) 

476 warn(msg) 

477 else: 

478 setattr(self, col.name, col) 

479 

480 __reserved_colspec_keys = ['name', 'description', 'index', 'table', 'required', 'class'] 

481 

482 def _init_class_columns(self): 

483 """ 

484 Process all predefined columns specified in class variable __columns__. 

485 Optional columns are not tracked but not added. 

486 """ 

487 for col in self.__columns__: 

488 if col['name'] not in self.__colids: # if column has not been added in __init__ 

489 if col.get('required', False): 

490 self.add_column(name=col['name'], 

491 description=col['description'], 

492 index=col.get('index', False), 

493 table=col.get('table', False), 

494 col_cls=col.get('class', VectorData), 

495 # Pass through extra kwargs for add_column that subclasses may have added 

496 **{k: col[k] for k in col.keys() 

497 if k not in DynamicTable.__reserved_colspec_keys}) 

498 else: 

499 # track the not yet initialized optional predefined columns 

500 self.__uninit_cols[col['name']] = col 

501 

502 # set the table attributes for not yet init optional predefined columns 

503 setattr(self, col['name'], None) 

504 index = col.get('index', False) 

505 if index is not False: 

506 if index is True: 

507 index = 1 

508 if isinstance(index, int): 508 ↛ 515line 508 didn't jump to line 515, because the condition on line 508 was never false

509 assert index > 0, ValueError("integer index value must be greater than 0") 

510 index_name = col['name'] 

511 for i in range(index): 

512 index_name = index_name + '_index' 

513 self.__uninit_cols[index_name] = col 

514 setattr(self, index_name, None) 

515 if col.get('enum', False): 

516 self.__uninit_cols[col['name'] + '_elements'] = col 

517 setattr(self, col['name'] + '_elements', None) 

518 

519 @staticmethod 

520 def __build_columns(columns, df=None): 

521 """ 

522 Build column objects according to specifications 

523 """ 

524 tmp = list() 

525 for d in columns: 

526 name = d['name'] 

527 desc = d.get('description', 'no description') 

528 col_cls = d.get('class', VectorData) 

529 data = None 

530 if df is not None: 

531 data = list(df[name].values) 

532 index = d.get('index', False) 

533 if index is not False: 533 ↛ 534line 533 didn't jump to line 534, because the condition on line 533 was never true

534 if isinstance(index, int) and index > 1: 

535 raise ValueError('Creating nested index columns using this method is not yet supported. Use ' 

536 'add_column or define the columns using __columns__ instead.') 

537 index_data = None 

538 if data is not None: 

539 index_data = [len(data[0])] 

540 for i in range(1, len(data)): 

541 index_data.append(len(data[i]) + index_data[i - 1]) 

542 # assume data came in through a DataFrame, so we need 

543 # to concatenate it 

544 tmp_data = list() 

545 for d in data: 

546 tmp_data.extend(d) 

547 data = tmp_data 

548 vdata = col_cls(name=name, description=desc, data=data) 

549 vindex = VectorIndex(name="%s_index" % name, data=index_data, target=vdata) 

550 tmp.append(vindex) 

551 tmp.append(vdata) 

552 elif d.get('enum', False): 552 ↛ 554line 552 didn't jump to line 554, because the condition on line 552 was never true

553 # EnumData is the indexing column, so it should go first 

554 if data is not None: 

555 elements, data = np.unique(data, return_inverse=True) 

556 tmp.append(EnumData(name, desc, data=data, elements=elements)) 

557 else: 

558 tmp.append(EnumData(name, desc, data=data)) 

559 # EnumData handles constructing the VectorData object that contains EnumData.elements 

560 # --> use this functionality (rather than creating here) for consistency and less code/complexity 

561 tmp.append(tmp[-1].elements) 

562 else: 

563 if data is None: 

564 data = list() 

565 if d.get('table', False): 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true

566 col_cls = DynamicTableRegion 

567 tmp.append(col_cls(name=name, description=desc, data=data)) 

568 return tmp 

569 

570 def __len__(self): 

571 """Number of rows in the table""" 

572 return len(self.id) 

573 

574 @docval({'name': 'data', 'type': dict, 'doc': 'the data to put in this row', 'default': None}, 

575 {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None}, 

576 {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique', 

577 'default': False}, 

578 allow_extra=True) 

579 def add_row(self, **kwargs): 

580 """ 

581 Add a row to the table. If *id* is not provided, it will auto-increment. 

582 """ 

583 data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs) 

584 data = data if data is not None else kwargs 

585 

586 extra_columns = set(list(data.keys())) - set(list(self.__colids.keys())) 

587 missing_columns = set(list(self.__colids.keys())) - set(list(data.keys())) 

588 

589 bad_data = [] 

590 for colname, colnum in self.__colids.items(): 

591 if colname not in data: 

592 raise ValueError("column '%s' missing" % colname) 

593 col = self.__df_cols[colnum] 

594 if isinstance(col, VectorIndex): 

595 continue 

596 else: 

597 if col.term_set is not None: 

598 if col.term_set.validate(term=data[colname]): 

599 continue 

600 else: 

601 bad_data.append(data[colname]) 

602 

603 if len(bad_data)!=0: 

604 msg = ('"%s" is not in the term set.' % ', '.join([str(item) for item in bad_data])) 

605 raise ValueError(msg) 

606 

607 # check to see if any of the extra columns just need to be added 

608 if extra_columns: 

609 for col in self.__columns__: 

610 if col['name'] in extra_columns: 

611 if data[col['name']] is not None: 611 ↛ 621line 611 didn't jump to line 621, because the condition on line 611 was never false

612 self.add_column(col['name'], col['description'], 

613 index=col.get('index', False), 

614 table=col.get('table', False), 

615 enum=col.get('enum', False), 

616 col_cls=col.get('class', VectorData), 

617 # Pass through extra keyword arguments for add_column that 

618 # subclasses may have added 

619 **{k: col[k] for k in col.keys() 

620 if k not in DynamicTable.__reserved_colspec_keys}) 

621 extra_columns.remove(col['name']) 

622 

623 if extra_columns or missing_columns: 

624 raise ValueError( 

625 '\n'.join([ 

626 'row data keys don\'t match available columns', 

627 'you supplied {} extra keys: {}'.format(len(extra_columns), extra_columns), 

628 'and were missing {} keys: {}'.format(len(missing_columns), missing_columns) 

629 ]) 

630 ) 

631 if row_id is None: 

632 row_id = data.pop('id', None) 

633 if row_id is None: 

634 row_id = len(self) 

635 if enforce_unique_id: 

636 if row_id in self.id: 

637 raise ValueError("id %i already in the table" % row_id) 

638 self.id.append(row_id) 

639 

640 for colname, colnum in self.__colids.items(): 

641 if colname not in data: 641 ↛ 642line 641 didn't jump to line 642, because the condition on line 641 was never true

642 raise ValueError("column '%s' missing" % colname) 

643 c = self.__df_cols[colnum] 

644 if isinstance(c, VectorIndex): 

645 c.add_vector(data[colname]) 

646 else: 

647 c.add_row(data[colname]) 

648 

649 def __eq__(self, other): 

650 """Compare if the two DynamicTables contain the same data. 

651 

652 First this returns False if the other DynamicTable has a different name or 

653 description. Then, this table and the other table are converted to pandas 

654 dataframes and the equality of the two tables is returned. 

655 

656 :param other: DynamicTable to compare to 

657 

658 :return: Bool indicating whether the two DynamicTables contain the same data 

659 """ 

660 if other is self: 

661 return True 

662 if not isinstance(other, DynamicTable): 

663 return False 

664 if self.name != other.name or self.description != other.description: 

665 return False 

666 return self.to_dataframe().equals(other.to_dataframe()) 

667 

668 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, # noqa: C901 

669 {'name': 'description', 'type': str, 'doc': 'a description for this column'}, 

670 {'name': 'data', 'type': ('array_data', 'data'), 

671 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()}, 

672 {'name': 'table', 'type': (bool, 'DynamicTable'), 

673 'doc': 'whether or not this is a table region or the table the region applies to', 'default': False}, 

674 {'name': 'index', 'type': (bool, VectorIndex, 'array_data', int), 

675 'doc': ' * ``False`` (default): do not generate a VectorIndex\n\n' 

676 ' * ``True``: generate one empty VectorIndex \n\n' 

677 ' * ``VectorIndex``: Use the supplied VectorIndex \n\n' 

678 ' * array-like of ints: Create a VectorIndex and use these values as the data \n\n' 

679 ' * ``int``: Recursively create `n` VectorIndex objects for a multi-ragged array \n', 

680 'default': False}, 

681 {'name': 'enum', 'type': (bool, 'array_data'), 'default': False, 

682 'doc': ('whether or not this column contains data from a fixed set of elements')}, 

683 {'name': 'term_set', 'type': TermSet, 'doc': 'the set of terms used to validate data on add', 

684 'default': None}, 

685 {'name': 'col_cls', 'type': type, 'default': VectorData, 

686 'doc': ('class to use to represent the column data. If table=True, this field is ignored and a ' 

687 'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData ' 

688 'object is used.')}, 

689 allow_extra=True) 

690 def add_column(self, **kwargs): # noqa: C901 

691 """ 

692 Add a column to this table. 

693 

694 If data is provided, it must contain the same number of rows as the current state of the table. 

695 

696 Extra keyword arguments will be passed to the constructor of the column class ("col_cls"). 

697 

698 :raises ValueError: if the column has already been added to the table 

699 """ 

700 name, data = getargs('name', 'data', kwargs) 

701 index, table, enum, col_cls, term_set= popargs('index', 'table', 'enum', 'col_cls', 'term_set', kwargs) 

702 

703 if term_set is not None: 

704 bad_data = [] 

705 for val in data: 

706 if term_set.validate(term=val): 

707 continue 

708 else: 

709 bad_data.append(val) 

710 if len(bad_data)!=0: 

711 bad_data_string = str(bad_data)[1:-1] 

712 msg = ("%s is not in the term set." % bad_data_string) 

713 raise ValueError(msg) 

714 

715 if isinstance(index, VectorIndex): 

716 warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be " 

717 "deprecated in a future version of HDMF.", FutureWarning) 

718 

719 if name in self.__colids: # column has already been added 

720 msg = "column '%s' already exists in %s '%s'" % (name, self.__class__.__name__, self.name) 

721 raise ValueError(msg) 

722 

723 if name in self.__uninit_cols: # column is a predefined optional column from the spec 

724 # check the given values against the predefined optional column spec. if they do not match, raise a warning 

725 # and ignore the given arguments. users should not be able to override these values 

726 table_bool = table or not isinstance(table, bool) 

727 spec_table = self.__uninit_cols[name].get('table', False) 

728 if table_bool != spec_table: 

729 msg = ("Column '%s' is predefined in %s with table=%s which does not match the entered " 

730 "table argument. The predefined table spec will be ignored. " 

731 "Please ensure the new column complies with the spec. " 

732 "This will raise an error in a future version of HDMF." 

733 % (name, self.__class__.__name__, spec_table)) 

734 warn(msg) 

735 

736 index_bool = index or not isinstance(index, bool) 

737 spec_index = self.__uninit_cols[name].get('index', False) 

738 if index_bool != spec_index: 

739 msg = ("Column '%s' is predefined in %s with index=%s which does not match the entered " 

740 "index argument. The predefined index spec will be ignored. " 

741 "Please ensure the new column complies with the spec. " 

742 "This will raise an error in a future version of HDMF." 

743 % (name, self.__class__.__name__, spec_index)) 

744 warn(msg) 

745 

746 spec_col_cls = self.__uninit_cols[name].get('class', VectorData) 

747 if col_cls != spec_col_cls: 

748 msg = ("Column '%s' is predefined in %s with class=%s which does not match the entered " 

749 "col_cls argument. The predefined class spec will be ignored. " 

750 "Please ensure the new column complies with the spec. " 

751 "This will raise an error in a future version of HDMF." 

752 % (name, self.__class__.__name__, spec_col_cls)) 

753 warn(msg) 

754 

755 ckwargs = dict(kwargs) 

756 

757 # Add table if it's been specified 

758 if table and enum: 758 ↛ 759line 758 didn't jump to line 759, because the condition on line 758 was never true

759 raise ValueError("column '%s' cannot be both a table region " 

760 "and come from an enumerable set of elements" % name) 

761 if table is not False: 

762 col_cls = DynamicTableRegion 

763 if isinstance(table, DynamicTable): 

764 ckwargs['table'] = table 

765 if enum is not False: 

766 col_cls = EnumData 

767 if isinstance(enum, (list, tuple, np.ndarray, VectorData)): 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true

768 ckwargs['elements'] = enum 

769 

770 # If the user provided a list of lists that needs to be indexed, then we now need to flatten the data 

771 # We can only create the index actual VectorIndex once we have the VectorData column so we compute 

772 # the index and flatten the data here and then create the VectorIndex later from create_vector_index 

773 # once we have created the column 

774 create_vector_index = None 

775 if ckwargs.get('data', None) is not None: 775 ↛ 799line 775 didn't jump to line 799, because the condition on line 775 was never false

776 # Check that we are asked to create an index 

777 if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0: 

778 # Iteratively flatten the data we use for the column based on the depth of the index to generate. 

779 # Also, for each level compute the data for the VectorIndex for that level 

780 flatten_data = data 

781 create_vector_index = [] 

782 for i in range(index): 

783 try: 

784 create_vector_index.append(np.cumsum([len(c) for c in flatten_data]).tolist()) 

785 except TypeError as e: 

786 raise ValueError("Cannot automatically construct VectorIndex for nested array. " 

787 "Invalid data array element found.") from e 

788 flatten_data = list(itertools.chain.from_iterable(flatten_data)) 

789 # if our data still is an array (e.g., a list or numpy array) then warn that the index parameter 

790 # may be incorrect. 

791 if len(flatten_data) > 0 and isinstance(flatten_data[0], (np.ndarray, list, tuple)): 

792 raise ValueError("Cannot automatically construct VectorIndex for nested array. " 

793 "Column data contains arrays as cell values. Please check the 'data' and 'index' " 

794 "parameters. 'index=%s' may be too small for the given data." % str(index)) 

795 # overwrite the data to be used for the VectorData column with the flattened data 

796 ckwargs['data'] = flatten_data 

797 

798 # Create the VectorData column 

799 col = col_cls(**ckwargs) 

800 col.parent = self 

801 columns = [col] 

802 self.__set_table_attr(col) 

803 if col in self.__uninit_cols: 803 ↛ 804line 803 didn't jump to line 804, because the condition on line 803 was never true

804 self.__uninit_cols.pop(col) 

805 

806 if col_cls is EnumData: 

807 columns.append(col.elements) 

808 col.elements.parent = self 

809 

810 # Add index if it's been specified 

811 if index is not False: 

812 if isinstance(index, VectorIndex): 

813 col_index = index 

814 self.__add_column_index_helper(col_index) 

815 elif isinstance(index, bool): 

816 # create empty index for empty column 

817 if create_vector_index is None: 

818 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index") 

819 col_index = VectorIndex(name=name + "_index", data=list(), target=col) 

820 # create single-level VectorIndex from the data based on the create_vector_index we computed earlier 

821 else: 

822 col_index = VectorIndex(name=name + "_index", data=create_vector_index[0], target=col) 

823 # add the column with the index 

824 self.__add_column_index_helper(col_index) 

825 elif isinstance(index, int): 

826 if create_vector_index is None: 

827 assert index > 0, ValueError("integer index value must be greater than 0") 

828 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index") 

829 index_name = name 

830 for i in range(index): 

831 index_name = index_name + "_index" 

832 col_index = VectorIndex(name=index_name, data=list(), target=col) 

833 self.__add_column_index_helper(col_index) 

834 if i < index - 1: 

835 columns.insert(0, col_index) 

836 col = col_index 

837 # Create the nested VectorIndex from the create_vector_index we computed above 

838 else: 

839 index_name = name 

840 for i in range(index): 

841 index_name = index_name + "_index" 

842 col_index = VectorIndex(name=index_name, data=create_vector_index[-(i+1)], target=col) 

843 self.__add_column_index_helper(col_index) 

844 if i < index - 1: 

845 columns.insert(0, col_index) 

846 col = col_index 

847 else: # make VectorIndex with supplied data 

848 assert len(col) > 0, ValueError("cannot pass non-empty index with empty data to index") 

849 col_index = VectorIndex(name=name + "_index", data=index, target=col) 

850 self.__add_column_index_helper(col_index) 

851 columns.insert(0, col_index) 

852 col = col_index 

853 

854 if len(col) != len(self.id): 

855 raise ValueError("column must have the same number of rows as 'id'") 

856 self.__colids[name] = len(self.__df_cols) 

857 self.fields['colnames'] = tuple(list(self.colnames) + [name]) 

858 self.fields['columns'] = tuple(list(self.columns) + columns) 

859 self.__df_cols.append(col) 

860 

861 def __add_column_index_helper(self, col_index): 

862 if not isinstance(col_index.parent, Container): 862 ↛ 865line 862 didn't jump to line 865, because the condition on line 862 was never false

863 col_index.parent = self 

864 # else, the ObjectMapper will create a link from self (parent) to col_index (child with existing parent) 

865 self.__indices[col_index.name] = col_index 

866 self.__set_table_attr(col_index) 

867 if col_index in self.__uninit_cols: 867 ↛ 868line 867 didn't jump to line 868, because the condition on line 867 was never true

868 self.__uninit_cols.pop(col_index) 

869 

870 @docval({'name': 'name', 'type': str, 'doc': 'the name of the DynamicTableRegion object'}, 

871 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the indices of the table'}, 

872 {'name': 'description', 'type': str, 'doc': 'a brief description of what the region is'}) 

873 def create_region(self, **kwargs): 

874 """ 

875 Create a DynamicTableRegion selecting a region (i.e., rows) in this DynamicTable. 

876 

877 :raises: IndexError if the provided region contains invalid indices 

878 

879 """ 

880 region = getargs('region', kwargs) 

881 if isinstance(region, slice): 

882 if (region.start is not None and region.start < 0) or (region.stop is not None and region.stop > len(self)): 

883 msg = 'region slice %s is out of range for this DynamicTable of length %d' % (str(region), len(self)) 

884 raise IndexError(msg) 

885 region = list(range(*region.indices(len(self)))) 

886 else: 

887 for idx in region: 

888 if idx < 0 or idx >= len(self): 

889 raise IndexError('The index ' + str(idx) + 

890 ' is out of range for this DynamicTable of length ' 

891 + str(len(self))) 

892 desc = getargs('description', kwargs) 

893 name = getargs('name', kwargs) 

894 return DynamicTableRegion(name=name, data=region, description=desc, table=self) 

895 

896 def __getitem__(self, key): 

897 ret = self.get(key) 

898 if ret is None: 

899 raise KeyError(key) 

900 return ret 

901 

902 def get(self, key, default=None, df=True, index=True, **kwargs): 

903 """Select a subset from the table. 

904 

905 If the table includes a DynamicTableRegion column, then by default, 

906 the index/indices of the DynamicTableRegion will be returned. If ``df=True`` and ``index=False``, 

907 then the returned pandas DataFrame will contain a nested DataFrame in each row of the 

908 DynamicTableRegion column. If ``df=False`` and ``index=True``, then a list of lists will be returned 

909 where the list containing the DynamicTableRegion column contains the indices of the DynamicTableRegion. 

910 Note that in this case, the DynamicTable referenced by the DynamicTableRegion can be accessed through 

911 the ``table`` attribute of the DynamicTableRegion object. ``df=False`` and ``index=False`` is 

912 not yet supported. 

913 

914 :param key: Key defining which elements of the table to select. This may be one of the following: 

915 

916 1) string with the name of the column to select 

917 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the 

918 column to select by name 

919 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then 

920 scalars are returned for each column that has a single value. If a list, array, or slice is used and 

921 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a 

922 single row. 

923 

924 :return: 1) If key is a string, then return the VectorData object representing the column with the string name 

925 2) If key is a tuple of (int, str), then return the scalar value of the selected cell 

926 3) If key is an int, list, np.ndarray, or slice, then return pandas.DataFrame or lists 

927 consisting of one or more rows 

928 

929 :raises: KeyError 

930 """ 

931 ret = None 

932 if not df and not index: 

933 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported 

934 raise ValueError('DynamicTable.get() with df=False and index=False is not yet supported.') 

935 if isinstance(key, tuple): 

936 # index by row and column --> return specific cell 

937 arg1 = key[0] 

938 arg2 = key[1] 

939 if isinstance(arg2, str): 

940 arg2 = self.__colids[arg2] 

941 ret = self.__df_cols[arg2][arg1] 

942 elif isinstance(key, str): 

943 # index by one string --> return column 

944 if key == 'id': 944 ↛ 945line 944 didn't jump to line 945, because the condition on line 944 was never true

945 return self.id 

946 elif key in self.__colids: 

947 ret = self.__df_cols[self.__colids[key]] 

948 elif key in self.__indices: 

949 ret = self.__indices[key] 

950 else: 

951 return default 

952 else: 

953 # index by int, list, np.ndarray, or slice --> 

954 # return pandas Dataframe or lists consisting of one or more rows 

955 sel = self.__get_selection_as_dict(key, df, index, **kwargs) 

956 if df: 

957 # reformat objects to fit into a pandas DataFrame 

958 if np.isscalar(key): 

959 ret = self.__get_selection_as_df_single_row(sel) 

960 else: 

961 ret = self.__get_selection_as_df(sel) 

962 else: 

963 ret = list(sel.values()) 

964 

965 return ret 

966 

967 def __get_selection_as_dict(self, arg, df, index, exclude=None, **kwargs): 

968 """Return a dict mapping column names to values (lists/arrays or dataframes) for the given selection. 

969 Uses each column's get() method, passing kwargs as necessary. 

970 

971 :param arg: key passed to get() to return one or more rows 

972 :type arg: int, list, np.ndarray, or slice 

973 """ 

974 if not (np.issubdtype(type(arg), np.integer) or isinstance(arg, (slice, list, np.ndarray))): 

975 raise KeyError("Key type not supported by DynamicTable %s" % str(type(arg))) 

976 if isinstance(arg, np.ndarray) and arg.ndim != 1: 976 ↛ 977line 976 didn't jump to line 977, because the condition on line 976 was never true

977 raise ValueError("Cannot index DynamicTable with multiple dimensions") 

978 if exclude is None: 

979 exclude = set([]) 

980 ret = OrderedDict() 

981 try: 

982 # index with a python slice or single int to select one or multiple rows 

983 ret['id'] = self.id[arg] 

984 for name in self.colnames: 

985 if name in exclude: 

986 continue 

987 col = self.__df_cols[self.__colids[name]] 

988 if index and (isinstance(col, DynamicTableRegion) or 

989 (isinstance(col, VectorIndex) and isinstance(col.target, DynamicTableRegion))): 

990 # return indices (in list, array, etc.) for DTR and ragged DTR 

991 ret[name] = col.get(arg, df=False, index=True, **kwargs) 

992 else: 

993 ret[name] = col.get(arg, df=df, index=index, **kwargs) 

994 return ret 

995 # if index is out of range, different errors can be generated depending on the dtype of the column 

996 # but despite the differences, raise an IndexError from that error 

997 except ValueError as ve: 997 ↛ 1000line 997 didn't jump to line 1000, because the exception caught by line 997 didn't happen

998 # in h5py <2, if the column is an h5py.Dataset, a ValueError was raised 

999 # in h5py 3+, this became an IndexError 

1000 x = re.match(r"^Index \((.*)\) out of range \(.*\)$", str(ve)) 

1001 if x: 

1002 msg = ("Row index %s out of range for %s '%s' (length %d)." 

1003 % (x.groups()[0], self.__class__.__name__, self.name, len(self))) 

1004 raise IndexError(msg) from ve 

1005 else: # pragma: no cover 

1006 raise ve 

1007 except IndexError as ie: 

1008 x = re.match(r"^Index \((.*)\) out of range for \(.*\)$", str(ie)) 

1009 if x: 

1010 msg = ("Row index %s out of range for %s '%s' (length %d)." 

1011 % (x.groups()[0], self.__class__.__name__, self.name, len(self))) 

1012 raise IndexError(msg) 

1013 elif str(ie) == 'list index out of range': 

1014 msg = ("Row index out of range for %s '%s' (length %d)." 

1015 % (self.__class__.__name__, self.name, len(self))) 

1016 raise IndexError(msg) from ie 

1017 else: # pragma: no cover 

1018 raise ie 

1019 

1020 def __get_selection_as_df_single_row(self, coldata): 

1021 """Return a pandas dataframe for the given row and columns with the id column as the index. 

1022 

1023 This is a special case of __get_selection_as_df where a single row was requested. 

1024 

1025 :param coldata: dict mapping column names to values (list/arrays or dataframes) 

1026 :type coldata: dict 

1027 """ 

1028 id_index_orig = coldata.pop('id') 

1029 id_index = [id_index_orig] 

1030 df_input = OrderedDict() 

1031 for k in coldata: # for each column 

1032 if isinstance(coldata[k], (np.ndarray, list, tuple, pd.DataFrame)): 

1033 # wrap in a list because coldata[k] may be an array/list/tuple with multiple elements (ragged or 

1034 # multi-dim column) and pandas needs to have one element per index row (=1 in this case) 

1035 df_input[k] = [coldata[k]] 

1036 else: # scalar, don't wrap 

1037 df_input[k] = coldata[k] 

1038 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64)) 

1039 ret.name = self.name 

1040 return ret 

1041 

1042 def __get_selection_as_df(self, coldata): 

1043 """Return a pandas dataframe for the given rows and columns with the id column as the index. 

1044 

1045 This is used when multiple row indices are selected (or a list/array/slice of a single index is passed to get). 

1046 __get_selection_as_df_single_row should be used if a single index is passed to get. 

1047 

1048 :param coldata: dict mapping column names to values (list/arrays or dataframes) 

1049 :type coldata: dict 

1050 """ 

1051 id_index = coldata.pop('id') 

1052 df_input = OrderedDict() 

1053 for k in coldata: # for each column 

1054 if isinstance(coldata[k], np.ndarray) and coldata[k].ndim > 1: 

1055 df_input[k] = list(coldata[k]) # convert multi-dim array to list of inner arrays 

1056 elif isinstance(coldata[k], pd.DataFrame): 

1057 # multiple rows were selected and collapsed into a dataframe 

1058 # split up the rows of the df into a list of dataframes, one per row 

1059 # TODO make this more efficient 

1060 df_input[k] = [coldata[k].iloc[[i]] for i in range(len(coldata[k]))] 

1061 else: 

1062 df_input[k] = coldata[k] 

1063 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64)) 

1064 ret.name = self.name 

1065 return ret 

1066 

1067 def __contains__(self, val): 

1068 """ 

1069 Check if the given value (i.e., column) exists in this table 

1070 """ 

1071 return val in self.__colids or val in self.__indices 

1072 

1073 def get_foreign_columns(self): 

1074 """ 

1075 Determine the names of all columns that link to another DynamicTable, i.e., 

1076 find all DynamicTableRegion type columns. Similar to a foreign key in a 

1077 database, a DynamicTableRegion column references elements in another table. 

1078 

1079 :returns: List of strings with the column names 

1080 """ 

1081 col_names = [] 

1082 for col_index, col in enumerate(self.columns): 

1083 if isinstance(col, DynamicTableRegion): 

1084 col_names.append(col.name) 

1085 return col_names 

1086 

1087 def has_foreign_columns(self): 

1088 """ 

1089 Does the table contain DynamicTableRegion columns 

1090 

1091 :returns: True if the table contains a DynamicTableRegion column, else False 

1092 """ 

1093 for col_index, col in enumerate(self.columns): 

1094 if isinstance(col, DynamicTableRegion): 

1095 return True 

1096 return False 

1097 

1098 @docval({'name': 'other_tables', 'type': (list, tuple, set), 

1099 'doc': "List of additional tables to consider in the search. Usually this " 

1100 "parameter is used for internal purposes, e.g., when we need to " 

1101 "consider AlignedDynamicTable", 'default': None}, 

1102 allow_extra=False) 

1103 def get_linked_tables(self, **kwargs): 

1104 """ 

1105 Get a list of the full list of all tables that are being linked to directly or indirectly 

1106 from this table via foreign DynamicTableColumns included in this table or in any table that 

1107 can be reached through DynamicTableRegion columns 

1108 

1109 Returns: List of NamedTuple objects with: 

1110 * 'source_table' : The source table containing the DynamicTableRegion column 

1111 * 'source_column' : The relevant DynamicTableRegion column in the 'source_table' 

1112 * 'target_table' : The target DynamicTable; same as source_column.table. 

1113 """ 

1114 link_type = NamedTuple('DynamicTableLink', 

1115 [('source_table', DynamicTable), 

1116 ('source_column', Union[DynamicTableRegion, VectorIndex]), 

1117 ('target_table', DynamicTable)]) 

1118 curr_tables = [self, ] # Set of tables 

1119 other_tables = getargs('other_tables', kwargs) 

1120 if other_tables is not None: 

1121 curr_tables += other_tables 

1122 curr_index = 0 

1123 foreign_cols = [] 

1124 while curr_index < len(curr_tables): 

1125 for col_index, col in enumerate(curr_tables[curr_index].columns): 

1126 if isinstance(col, DynamicTableRegion): 

1127 foreign_cols.append(link_type(source_table=curr_tables[curr_index], 

1128 source_column=col, 

1129 target_table=col.table)) 

1130 curr_table_visited = False 

1131 for t in curr_tables: 

1132 if t is col.table: 

1133 curr_table_visited = True 

1134 if not curr_table_visited: 

1135 curr_tables.append(col.table) 

1136 curr_index += 1 

1137 return foreign_cols 

1138 

1139 @docval({'name': 'exclude', 'type': set, 'doc': 'Set of column names to exclude from the dataframe', 

1140 'default': None}, 

1141 {'name': 'index', 'type': bool, 

1142 'doc': ('Whether to return indices for a DynamicTableRegion column. If False, nested dataframes will be ' 

1143 'returned.'), 

1144 'default': False} 

1145 ) 

1146 def to_dataframe(self, **kwargs): 

1147 """ 

1148 Produce a pandas DataFrame containing this table's data. 

1149 

1150 If this table contains a DynamicTableRegion, by default, 

1151 

1152 If exclude is None, this is equivalent to table.get(slice(None, None, None), index=False). 

1153 """ 

1154 arg = slice(None, None, None) # select all rows 

1155 sel = self.__get_selection_as_dict(arg, df=True, **kwargs) 

1156 ret = self.__get_selection_as_df(sel) 

1157 return ret 

1158 

1159 @classmethod 

1160 @docval( 

1161 {'name': 'df', 'type': pd.DataFrame, 'doc': 'source DataFrame'}, 

1162 {'name': 'name', 'type': str, 'doc': 'the name of this table'}, 

1163 { 

1164 'name': 'index_column', 

1165 'type': str, 

1166 'doc': 'if provided, this column will become the table\'s index', 

1167 'default': None 

1168 }, 

1169 { 

1170 'name': 'table_description', 

1171 'type': str, 

1172 'doc': 'a description of what is in the resulting table', 

1173 'default': '' 

1174 }, 

1175 { 

1176 'name': 'columns', 

1177 'type': (list, tuple), 

1178 'doc': 'a list/tuple of dictionaries specifying columns in the table', 

1179 'default': None 

1180 }, 

1181 allow_extra=True 

1182 ) 

1183 def from_dataframe(cls, **kwargs): 

1184 ''' 

1185 Construct an instance of DynamicTable (or a subclass) from a pandas DataFrame. 

1186 

1187 The columns of the resulting table are defined by the columns of the 

1188 dataframe and the index by the dataframe's index (make sure it has a 

1189 name!) or by a column whose name is supplied to the index_column 

1190 parameter. We recommend that you supply *columns* - a list/tuple of 

1191 dictionaries containing the name and description of the column- to help 

1192 others understand the contents of your table. See 

1193 :py:class:`~hdmf.common.table.DynamicTable` for more details on *columns*. 

1194 ''' 

1195 

1196 columns = kwargs.pop('columns') 

1197 df = kwargs.pop('df') 

1198 name = kwargs.pop('name') 

1199 index_column = kwargs.pop('index_column') 

1200 table_description = kwargs.pop('table_description') 

1201 column_descriptions = kwargs.pop('column_descriptions', dict()) 

1202 

1203 supplied_columns = dict() 

1204 if columns: 1204 ↛ 1205line 1204 didn't jump to line 1205, because the condition on line 1204 was never true

1205 supplied_columns = {x['name']: x for x in columns} 

1206 

1207 class_cols = {x['name']: x for x in cls.__columns__} 

1208 required_cols = set(x['name'] for x in cls.__columns__ if 'required' in x and x['required']) 

1209 df_cols = df.columns 

1210 if required_cols - set(df_cols): 1210 ↛ 1211line 1210 didn't jump to line 1211, because the condition on line 1210 was never true

1211 raise ValueError('missing required cols: ' + str(required_cols - set(df_cols))) 

1212 if set(supplied_columns.keys()) - set(df_cols): 1212 ↛ 1213line 1212 didn't jump to line 1213, because the condition on line 1212 was never true

1213 raise ValueError('cols specified but not provided: ' + str(set(supplied_columns.keys()) - set(df_cols))) 

1214 columns = [] 

1215 for col_name in df_cols: 

1216 if col_name in class_cols: 1216 ↛ 1217line 1216 didn't jump to line 1217, because the condition on line 1216 was never true

1217 columns.append(class_cols[col_name]) 

1218 elif col_name in supplied_columns: 1218 ↛ 1219line 1218 didn't jump to line 1219, because the condition on line 1218 was never true

1219 columns.append(supplied_columns[col_name]) 

1220 else: 

1221 columns.append({'name': col_name, 

1222 'description': column_descriptions.get(col_name, 'no description')}) 

1223 if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str): 

1224 lengths = [len(x) for x in df[col_name]] 

1225 if not lengths[1:] == lengths[:-1]: 1225 ↛ 1226line 1225 didn't jump to line 1226, because the condition on line 1225 was never true

1226 columns[-1].update(index=True) 

1227 

1228 if index_column is not None: 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true

1229 ids = ElementIdentifiers(name=index_column, data=df[index_column].values.tolist()) 

1230 else: 

1231 index_name = df.index.name if df.index.name is not None else 'id' 

1232 ids = ElementIdentifiers(name=index_name, data=df.index.values.tolist()) 

1233 

1234 columns = cls.__build_columns(columns, df=df) 

1235 

1236 return cls(name=name, id=ids, columns=columns, description=table_description, **kwargs) 

1237 

1238 def copy(self): 

1239 """ 

1240 Return a copy of this DynamicTable. 

1241 This is useful for linking. 

1242 """ 

1243 kwargs = dict(name=self.name, id=self.id, columns=self.columns, description=self.description, 

1244 colnames=self.colnames) 

1245 return self.__class__(**kwargs) 

1246 

1247 

1248@register_class('DynamicTableRegion') 

1249class DynamicTableRegion(VectorData): 

1250 """ 

1251 DynamicTableRegion provides a link from one table to an index or region of another. The `table` 

1252 attribute is another `DynamicTable`, indicating which table is referenced. The data is int(s) 

1253 indicating the row(s) (0-indexed) of the target array. `DynamicTableRegion`s can be used to 

1254 associate multiple rows with the same meta-data without data duplication. They can also be used to 

1255 create hierarchical relationships between multiple `DynamicTable`s. `DynamicTableRegion` objects 

1256 may be paired with a `VectorIndex` object to create ragged references, so a single cell of a 

1257 `DynamicTable` can reference many rows of another `DynamicTable`. 

1258 """ 

1259 

1260 __fields__ = ( 

1261 'table', 

1262 ) 

1263 

1264 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, 

1265 {'name': 'data', 'type': ('array_data', 'data'), 

1266 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'}, 

1267 {'name': 'description', 'type': str, 'doc': 'a description of what this region represents'}, 

1268 {'name': 'table', 'type': DynamicTable, 

1269 'doc': 'the DynamicTable this region applies to', 'default': None}, 

1270 allow_positional=AllowPositional.WARNING) 

1271 def __init__(self, **kwargs): 

1272 t = popargs('table', kwargs) 

1273 super().__init__(**kwargs) 

1274 self.table = t 

1275 

1276 @property 

1277 def table(self): 

1278 """The DynamicTable this DynamicTableRegion is pointing to""" 

1279 return self.fields.get('table') 

1280 

1281 @table.setter 

1282 def table(self, val): 

1283 """ 

1284 Set the table this DynamicTableRegion should be pointing to 

1285 

1286 :param val: The DynamicTable this DynamicTableRegion should be pointing to 

1287 

1288 :raises: AttributeError if table is already in fields 

1289 :raises: IndexError if the current indices are out of bounds for the new table given by val 

1290 """ 

1291 if val is None: 

1292 return 

1293 if 'table' in self.fields: 1293 ↛ 1294line 1293 didn't jump to line 1294, because the condition on line 1293 was never true

1294 msg = "can't set attribute 'table' -- already set" 

1295 raise AttributeError(msg) 

1296 dat = self.data 

1297 if isinstance(dat, DataIO): 1297 ↛ 1298line 1297 didn't jump to line 1298, because the condition on line 1297 was never true

1298 dat = dat.data 

1299 self.fields['table'] = val 

1300 

1301 def __getitem__(self, arg): 

1302 return self.get(arg) 

1303 

1304 def get(self, arg, index=False, df=True, **kwargs): 

1305 """ 

1306 Subset the DynamicTableRegion 

1307 

1308 :param arg: Key defining which elements of the table to select. This may be one of the following: 

1309 

1310 1) string with the name of the column to select 

1311 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the 

1312 column to select by name 

1313 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then 

1314 scalars are returned for each column that has a single value. If a list, array, or slice is used and 

1315 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a 

1316 single row. 

1317 

1318 :param index: Boolean indicating whether to return indices of the DTR (default False) 

1319 :param df: Boolean indicating whether to return the result as a pandas DataFrame (default True) 

1320 

1321 :return: Result from self.table[...] with the appropriate selection based on the 

1322 rows selected by this DynamicTableRegion 

1323 """ 

1324 if not df and not index: 

1325 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported 

1326 raise ValueError('DynamicTableRegion.get() with df=False and index=False is not yet supported.') 

1327 # treat the list of indices as data that can be indexed. then pass the 

1328 # result to the table to get the data 

1329 if isinstance(arg, tuple): 

1330 arg1 = arg[0] 

1331 arg2 = arg[1] 

1332 return self.table[self.data[arg1], arg2] 

1333 elif isinstance(arg, str): 

1334 return self.table[arg] 

1335 elif np.issubdtype(type(arg), np.integer): 

1336 if arg >= len(self.data): 

1337 raise IndexError('index {} out of bounds for data of length {}'.format(arg, len(self.data))) 

1338 ret = self.data[arg] 

1339 if not index: 

1340 ret = self.table.get(ret, df=df, index=index, **kwargs) 

1341 return ret 

1342 elif isinstance(arg, (list, slice, np.ndarray)): 

1343 idx = arg 

1344 

1345 # get the data at the specified indices 

1346 if isinstance(self.data, (tuple, list)) and isinstance(idx, (list, np.ndarray)): 

1347 ret = [self.data[i] for i in idx] 

1348 else: 

1349 ret = self.data[idx] 

1350 

1351 # dereference them if necessary 

1352 if not index: 

1353 # These lines are needed because indexing Dataset with a list/ndarray 

1354 # of ints requires the list to be sorted. 

1355 # 

1356 # First get the unique elements, retrieve them from the table, and then 

1357 # reorder the result according to the original index that the user passed in. 

1358 # 

1359 # When not returning a DataFrame, we need to recursively sort the subelements 

1360 # of the list we are returning. This is carried out by the recursive method _index_lol 

1361 uniq = np.unique(ret) 

1362 lut = {val: i for i, val in enumerate(uniq)} 

1363 values = self.table.get(uniq, df=df, index=index, **kwargs) 

1364 if df: 1364 ↛ 1367line 1364 didn't jump to line 1367, because the condition on line 1364 was never false

1365 ret = values.iloc[[lut[i] for i in ret]] 

1366 else: 

1367 ret = self._index_lol(values, ret, lut) 

1368 return ret 

1369 else: 

1370 raise ValueError("unrecognized argument: '%s'" % arg) 

1371 

1372 def _index_lol(self, result, index, lut): 

1373 """ 

1374 This is a helper function for indexing a list of lists/ndarrays. When not returning a 

1375 DataFrame, indexing a DynamicTable will return a list of lists and ndarrays. To sort 

1376 the result of a DynamicTable index according to the order of the indices passed in by the 

1377 user, we have to recursively sort the sub-lists/sub-ndarrays. 

1378 """ 

1379 ret = list() 

1380 for col in result: 

1381 if isinstance(col, list): 

1382 if isinstance(col[0], list): 

1383 # list of columns that need to be sorted 

1384 ret.append(self._index_lol(col, index, lut)) 

1385 else: 

1386 # list of elements, one for each row to return 

1387 ret.append([col[lut[i]] for i in index]) 

1388 elif isinstance(col, np.ndarray): 

1389 ret.append(np.array([col[lut[i]] for i in index], dtype=col.dtype)) 

1390 else: 

1391 raise ValueError('unrecognized column type: %s. Expected list or np.ndarray' % type(col)) 

1392 return ret 

1393 

1394 def to_dataframe(self, **kwargs): 

1395 """ 

1396 Convert the whole DynamicTableRegion to a pandas dataframe. 

1397 

1398 Keyword arguments are passed through to the to_dataframe method of DynamicTable that 

1399 is being referenced (i.e., self.table). This allows specification of the 'exclude' 

1400 parameter and any other parameters of DynamicTable.to_dataframe. 

1401 """ 

1402 return self.table.to_dataframe(**kwargs).iloc[self.data[:]] 

1403 

1404 @property 

1405 def shape(self): 

1406 """ 

1407 Define the shape, i.e., (num_rows, num_columns) of the selected table region 

1408 :return: Shape tuple with two integers indicating the number of rows and number of columns 

1409 """ 

1410 return (len(self.data), len(self.table.columns)) 

1411 

1412 def __repr__(self): 

1413 """ 

1414 :return: Human-readable string representation of the DynamicTableRegion 

1415 """ 

1416 cls = self.__class__ 

1417 template = "%s %s.%s at 0x%d\n" % (self.name, cls.__module__, cls.__name__, id(self)) 

1418 template += " Target table: %s %s.%s at 0x%d\n" % (self.table.name, 

1419 self.table.__class__.__module__, 

1420 self.table.__class__.__name__, 

1421 id(self.table)) 

1422 return template 

1423 

1424 

1425def _uint_precision(elements): 

1426 """ Calculate the uint precision needed to encode a set of elements """ 

1427 n_elements = elements 

1428 if hasattr(elements, '__len__'): 1428 ↛ 1430line 1428 didn't jump to line 1430, because the condition on line 1428 was never false

1429 n_elements = len(elements) 

1430 return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type 

1431 

1432 

1433def _map_elements(uint, elements): 

1434 """ Map CV terms to their uint index """ 

1435 return {t[1]: uint(t[0]) for t in enumerate(elements)} 

1436 

1437 

1438@register_class('EnumData', EXP_NAMESPACE) 

1439class EnumData(VectorData): 

1440 """ 

1441 A n-dimensional dataset that can contain elements from fixed set of elements. 

1442 """ 

1443 

1444 __fields__ = ('elements', ) 

1445 

1446 @docval({'name': 'name', 'type': str, 'doc': 'the name of this column'}, 

1447 {'name': 'description', 'type': str, 'doc': 'a description for this column'}, 

1448 {'name': 'data', 'type': ('array_data', 'data'), 

1449 'doc': 'integers that index into elements for the value of each row', 'default': list()}, 

1450 {'name': 'elements', 'type': ('array_data', 'data', VectorData), 'default': list(), 

1451 'doc': 'lookup values for each integer in ``data``'}, 

1452 allow_positional=AllowPositional.WARNING) 

1453 def __init__(self, **kwargs): 

1454 elements = popargs('elements', kwargs) 

1455 super().__init__(**kwargs) 

1456 if not isinstance(elements, VectorData): 

1457 elements = VectorData(name='%s_elements' % self.name, data=elements, 

1458 description='fixed set of elements referenced by %s' % self.name) 

1459 self.elements = elements 

1460 if len(self.elements) > 0: 

1461 self.__uint = _uint_precision(self.elements.data) 

1462 self.__revidx = _map_elements(self.__uint, self.elements.data) 

1463 else: 

1464 self.__revidx = dict() # a map from term to index 

1465 self.__uint = None # the precision needed to encode all terms 

1466 

1467 def __add_term(self, term): 

1468 """ 

1469 Add a new CV term, and return it's corresponding index 

1470 

1471 Returns: 

1472 The index of the term 

1473 """ 

1474 if term not in self.__revidx: 

1475 # get minimum uint precision needed for elements 

1476 self.elements.append(term) 

1477 uint = _uint_precision(self.elements) 

1478 if self.__uint is uint: 

1479 # add the new term to the index-term map 

1480 self.__revidx[term] = self.__uint(len(self.elements) - 1) 

1481 else: 

1482 # remap terms to their uint and bump the precision of existing data 

1483 self.__uint = uint 

1484 self.__revidx = _map_elements(self.__uint, self.elements) 

1485 for i in range(len(self.data)): 1485 ↛ 1486line 1485 didn't jump to line 1486, because the loop on line 1485 never started

1486 self.data[i] = self.__uint(self.data[i]) 

1487 return self.__revidx[term] 

1488 

1489 def __getitem__(self, arg): 

1490 return self.get(arg, index=False) 

1491 

1492 def _get_helper(self, idx, index=False, join=False, **kwargs): 

1493 """ 

1494 A helper function for getting elements elements 

1495 

1496 This helper function contains the post-processing of retrieve indices. By separating this, 

1497 it allows customizing processing of indices before resolving the elements elements 

1498 """ 

1499 if index: 

1500 return idx 

1501 if not np.isscalar(idx): 

1502 idx = np.asarray(idx) 

1503 ret = np.asarray(self.elements.get(idx.ravel(), **kwargs)).reshape(idx.shape) 

1504 if join: 

1505 ret = ''.join(ret.ravel()) 

1506 else: 

1507 ret = self.elements.get(idx, **kwargs) 

1508 return ret 

1509 

1510 def get(self, arg, index=False, join=False, **kwargs): 

1511 """ 

1512 Return elements elements for the given argument. 

1513 

1514 Args: 

1515 index (bool): Return indices, do not return CV elements 

1516 join (bool): Concatenate elements together into a single string 

1517 

1518 Returns: 

1519 CV elements if *join* is False or a concatenation of all selected 

1520 elements if *join* is True. 

1521 """ 

1522 idx = self.data[arg] 

1523 return self._get_helper(idx, index=index, join=join, **kwargs) 

1524 

1525 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'}, 

1526 {'name': 'index', 'type': bool, 'doc': 'whether or not the value being added is an index', 

1527 'default': False}) 

1528 def add_row(self, **kwargs): 

1529 """Append a data value to this EnumData column 

1530 

1531 If an element is provided for *val* (i.e. *index* is False), the correct 

1532 index value will be determined. Otherwise, *val* will be added as provided. 

1533 """ 

1534 val, index = getargs('val', 'index', kwargs) 

1535 if not index: 

1536 val = self.__add_term(val) 

1537 super().append(val)