Coverage for src/hdmf/common/table.py: 89%
767 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-07-25 05:02 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-07-25 05:02 +0000
1"""
2Collection of Container classes for interacting with data types related to
3the storage and use of dynamic data tables as part of the hdmf-common schema
4"""
6import re
7from collections import OrderedDict
8from typing import NamedTuple, Union
9from warnings import warn
11import numpy as np
12import pandas as pd
13import itertools
15from . import register_class, EXP_NAMESPACE
16from ..container import Container, Data
17from ..data_utils import DataIO, AbstractDataChunkIterator
18from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional
19from ..term_set import TermSet
22@register_class('VectorData')
23class VectorData(Data):
24 """
25 A n-dimensional dataset representing a column of a DynamicTable.
26 If used without an accompanying VectorIndex, first dimension is
27 along the rows of the DynamicTable and each step along the first
28 dimension is a cell of the larger table. VectorData can also be
29 used to represent a ragged array if paired with a VectorIndex.
30 This allows for storing arrays of varying length in a single cell
31 of the DynamicTable by indexing into this VectorData. The first
32 vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at
33 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on.
34 """
36 __fields__ = ("description",)
38 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'},
39 {'name': 'description', 'type': str, 'doc': 'a description for this column'},
40 {'name': 'data', 'type': ('array_data', 'data'),
41 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()},
42 {'name': 'term_set', 'type': TermSet, 'doc': 'the set of terms used to validate data on add',
43 'default': None},
44 allow_positional=AllowPositional.WARNING)
45 def __init__(self, **kwargs):
46 description = popargs('description', kwargs)
47 super().__init__(**kwargs)
48 self.description = description
50 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'})
51 def add_row(self, **kwargs):
52 """Append a data value to this VectorData column"""
53 val = getargs('val', kwargs)
54 if self.term_set is not None:
55 if self.term_set.validate(term=val): 55 ↛ 58line 55 didn't jump to line 58, because the condition on line 55 was never false
56 self.append(val)
57 else:
58 msg = ("%s is not in the term set." % val)
59 raise ValueError(msg)
61 else:
62 self.append(val)
64 def get(self, key, **kwargs):
65 """
66 Retrieve elements from this VectorData
68 :param key: Selection of the elements
69 :param kwargs: Ignored
70 """
71 return super().get(key)
73 def extend(self, ar, **kwargs):
74 """Add all elements of the iterable arg to the end of this VectorData.
76 Each subclass of VectorData should have its own extend method to ensure functionality and efficiency.
78 :param arg: The iterable to add to the end of this VectorData
79 """
80 #################################################################################
81 # Each subclass of VectorData should have its own extend method to ensure
82 # functionality AND efficiency of the extend operation. However, because currently
83 # they do not all have one of these methods, the only way to ensure functionality
84 # is with calls to add_row. Because that is inefficient for basic VectorData,
85 # this check is added to ensure we always call extend on a basic VectorData.
86 if self.__class__.__mro__[0] == VectorData:
87 super().extend(ar)
88 else:
89 for i in ar:
90 self.add_row(i, **kwargs)
93@register_class('VectorIndex')
94class VectorIndex(VectorData):
95 """
96 When paired with a VectorData, this allows for storing arrays of varying
97 length in a single cell of the DynamicTable by indexing into this VectorData.
98 The first vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at
99 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on.
100 """
102 __fields__ = ("target",)
104 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorIndex'},
105 {'name': 'data', 'type': ('array_data', 'data'),
106 'doc': 'a 1D dataset containing indexes that apply to VectorData object'},
107 {'name': 'target', 'type': VectorData,
108 'doc': 'the target dataset that this index applies to'},
109 allow_positional=AllowPositional.WARNING)
110 def __init__(self, **kwargs):
111 target = popargs('target', kwargs)
112 kwargs['description'] = "Index for VectorData '%s'" % target.name
113 super().__init__(**kwargs)
114 self.target = target
115 self.__uint = np.uint8
116 self.__maxval = 255
117 if isinstance(self.data, (list, np.ndarray)):
118 if len(self.data) > 0:
119 self.__check_precision(len(self.target))
120 # adjust precision for types that we can adjust precision for
121 self.__adjust_precision(self.__uint)
123 def add_vector(self, arg, **kwargs):
124 """
125 Add the given data value to the target VectorData and append the corresponding index to this VectorIndex
126 :param arg: The data value to be added to self.target
127 """
128 if isinstance(self.target, VectorIndex):
129 for a in arg:
130 self.target.add_vector(a)
131 else:
132 self.target.extend(arg, **kwargs)
133 self.append(self.__check_precision(len(self.target)))
135 def __check_precision(self, idx):
136 """
137 Check precision of current dataset and, if necessary, adjust precision to accommodate new value.
139 Returns:
140 unsigned integer encoding of idx
141 """
142 if idx > self.__maxval:
143 while idx > self.__maxval:
144 nbits = (np.log2(self.__maxval + 1) * 2) # 8->16, 16->32, 32->64
145 if nbits == 128: # pragma: no cover
146 msg = ('Cannot store more than 18446744073709551615 elements in a VectorData. Largest dtype '
147 'allowed for VectorIndex is uint64.')
148 raise ValueError(msg)
149 self.__maxval = 2 ** nbits - 1
150 self.__uint = np.dtype('uint%d' % nbits).type
151 self.__adjust_precision(self.__uint)
152 return self.__uint(idx)
154 def __adjust_precision(self, uint):
155 """
156 Adjust precision of data to specified unsigned integer precision.
157 """
158 if isinstance(self.data, list):
159 for i in range(len(self.data)):
160 self.data[i] = uint(self.data[i])
161 elif isinstance(self.data, np.ndarray): 161 ↛ 165line 161 didn't jump to line 165, because the condition on line 161 was never false
162 # use self._Data__data to work around restriction on resetting self.data
163 self._Data__data = self.data.astype(uint)
164 else:
165 raise ValueError("cannot adjust precision of type %s to %s", (type(self.data), uint))
167 def add_row(self, arg, **kwargs):
168 """
169 Convenience function. Same as :py:func:`add_vector`
170 """
171 self.add_vector(arg, **kwargs)
173 def __getitem_helper(self, arg, **kwargs):
174 """
175 Internal helper function used by __getitem__ to retrieve a data value from self.target
177 :param arg: Integer index into this VectorIndex indicating the element we want to retrieve from the target
178 :param kwargs: any additional arguments to *get* method of the self.target VectorData
179 :return: Scalar or list of values retrieved
180 """
181 start = 0 if arg == 0 else self.data[arg - 1]
182 end = self.data[arg]
183 return self.target.get(slice(start, end), **kwargs)
185 def __getitem__(self, arg):
186 """
187 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData
189 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex
190 :return: Scalar or list of values retrieved
191 """
192 return self.get(arg)
194 def get(self, arg, **kwargs):
195 """
196 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData
198 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex
199 :param kwargs: any additional arguments to *get* method of the self.target VectorData
200 :return: Scalar or list of values retrieved
201 """
202 if np.isscalar(arg):
203 return self.__getitem_helper(arg, **kwargs)
204 else:
205 if isinstance(arg, slice):
206 indices = list(range(*arg.indices(len(self.data))))
207 else:
208 if isinstance(arg[0], bool): 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true
209 arg = np.where(arg)[0]
210 indices = arg
211 ret = list()
212 for i in indices:
213 ret.append(self.__getitem_helper(i, **kwargs))
214 return ret
217@register_class('ElementIdentifiers')
218class ElementIdentifiers(Data):
219 """
220 Data container with a list of unique identifiers for values within a dataset, e.g. rows of a DynamicTable.
221 """
223 @docval({'name': 'name', 'type': str, 'doc': 'the name of this ElementIdentifiers'},
224 {'name': 'data', 'type': ('array_data', 'data'), 'doc': 'a 1D dataset containing identifiers',
225 'default': list()},
226 allow_positional=AllowPositional.WARNING)
227 def __init__(self, **kwargs):
228 super().__init__(**kwargs)
230 @docval({'name': 'other', 'type': (Data, np.ndarray, list, tuple, int),
231 'doc': 'List of ids to search for in this ElementIdentifer object'},
232 rtype=np.ndarray,
233 returns='Array with the list of indices where the elements in the list where found.'
234 'Note, the elements in the returned list are ordered in increasing index'
235 'of the found elements, rather than in the order in which the elements'
236 'where given for the search. Also the length of the result may be different from the length'
237 'of the input array. E.g., if our ids are [1,2,3] and we are search for [3,1,5] the '
238 'result would be [0,2] and NOT [2,0,None]')
239 def __eq__(self, other):
240 """
241 Given a list of ids return the indices in the ElementIdentifiers array where the indices are found.
242 """
243 # Determine the ids we want to find
244 search_ids = other if not isinstance(other, Data) else other.data
245 if isinstance(search_ids, int):
246 search_ids = [search_ids]
247 # Find all matching locations
248 return np.in1d(self.data, search_ids).nonzero()[0]
251@register_class('DynamicTable')
252class DynamicTable(Container):
253 r"""
254 A column-based table. Columns are defined by the argument *columns*. This argument
255 must be a list/tuple of :class:`~hdmf.common.table.VectorData` and :class:`~hdmf.common.table.VectorIndex` objects
256 or a list/tuple of dicts containing the keys ``name`` and ``description`` that provide the name and description
257 of each column in the table. Additionally, the keys ``index``, ``table``, ``enum`` can be used for specifying
258 additional structure to the table columns. Setting the key ``index`` to ``True`` can be used to indicate that the
259 :class:`~hdmf.common.table.VectorData` column will store a ragged array (i.e. will be accompanied with a
260 :class:`~hdmf.common.table.VectorIndex`). Setting the key ``table`` to ``True`` can be used to indicate that the
261 column will store regions to another DynamicTable. Setting the key ``enum`` to ``True`` can be used to indicate
262 that the column data will come from a fixed set of values.
264 Columns in DynamicTable subclasses can be statically defined by specifying the class attribute *\_\_columns\_\_*,
265 rather than specifying them at runtime at the instance level. This is useful for defining a table structure
266 that will get reused. The requirements for *\_\_columns\_\_* are the same as the requirements described above
267 for specifying table columns with the *columns* argument to the DynamicTable constructor.
268 """
270 __fields__ = (
271 {'name': 'id', 'child': True},
272 {'name': 'columns', 'child': True},
273 'colnames',
274 'description'
275 )
277 __columns__ = tuple()
279 @ExtenderMeta.pre_init
280 def __gather_columns(cls, name, bases, classdict):
281 r"""
282 Gather columns from the *\_\_columns\_\_* class attribute and add them to the class.
284 This classmethod will be called during class declaration in the metaclass to automatically
285 include all columns declared in subclasses.
286 """
287 if not isinstance(cls.__columns__, tuple):
288 msg = "'__columns__' must be of type tuple, found %s" % type(cls.__columns__)
289 raise TypeError(msg)
291 if (len(bases) and 'DynamicTable' in globals() and issubclass(bases[-1], Container)
292 and bases[-1].__columns__ is not cls.__columns__):
293 new_columns = list(cls.__columns__)
294 new_columns[0:0] = bases[-1].__columns__ # prepend superclass columns to new_columns
295 cls.__columns__ = tuple(new_columns)
297 @docval({'name': 'name', 'type': str, 'doc': 'the name of this table'}, # noqa: C901
298 {'name': 'description', 'type': str, 'doc': 'a description of what is in this table'},
299 {'name': 'id', 'type': ('array_data', 'data', ElementIdentifiers), 'doc': 'the identifiers for this table',
300 'default': None},
301 {'name': 'columns', 'type': (tuple, list), 'doc': 'the columns in this table', 'default': None},
302 {'name': 'colnames', 'type': 'array_data',
303 'doc': 'the ordered names of the columns in this table. columns must also be provided.',
304 'default': None},
305 allow_positional=AllowPositional.WARNING)
306 def __init__(self, **kwargs): # noqa: C901
307 id, columns, desc, colnames = popargs('id', 'columns', 'description', 'colnames', kwargs)
308 super().__init__(**kwargs)
309 self.description = desc
311 # hold names of optional columns that are defined in __columns__ that are not yet initialized
312 # map name to column specification
313 self.__uninit_cols = dict()
315 # All tables must have ElementIdentifiers (i.e. a primary key column)
316 # Here, we figure out what to do for that
317 if id is not None:
318 if not isinstance(id, ElementIdentifiers):
319 id = ElementIdentifiers(name='id', data=id)
320 else:
321 id = ElementIdentifiers(name='id')
323 if columns is not None and len(columns) > 0:
324 # If columns have been passed in, check them over and process accordingly
325 if isinstance(columns[0], dict):
326 columns = self.__build_columns(columns)
327 elif not all(isinstance(c, VectorData) for c in columns):
328 raise ValueError("'columns' must be a list of dict, VectorData, DynamicTableRegion, or VectorIndex")
330 all_names = [c.name for c in columns]
331 if len(all_names) != len(set(all_names)):
332 raise ValueError("'columns' contains columns with duplicate names: %s" % all_names)
334 all_targets = [c.target.name for c in columns if isinstance(c, VectorIndex)]
335 if len(all_targets) != len(set(all_targets)):
336 raise ValueError("'columns' contains index columns with the same target: %s" % all_targets)
338 # TODO: check columns against __columns__
339 # mismatches should raise an error (e.g., a VectorData cannot be passed in with the same name as a
340 # prespecified table region column)
342 # check column lengths against each other and id length
343 # set ids if non-zero cols are provided and ids is empty
344 colset = {c.name: c for c in columns}
345 for c in columns: # remove all VectorData objects that have an associated VectorIndex from colset
346 if isinstance(c, VectorIndex):
347 if c.target.name in colset:
348 colset.pop(c.target.name)
349 else:
350 raise ValueError("Found VectorIndex '%s' but not its target '%s'" % (c.name, c.target.name))
351 elif isinstance(c, EnumData):
352 if c.elements.name in colset: 352 ↛ 354line 352 didn't jump to line 354, because the condition on line 352 was never false
353 colset.pop(c.elements.name)
354 _data = c.data
355 if isinstance(_data, DataIO):
356 _data = _data.data
357 if isinstance(_data, AbstractDataChunkIterator): 357 ↛ 358line 357 didn't jump to line 358, because the condition on line 357 was never true
358 colset.pop(c.name, None)
359 lens = [len(c) for c in colset.values()]
360 if not all(i == lens[0] for i in lens):
361 raise ValueError("columns must be the same length")
362 if len(lens) > 0 and lens[0] != len(id):
363 # the first part of this conditional is needed in the
364 # event that all columns are AbstractDataChunkIterators
365 if len(id) > 0:
366 raise ValueError("must provide same number of ids as length of columns")
367 else: # set ids to: 0 to length of columns - 1
368 id.data.extend(range(lens[0]))
370 self.id = id
372 # NOTE: self.colnames and self.columns are always tuples
373 # if kwarg colnames is an h5dataset, self.colnames is still a tuple
374 if colnames is None or len(colnames) == 0:
375 if columns is None:
376 # make placeholder for columns if nothing was given
377 self.colnames = tuple()
378 self.columns = tuple()
379 else:
380 # Figure out column names if columns were given
381 tmp = OrderedDict()
382 skip = set()
383 for col in columns:
384 if col.name in skip: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true
385 continue
386 if isinstance(col, VectorIndex):
387 continue
388 if isinstance(col, EnumData): 388 ↛ 389line 388 didn't jump to line 389, because the condition on line 388 was never true
389 skip.add(col.elements.name)
390 tmp.pop(col.elements.name, None)
391 tmp[col.name] = None
392 self.colnames = tuple(tmp)
393 self.columns = tuple(columns)
394 else:
395 # Calculate the order of column names
396 if columns is None:
397 raise ValueError("Must supply 'columns' if specifying 'colnames'")
398 else:
399 # order the columns according to the column names, which does not include indices
400 self.colnames = tuple(pystr(c) for c in colnames)
401 col_dict = {col.name: col for col in columns}
402 # map from vectordata name to list of vectorindex objects where target of last vectorindex is vectordata
403 indices = dict()
404 # determine which columns are indexed by another column
405 for col in columns:
406 if isinstance(col, VectorIndex):
407 # loop through nested indices to get to non-index column
408 tmp_indices = [col]
409 curr_col = col
410 while isinstance(curr_col.target, VectorIndex):
411 curr_col = curr_col.target
412 tmp_indices.append(curr_col)
413 # make sure the indices values has the full index chain, so replace existing value if it is
414 # shorter
415 if len(tmp_indices) > len(indices.get(curr_col.target.name, [])):
416 indices[curr_col.target.name] = tmp_indices
417 elif isinstance(col, EnumData):
418 # EnumData is the indexing column, so it should go first
419 if col.name not in indices: 419 ↛ 405line 419 didn't jump to line 405, because the condition on line 419 was never false
420 indices[col.name] = [col] # EnumData is the indexing object
421 col_dict[col.name] = col.elements # EnumData.elements is the column with values
422 else:
423 if col.name in indices:
424 continue
425 indices[col.name] = []
426 # put columns in order of colnames, with indices before the target vectordata
427 tmp = []
428 for name in self.colnames:
429 tmp.extend(indices[name])
430 tmp.append(col_dict[name])
431 self.columns = tuple(tmp)
433 # to make generating DataFrames and Series easier
434 col_dict = dict()
435 self.__indices = dict()
436 for col in self.columns:
437 if isinstance(col, VectorIndex):
438 # if index has already been added because it is part of a nested index chain, ignore this column
439 if col.name in self.__indices:
440 continue
441 self.__indices[col.name] = col
443 # loop through nested indices to get to non-index column
444 curr_col = col
445 self.__set_table_attr(curr_col)
446 while isinstance(curr_col.target, VectorIndex):
447 curr_col = curr_col.target
448 # check if index has been added. if not, add it
449 if not hasattr(self, curr_col.name):
450 self.__set_table_attr(curr_col)
451 self.__indices[curr_col.name] = col
453 # use target vectordata name at end of indexing chain as key to get to the top level index
454 col_dict[curr_col.target.name] = col
455 if not hasattr(self, curr_col.target.name):
456 self.__set_table_attr(curr_col.target)
457 else: # this is a regular VectorData or EnumData
458 # if we added this column using its index, ignore this column
459 if col.name in col_dict:
460 continue
461 else:
462 col_dict[col.name] = col
463 self.__set_table_attr(col)
465 self.__df_cols = [self.id] + [col_dict[name] for name in self.colnames]
467 # self.__colids maps the column name to an index starting at 1
468 self.__colids = {name: i + 1 for i, name in enumerate(self.colnames)}
469 self._init_class_columns()
471 def __set_table_attr(self, col):
472 if hasattr(self, col.name) and col.name not in self.__uninit_cols:
473 msg = ("An attribute '%s' already exists on %s '%s' so this column cannot be accessed as an attribute, "
474 "e.g., table.%s; it can only be accessed using other methods, e.g., table['%s']."
475 % (col.name, self.__class__.__name__, self.name, col.name, col.name))
476 warn(msg)
477 else:
478 setattr(self, col.name, col)
480 __reserved_colspec_keys = ['name', 'description', 'index', 'table', 'required', 'class']
482 def _init_class_columns(self):
483 """
484 Process all predefined columns specified in class variable __columns__.
485 Optional columns are not tracked but not added.
486 """
487 for col in self.__columns__:
488 if col['name'] not in self.__colids: # if column has not been added in __init__
489 if col.get('required', False):
490 self.add_column(name=col['name'],
491 description=col['description'],
492 index=col.get('index', False),
493 table=col.get('table', False),
494 col_cls=col.get('class', VectorData),
495 # Pass through extra kwargs for add_column that subclasses may have added
496 **{k: col[k] for k in col.keys()
497 if k not in DynamicTable.__reserved_colspec_keys})
498 else:
499 # track the not yet initialized optional predefined columns
500 self.__uninit_cols[col['name']] = col
502 # set the table attributes for not yet init optional predefined columns
503 setattr(self, col['name'], None)
504 index = col.get('index', False)
505 if index is not False:
506 if index is True:
507 index = 1
508 if isinstance(index, int): 508 ↛ 515line 508 didn't jump to line 515, because the condition on line 508 was never false
509 assert index > 0, ValueError("integer index value must be greater than 0")
510 index_name = col['name']
511 for i in range(index):
512 index_name = index_name + '_index'
513 self.__uninit_cols[index_name] = col
514 setattr(self, index_name, None)
515 if col.get('enum', False):
516 self.__uninit_cols[col['name'] + '_elements'] = col
517 setattr(self, col['name'] + '_elements', None)
519 @staticmethod
520 def __build_columns(columns, df=None):
521 """
522 Build column objects according to specifications
523 """
524 tmp = list()
525 for d in columns:
526 name = d['name']
527 desc = d.get('description', 'no description')
528 col_cls = d.get('class', VectorData)
529 data = None
530 if df is not None:
531 data = list(df[name].values)
532 index = d.get('index', False)
533 if index is not False: 533 ↛ 534line 533 didn't jump to line 534, because the condition on line 533 was never true
534 if isinstance(index, int) and index > 1:
535 raise ValueError('Creating nested index columns using this method is not yet supported. Use '
536 'add_column or define the columns using __columns__ instead.')
537 index_data = None
538 if data is not None:
539 index_data = [len(data[0])]
540 for i in range(1, len(data)):
541 index_data.append(len(data[i]) + index_data[i - 1])
542 # assume data came in through a DataFrame, so we need
543 # to concatenate it
544 tmp_data = list()
545 for d in data:
546 tmp_data.extend(d)
547 data = tmp_data
548 vdata = col_cls(name=name, description=desc, data=data)
549 vindex = VectorIndex(name="%s_index" % name, data=index_data, target=vdata)
550 tmp.append(vindex)
551 tmp.append(vdata)
552 elif d.get('enum', False): 552 ↛ 554line 552 didn't jump to line 554, because the condition on line 552 was never true
553 # EnumData is the indexing column, so it should go first
554 if data is not None:
555 elements, data = np.unique(data, return_inverse=True)
556 tmp.append(EnumData(name, desc, data=data, elements=elements))
557 else:
558 tmp.append(EnumData(name, desc, data=data))
559 # EnumData handles constructing the VectorData object that contains EnumData.elements
560 # --> use this functionality (rather than creating here) for consistency and less code/complexity
561 tmp.append(tmp[-1].elements)
562 else:
563 if data is None:
564 data = list()
565 if d.get('table', False): 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true
566 col_cls = DynamicTableRegion
567 tmp.append(col_cls(name=name, description=desc, data=data))
568 return tmp
570 def __len__(self):
571 """Number of rows in the table"""
572 return len(self.id)
574 @docval({'name': 'data', 'type': dict, 'doc': 'the data to put in this row', 'default': None},
575 {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None},
576 {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique',
577 'default': False},
578 allow_extra=True)
579 def add_row(self, **kwargs):
580 """
581 Add a row to the table. If *id* is not provided, it will auto-increment.
582 """
583 data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs)
584 data = data if data is not None else kwargs
586 extra_columns = set(list(data.keys())) - set(list(self.__colids.keys()))
587 missing_columns = set(list(self.__colids.keys())) - set(list(data.keys()))
589 bad_data = []
590 for colname, colnum in self.__colids.items():
591 if colname not in data:
592 raise ValueError("column '%s' missing" % colname)
593 col = self.__df_cols[colnum]
594 if isinstance(col, VectorIndex):
595 continue
596 else:
597 if col.term_set is not None:
598 if col.term_set.validate(term=data[colname]):
599 continue
600 else:
601 bad_data.append(data[colname])
603 if len(bad_data)!=0:
604 msg = ('"%s" is not in the term set.' % ', '.join([str(item) for item in bad_data]))
605 raise ValueError(msg)
607 # check to see if any of the extra columns just need to be added
608 if extra_columns:
609 for col in self.__columns__:
610 if col['name'] in extra_columns:
611 if data[col['name']] is not None: 611 ↛ 621line 611 didn't jump to line 621, because the condition on line 611 was never false
612 self.add_column(col['name'], col['description'],
613 index=col.get('index', False),
614 table=col.get('table', False),
615 enum=col.get('enum', False),
616 col_cls=col.get('class', VectorData),
617 # Pass through extra keyword arguments for add_column that
618 # subclasses may have added
619 **{k: col[k] for k in col.keys()
620 if k not in DynamicTable.__reserved_colspec_keys})
621 extra_columns.remove(col['name'])
623 if extra_columns or missing_columns:
624 raise ValueError(
625 '\n'.join([
626 'row data keys don\'t match available columns',
627 'you supplied {} extra keys: {}'.format(len(extra_columns), extra_columns),
628 'and were missing {} keys: {}'.format(len(missing_columns), missing_columns)
629 ])
630 )
631 if row_id is None:
632 row_id = data.pop('id', None)
633 if row_id is None:
634 row_id = len(self)
635 if enforce_unique_id:
636 if row_id in self.id:
637 raise ValueError("id %i already in the table" % row_id)
638 self.id.append(row_id)
640 for colname, colnum in self.__colids.items():
641 if colname not in data: 641 ↛ 642line 641 didn't jump to line 642, because the condition on line 641 was never true
642 raise ValueError("column '%s' missing" % colname)
643 c = self.__df_cols[colnum]
644 if isinstance(c, VectorIndex):
645 c.add_vector(data[colname])
646 else:
647 c.add_row(data[colname])
649 def __eq__(self, other):
650 """Compare if the two DynamicTables contain the same data.
652 First this returns False if the other DynamicTable has a different name or
653 description. Then, this table and the other table are converted to pandas
654 dataframes and the equality of the two tables is returned.
656 :param other: DynamicTable to compare to
658 :return: Bool indicating whether the two DynamicTables contain the same data
659 """
660 if other is self:
661 return True
662 if not isinstance(other, DynamicTable):
663 return False
664 if self.name != other.name or self.description != other.description:
665 return False
666 return self.to_dataframe().equals(other.to_dataframe())
668 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, # noqa: C901
669 {'name': 'description', 'type': str, 'doc': 'a description for this column'},
670 {'name': 'data', 'type': ('array_data', 'data'),
671 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()},
672 {'name': 'table', 'type': (bool, 'DynamicTable'),
673 'doc': 'whether or not this is a table region or the table the region applies to', 'default': False},
674 {'name': 'index', 'type': (bool, VectorIndex, 'array_data', int),
675 'doc': ' * ``False`` (default): do not generate a VectorIndex\n\n'
676 ' * ``True``: generate one empty VectorIndex \n\n'
677 ' * ``VectorIndex``: Use the supplied VectorIndex \n\n'
678 ' * array-like of ints: Create a VectorIndex and use these values as the data \n\n'
679 ' * ``int``: Recursively create `n` VectorIndex objects for a multi-ragged array \n',
680 'default': False},
681 {'name': 'enum', 'type': (bool, 'array_data'), 'default': False,
682 'doc': ('whether or not this column contains data from a fixed set of elements')},
683 {'name': 'term_set', 'type': TermSet, 'doc': 'the set of terms used to validate data on add',
684 'default': None},
685 {'name': 'col_cls', 'type': type, 'default': VectorData,
686 'doc': ('class to use to represent the column data. If table=True, this field is ignored and a '
687 'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData '
688 'object is used.')},
689 allow_extra=True)
690 def add_column(self, **kwargs): # noqa: C901
691 """
692 Add a column to this table.
694 If data is provided, it must contain the same number of rows as the current state of the table.
696 Extra keyword arguments will be passed to the constructor of the column class ("col_cls").
698 :raises ValueError: if the column has already been added to the table
699 """
700 name, data = getargs('name', 'data', kwargs)
701 index, table, enum, col_cls, term_set= popargs('index', 'table', 'enum', 'col_cls', 'term_set', kwargs)
703 if term_set is not None:
704 bad_data = []
705 for val in data:
706 if term_set.validate(term=val):
707 continue
708 else:
709 bad_data.append(val)
710 if len(bad_data)!=0:
711 bad_data_string = str(bad_data)[1:-1]
712 msg = ("%s is not in the term set." % bad_data_string)
713 raise ValueError(msg)
715 if isinstance(index, VectorIndex):
716 warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be "
717 "deprecated in a future version of HDMF.", FutureWarning)
719 if name in self.__colids: # column has already been added
720 msg = "column '%s' already exists in %s '%s'" % (name, self.__class__.__name__, self.name)
721 raise ValueError(msg)
723 if name in self.__uninit_cols: # column is a predefined optional column from the spec
724 # check the given values against the predefined optional column spec. if they do not match, raise a warning
725 # and ignore the given arguments. users should not be able to override these values
726 table_bool = table or not isinstance(table, bool)
727 spec_table = self.__uninit_cols[name].get('table', False)
728 if table_bool != spec_table:
729 msg = ("Column '%s' is predefined in %s with table=%s which does not match the entered "
730 "table argument. The predefined table spec will be ignored. "
731 "Please ensure the new column complies with the spec. "
732 "This will raise an error in a future version of HDMF."
733 % (name, self.__class__.__name__, spec_table))
734 warn(msg)
736 index_bool = index or not isinstance(index, bool)
737 spec_index = self.__uninit_cols[name].get('index', False)
738 if index_bool != spec_index:
739 msg = ("Column '%s' is predefined in %s with index=%s which does not match the entered "
740 "index argument. The predefined index spec will be ignored. "
741 "Please ensure the new column complies with the spec. "
742 "This will raise an error in a future version of HDMF."
743 % (name, self.__class__.__name__, spec_index))
744 warn(msg)
746 spec_col_cls = self.__uninit_cols[name].get('class', VectorData)
747 if col_cls != spec_col_cls:
748 msg = ("Column '%s' is predefined in %s with class=%s which does not match the entered "
749 "col_cls argument. The predefined class spec will be ignored. "
750 "Please ensure the new column complies with the spec. "
751 "This will raise an error in a future version of HDMF."
752 % (name, self.__class__.__name__, spec_col_cls))
753 warn(msg)
755 ckwargs = dict(kwargs)
757 # Add table if it's been specified
758 if table and enum: 758 ↛ 759line 758 didn't jump to line 759, because the condition on line 758 was never true
759 raise ValueError("column '%s' cannot be both a table region "
760 "and come from an enumerable set of elements" % name)
761 if table is not False:
762 col_cls = DynamicTableRegion
763 if isinstance(table, DynamicTable):
764 ckwargs['table'] = table
765 if enum is not False:
766 col_cls = EnumData
767 if isinstance(enum, (list, tuple, np.ndarray, VectorData)): 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true
768 ckwargs['elements'] = enum
770 # If the user provided a list of lists that needs to be indexed, then we now need to flatten the data
771 # We can only create the index actual VectorIndex once we have the VectorData column so we compute
772 # the index and flatten the data here and then create the VectorIndex later from create_vector_index
773 # once we have created the column
774 create_vector_index = None
775 if ckwargs.get('data', None) is not None: 775 ↛ 799line 775 didn't jump to line 799, because the condition on line 775 was never false
776 # Check that we are asked to create an index
777 if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0:
778 # Iteratively flatten the data we use for the column based on the depth of the index to generate.
779 # Also, for each level compute the data for the VectorIndex for that level
780 flatten_data = data
781 create_vector_index = []
782 for i in range(index):
783 try:
784 create_vector_index.append(np.cumsum([len(c) for c in flatten_data]).tolist())
785 except TypeError as e:
786 raise ValueError("Cannot automatically construct VectorIndex for nested array. "
787 "Invalid data array element found.") from e
788 flatten_data = list(itertools.chain.from_iterable(flatten_data))
789 # if our data still is an array (e.g., a list or numpy array) then warn that the index parameter
790 # may be incorrect.
791 if len(flatten_data) > 0 and isinstance(flatten_data[0], (np.ndarray, list, tuple)):
792 raise ValueError("Cannot automatically construct VectorIndex for nested array. "
793 "Column data contains arrays as cell values. Please check the 'data' and 'index' "
794 "parameters. 'index=%s' may be too small for the given data." % str(index))
795 # overwrite the data to be used for the VectorData column with the flattened data
796 ckwargs['data'] = flatten_data
798 # Create the VectorData column
799 col = col_cls(**ckwargs)
800 col.parent = self
801 columns = [col]
802 self.__set_table_attr(col)
803 if col in self.__uninit_cols: 803 ↛ 804line 803 didn't jump to line 804, because the condition on line 803 was never true
804 self.__uninit_cols.pop(col)
806 if col_cls is EnumData:
807 columns.append(col.elements)
808 col.elements.parent = self
810 # Add index if it's been specified
811 if index is not False:
812 if isinstance(index, VectorIndex):
813 col_index = index
814 self.__add_column_index_helper(col_index)
815 elif isinstance(index, bool):
816 # create empty index for empty column
817 if create_vector_index is None:
818 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index")
819 col_index = VectorIndex(name=name + "_index", data=list(), target=col)
820 # create single-level VectorIndex from the data based on the create_vector_index we computed earlier
821 else:
822 col_index = VectorIndex(name=name + "_index", data=create_vector_index[0], target=col)
823 # add the column with the index
824 self.__add_column_index_helper(col_index)
825 elif isinstance(index, int):
826 if create_vector_index is None:
827 assert index > 0, ValueError("integer index value must be greater than 0")
828 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index")
829 index_name = name
830 for i in range(index):
831 index_name = index_name + "_index"
832 col_index = VectorIndex(name=index_name, data=list(), target=col)
833 self.__add_column_index_helper(col_index)
834 if i < index - 1:
835 columns.insert(0, col_index)
836 col = col_index
837 # Create the nested VectorIndex from the create_vector_index we computed above
838 else:
839 index_name = name
840 for i in range(index):
841 index_name = index_name + "_index"
842 col_index = VectorIndex(name=index_name, data=create_vector_index[-(i+1)], target=col)
843 self.__add_column_index_helper(col_index)
844 if i < index - 1:
845 columns.insert(0, col_index)
846 col = col_index
847 else: # make VectorIndex with supplied data
848 assert len(col) > 0, ValueError("cannot pass non-empty index with empty data to index")
849 col_index = VectorIndex(name=name + "_index", data=index, target=col)
850 self.__add_column_index_helper(col_index)
851 columns.insert(0, col_index)
852 col = col_index
854 if len(col) != len(self.id):
855 raise ValueError("column must have the same number of rows as 'id'")
856 self.__colids[name] = len(self.__df_cols)
857 self.fields['colnames'] = tuple(list(self.colnames) + [name])
858 self.fields['columns'] = tuple(list(self.columns) + columns)
859 self.__df_cols.append(col)
861 def __add_column_index_helper(self, col_index):
862 if not isinstance(col_index.parent, Container): 862 ↛ 865line 862 didn't jump to line 865, because the condition on line 862 was never false
863 col_index.parent = self
864 # else, the ObjectMapper will create a link from self (parent) to col_index (child with existing parent)
865 self.__indices[col_index.name] = col_index
866 self.__set_table_attr(col_index)
867 if col_index in self.__uninit_cols: 867 ↛ 868line 867 didn't jump to line 868, because the condition on line 867 was never true
868 self.__uninit_cols.pop(col_index)
870 @docval({'name': 'name', 'type': str, 'doc': 'the name of the DynamicTableRegion object'},
871 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the indices of the table'},
872 {'name': 'description', 'type': str, 'doc': 'a brief description of what the region is'})
873 def create_region(self, **kwargs):
874 """
875 Create a DynamicTableRegion selecting a region (i.e., rows) in this DynamicTable.
877 :raises: IndexError if the provided region contains invalid indices
879 """
880 region = getargs('region', kwargs)
881 if isinstance(region, slice):
882 if (region.start is not None and region.start < 0) or (region.stop is not None and region.stop > len(self)):
883 msg = 'region slice %s is out of range for this DynamicTable of length %d' % (str(region), len(self))
884 raise IndexError(msg)
885 region = list(range(*region.indices(len(self))))
886 else:
887 for idx in region:
888 if idx < 0 or idx >= len(self):
889 raise IndexError('The index ' + str(idx) +
890 ' is out of range for this DynamicTable of length '
891 + str(len(self)))
892 desc = getargs('description', kwargs)
893 name = getargs('name', kwargs)
894 return DynamicTableRegion(name=name, data=region, description=desc, table=self)
896 def __getitem__(self, key):
897 ret = self.get(key)
898 if ret is None:
899 raise KeyError(key)
900 return ret
902 def get(self, key, default=None, df=True, index=True, **kwargs):
903 """Select a subset from the table.
905 If the table includes a DynamicTableRegion column, then by default,
906 the index/indices of the DynamicTableRegion will be returned. If ``df=True`` and ``index=False``,
907 then the returned pandas DataFrame will contain a nested DataFrame in each row of the
908 DynamicTableRegion column. If ``df=False`` and ``index=True``, then a list of lists will be returned
909 where the list containing the DynamicTableRegion column contains the indices of the DynamicTableRegion.
910 Note that in this case, the DynamicTable referenced by the DynamicTableRegion can be accessed through
911 the ``table`` attribute of the DynamicTableRegion object. ``df=False`` and ``index=False`` is
912 not yet supported.
914 :param key: Key defining which elements of the table to select. This may be one of the following:
916 1) string with the name of the column to select
917 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the
918 column to select by name
919 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then
920 scalars are returned for each column that has a single value. If a list, array, or slice is used and
921 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a
922 single row.
924 :return: 1) If key is a string, then return the VectorData object representing the column with the string name
925 2) If key is a tuple of (int, str), then return the scalar value of the selected cell
926 3) If key is an int, list, np.ndarray, or slice, then return pandas.DataFrame or lists
927 consisting of one or more rows
929 :raises: KeyError
930 """
931 ret = None
932 if not df and not index:
933 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported
934 raise ValueError('DynamicTable.get() with df=False and index=False is not yet supported.')
935 if isinstance(key, tuple):
936 # index by row and column --> return specific cell
937 arg1 = key[0]
938 arg2 = key[1]
939 if isinstance(arg2, str):
940 arg2 = self.__colids[arg2]
941 ret = self.__df_cols[arg2][arg1]
942 elif isinstance(key, str):
943 # index by one string --> return column
944 if key == 'id': 944 ↛ 945line 944 didn't jump to line 945, because the condition on line 944 was never true
945 return self.id
946 elif key in self.__colids:
947 ret = self.__df_cols[self.__colids[key]]
948 elif key in self.__indices:
949 ret = self.__indices[key]
950 else:
951 return default
952 else:
953 # index by int, list, np.ndarray, or slice -->
954 # return pandas Dataframe or lists consisting of one or more rows
955 sel = self.__get_selection_as_dict(key, df, index, **kwargs)
956 if df:
957 # reformat objects to fit into a pandas DataFrame
958 if np.isscalar(key):
959 ret = self.__get_selection_as_df_single_row(sel)
960 else:
961 ret = self.__get_selection_as_df(sel)
962 else:
963 ret = list(sel.values())
965 return ret
967 def __get_selection_as_dict(self, arg, df, index, exclude=None, **kwargs):
968 """Return a dict mapping column names to values (lists/arrays or dataframes) for the given selection.
969 Uses each column's get() method, passing kwargs as necessary.
971 :param arg: key passed to get() to return one or more rows
972 :type arg: int, list, np.ndarray, or slice
973 """
974 if not (np.issubdtype(type(arg), np.integer) or isinstance(arg, (slice, list, np.ndarray))):
975 raise KeyError("Key type not supported by DynamicTable %s" % str(type(arg)))
976 if isinstance(arg, np.ndarray) and arg.ndim != 1: 976 ↛ 977line 976 didn't jump to line 977, because the condition on line 976 was never true
977 raise ValueError("Cannot index DynamicTable with multiple dimensions")
978 if exclude is None:
979 exclude = set([])
980 ret = OrderedDict()
981 try:
982 # index with a python slice or single int to select one or multiple rows
983 ret['id'] = self.id[arg]
984 for name in self.colnames:
985 if name in exclude:
986 continue
987 col = self.__df_cols[self.__colids[name]]
988 if index and (isinstance(col, DynamicTableRegion) or
989 (isinstance(col, VectorIndex) and isinstance(col.target, DynamicTableRegion))):
990 # return indices (in list, array, etc.) for DTR and ragged DTR
991 ret[name] = col.get(arg, df=False, index=True, **kwargs)
992 else:
993 ret[name] = col.get(arg, df=df, index=index, **kwargs)
994 return ret
995 # if index is out of range, different errors can be generated depending on the dtype of the column
996 # but despite the differences, raise an IndexError from that error
997 except ValueError as ve: 997 ↛ 1000line 997 didn't jump to line 1000, because the exception caught by line 997 didn't happen
998 # in h5py <2, if the column is an h5py.Dataset, a ValueError was raised
999 # in h5py 3+, this became an IndexError
1000 x = re.match(r"^Index \((.*)\) out of range \(.*\)$", str(ve))
1001 if x:
1002 msg = ("Row index %s out of range for %s '%s' (length %d)."
1003 % (x.groups()[0], self.__class__.__name__, self.name, len(self)))
1004 raise IndexError(msg) from ve
1005 else: # pragma: no cover
1006 raise ve
1007 except IndexError as ie:
1008 x = re.match(r"^Index \((.*)\) out of range for \(.*\)$", str(ie))
1009 if x:
1010 msg = ("Row index %s out of range for %s '%s' (length %d)."
1011 % (x.groups()[0], self.__class__.__name__, self.name, len(self)))
1012 raise IndexError(msg)
1013 elif str(ie) == 'list index out of range':
1014 msg = ("Row index out of range for %s '%s' (length %d)."
1015 % (self.__class__.__name__, self.name, len(self)))
1016 raise IndexError(msg) from ie
1017 else: # pragma: no cover
1018 raise ie
1020 def __get_selection_as_df_single_row(self, coldata):
1021 """Return a pandas dataframe for the given row and columns with the id column as the index.
1023 This is a special case of __get_selection_as_df where a single row was requested.
1025 :param coldata: dict mapping column names to values (list/arrays or dataframes)
1026 :type coldata: dict
1027 """
1028 id_index_orig = coldata.pop('id')
1029 id_index = [id_index_orig]
1030 df_input = OrderedDict()
1031 for k in coldata: # for each column
1032 if isinstance(coldata[k], (np.ndarray, list, tuple, pd.DataFrame)):
1033 # wrap in a list because coldata[k] may be an array/list/tuple with multiple elements (ragged or
1034 # multi-dim column) and pandas needs to have one element per index row (=1 in this case)
1035 df_input[k] = [coldata[k]]
1036 else: # scalar, don't wrap
1037 df_input[k] = coldata[k]
1038 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64))
1039 ret.name = self.name
1040 return ret
1042 def __get_selection_as_df(self, coldata):
1043 """Return a pandas dataframe for the given rows and columns with the id column as the index.
1045 This is used when multiple row indices are selected (or a list/array/slice of a single index is passed to get).
1046 __get_selection_as_df_single_row should be used if a single index is passed to get.
1048 :param coldata: dict mapping column names to values (list/arrays or dataframes)
1049 :type coldata: dict
1050 """
1051 id_index = coldata.pop('id')
1052 df_input = OrderedDict()
1053 for k in coldata: # for each column
1054 if isinstance(coldata[k], np.ndarray) and coldata[k].ndim > 1:
1055 df_input[k] = list(coldata[k]) # convert multi-dim array to list of inner arrays
1056 elif isinstance(coldata[k], pd.DataFrame):
1057 # multiple rows were selected and collapsed into a dataframe
1058 # split up the rows of the df into a list of dataframes, one per row
1059 # TODO make this more efficient
1060 df_input[k] = [coldata[k].iloc[[i]] for i in range(len(coldata[k]))]
1061 else:
1062 df_input[k] = coldata[k]
1063 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64))
1064 ret.name = self.name
1065 return ret
1067 def __contains__(self, val):
1068 """
1069 Check if the given value (i.e., column) exists in this table
1070 """
1071 return val in self.__colids or val in self.__indices
1073 def get_foreign_columns(self):
1074 """
1075 Determine the names of all columns that link to another DynamicTable, i.e.,
1076 find all DynamicTableRegion type columns. Similar to a foreign key in a
1077 database, a DynamicTableRegion column references elements in another table.
1079 :returns: List of strings with the column names
1080 """
1081 col_names = []
1082 for col_index, col in enumerate(self.columns):
1083 if isinstance(col, DynamicTableRegion):
1084 col_names.append(col.name)
1085 return col_names
1087 def has_foreign_columns(self):
1088 """
1089 Does the table contain DynamicTableRegion columns
1091 :returns: True if the table contains a DynamicTableRegion column, else False
1092 """
1093 for col_index, col in enumerate(self.columns):
1094 if isinstance(col, DynamicTableRegion):
1095 return True
1096 return False
1098 @docval({'name': 'other_tables', 'type': (list, tuple, set),
1099 'doc': "List of additional tables to consider in the search. Usually this "
1100 "parameter is used for internal purposes, e.g., when we need to "
1101 "consider AlignedDynamicTable", 'default': None},
1102 allow_extra=False)
1103 def get_linked_tables(self, **kwargs):
1104 """
1105 Get a list of the full list of all tables that are being linked to directly or indirectly
1106 from this table via foreign DynamicTableColumns included in this table or in any table that
1107 can be reached through DynamicTableRegion columns
1109 Returns: List of NamedTuple objects with:
1110 * 'source_table' : The source table containing the DynamicTableRegion column
1111 * 'source_column' : The relevant DynamicTableRegion column in the 'source_table'
1112 * 'target_table' : The target DynamicTable; same as source_column.table.
1113 """
1114 link_type = NamedTuple('DynamicTableLink',
1115 [('source_table', DynamicTable),
1116 ('source_column', Union[DynamicTableRegion, VectorIndex]),
1117 ('target_table', DynamicTable)])
1118 curr_tables = [self, ] # Set of tables
1119 other_tables = getargs('other_tables', kwargs)
1120 if other_tables is not None:
1121 curr_tables += other_tables
1122 curr_index = 0
1123 foreign_cols = []
1124 while curr_index < len(curr_tables):
1125 for col_index, col in enumerate(curr_tables[curr_index].columns):
1126 if isinstance(col, DynamicTableRegion):
1127 foreign_cols.append(link_type(source_table=curr_tables[curr_index],
1128 source_column=col,
1129 target_table=col.table))
1130 curr_table_visited = False
1131 for t in curr_tables:
1132 if t is col.table:
1133 curr_table_visited = True
1134 if not curr_table_visited:
1135 curr_tables.append(col.table)
1136 curr_index += 1
1137 return foreign_cols
1139 @docval({'name': 'exclude', 'type': set, 'doc': 'Set of column names to exclude from the dataframe',
1140 'default': None},
1141 {'name': 'index', 'type': bool,
1142 'doc': ('Whether to return indices for a DynamicTableRegion column. If False, nested dataframes will be '
1143 'returned.'),
1144 'default': False}
1145 )
1146 def to_dataframe(self, **kwargs):
1147 """
1148 Produce a pandas DataFrame containing this table's data.
1150 If this table contains a DynamicTableRegion, by default,
1152 If exclude is None, this is equivalent to table.get(slice(None, None, None), index=False).
1153 """
1154 arg = slice(None, None, None) # select all rows
1155 sel = self.__get_selection_as_dict(arg, df=True, **kwargs)
1156 ret = self.__get_selection_as_df(sel)
1157 return ret
1159 @classmethod
1160 @docval(
1161 {'name': 'df', 'type': pd.DataFrame, 'doc': 'source DataFrame'},
1162 {'name': 'name', 'type': str, 'doc': 'the name of this table'},
1163 {
1164 'name': 'index_column',
1165 'type': str,
1166 'doc': 'if provided, this column will become the table\'s index',
1167 'default': None
1168 },
1169 {
1170 'name': 'table_description',
1171 'type': str,
1172 'doc': 'a description of what is in the resulting table',
1173 'default': ''
1174 },
1175 {
1176 'name': 'columns',
1177 'type': (list, tuple),
1178 'doc': 'a list/tuple of dictionaries specifying columns in the table',
1179 'default': None
1180 },
1181 allow_extra=True
1182 )
1183 def from_dataframe(cls, **kwargs):
1184 '''
1185 Construct an instance of DynamicTable (or a subclass) from a pandas DataFrame.
1187 The columns of the resulting table are defined by the columns of the
1188 dataframe and the index by the dataframe's index (make sure it has a
1189 name!) or by a column whose name is supplied to the index_column
1190 parameter. We recommend that you supply *columns* - a list/tuple of
1191 dictionaries containing the name and description of the column- to help
1192 others understand the contents of your table. See
1193 :py:class:`~hdmf.common.table.DynamicTable` for more details on *columns*.
1194 '''
1196 columns = kwargs.pop('columns')
1197 df = kwargs.pop('df')
1198 name = kwargs.pop('name')
1199 index_column = kwargs.pop('index_column')
1200 table_description = kwargs.pop('table_description')
1201 column_descriptions = kwargs.pop('column_descriptions', dict())
1203 supplied_columns = dict()
1204 if columns: 1204 ↛ 1205line 1204 didn't jump to line 1205, because the condition on line 1204 was never true
1205 supplied_columns = {x['name']: x for x in columns}
1207 class_cols = {x['name']: x for x in cls.__columns__}
1208 required_cols = set(x['name'] for x in cls.__columns__ if 'required' in x and x['required'])
1209 df_cols = df.columns
1210 if required_cols - set(df_cols): 1210 ↛ 1211line 1210 didn't jump to line 1211, because the condition on line 1210 was never true
1211 raise ValueError('missing required cols: ' + str(required_cols - set(df_cols)))
1212 if set(supplied_columns.keys()) - set(df_cols): 1212 ↛ 1213line 1212 didn't jump to line 1213, because the condition on line 1212 was never true
1213 raise ValueError('cols specified but not provided: ' + str(set(supplied_columns.keys()) - set(df_cols)))
1214 columns = []
1215 for col_name in df_cols:
1216 if col_name in class_cols: 1216 ↛ 1217line 1216 didn't jump to line 1217, because the condition on line 1216 was never true
1217 columns.append(class_cols[col_name])
1218 elif col_name in supplied_columns: 1218 ↛ 1219line 1218 didn't jump to line 1219, because the condition on line 1218 was never true
1219 columns.append(supplied_columns[col_name])
1220 else:
1221 columns.append({'name': col_name,
1222 'description': column_descriptions.get(col_name, 'no description')})
1223 if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str):
1224 lengths = [len(x) for x in df[col_name]]
1225 if not lengths[1:] == lengths[:-1]: 1225 ↛ 1226line 1225 didn't jump to line 1226, because the condition on line 1225 was never true
1226 columns[-1].update(index=True)
1228 if index_column is not None: 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true
1229 ids = ElementIdentifiers(name=index_column, data=df[index_column].values.tolist())
1230 else:
1231 index_name = df.index.name if df.index.name is not None else 'id'
1232 ids = ElementIdentifiers(name=index_name, data=df.index.values.tolist())
1234 columns = cls.__build_columns(columns, df=df)
1236 return cls(name=name, id=ids, columns=columns, description=table_description, **kwargs)
1238 def copy(self):
1239 """
1240 Return a copy of this DynamicTable.
1241 This is useful for linking.
1242 """
1243 kwargs = dict(name=self.name, id=self.id, columns=self.columns, description=self.description,
1244 colnames=self.colnames)
1245 return self.__class__(**kwargs)
1248@register_class('DynamicTableRegion')
1249class DynamicTableRegion(VectorData):
1250 """
1251 DynamicTableRegion provides a link from one table to an index or region of another. The `table`
1252 attribute is another `DynamicTable`, indicating which table is referenced. The data is int(s)
1253 indicating the row(s) (0-indexed) of the target array. `DynamicTableRegion`s can be used to
1254 associate multiple rows with the same meta-data without data duplication. They can also be used to
1255 create hierarchical relationships between multiple `DynamicTable`s. `DynamicTableRegion` objects
1256 may be paired with a `VectorIndex` object to create ragged references, so a single cell of a
1257 `DynamicTable` can reference many rows of another `DynamicTable`.
1258 """
1260 __fields__ = (
1261 'table',
1262 )
1264 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'},
1265 {'name': 'data', 'type': ('array_data', 'data'),
1266 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'},
1267 {'name': 'description', 'type': str, 'doc': 'a description of what this region represents'},
1268 {'name': 'table', 'type': DynamicTable,
1269 'doc': 'the DynamicTable this region applies to', 'default': None},
1270 allow_positional=AllowPositional.WARNING)
1271 def __init__(self, **kwargs):
1272 t = popargs('table', kwargs)
1273 super().__init__(**kwargs)
1274 self.table = t
1276 @property
1277 def table(self):
1278 """The DynamicTable this DynamicTableRegion is pointing to"""
1279 return self.fields.get('table')
1281 @table.setter
1282 def table(self, val):
1283 """
1284 Set the table this DynamicTableRegion should be pointing to
1286 :param val: The DynamicTable this DynamicTableRegion should be pointing to
1288 :raises: AttributeError if table is already in fields
1289 :raises: IndexError if the current indices are out of bounds for the new table given by val
1290 """
1291 if val is None:
1292 return
1293 if 'table' in self.fields: 1293 ↛ 1294line 1293 didn't jump to line 1294, because the condition on line 1293 was never true
1294 msg = "can't set attribute 'table' -- already set"
1295 raise AttributeError(msg)
1296 dat = self.data
1297 if isinstance(dat, DataIO): 1297 ↛ 1298line 1297 didn't jump to line 1298, because the condition on line 1297 was never true
1298 dat = dat.data
1299 self.fields['table'] = val
1301 def __getitem__(self, arg):
1302 return self.get(arg)
1304 def get(self, arg, index=False, df=True, **kwargs):
1305 """
1306 Subset the DynamicTableRegion
1308 :param arg: Key defining which elements of the table to select. This may be one of the following:
1310 1) string with the name of the column to select
1311 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the
1312 column to select by name
1313 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then
1314 scalars are returned for each column that has a single value. If a list, array, or slice is used and
1315 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a
1316 single row.
1318 :param index: Boolean indicating whether to return indices of the DTR (default False)
1319 :param df: Boolean indicating whether to return the result as a pandas DataFrame (default True)
1321 :return: Result from self.table[...] with the appropriate selection based on the
1322 rows selected by this DynamicTableRegion
1323 """
1324 if not df and not index:
1325 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported
1326 raise ValueError('DynamicTableRegion.get() with df=False and index=False is not yet supported.')
1327 # treat the list of indices as data that can be indexed. then pass the
1328 # result to the table to get the data
1329 if isinstance(arg, tuple):
1330 arg1 = arg[0]
1331 arg2 = arg[1]
1332 return self.table[self.data[arg1], arg2]
1333 elif isinstance(arg, str):
1334 return self.table[arg]
1335 elif np.issubdtype(type(arg), np.integer):
1336 if arg >= len(self.data):
1337 raise IndexError('index {} out of bounds for data of length {}'.format(arg, len(self.data)))
1338 ret = self.data[arg]
1339 if not index:
1340 ret = self.table.get(ret, df=df, index=index, **kwargs)
1341 return ret
1342 elif isinstance(arg, (list, slice, np.ndarray)):
1343 idx = arg
1345 # get the data at the specified indices
1346 if isinstance(self.data, (tuple, list)) and isinstance(idx, (list, np.ndarray)):
1347 ret = [self.data[i] for i in idx]
1348 else:
1349 ret = self.data[idx]
1351 # dereference them if necessary
1352 if not index:
1353 # These lines are needed because indexing Dataset with a list/ndarray
1354 # of ints requires the list to be sorted.
1355 #
1356 # First get the unique elements, retrieve them from the table, and then
1357 # reorder the result according to the original index that the user passed in.
1358 #
1359 # When not returning a DataFrame, we need to recursively sort the subelements
1360 # of the list we are returning. This is carried out by the recursive method _index_lol
1361 uniq = np.unique(ret)
1362 lut = {val: i for i, val in enumerate(uniq)}
1363 values = self.table.get(uniq, df=df, index=index, **kwargs)
1364 if df: 1364 ↛ 1367line 1364 didn't jump to line 1367, because the condition on line 1364 was never false
1365 ret = values.iloc[[lut[i] for i in ret]]
1366 else:
1367 ret = self._index_lol(values, ret, lut)
1368 return ret
1369 else:
1370 raise ValueError("unrecognized argument: '%s'" % arg)
1372 def _index_lol(self, result, index, lut):
1373 """
1374 This is a helper function for indexing a list of lists/ndarrays. When not returning a
1375 DataFrame, indexing a DynamicTable will return a list of lists and ndarrays. To sort
1376 the result of a DynamicTable index according to the order of the indices passed in by the
1377 user, we have to recursively sort the sub-lists/sub-ndarrays.
1378 """
1379 ret = list()
1380 for col in result:
1381 if isinstance(col, list):
1382 if isinstance(col[0], list):
1383 # list of columns that need to be sorted
1384 ret.append(self._index_lol(col, index, lut))
1385 else:
1386 # list of elements, one for each row to return
1387 ret.append([col[lut[i]] for i in index])
1388 elif isinstance(col, np.ndarray):
1389 ret.append(np.array([col[lut[i]] for i in index], dtype=col.dtype))
1390 else:
1391 raise ValueError('unrecognized column type: %s. Expected list or np.ndarray' % type(col))
1392 return ret
1394 def to_dataframe(self, **kwargs):
1395 """
1396 Convert the whole DynamicTableRegion to a pandas dataframe.
1398 Keyword arguments are passed through to the to_dataframe method of DynamicTable that
1399 is being referenced (i.e., self.table). This allows specification of the 'exclude'
1400 parameter and any other parameters of DynamicTable.to_dataframe.
1401 """
1402 return self.table.to_dataframe(**kwargs).iloc[self.data[:]]
1404 @property
1405 def shape(self):
1406 """
1407 Define the shape, i.e., (num_rows, num_columns) of the selected table region
1408 :return: Shape tuple with two integers indicating the number of rows and number of columns
1409 """
1410 return (len(self.data), len(self.table.columns))
1412 def __repr__(self):
1413 """
1414 :return: Human-readable string representation of the DynamicTableRegion
1415 """
1416 cls = self.__class__
1417 template = "%s %s.%s at 0x%d\n" % (self.name, cls.__module__, cls.__name__, id(self))
1418 template += " Target table: %s %s.%s at 0x%d\n" % (self.table.name,
1419 self.table.__class__.__module__,
1420 self.table.__class__.__name__,
1421 id(self.table))
1422 return template
1425def _uint_precision(elements):
1426 """ Calculate the uint precision needed to encode a set of elements """
1427 n_elements = elements
1428 if hasattr(elements, '__len__'): 1428 ↛ 1430line 1428 didn't jump to line 1430, because the condition on line 1428 was never false
1429 n_elements = len(elements)
1430 return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type
1433def _map_elements(uint, elements):
1434 """ Map CV terms to their uint index """
1435 return {t[1]: uint(t[0]) for t in enumerate(elements)}
1438@register_class('EnumData', EXP_NAMESPACE)
1439class EnumData(VectorData):
1440 """
1441 A n-dimensional dataset that can contain elements from fixed set of elements.
1442 """
1444 __fields__ = ('elements', )
1446 @docval({'name': 'name', 'type': str, 'doc': 'the name of this column'},
1447 {'name': 'description', 'type': str, 'doc': 'a description for this column'},
1448 {'name': 'data', 'type': ('array_data', 'data'),
1449 'doc': 'integers that index into elements for the value of each row', 'default': list()},
1450 {'name': 'elements', 'type': ('array_data', 'data', VectorData), 'default': list(),
1451 'doc': 'lookup values for each integer in ``data``'},
1452 allow_positional=AllowPositional.WARNING)
1453 def __init__(self, **kwargs):
1454 elements = popargs('elements', kwargs)
1455 super().__init__(**kwargs)
1456 if not isinstance(elements, VectorData):
1457 elements = VectorData(name='%s_elements' % self.name, data=elements,
1458 description='fixed set of elements referenced by %s' % self.name)
1459 self.elements = elements
1460 if len(self.elements) > 0:
1461 self.__uint = _uint_precision(self.elements.data)
1462 self.__revidx = _map_elements(self.__uint, self.elements.data)
1463 else:
1464 self.__revidx = dict() # a map from term to index
1465 self.__uint = None # the precision needed to encode all terms
1467 def __add_term(self, term):
1468 """
1469 Add a new CV term, and return it's corresponding index
1471 Returns:
1472 The index of the term
1473 """
1474 if term not in self.__revidx:
1475 # get minimum uint precision needed for elements
1476 self.elements.append(term)
1477 uint = _uint_precision(self.elements)
1478 if self.__uint is uint:
1479 # add the new term to the index-term map
1480 self.__revidx[term] = self.__uint(len(self.elements) - 1)
1481 else:
1482 # remap terms to their uint and bump the precision of existing data
1483 self.__uint = uint
1484 self.__revidx = _map_elements(self.__uint, self.elements)
1485 for i in range(len(self.data)): 1485 ↛ 1486line 1485 didn't jump to line 1486, because the loop on line 1485 never started
1486 self.data[i] = self.__uint(self.data[i])
1487 return self.__revidx[term]
1489 def __getitem__(self, arg):
1490 return self.get(arg, index=False)
1492 def _get_helper(self, idx, index=False, join=False, **kwargs):
1493 """
1494 A helper function for getting elements elements
1496 This helper function contains the post-processing of retrieve indices. By separating this,
1497 it allows customizing processing of indices before resolving the elements elements
1498 """
1499 if index:
1500 return idx
1501 if not np.isscalar(idx):
1502 idx = np.asarray(idx)
1503 ret = np.asarray(self.elements.get(idx.ravel(), **kwargs)).reshape(idx.shape)
1504 if join:
1505 ret = ''.join(ret.ravel())
1506 else:
1507 ret = self.elements.get(idx, **kwargs)
1508 return ret
1510 def get(self, arg, index=False, join=False, **kwargs):
1511 """
1512 Return elements elements for the given argument.
1514 Args:
1515 index (bool): Return indices, do not return CV elements
1516 join (bool): Concatenate elements together into a single string
1518 Returns:
1519 CV elements if *join* is False or a concatenation of all selected
1520 elements if *join* is True.
1521 """
1522 idx = self.data[arg]
1523 return self._get_helper(idx, index=index, join=join, **kwargs)
1525 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'},
1526 {'name': 'index', 'type': bool, 'doc': 'whether or not the value being added is an index',
1527 'default': False})
1528 def add_row(self, **kwargs):
1529 """Append a data value to this EnumData column
1531 If an element is provided for *val* (i.e. *index* is False), the correct
1532 index value will be determined. Otherwise, *val* will be added as provided.
1533 """
1534 val, index = getargs('val', 'index', kwargs)
1535 if not index:
1536 val = self.__add_term(val)
1537 super().append(val)