Coverage for src/hdmf/common/table.py: 89%
768 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-04 02:57 +0000
1"""
2Collection of Container classes for interacting with data types related to
3the storage and use of dynamic data tables as part of the hdmf-common schema
4"""
6import re
7from collections import OrderedDict
8from typing import NamedTuple, Union
9from warnings import warn
11import numpy as np
12import pandas as pd
13import itertools
15from . import register_class, EXP_NAMESPACE
16from ..container import Container, Data
17from ..data_utils import DataIO, AbstractDataChunkIterator
18from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional
19from ..term_set import TermSetWrapper
22@register_class('VectorData')
23class VectorData(Data):
24 """
25 A n-dimensional dataset representing a column of a DynamicTable.
26 If used without an accompanying VectorIndex, first dimension is
27 along the rows of the DynamicTable and each step along the first
28 dimension is a cell of the larger table. VectorData can also be
29 used to represent a ragged array if paired with a VectorIndex.
30 This allows for storing arrays of varying length in a single cell
31 of the DynamicTable by indexing into this VectorData. The first
32 vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at
33 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on.
34 """
36 __fields__ = ("description",)
38 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'},
39 {'name': 'description', 'type': str, 'doc': 'a description for this column'},
40 {'name': 'data', 'type': ('array_data', 'data'),
41 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()},
42 allow_positional=AllowPositional.WARNING)
43 def __init__(self, **kwargs):
44 description = popargs('description', kwargs)
45 super().__init__(**kwargs)
46 self.description = description
48 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'})
49 def add_row(self, **kwargs):
50 """Append a data value to this VectorData column"""
51 val = getargs('val', kwargs)
52 self.append(val)
54 def get(self, key, **kwargs):
55 """
56 Retrieve elements from this VectorData
58 :param key: Selection of the elements
59 :param kwargs: Ignored
60 """
61 return super().get(key)
63 def extend(self, ar, **kwargs):
64 """Add all elements of the iterable arg to the end of this VectorData.
66 Each subclass of VectorData should have its own extend method to ensure functionality and efficiency.
68 :param arg: The iterable to add to the end of this VectorData
69 """
70 #################################################################################
71 # Each subclass of VectorData should have its own extend method to ensure
72 # functionality AND efficiency of the extend operation. However, because currently
73 # they do not all have one of these methods, the only way to ensure functionality
74 # is with calls to add_row. Because that is inefficient for basic VectorData,
75 # this check is added to ensure we always call extend on a basic VectorData.
76 if self.__class__.__mro__[0] == VectorData:
77 super().extend(ar)
78 else:
79 for i in ar:
80 self.add_row(i, **kwargs)
83@register_class('VectorIndex')
84class VectorIndex(VectorData):
85 """
86 When paired with a VectorData, this allows for storing arrays of varying
87 length in a single cell of the DynamicTable by indexing into this VectorData.
88 The first vector is at VectorData[0:VectorIndex(0)+1]. The second vector is at
89 VectorData[VectorIndex(0)+1:VectorIndex(1)+1], and so on.
90 """
92 __fields__ = ("target",)
94 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorIndex'},
95 {'name': 'data', 'type': ('array_data', 'data'),
96 'doc': 'a 1D dataset containing indexes that apply to VectorData object'},
97 {'name': 'target', 'type': VectorData,
98 'doc': 'the target dataset that this index applies to'},
99 allow_positional=AllowPositional.WARNING)
100 def __init__(self, **kwargs):
101 target = popargs('target', kwargs)
102 kwargs['description'] = "Index for VectorData '%s'" % target.name
103 super().__init__(**kwargs)
104 self.target = target
105 self.__uint = np.uint8
106 self.__maxval = 255
107 if isinstance(self.data, (list, np.ndarray)):
108 if len(self.data) > 0:
109 self.__check_precision(len(self.target))
110 # adjust precision for types that we can adjust precision for
111 self.__adjust_precision(self.__uint)
113 def add_vector(self, arg, **kwargs):
114 """
115 Add the given data value to the target VectorData and append the corresponding index to this VectorIndex
116 :param arg: The data value to be added to self.target
117 """
118 if isinstance(self.target, VectorIndex):
119 for a in arg:
120 self.target.add_vector(a)
121 else:
122 self.target.extend(arg, **kwargs)
123 self.append(self.__check_precision(len(self.target)))
125 def __check_precision(self, idx):
126 """
127 Check precision of current dataset and, if necessary, adjust precision to accommodate new value.
129 Returns:
130 unsigned integer encoding of idx
131 """
132 if idx > self.__maxval:
133 while idx > self.__maxval:
134 nbits = (np.log2(self.__maxval + 1) * 2) # 8->16, 16->32, 32->64
135 if nbits == 128: # pragma: no cover
136 msg = ('Cannot store more than 18446744073709551615 elements in a VectorData. Largest dtype '
137 'allowed for VectorIndex is uint64.')
138 raise ValueError(msg)
139 self.__maxval = 2 ** nbits - 1
140 self.__uint = np.dtype('uint%d' % nbits).type
141 self.__adjust_precision(self.__uint)
142 return self.__uint(idx)
144 def __adjust_precision(self, uint):
145 """
146 Adjust precision of data to specified unsigned integer precision.
147 """
148 if isinstance(self.data, list):
149 for i in range(len(self.data)):
150 self.data[i] = uint(self.data[i])
151 elif isinstance(self.data, np.ndarray): 151 ↛ 155line 151 didn't jump to line 155, because the condition on line 151 was never false
152 # use self._Data__data to work around restriction on resetting self.data
153 self._Data__data = self.data.astype(uint)
154 else:
155 raise ValueError("cannot adjust precision of type %s to %s", (type(self.data), uint))
157 def add_row(self, arg, **kwargs):
158 """
159 Convenience function. Same as :py:func:`add_vector`
160 """
161 self.add_vector(arg, **kwargs)
163 def __getitem_helper(self, arg, **kwargs):
164 """
165 Internal helper function used by __getitem__ to retrieve a data value from self.target
167 :param arg: Integer index into this VectorIndex indicating the element we want to retrieve from the target
168 :param kwargs: any additional arguments to *get* method of the self.target VectorData
169 :return: Scalar or list of values retrieved
170 """
171 start = 0 if arg == 0 else self.data[arg - 1]
172 end = self.data[arg]
173 return self.target.get(slice(start, end), **kwargs)
175 def __getitem__(self, arg):
176 """
177 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData
179 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex
180 :return: Scalar or list of values retrieved
181 """
182 return self.get(arg)
184 def get(self, arg, **kwargs):
185 """
186 Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData
188 :param arg: slice or integer index indicating the elements we want to select in this VectorIndex
189 :param kwargs: any additional arguments to *get* method of the self.target VectorData
190 :return: Scalar or list of values retrieved
191 """
192 if np.isscalar(arg):
193 return self.__getitem_helper(arg, **kwargs)
194 else:
195 if isinstance(arg, slice):
196 indices = list(range(*arg.indices(len(self.data))))
197 else:
198 if isinstance(arg[0], bool): 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true
199 arg = np.where(arg)[0]
200 indices = arg
201 ret = list()
202 for i in indices:
203 ret.append(self.__getitem_helper(i, **kwargs))
204 return ret
207@register_class('ElementIdentifiers')
208class ElementIdentifiers(Data):
209 """
210 Data container with a list of unique identifiers for values within a dataset, e.g. rows of a DynamicTable.
211 """
213 @docval({'name': 'name', 'type': str, 'doc': 'the name of this ElementIdentifiers'},
214 {'name': 'data', 'type': ('array_data', 'data'), 'doc': 'a 1D dataset containing identifiers',
215 'default': list()},
216 allow_positional=AllowPositional.WARNING)
217 def __init__(self, **kwargs):
218 super().__init__(**kwargs)
220 @docval({'name': 'other', 'type': (Data, np.ndarray, list, tuple, int),
221 'doc': 'List of ids to search for in this ElementIdentifer object'},
222 rtype=np.ndarray,
223 returns='Array with the list of indices where the elements in the list where found.'
224 'Note, the elements in the returned list are ordered in increasing index'
225 'of the found elements, rather than in the order in which the elements'
226 'where given for the search. Also the length of the result may be different from the length'
227 'of the input array. E.g., if our ids are [1,2,3] and we are search for [3,1,5] the '
228 'result would be [0,2] and NOT [2,0,None]')
229 def __eq__(self, other):
230 """
231 Given a list of ids return the indices in the ElementIdentifiers array where the indices are found.
232 """
233 # Determine the ids we want to find
234 search_ids = other if not isinstance(other, Data) else other.data
235 if isinstance(search_ids, int):
236 search_ids = [search_ids]
237 # Find all matching locations
238 return np.in1d(self.data, search_ids).nonzero()[0]
241@register_class('DynamicTable')
242class DynamicTable(Container):
243 r"""
244 A column-based table. Columns are defined by the argument *columns*. This argument
245 must be a list/tuple of :class:`~hdmf.common.table.VectorData` and :class:`~hdmf.common.table.VectorIndex` objects
246 or a list/tuple of dicts containing the keys ``name`` and ``description`` that provide the name and description
247 of each column in the table. Additionally, the keys ``index``, ``table``, ``enum`` can be used for specifying
248 additional structure to the table columns. Setting the key ``index`` to ``True`` can be used to indicate that the
249 :class:`~hdmf.common.table.VectorData` column will store a ragged array (i.e. will be accompanied with a
250 :class:`~hdmf.common.table.VectorIndex`). Setting the key ``table`` to ``True`` can be used to indicate that the
251 column will store regions to another DynamicTable. Setting the key ``enum`` to ``True`` can be used to indicate
252 that the column data will come from a fixed set of values.
254 Columns in DynamicTable subclasses can be statically defined by specifying the class attribute *\_\_columns\_\_*,
255 rather than specifying them at runtime at the instance level. This is useful for defining a table structure
256 that will get reused. The requirements for *\_\_columns\_\_* are the same as the requirements described above
257 for specifying table columns with the *columns* argument to the DynamicTable constructor.
258 """
260 __fields__ = (
261 {'name': 'id', 'child': True},
262 {'name': 'columns', 'child': True},
263 'colnames',
264 'description'
265 )
267 __columns__ = tuple()
269 @ExtenderMeta.pre_init
270 def __gather_columns(cls, name, bases, classdict):
271 r"""
272 Gather columns from the *\_\_columns\_\_* class attribute and add them to the class.
274 This classmethod will be called during class declaration in the metaclass to automatically
275 include all columns declared in subclasses.
276 """
277 if not isinstance(cls.__columns__, tuple):
278 msg = "'__columns__' must be of type tuple, found %s" % type(cls.__columns__)
279 raise TypeError(msg)
281 if (len(bases) and 'DynamicTable' in globals() and issubclass(bases[-1], Container)
282 and bases[-1].__columns__ is not cls.__columns__):
283 new_columns = list(cls.__columns__)
284 new_columns[0:0] = bases[-1].__columns__ # prepend superclass columns to new_columns
285 cls.__columns__ = tuple(new_columns)
287 @docval({'name': 'name', 'type': str, 'doc': 'the name of this table'}, # noqa: C901
288 {'name': 'description', 'type': str, 'doc': 'a description of what is in this table'},
289 {'name': 'id', 'type': ('array_data', 'data', ElementIdentifiers), 'doc': 'the identifiers for this table',
290 'default': None},
291 {'name': 'columns', 'type': (tuple, list), 'doc': 'the columns in this table', 'default': None},
292 {'name': 'colnames', 'type': 'array_data',
293 'doc': 'the ordered names of the columns in this table. columns must also be provided.',
294 'default': None},
295 allow_positional=AllowPositional.WARNING)
296 def __init__(self, **kwargs): # noqa: C901
297 id, columns, desc, colnames = popargs('id', 'columns', 'description', 'colnames', kwargs)
298 super().__init__(**kwargs)
299 self.description = desc
301 # hold names of optional columns that are defined in __columns__ that are not yet initialized
302 # map name to column specification
303 self.__uninit_cols = dict()
305 # All tables must have ElementIdentifiers (i.e. a primary key column)
306 # Here, we figure out what to do for that
307 user_provided_ids = (id is not None)
308 if user_provided_ids:
309 if not isinstance(id, ElementIdentifiers):
310 id = ElementIdentifiers(name='id', data=id)
311 else:
312 id = ElementIdentifiers(name='id')
314 if columns is not None and len(columns) > 0:
315 # If columns have been passed in, check them over and process accordingly
316 if isinstance(columns[0], dict):
317 columns = self.__build_columns(columns)
318 elif not all(isinstance(c, VectorData) for c in columns):
319 raise ValueError("'columns' must be a list of dict, VectorData, DynamicTableRegion, or VectorIndex")
321 all_names = [c.name for c in columns]
322 if len(all_names) != len(set(all_names)):
323 raise ValueError("'columns' contains columns with duplicate names: %s" % all_names)
325 all_targets = [c.target.name for c in columns if isinstance(c, VectorIndex)]
326 if len(all_targets) != len(set(all_targets)):
327 raise ValueError("'columns' contains index columns with the same target: %s" % all_targets)
329 # TODO: check columns against __columns__
330 # mismatches should raise an error (e.g., a VectorData cannot be passed in with the same name as a
331 # prespecified table region column)
333 # check column lengths against each other and id length
334 # set ids if non-zero cols are provided and ids is empty
335 colset = {c.name: c for c in columns}
336 for c in columns: # remove all VectorData objects that have an associated VectorIndex from colset
337 if isinstance(c, VectorIndex):
338 if c.target.name in colset:
339 colset.pop(c.target.name)
340 else:
341 raise ValueError("Found VectorIndex '%s' but not its target '%s'" % (c.name, c.target.name))
342 elif isinstance(c, EnumData):
343 if c.elements.name in colset: 343 ↛ 345line 343 didn't jump to line 345, because the condition on line 343 was never false
344 colset.pop(c.elements.name)
345 _data = c.data
346 if isinstance(_data, DataIO):
347 _data = _data.data
348 if isinstance(_data, AbstractDataChunkIterator):
349 colset.pop(c.name, None)
350 lens = [len(c) for c in colset.values()]
351 all_columns_are_iterators = (len(lens) == 0)
353 if not all(i == lens[0] for i in lens):
354 raise ValueError("Columns must be the same length")
355 # If we have columns given, but all columns are AbstractDataChunkIterator's, then we
356 # cannot determine how many elements the id column will need. I.e., in this case the
357 # user needs to provide the id's as otherwise we may create an invalid table with an
358 # empty Id column but data in the rows. See: https://github.com/hdmf-dev/hdmf/issues/952
359 if all_columns_are_iterators and not user_provided_ids:
360 raise ValueError("Cannot determine row id's for table. Must provide ids with same length "
361 "as the columns when all columns are specified via DataChunkIterator objects.")
362 # If we have columns with a known length but the length (i.e., number of rows)
363 # does not match the number of id's then initialize the id's
364 if not all_columns_are_iterators and lens[0] != len(id):
365 if user_provided_ids and len(id) > 0:
366 raise ValueError("Must provide same number of ids as length of columns")
367 else: # set ids to: 0 to length of columns - 1
368 id.data.extend(range(lens[0]))
370 self.id = id
372 # NOTE: self.colnames and self.columns are always tuples
373 # if kwarg colnames is an h5dataset, self.colnames is still a tuple
374 if colnames is None or len(colnames) == 0:
375 if columns is None:
376 # make placeholder for columns if nothing was given
377 self.colnames = tuple()
378 self.columns = tuple()
379 else:
380 # Figure out column names if columns were given
381 tmp = OrderedDict()
382 skip = set()
383 for col in columns:
384 if col.name in skip: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true
385 continue
386 if isinstance(col, VectorIndex):
387 continue
388 if isinstance(col, EnumData): 388 ↛ 389line 388 didn't jump to line 389, because the condition on line 388 was never true
389 skip.add(col.elements.name)
390 tmp.pop(col.elements.name, None)
391 tmp[col.name] = None
392 self.colnames = tuple(tmp)
393 self.columns = tuple(columns)
394 else:
395 # Calculate the order of column names
396 if columns is None:
397 raise ValueError("Must supply 'columns' if specifying 'colnames'")
398 else:
399 # order the columns according to the column names, which does not include indices
400 self.colnames = tuple(pystr(c) for c in colnames)
401 col_dict = {col.name: col for col in columns}
402 # map from vectordata name to list of vectorindex objects where target of last vectorindex is vectordata
403 indices = dict()
404 # determine which columns are indexed by another column
405 for col in columns:
406 if isinstance(col, VectorIndex):
407 # loop through nested indices to get to non-index column
408 tmp_indices = [col]
409 curr_col = col
410 while isinstance(curr_col.target, VectorIndex):
411 curr_col = curr_col.target
412 tmp_indices.append(curr_col)
413 # make sure the indices values has the full index chain, so replace existing value if it is
414 # shorter
415 if len(tmp_indices) > len(indices.get(curr_col.target.name, [])):
416 indices[curr_col.target.name] = tmp_indices
417 elif isinstance(col, EnumData):
418 # EnumData is the indexing column, so it should go first
419 if col.name not in indices: 419 ↛ 405line 419 didn't jump to line 405, because the condition on line 419 was never false
420 indices[col.name] = [col] # EnumData is the indexing object
421 col_dict[col.name] = col.elements # EnumData.elements is the column with values
422 else:
423 if col.name in indices:
424 continue
425 indices[col.name] = []
426 # put columns in order of colnames, with indices before the target vectordata
427 tmp = []
428 for name in self.colnames:
429 tmp.extend(indices[name])
430 tmp.append(col_dict[name])
431 self.columns = tuple(tmp)
433 # to make generating DataFrames and Series easier
434 col_dict = dict()
435 self.__indices = dict()
436 for col in self.columns:
437 if isinstance(col, VectorIndex):
438 # if index has already been added because it is part of a nested index chain, ignore this column
439 if col.name in self.__indices:
440 continue
441 self.__indices[col.name] = col
443 # loop through nested indices to get to non-index column
444 curr_col = col
445 self.__set_table_attr(curr_col)
446 while isinstance(curr_col.target, VectorIndex):
447 curr_col = curr_col.target
448 # check if index has been added. if not, add it
449 if not hasattr(self, curr_col.name):
450 self.__set_table_attr(curr_col)
451 self.__indices[curr_col.name] = col
453 # use target vectordata name at end of indexing chain as key to get to the top level index
454 col_dict[curr_col.target.name] = col
455 if not hasattr(self, curr_col.target.name):
456 self.__set_table_attr(curr_col.target)
457 else: # this is a regular VectorData or EnumData
458 # if we added this column using its index, ignore this column
459 if col.name in col_dict:
460 continue
461 else:
462 col_dict[col.name] = col
463 self.__set_table_attr(col)
465 self.__df_cols = [self.id] + [col_dict[name] for name in self.colnames]
467 # self.__colids maps the column name to an index starting at 1
468 self.__colids = {name: i + 1 for i, name in enumerate(self.colnames)}
469 self._init_class_columns()
471 def __set_table_attr(self, col):
472 if hasattr(self, col.name) and col.name not in self.__uninit_cols:
473 msg = ("An attribute '%s' already exists on %s '%s' so this column cannot be accessed as an attribute, "
474 "e.g., table.%s; it can only be accessed using other methods, e.g., table['%s']."
475 % (col.name, self.__class__.__name__, self.name, col.name, col.name))
476 warn(msg)
477 else:
478 setattr(self, col.name, col)
480 __reserved_colspec_keys = ['name', 'description', 'index', 'table', 'required', 'class']
482 def _init_class_columns(self):
483 """
484 Process all predefined columns specified in class variable __columns__.
485 Optional columns are not tracked but not added.
486 """
487 for col in self.__columns__:
488 if col['name'] not in self.__colids: # if column has not been added in __init__
489 if col.get('required', False):
490 self.add_column(name=col['name'],
491 description=col['description'],
492 index=col.get('index', False),
493 table=col.get('table', False),
494 col_cls=col.get('class', VectorData),
495 # Pass through extra kwargs for add_column that subclasses may have added
496 **{k: col[k] for k in col.keys()
497 if k not in DynamicTable.__reserved_colspec_keys})
498 else:
499 # track the not yet initialized optional predefined columns
500 self.__uninit_cols[col['name']] = col
502 # set the table attributes for not yet init optional predefined columns
503 setattr(self, col['name'], None)
504 index = col.get('index', False)
505 if index is not False:
506 if index is True:
507 index = 1
508 if isinstance(index, int): 508 ↛ 515line 508 didn't jump to line 515, because the condition on line 508 was never false
509 assert index > 0, ValueError("integer index value must be greater than 0")
510 index_name = col['name']
511 for i in range(index):
512 index_name = index_name + '_index'
513 self.__uninit_cols[index_name] = col
514 setattr(self, index_name, None)
515 if col.get('enum', False):
516 self.__uninit_cols[col['name'] + '_elements'] = col
517 setattr(self, col['name'] + '_elements', None)
519 @staticmethod
520 def __build_columns(columns, df=None):
521 """
522 Build column objects according to specifications
523 """
524 tmp = list()
525 for d in columns:
526 name = d['name']
527 desc = d.get('description', 'no description')
528 col_cls = d.get('class', VectorData)
529 data = None
530 if df is not None:
531 data = list(df[name].values)
532 index = d.get('index', False)
533 if index is not False: 533 ↛ 534line 533 didn't jump to line 534, because the condition on line 533 was never true
534 if isinstance(index, int) and index > 1:
535 raise ValueError('Creating nested index columns using this method is not yet supported. Use '
536 'add_column or define the columns using __columns__ instead.')
537 index_data = None
538 if data is not None:
539 index_data = [len(data[0])]
540 for i in range(1, len(data)):
541 index_data.append(len(data[i]) + index_data[i - 1])
542 # assume data came in through a DataFrame, so we need
543 # to concatenate it
544 tmp_data = list()
545 for d in data:
546 tmp_data.extend(d)
547 data = tmp_data
548 vdata = col_cls(name=name, description=desc, data=data)
549 vindex = VectorIndex(name="%s_index" % name, data=index_data, target=vdata)
550 tmp.append(vindex)
551 tmp.append(vdata)
552 elif d.get('enum', False): 552 ↛ 554line 552 didn't jump to line 554, because the condition on line 552 was never true
553 # EnumData is the indexing column, so it should go first
554 if data is not None:
555 elements, data = np.unique(data, return_inverse=True)
556 tmp.append(EnumData(name, desc, data=data, elements=elements))
557 else:
558 tmp.append(EnumData(name, desc, data=data))
559 # EnumData handles constructing the VectorData object that contains EnumData.elements
560 # --> use this functionality (rather than creating here) for consistency and less code/complexity
561 tmp.append(tmp[-1].elements)
562 else:
563 if data is None:
564 data = list()
565 if d.get('table', False): 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true
566 col_cls = DynamicTableRegion
567 tmp.append(col_cls(name=name, description=desc, data=data))
568 return tmp
570 def __len__(self):
571 """Number of rows in the table"""
572 return len(self.id)
574 @docval({'name': 'data', 'type': dict, 'doc': 'the data to put in this row', 'default': None},
575 {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None},
576 {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique',
577 'default': False},
578 allow_extra=True)
579 def add_row(self, **kwargs):
580 """
581 Add a row to the table. If *id* is not provided, it will auto-increment.
582 """
583 data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs)
584 data = data if data is not None else kwargs
586 bad_data = []
587 extra_columns = set(list(data.keys())) - set(list(self.__colids.keys()))
588 missing_columns = set(list(self.__colids.keys())) - set(list(data.keys()))
590 for colname, colnum in self.__colids.items():
591 if colname not in data:
592 raise ValueError("column '%s' missing" % colname)
593 col = self.__df_cols[colnum]
594 if isinstance(col, VectorIndex):
595 continue
596 else:
597 if isinstance(col.data, TermSetWrapper):
598 if col.data.termset.validate(term=data[colname]):
599 continue
600 else:
601 bad_data.append(data[colname])
603 if len(bad_data)!=0:
604 msg = ('"%s" is not in the term set.' % ', '.join([str(item) for item in bad_data]))
605 raise ValueError(msg)
607 # check to see if any of the extra columns just need to be added
608 if extra_columns:
609 for col in self.__columns__:
610 if col['name'] in extra_columns:
611 if data[col['name']] is not None: 611 ↛ 621line 611 didn't jump to line 621, because the condition on line 611 was never false
612 self.add_column(col['name'], col['description'],
613 index=col.get('index', False),
614 table=col.get('table', False),
615 enum=col.get('enum', False),
616 col_cls=col.get('class', VectorData),
617 # Pass through extra keyword arguments for add_column that
618 # subclasses may have added
619 **{k: col[k] for k in col.keys()
620 if k not in DynamicTable.__reserved_colspec_keys})
621 extra_columns.remove(col['name'])
623 if extra_columns or missing_columns:
624 raise ValueError(
625 '\n'.join([
626 'row data keys don\'t match available columns',
627 'you supplied {} extra keys: {}'.format(len(extra_columns), extra_columns),
628 'and were missing {} keys: {}'.format(len(missing_columns), missing_columns)
629 ])
630 )
631 if row_id is None:
632 row_id = data.pop('id', None)
633 if row_id is None:
634 row_id = len(self)
635 if enforce_unique_id:
636 if row_id in self.id:
637 raise ValueError("id %i already in the table" % row_id)
638 self.id.append(row_id)
640 for colname, colnum in self.__colids.items():
641 if colname not in data: 641 ↛ 642line 641 didn't jump to line 642, because the condition on line 641 was never true
642 raise ValueError("column '%s' missing" % colname)
643 c = self.__df_cols[colnum]
644 if isinstance(c, VectorIndex):
645 c.add_vector(data[colname])
646 else:
647 c.add_row(data[colname])
649 def __eq__(self, other):
650 """Compare if the two DynamicTables contain the same data.
652 First this returns False if the other DynamicTable has a different name or
653 description. Then, this table and the other table are converted to pandas
654 dataframes and the equality of the two tables is returned.
656 :param other: DynamicTable to compare to
658 :return: Bool indicating whether the two DynamicTables contain the same data
659 """
660 if other is self:
661 return True
662 if not isinstance(other, DynamicTable):
663 return False
664 if self.name != other.name or self.description != other.description:
665 return False
666 return self.to_dataframe().equals(other.to_dataframe())
668 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, # noqa: C901
669 {'name': 'description', 'type': str, 'doc': 'a description for this column'},
670 {'name': 'data', 'type': ('array_data', 'data'),
671 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()},
672 {'name': 'table', 'type': (bool, 'DynamicTable'),
673 'doc': 'whether or not this is a table region or the table the region applies to', 'default': False},
674 {'name': 'index', 'type': (bool, VectorIndex, 'array_data', int),
675 'doc': ' * ``False`` (default): do not generate a VectorIndex\n\n'
676 ' * ``True``: generate one empty VectorIndex \n\n'
677 ' * ``VectorIndex``: Use the supplied VectorIndex \n\n'
678 ' * array-like of ints: Create a VectorIndex and use these values as the data \n\n'
679 ' * ``int``: Recursively create `n` VectorIndex objects for a multi-ragged array \n',
680 'default': False},
681 {'name': 'enum', 'type': (bool, 'array_data'), 'default': False,
682 'doc': ('whether or not this column contains data from a fixed set of elements')},
683 {'name': 'col_cls', 'type': type, 'default': VectorData,
684 'doc': ('class to use to represent the column data. If table=True, this field is ignored and a '
685 'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData '
686 'object is used.')},
687 allow_extra=True)
688 def add_column(self, **kwargs): # noqa: C901
689 """
690 Add a column to this table.
692 If data is provided, it must contain the same number of rows as the current state of the table.
694 Extra keyword arguments will be passed to the constructor of the column class ("col_cls").
696 :raises ValueError: if the column has already been added to the table
697 """
698 name, data = getargs('name', 'data', kwargs)
699 index, table, enum, col_cls= popargs('index', 'table', 'enum', 'col_cls', kwargs)
701 if isinstance(index, VectorIndex):
702 warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be "
703 "deprecated in a future version of HDMF.", FutureWarning)
705 if name in self.__colids: # column has already been added
706 msg = "column '%s' already exists in %s '%s'" % (name, self.__class__.__name__, self.name)
707 raise ValueError(msg)
709 if name in self.__uninit_cols: # column is a predefined optional column from the spec
710 # check the given values against the predefined optional column spec. if they do not match, raise a warning
711 # and ignore the given arguments. users should not be able to override these values
712 table_bool = table or not isinstance(table, bool)
713 spec_table = self.__uninit_cols[name].get('table', False)
714 if table_bool != spec_table:
715 msg = ("Column '%s' is predefined in %s with table=%s which does not match the entered "
716 "table argument. The predefined table spec will be ignored. "
717 "Please ensure the new column complies with the spec. "
718 "This will raise an error in a future version of HDMF."
719 % (name, self.__class__.__name__, spec_table))
720 warn(msg)
722 index_bool = index or not isinstance(index, bool)
723 spec_index = self.__uninit_cols[name].get('index', False)
724 if index_bool != spec_index:
725 msg = ("Column '%s' is predefined in %s with index=%s which does not match the entered "
726 "index argument. The predefined index spec will be ignored. "
727 "Please ensure the new column complies with the spec. "
728 "This will raise an error in a future version of HDMF."
729 % (name, self.__class__.__name__, spec_index))
730 warn(msg)
732 spec_col_cls = self.__uninit_cols[name].get('class', VectorData)
733 if col_cls != spec_col_cls:
734 msg = ("Column '%s' is predefined in %s with class=%s which does not match the entered "
735 "col_cls argument. The predefined class spec will be ignored. "
736 "Please ensure the new column complies with the spec. "
737 "This will raise an error in a future version of HDMF."
738 % (name, self.__class__.__name__, spec_col_cls))
739 warn(msg)
741 ckwargs = dict(kwargs)
743 # Add table if it's been specified
744 if table and enum: 744 ↛ 745line 744 didn't jump to line 745, because the condition on line 744 was never true
745 raise ValueError("column '%s' cannot be both a table region "
746 "and come from an enumerable set of elements" % name)
747 if table is not False:
748 col_cls = DynamicTableRegion
749 if isinstance(table, DynamicTable):
750 ckwargs['table'] = table
751 if enum is not False:
752 col_cls = EnumData
753 if isinstance(enum, (list, tuple, np.ndarray, VectorData)): 753 ↛ 754line 753 didn't jump to line 754, because the condition on line 753 was never true
754 ckwargs['elements'] = enum
756 # If the user provided a list of lists that needs to be indexed, then we now need to flatten the data
757 # We can only create the index actual VectorIndex once we have the VectorData column so we compute
758 # the index and flatten the data here and then create the VectorIndex later from create_vector_index
759 # once we have created the column
760 create_vector_index = None
761 if ckwargs.get('data', None) is not None: 761 ↛ 785line 761 didn't jump to line 785, because the condition on line 761 was never false
762 # Check that we are asked to create an index
763 if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0:
764 # Iteratively flatten the data we use for the column based on the depth of the index to generate.
765 # Also, for each level compute the data for the VectorIndex for that level
766 flatten_data = data
767 create_vector_index = []
768 for i in range(index):
769 try:
770 create_vector_index.append(np.cumsum([len(c) for c in flatten_data]).tolist())
771 except TypeError as e:
772 raise ValueError("Cannot automatically construct VectorIndex for nested array. "
773 "Invalid data array element found.") from e
774 flatten_data = list(itertools.chain.from_iterable(flatten_data))
775 # if our data still is an array (e.g., a list or numpy array) then warn that the index parameter
776 # may be incorrect.
777 if len(flatten_data) > 0 and isinstance(flatten_data[0], (np.ndarray, list, tuple)):
778 raise ValueError("Cannot automatically construct VectorIndex for nested array. "
779 "Column data contains arrays as cell values. Please check the 'data' and 'index' "
780 "parameters. 'index=%s' may be too small for the given data." % str(index))
781 # overwrite the data to be used for the VectorData column with the flattened data
782 ckwargs['data'] = flatten_data
784 # Create the VectorData column
785 col = col_cls(**ckwargs)
786 col.parent = self
787 columns = [col]
788 self.__set_table_attr(col)
789 if col in self.__uninit_cols: 789 ↛ 790line 789 didn't jump to line 790, because the condition on line 789 was never true
790 self.__uninit_cols.pop(col)
792 if col_cls is EnumData:
793 columns.append(col.elements)
794 col.elements.parent = self
796 # Add index if it's been specified
797 if index is not False:
798 if isinstance(index, VectorIndex):
799 col_index = index
800 self.__add_column_index_helper(col_index)
801 elif isinstance(index, bool):
802 # create empty index for empty column
803 if create_vector_index is None:
804 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index")
805 col_index = VectorIndex(name=name + "_index", data=list(), target=col)
806 # create single-level VectorIndex from the data based on the create_vector_index we computed earlier
807 else:
808 col_index = VectorIndex(name=name + "_index", data=create_vector_index[0], target=col)
809 # add the column with the index
810 self.__add_column_index_helper(col_index)
811 elif isinstance(index, int):
812 if create_vector_index is None:
813 assert index > 0, ValueError("integer index value must be greater than 0")
814 assert len(col) == 0, ValueError("cannot pass empty index with non-empty data to index")
815 index_name = name
816 for i in range(index):
817 index_name = index_name + "_index"
818 col_index = VectorIndex(name=index_name, data=list(), target=col)
819 self.__add_column_index_helper(col_index)
820 if i < index - 1:
821 columns.insert(0, col_index)
822 col = col_index
823 # Create the nested VectorIndex from the create_vector_index we computed above
824 else:
825 index_name = name
826 for i in range(index):
827 index_name = index_name + "_index"
828 col_index = VectorIndex(name=index_name, data=create_vector_index[-(i+1)], target=col)
829 self.__add_column_index_helper(col_index)
830 if i < index - 1:
831 columns.insert(0, col_index)
832 col = col_index
833 else: # make VectorIndex with supplied data
834 assert len(col) > 0, ValueError("cannot pass non-empty index with empty data to index")
835 col_index = VectorIndex(name=name + "_index", data=index, target=col)
836 self.__add_column_index_helper(col_index)
837 columns.insert(0, col_index)
838 col = col_index
840 if len(col) != len(self.id):
841 raise ValueError("column must have the same number of rows as 'id'")
842 self.__colids[name] = len(self.__df_cols)
843 self.fields['colnames'] = tuple(list(self.colnames) + [name])
844 self.fields['columns'] = tuple(list(self.columns) + columns)
845 self.__df_cols.append(col)
847 def __add_column_index_helper(self, col_index):
848 if not isinstance(col_index.parent, Container): 848 ↛ 851line 848 didn't jump to line 851, because the condition on line 848 was never false
849 col_index.parent = self
850 # else, the ObjectMapper will create a link from self (parent) to col_index (child with existing parent)
851 self.__indices[col_index.name] = col_index
852 self.__set_table_attr(col_index)
853 if col_index in self.__uninit_cols: 853 ↛ 854line 853 didn't jump to line 854, because the condition on line 853 was never true
854 self.__uninit_cols.pop(col_index)
856 @docval({'name': 'name', 'type': str, 'doc': 'the name of the DynamicTableRegion object'},
857 {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the indices of the table'},
858 {'name': 'description', 'type': str, 'doc': 'a brief description of what the region is'})
859 def create_region(self, **kwargs):
860 """
861 Create a DynamicTableRegion selecting a region (i.e., rows) in this DynamicTable.
863 :raises: IndexError if the provided region contains invalid indices
865 """
866 region = getargs('region', kwargs)
867 if isinstance(region, slice):
868 if (region.start is not None and region.start < 0) or (region.stop is not None and region.stop > len(self)):
869 msg = 'region slice %s is out of range for this DynamicTable of length %d' % (str(region), len(self))
870 raise IndexError(msg)
871 region = list(range(*region.indices(len(self))))
872 else:
873 for idx in region:
874 if idx < 0 or idx >= len(self):
875 raise IndexError('The index ' + str(idx) +
876 ' is out of range for this DynamicTable of length '
877 + str(len(self)))
878 desc = getargs('description', kwargs)
879 name = getargs('name', kwargs)
880 return DynamicTableRegion(name=name, data=region, description=desc, table=self)
882 def __getitem__(self, key):
883 ret = self.get(key)
884 if ret is None:
885 raise KeyError(key)
886 return ret
888 def get(self, key, default=None, df=True, index=True, **kwargs):
889 """Select a subset from the table.
891 If the table includes a DynamicTableRegion column, then by default,
892 the index/indices of the DynamicTableRegion will be returned. If ``df=True`` and ``index=False``,
893 then the returned pandas DataFrame will contain a nested DataFrame in each row of the
894 DynamicTableRegion column. If ``df=False`` and ``index=True``, then a list of lists will be returned
895 where the list containing the DynamicTableRegion column contains the indices of the DynamicTableRegion.
896 Note that in this case, the DynamicTable referenced by the DynamicTableRegion can be accessed through
897 the ``table`` attribute of the DynamicTableRegion object. ``df=False`` and ``index=False`` is
898 not yet supported.
900 :param key: Key defining which elements of the table to select. This may be one of the following:
902 1) string with the name of the column to select
903 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the
904 column to select by name
905 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then
906 scalars are returned for each column that has a single value. If a list, array, or slice is used and
907 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a
908 single row.
910 :return: 1) If key is a string, then return the VectorData object representing the column with the string name
911 2) If key is a tuple of (int, str), then return the scalar value of the selected cell
912 3) If key is an int, list, np.ndarray, or slice, then return pandas.DataFrame or lists
913 consisting of one or more rows
915 :raises: KeyError
916 """
917 ret = None
918 if not df and not index:
919 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported
920 raise ValueError('DynamicTable.get() with df=False and index=False is not yet supported.')
921 if isinstance(key, tuple):
922 # index by row and column --> return specific cell
923 arg1 = key[0]
924 arg2 = key[1]
925 if isinstance(arg2, str):
926 arg2 = self.__colids[arg2]
927 ret = self.__df_cols[arg2][arg1]
928 elif isinstance(key, str):
929 # index by one string --> return column
930 if key == 'id': 930 ↛ 931line 930 didn't jump to line 931, because the condition on line 930 was never true
931 return self.id
932 elif key in self.__colids:
933 ret = self.__df_cols[self.__colids[key]]
934 elif key in self.__indices:
935 ret = self.__indices[key]
936 else:
937 return default
938 else:
939 # index by int, list, np.ndarray, or slice -->
940 # return pandas Dataframe or lists consisting of one or more rows
941 sel = self.__get_selection_as_dict(key, df, index, **kwargs)
942 if df:
943 # reformat objects to fit into a pandas DataFrame
944 if np.isscalar(key):
945 ret = self.__get_selection_as_df_single_row(sel)
946 else:
947 ret = self.__get_selection_as_df(sel)
948 else:
949 ret = list(sel.values())
951 return ret
953 def __get_selection_as_dict(self, arg, df, index, exclude=None, **kwargs):
954 """Return a dict mapping column names to values (lists/arrays or dataframes) for the given selection.
955 Uses each column's get() method, passing kwargs as necessary.
957 :param arg: key passed to get() to return one or more rows
958 :type arg: int, list, np.ndarray, or slice
959 """
960 if not (np.issubdtype(type(arg), np.integer) or isinstance(arg, (slice, list, np.ndarray))):
961 raise KeyError("Key type not supported by DynamicTable %s" % str(type(arg)))
962 if isinstance(arg, np.ndarray) and arg.ndim != 1: 962 ↛ 963line 962 didn't jump to line 963, because the condition on line 962 was never true
963 raise ValueError("Cannot index DynamicTable with multiple dimensions")
964 if exclude is None:
965 exclude = set([])
966 ret = OrderedDict()
967 try:
968 # index with a python slice or single int to select one or multiple rows
969 ret['id'] = self.id[arg]
970 for name in self.colnames:
971 if name in exclude:
972 continue
973 col = self.__df_cols[self.__colids[name]]
974 if index and (isinstance(col, DynamicTableRegion) or
975 (isinstance(col, VectorIndex) and isinstance(col.target, DynamicTableRegion))):
976 # return indices (in list, array, etc.) for DTR and ragged DTR
977 ret[name] = col.get(arg, df=False, index=True, **kwargs)
978 else:
979 ret[name] = col.get(arg, df=df, index=index, **kwargs)
980 return ret
981 # if index is out of range, different errors can be generated depending on the dtype of the column
982 # but despite the differences, raise an IndexError from that error
983 except ValueError as ve: 983 ↛ 986line 983 didn't jump to line 986, because the exception caught by line 983 didn't happen
984 # in h5py <2, if the column is an h5py.Dataset, a ValueError was raised
985 # in h5py 3+, this became an IndexError
986 x = re.match(r"^Index \((.*)\) out of range \(.*\)$", str(ve))
987 if x:
988 msg = ("Row index %s out of range for %s '%s' (length %d)."
989 % (x.groups()[0], self.__class__.__name__, self.name, len(self)))
990 raise IndexError(msg) from ve
991 else: # pragma: no cover
992 raise ve
993 except IndexError as ie:
994 x = re.match(r"^Index \((.*)\) out of range for \(.*\)$", str(ie))
995 if x:
996 msg = ("Row index %s out of range for %s '%s' (length %d)."
997 % (x.groups()[0], self.__class__.__name__, self.name, len(self)))
998 raise IndexError(msg)
999 elif str(ie) == 'list index out of range':
1000 msg = ("Row index out of range for %s '%s' (length %d)."
1001 % (self.__class__.__name__, self.name, len(self)))
1002 raise IndexError(msg) from ie
1003 else: # pragma: no cover
1004 raise ie
1006 def __get_selection_as_df_single_row(self, coldata):
1007 """Return a pandas dataframe for the given row and columns with the id column as the index.
1009 This is a special case of __get_selection_as_df where a single row was requested.
1011 :param coldata: dict mapping column names to values (list/arrays or dataframes)
1012 :type coldata: dict
1013 """
1014 id_index_orig = coldata.pop('id')
1015 id_index = [id_index_orig]
1016 df_input = OrderedDict()
1017 for k in coldata: # for each column
1018 if isinstance(coldata[k], (np.ndarray, list, tuple, pd.DataFrame)):
1019 # wrap in a list because coldata[k] may be an array/list/tuple with multiple elements (ragged or
1020 # multi-dim column) and pandas needs to have one element per index row (=1 in this case)
1021 df_input[k] = [coldata[k]]
1022 else: # scalar, don't wrap
1023 df_input[k] = coldata[k]
1024 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64))
1025 ret.name = self.name
1026 return ret
1028 def __get_selection_as_df(self, coldata):
1029 """Return a pandas dataframe for the given rows and columns with the id column as the index.
1031 This is used when multiple row indices are selected (or a list/array/slice of a single index is passed to get).
1032 __get_selection_as_df_single_row should be used if a single index is passed to get.
1034 :param coldata: dict mapping column names to values (list/arrays or dataframes)
1035 :type coldata: dict
1036 """
1037 id_index = coldata.pop('id')
1038 df_input = OrderedDict()
1039 for k in coldata: # for each column
1040 if isinstance(coldata[k], np.ndarray) and coldata[k].ndim > 1:
1041 df_input[k] = list(coldata[k]) # convert multi-dim array to list of inner arrays
1042 elif isinstance(coldata[k], pd.DataFrame):
1043 # multiple rows were selected and collapsed into a dataframe
1044 # split up the rows of the df into a list of dataframes, one per row
1045 # TODO make this more efficient
1046 df_input[k] = [coldata[k].iloc[[i]] for i in range(len(coldata[k]))]
1047 else:
1048 df_input[k] = coldata[k]
1049 ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index, dtype=np.int64))
1050 ret.name = self.name
1051 return ret
1053 def __contains__(self, val):
1054 """
1055 Check if the given value (i.e., column) exists in this table
1056 """
1057 return val in self.__colids or val in self.__indices
1059 def get_foreign_columns(self):
1060 """
1061 Determine the names of all columns that link to another DynamicTable, i.e.,
1062 find all DynamicTableRegion type columns. Similar to a foreign key in a
1063 database, a DynamicTableRegion column references elements in another table.
1065 :returns: List of strings with the column names
1066 """
1067 col_names = []
1068 for col_index, col in enumerate(self.columns):
1069 if isinstance(col, DynamicTableRegion):
1070 col_names.append(col.name)
1071 return col_names
1073 def has_foreign_columns(self):
1074 """
1075 Does the table contain DynamicTableRegion columns
1077 :returns: True if the table contains a DynamicTableRegion column, else False
1078 """
1079 for col_index, col in enumerate(self.columns):
1080 if isinstance(col, DynamicTableRegion):
1081 return True
1082 return False
1084 @docval({'name': 'other_tables', 'type': (list, tuple, set),
1085 'doc': "List of additional tables to consider in the search. Usually this "
1086 "parameter is used for internal purposes, e.g., when we need to "
1087 "consider AlignedDynamicTable", 'default': None},
1088 allow_extra=False)
1089 def get_linked_tables(self, **kwargs):
1090 """
1091 Get a list of the full list of all tables that are being linked to directly or indirectly
1092 from this table via foreign DynamicTableColumns included in this table or in any table that
1093 can be reached through DynamicTableRegion columns
1095 Returns: List of NamedTuple objects with:
1096 * 'source_table' : The source table containing the DynamicTableRegion column
1097 * 'source_column' : The relevant DynamicTableRegion column in the 'source_table'
1098 * 'target_table' : The target DynamicTable; same as source_column.table.
1099 """
1100 link_type = NamedTuple('DynamicTableLink',
1101 [('source_table', DynamicTable),
1102 ('source_column', Union[DynamicTableRegion, VectorIndex]),
1103 ('target_table', DynamicTable)])
1104 curr_tables = [self, ] # Set of tables
1105 other_tables = getargs('other_tables', kwargs)
1106 if other_tables is not None:
1107 curr_tables += other_tables
1108 curr_index = 0
1109 foreign_cols = []
1110 while curr_index < len(curr_tables):
1111 for col_index, col in enumerate(curr_tables[curr_index].columns):
1112 if isinstance(col, DynamicTableRegion):
1113 foreign_cols.append(link_type(source_table=curr_tables[curr_index],
1114 source_column=col,
1115 target_table=col.table))
1116 curr_table_visited = False
1117 for t in curr_tables:
1118 if t is col.table:
1119 curr_table_visited = True
1120 if not curr_table_visited:
1121 curr_tables.append(col.table)
1122 curr_index += 1
1123 return foreign_cols
1125 @docval({'name': 'exclude', 'type': set, 'doc': 'Set of column names to exclude from the dataframe',
1126 'default': None},
1127 {'name': 'index', 'type': bool,
1128 'doc': ('Whether to return indices for a DynamicTableRegion column. If False, nested dataframes will be '
1129 'returned.'),
1130 'default': False}
1131 )
1132 def to_dataframe(self, **kwargs):
1133 """
1134 Produce a pandas DataFrame containing this table's data.
1136 If this table contains a DynamicTableRegion, by default,
1138 If exclude is None, this is equivalent to table.get(slice(None, None, None), index=False).
1139 """
1140 arg = slice(None, None, None) # select all rows
1141 sel = self.__get_selection_as_dict(arg, df=True, **kwargs)
1142 ret = self.__get_selection_as_df(sel)
1143 return ret
1145 @classmethod
1146 @docval(
1147 {'name': 'df', 'type': pd.DataFrame, 'doc': 'source DataFrame'},
1148 {'name': 'name', 'type': str, 'doc': 'the name of this table'},
1149 {
1150 'name': 'index_column',
1151 'type': str,
1152 'doc': 'if provided, this column will become the table\'s index',
1153 'default': None
1154 },
1155 {
1156 'name': 'table_description',
1157 'type': str,
1158 'doc': 'a description of what is in the resulting table',
1159 'default': ''
1160 },
1161 {
1162 'name': 'columns',
1163 'type': (list, tuple),
1164 'doc': 'a list/tuple of dictionaries specifying columns in the table',
1165 'default': None
1166 },
1167 allow_extra=True
1168 )
1169 def from_dataframe(cls, **kwargs):
1170 '''
1171 Construct an instance of DynamicTable (or a subclass) from a pandas DataFrame.
1173 The columns of the resulting table are defined by the columns of the
1174 dataframe and the index by the dataframe's index (make sure it has a
1175 name!) or by a column whose name is supplied to the index_column
1176 parameter. We recommend that you supply *columns* - a list/tuple of
1177 dictionaries containing the name and description of the column- to help
1178 others understand the contents of your table. See
1179 :py:class:`~hdmf.common.table.DynamicTable` for more details on *columns*.
1180 '''
1182 columns = kwargs.pop('columns')
1183 df = kwargs.pop('df')
1184 name = kwargs.pop('name')
1185 index_column = kwargs.pop('index_column')
1186 table_description = kwargs.pop('table_description')
1187 column_descriptions = kwargs.pop('column_descriptions', dict())
1189 supplied_columns = dict()
1190 if columns: 1190 ↛ 1191line 1190 didn't jump to line 1191, because the condition on line 1190 was never true
1191 supplied_columns = {x['name']: x for x in columns}
1193 class_cols = {x['name']: x for x in cls.__columns__}
1194 required_cols = set(x['name'] for x in cls.__columns__ if 'required' in x and x['required'])
1195 df_cols = df.columns
1196 if required_cols - set(df_cols): 1196 ↛ 1197line 1196 didn't jump to line 1197, because the condition on line 1196 was never true
1197 raise ValueError('missing required cols: ' + str(required_cols - set(df_cols)))
1198 if set(supplied_columns.keys()) - set(df_cols): 1198 ↛ 1199line 1198 didn't jump to line 1199, because the condition on line 1198 was never true
1199 raise ValueError('cols specified but not provided: ' + str(set(supplied_columns.keys()) - set(df_cols)))
1200 columns = []
1201 for col_name in df_cols:
1202 if col_name in class_cols: 1202 ↛ 1203line 1202 didn't jump to line 1203, because the condition on line 1202 was never true
1203 columns.append(class_cols[col_name])
1204 elif col_name in supplied_columns: 1204 ↛ 1205line 1204 didn't jump to line 1205, because the condition on line 1204 was never true
1205 columns.append(supplied_columns[col_name])
1206 else:
1207 columns.append({'name': col_name,
1208 'description': column_descriptions.get(col_name, 'no description')})
1209 if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str):
1210 lengths = [len(x) for x in df[col_name]]
1211 if not lengths[1:] == lengths[:-1]: 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true
1212 columns[-1].update(index=True)
1214 if index_column is not None: 1214 ↛ 1215line 1214 didn't jump to line 1215, because the condition on line 1214 was never true
1215 ids = ElementIdentifiers(name=index_column, data=df[index_column].values.tolist())
1216 else:
1217 index_name = df.index.name if df.index.name is not None else 'id'
1218 ids = ElementIdentifiers(name=index_name, data=df.index.values.tolist())
1220 columns = cls.__build_columns(columns, df=df)
1222 return cls(name=name, id=ids, columns=columns, description=table_description, **kwargs)
1224 def copy(self):
1225 """
1226 Return a copy of this DynamicTable.
1227 This is useful for linking.
1228 """
1229 kwargs = dict(name=self.name, id=self.id, columns=self.columns, description=self.description,
1230 colnames=self.colnames)
1231 return self.__class__(**kwargs)
1234@register_class('DynamicTableRegion')
1235class DynamicTableRegion(VectorData):
1236 """
1237 DynamicTableRegion provides a link from one table to an index or region of another. The `table`
1238 attribute is another `DynamicTable`, indicating which table is referenced. The data is int(s)
1239 indicating the row(s) (0-indexed) of the target array. `DynamicTableRegion`s can be used to
1240 associate multiple rows with the same meta-data without data duplication. They can also be used to
1241 create hierarchical relationships between multiple `DynamicTable`s. `DynamicTableRegion` objects
1242 may be paired with a `VectorIndex` object to create ragged references, so a single cell of a
1243 `DynamicTable` can reference many rows of another `DynamicTable`.
1244 """
1246 __fields__ = (
1247 'table',
1248 )
1250 @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'},
1251 {'name': 'data', 'type': ('array_data', 'data'),
1252 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'},
1253 {'name': 'description', 'type': str, 'doc': 'a description of what this region represents'},
1254 {'name': 'table', 'type': DynamicTable,
1255 'doc': 'the DynamicTable this region applies to', 'default': None},
1256 allow_positional=AllowPositional.WARNING)
1257 def __init__(self, **kwargs):
1258 t = popargs('table', kwargs)
1259 super().__init__(**kwargs)
1260 self.table = t
1262 @property
1263 def table(self):
1264 """The DynamicTable this DynamicTableRegion is pointing to"""
1265 return self.fields.get('table')
1267 @table.setter
1268 def table(self, val):
1269 """
1270 Set the table this DynamicTableRegion should be pointing to
1272 :param val: The DynamicTable this DynamicTableRegion should be pointing to
1274 :raises: AttributeError if table is already in fields
1275 :raises: IndexError if the current indices are out of bounds for the new table given by val
1276 """
1277 if val is None:
1278 return
1279 if 'table' in self.fields: 1279 ↛ 1280line 1279 didn't jump to line 1280, because the condition on line 1279 was never true
1280 msg = "can't set attribute 'table' -- already set"
1281 raise AttributeError(msg)
1282 dat = self.data
1283 if isinstance(dat, DataIO): 1283 ↛ 1284line 1283 didn't jump to line 1284, because the condition on line 1283 was never true
1284 dat = dat.data
1285 self.fields['table'] = val
1287 def __getitem__(self, arg):
1288 return self.get(arg)
1290 def get(self, arg, index=False, df=True, **kwargs):
1291 """
1292 Subset the DynamicTableRegion
1294 :param arg: Key defining which elements of the table to select. This may be one of the following:
1296 1) string with the name of the column to select
1297 2) a tuple consisting of (int, str) where the int selects the row and the string identifies the
1298 column to select by name
1299 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then
1300 scalars are returned for each column that has a single value. If a list, array, or slice is used and
1301 df=False, then lists are returned for each column, even if the list, array, or slice resolves to a
1302 single row.
1304 :param index: Boolean indicating whether to return indices of the DTR (default False)
1305 :param df: Boolean indicating whether to return the result as a pandas DataFrame (default True)
1307 :return: Result from self.table[...] with the appropriate selection based on the
1308 rows selected by this DynamicTableRegion
1309 """
1310 if not df and not index:
1311 # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported
1312 raise ValueError('DynamicTableRegion.get() with df=False and index=False is not yet supported.')
1313 # treat the list of indices as data that can be indexed. then pass the
1314 # result to the table to get the data
1315 if isinstance(arg, tuple):
1316 arg1 = arg[0]
1317 arg2 = arg[1]
1318 return self.table[self.data[arg1], arg2]
1319 elif isinstance(arg, str):
1320 return self.table[arg]
1321 elif np.issubdtype(type(arg), np.integer):
1322 if arg >= len(self.data):
1323 raise IndexError('index {} out of bounds for data of length {}'.format(arg, len(self.data)))
1324 ret = self.data[arg]
1325 if not index:
1326 ret = self.table.get(ret, df=df, index=index, **kwargs)
1327 return ret
1328 elif isinstance(arg, (list, slice, np.ndarray)):
1329 idx = arg
1331 # get the data at the specified indices
1332 if isinstance(self.data, (tuple, list)) and isinstance(idx, (list, np.ndarray)):
1333 ret = [self.data[i] for i in idx]
1334 else:
1335 ret = self.data[idx]
1337 # dereference them if necessary
1338 if not index:
1339 # These lines are needed because indexing Dataset with a list/ndarray
1340 # of ints requires the list to be sorted.
1341 #
1342 # First get the unique elements, retrieve them from the table, and then
1343 # reorder the result according to the original index that the user passed in.
1344 #
1345 # When not returning a DataFrame, we need to recursively sort the subelements
1346 # of the list we are returning. This is carried out by the recursive method _index_lol
1347 uniq = np.unique(ret)
1348 lut = {val: i for i, val in enumerate(uniq)}
1349 values = self.table.get(uniq, df=df, index=index, **kwargs)
1350 if df: 1350 ↛ 1353line 1350 didn't jump to line 1353, because the condition on line 1350 was never false
1351 ret = values.iloc[[lut[i] for i in ret]]
1352 else:
1353 ret = self._index_lol(values, ret, lut)
1354 return ret
1355 else:
1356 raise ValueError("unrecognized argument: '%s'" % arg)
1358 def _index_lol(self, result, index, lut):
1359 """
1360 This is a helper function for indexing a list of lists/ndarrays. When not returning a
1361 DataFrame, indexing a DynamicTable will return a list of lists and ndarrays. To sort
1362 the result of a DynamicTable index according to the order of the indices passed in by the
1363 user, we have to recursively sort the sub-lists/sub-ndarrays.
1364 """
1365 ret = list()
1366 for col in result:
1367 if isinstance(col, list):
1368 if isinstance(col[0], list):
1369 # list of columns that need to be sorted
1370 ret.append(self._index_lol(col, index, lut))
1371 else:
1372 # list of elements, one for each row to return
1373 ret.append([col[lut[i]] for i in index])
1374 elif isinstance(col, np.ndarray):
1375 ret.append(np.array([col[lut[i]] for i in index], dtype=col.dtype))
1376 else:
1377 raise ValueError('unrecognized column type: %s. Expected list or np.ndarray' % type(col))
1378 return ret
1380 def to_dataframe(self, **kwargs):
1381 """
1382 Convert the whole DynamicTableRegion to a pandas dataframe.
1384 Keyword arguments are passed through to the to_dataframe method of DynamicTable that
1385 is being referenced (i.e., self.table). This allows specification of the 'exclude'
1386 parameter and any other parameters of DynamicTable.to_dataframe.
1387 """
1388 return self.table.to_dataframe(**kwargs).iloc[self.data[:]]
1390 @property
1391 def shape(self):
1392 """
1393 Define the shape, i.e., (num_rows, num_columns) of the selected table region
1394 :return: Shape tuple with two integers indicating the number of rows and number of columns
1395 """
1396 return (len(self.data), len(self.table.columns))
1398 def __repr__(self):
1399 """
1400 :return: Human-readable string representation of the DynamicTableRegion
1401 """
1402 cls = self.__class__
1403 template = "%s %s.%s at 0x%d\n" % (self.name, cls.__module__, cls.__name__, id(self))
1404 template += " Target table: %s %s.%s at 0x%d\n" % (self.table.name,
1405 self.table.__class__.__module__,
1406 self.table.__class__.__name__,
1407 id(self.table))
1408 return template
1410 def _validate_on_set_parent(self):
1411 # when this DynamicTableRegion is added to a parent, check:
1412 # 1) if the table was read from a written file, no need to validate further
1413 p = self.table
1414 while p is not None:
1415 if p.container_source is not None: 1415 ↛ 1416line 1415 didn't jump to line 1416, because the condition on line 1415 was never true
1416 return super()._validate_on_set_parent()
1417 p = p.parent
1419 # 2) if none of the ancestors are ancestors of the linked-to table, then when this is written, the table
1420 # field will point to a table that is not in the file
1421 table_ancestor_ids = [id(x) for x in self.table.get_ancestors()]
1422 self_ancestor_ids = [id(x) for x in self.get_ancestors()]
1424 if set(table_ancestor_ids).isdisjoint(self_ancestor_ids):
1425 msg = (f"The linked table for DynamicTableRegion '{self.name}' does not share an ancestor with the "
1426 "DynamicTableRegion.")
1427 warn(msg)
1428 return super()._validate_on_set_parent()
1431def _uint_precision(elements):
1432 """ Calculate the uint precision needed to encode a set of elements """
1433 n_elements = elements
1434 if hasattr(elements, '__len__'): 1434 ↛ 1436line 1434 didn't jump to line 1436, because the condition on line 1434 was never false
1435 n_elements = len(elements)
1436 return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type
1439def _map_elements(uint, elements):
1440 """ Map CV terms to their uint index """
1441 return {t[1]: uint(t[0]) for t in enumerate(elements)}
1444@register_class('EnumData', EXP_NAMESPACE)
1445class EnumData(VectorData):
1446 """
1447 A n-dimensional dataset that can contain elements from fixed set of elements.
1448 """
1450 __fields__ = ('elements', )
1452 @docval({'name': 'name', 'type': str, 'doc': 'the name of this column'},
1453 {'name': 'description', 'type': str, 'doc': 'a description for this column'},
1454 {'name': 'data', 'type': ('array_data', 'data'),
1455 'doc': 'integers that index into elements for the value of each row', 'default': list()},
1456 {'name': 'elements', 'type': ('array_data', 'data', VectorData), 'default': list(),
1457 'doc': 'lookup values for each integer in ``data``'},
1458 allow_positional=AllowPositional.WARNING)
1459 def __init__(self, **kwargs):
1460 elements = popargs('elements', kwargs)
1461 super().__init__(**kwargs)
1462 if not isinstance(elements, VectorData):
1463 elements = VectorData(name='%s_elements' % self.name, data=elements,
1464 description='fixed set of elements referenced by %s' % self.name)
1465 self.elements = elements
1466 if len(self.elements) > 0:
1467 self.__uint = _uint_precision(self.elements.data)
1468 self.__revidx = _map_elements(self.__uint, self.elements.data)
1469 else:
1470 self.__revidx = dict() # a map from term to index
1471 self.__uint = None # the precision needed to encode all terms
1473 def __add_term(self, term):
1474 """
1475 Add a new CV term, and return it's corresponding index
1477 Returns:
1478 The index of the term
1479 """
1480 if term not in self.__revidx:
1481 # get minimum uint precision needed for elements
1482 self.elements.append(term)
1483 uint = _uint_precision(self.elements)
1484 if self.__uint is uint:
1485 # add the new term to the index-term map
1486 self.__revidx[term] = self.__uint(len(self.elements) - 1)
1487 else:
1488 # remap terms to their uint and bump the precision of existing data
1489 self.__uint = uint
1490 self.__revidx = _map_elements(self.__uint, self.elements)
1491 for i in range(len(self.data)): 1491 ↛ 1492line 1491 didn't jump to line 1492, because the loop on line 1491 never started
1492 self.data[i] = self.__uint(self.data[i])
1493 return self.__revidx[term]
1495 def __getitem__(self, arg):
1496 return self.get(arg, index=False)
1498 def _get_helper(self, idx, index=False, join=False, **kwargs):
1499 """
1500 A helper function for getting elements elements
1502 This helper function contains the post-processing of retrieve indices. By separating this,
1503 it allows customizing processing of indices before resolving the elements elements
1504 """
1505 if index:
1506 return idx
1507 if not np.isscalar(idx):
1508 idx = np.asarray(idx)
1509 ret = np.asarray(self.elements.get(idx.ravel(), **kwargs)).reshape(idx.shape)
1510 if join:
1511 ret = ''.join(ret.ravel())
1512 else:
1513 ret = self.elements.get(idx, **kwargs)
1514 return ret
1516 def get(self, arg, index=False, join=False, **kwargs):
1517 """
1518 Return elements elements for the given argument.
1520 Args:
1521 index (bool): Return indices, do not return CV elements
1522 join (bool): Concatenate elements together into a single string
1524 Returns:
1525 CV elements if *join* is False or a concatenation of all selected
1526 elements if *join* is True.
1527 """
1528 idx = self.data[arg]
1529 return self._get_helper(idx, index=index, join=join, **kwargs)
1531 @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'},
1532 {'name': 'index', 'type': bool, 'doc': 'whether or not the value being added is an index',
1533 'default': False})
1534 def add_row(self, **kwargs):
1535 """Append a data value to this EnumData column
1537 If an element is provided for *val* (i.e. *index* is False), the correct
1538 index value will be determined. Otherwise, *val* will be added as provided.
1539 """
1540 val, index = getargs('val', 'index', kwargs)
1541 if not index:
1542 val = self.__add_term(val)
1543 super().append(val)