Coverage for src/hdmf/common/alignedtable.py: 99%

171 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-07-25 05:02 +0000

1""" 

2Collection of Container classes for interacting with aligned and hierarchical dynamic tables 

3""" 

4from collections import OrderedDict 

5 

6import numpy as np 

7import pandas as pd 

8 

9from . import register_class 

10from .table import DynamicTable 

11from ..utils import docval, getargs, popargs, get_docval, AllowPositional 

12 

13 

14@register_class('AlignedDynamicTable') 

15class AlignedDynamicTable(DynamicTable): 

16 """ 

17 DynamicTable container that supports storing a collection of subtables. Each sub-table is a 

18 DynamicTable itself that is aligned with the main table by row index. I.e., all 

19 DynamicTables stored in this group MUST have the same number of rows. This type effectively 

20 defines a 2-level table in which the main data is stored in the main table implemented by this type 

21 and additional columns of the table are grouped into categories, with each category being' 

22 represented by a separate DynamicTable stored within the group. 

23 

24 NOTE: To remain compatible with DynamicTable, the attribute colnames represents only the 

25 columns of the main table (not including the category tables). To get the full list of 

26 column names, use the get_colnames() function instead. 

27 """ 

28 __fields__ = ({'name': 'category_tables', 'child': True}, ) 

29 

30 @docval(*get_docval(DynamicTable.__init__), 

31 {'name': 'category_tables', 'type': list, 

32 'doc': 'List of DynamicTables to be added to the container. NOTE: Only regular ' 

33 'DynamicTables are allowed. Using AlignedDynamicTable as a category for ' 

34 'AlignedDynamicTable is currently not supported.', 'default': None}, 

35 {'name': 'categories', 'type': 'array_data', 

36 'doc': 'List of names with the ordering of category tables', 'default': None}, 

37 allow_positional=AllowPositional.WARNING) 

38 def __init__(self, **kwargs): # noqa: C901 

39 in_category_tables = popargs('category_tables', kwargs) 

40 in_categories = popargs('categories', kwargs) 

41 if in_category_tables is not None: 

42 # Error check to make sure that all category_table are regular DynamicTable 

43 for i, v in enumerate(in_category_tables): 

44 if not isinstance(v, DynamicTable): 

45 raise ValueError("Category table with index %i is not a DynamicTable" % i) 

46 if isinstance(v, AlignedDynamicTable): 

47 raise ValueError("Category table with index %i is an AlignedDynamicTable. " 

48 "Nesting of AlignedDynamicTable is currently not supported." % i) 

49 # set in_categories from the in_category_tables if it is empty 

50 if in_categories is None and in_category_tables is not None: 

51 in_categories = [tab.name for tab in in_category_tables] 

52 # check that if categories is given that we also have category_tables 

53 if in_categories is not None and in_category_tables is None: 

54 raise ValueError("Categories provided but no category_tables given") 

55 # at this point both in_categories and in_category_tables should either both be None or both be a list 

56 if in_categories is not None: 

57 if len(in_categories) != len(in_category_tables): 

58 raise ValueError("%s category_tables given but %s categories specified" % 

59 (len(in_category_tables), len(in_categories))) 

60 # Initialize the main dynamic table 

61 super().__init__(**kwargs) 

62 # Create and set all sub-categories 

63 dts = OrderedDict() 

64 # Add the custom categories given as inputs 

65 if in_category_tables is not None: 

66 # We may need to resize our main table when adding categories as the user may not have set ids 

67 if len(in_category_tables) > 0: 67 ↛ 76line 67 didn't jump to line 76, because the condition on line 67 was never false

68 # We have categories to process 

69 if len(self.id) == 0: 

70 # The user did not initialize our main table id's nor set columns for our main table 

71 for i in range(len(in_category_tables[0])): 

72 self.id.append(i) 

73 # Add the user-provided categories in the correct order as described by the categories 

74 # This is necessary, because we do not store the categories explicitly but we maintain them 

75 # as the order of our self.category_tables. In this makes sure look-ups are consistent. 

76 lookup_index = OrderedDict([(k, -1) for k in in_categories]) 

77 for i, v in enumerate(in_category_tables): 

78 # Error check that the name of the table is in our categories list 

79 if v.name not in lookup_index: 

80 raise ValueError("DynamicTable %s does not appear in categories %s" % (v.name, str(in_categories))) 

81 # Error check to make sure no two tables with the same name are given 

82 if lookup_index[v.name] >= 0: 

83 raise ValueError("Duplicate table name %s found in input dynamic_tables" % v.name) 

84 lookup_index[v.name] = i 

85 for table_name, tabel_index in lookup_index.items(): 

86 # This error case should not be able to occur since the length of the in_categories and 

87 # in_category_tables must match and we made sure that each DynamicTable we added had its 

88 # name in the in_categories list. We, therefore, exclude this check from coverage testing 

89 # but we leave it in just as a backup trigger in case something unexpected happens 

90 if tabel_index < 0: # pragma: no cover 

91 raise ValueError("DynamicTable %s listed in categories but does not appear in category_tables" % 

92 table_name) # pragma: no cover 

93 # Test that all category tables have the correct number of rows 

94 category = in_category_tables[tabel_index] 

95 if len(category) != len(self): 

96 raise ValueError('Category DynamicTable %s does not align, it has %i rows expected %i' % 

97 (category.name, len(category), len(self))) 

98 # Add the category table to our category_tables. 

99 dts[category.name] = category 

100 # Set the self.category_tables attribute, which will set the parent/child relationships for the category_tables 

101 self.category_tables = dts 

102 

103 def __contains__(self, val): 

104 """ 

105 Check if the given value (i.e., column) exists in this table 

106 

107 :param val: If val is a string then check if the given category exists. If val is a tuple 

108 of two strings (category, colname) then check for the given category if the given colname exists. 

109 """ 

110 if isinstance(val, str): 

111 return val in self.category_tables or val in self.colnames 

112 elif isinstance(val, tuple): 

113 if len(val) != 2: 

114 raise ValueError("Expected tuple of strings of length 2 got tuple of length %i" % len(val)) 

115 return val[1] in self.get_category(val[0]) 

116 else: 

117 return False 

118 

119 @property 

120 def categories(self): 

121 """ 

122 Get the list of names the categories 

123 

124 Short-hand for list(self.category_tables.keys()) 

125 

126 :raises: KeyError if the given name is not in self.category_tables 

127 """ 

128 return list(self.category_tables.keys()) 

129 

130 @docval({'name': 'category', 'type': DynamicTable, 'doc': 'Add a new DynamicTable category'},) 

131 def add_category(self, **kwargs): 

132 """ 

133 Add a new DynamicTable to the AlignedDynamicTable to create a new category in the table. 

134 

135 NOTE: The table must align with (i.e, have the same number of rows as) the main data table (and 

136 other category tables). I.e., if the AlignedDynamicTable is already populated with data 

137 then we have to populate the new category with the corresponding data before adding it. 

138 

139 :raises: ValueError is raised if the input table does not have the same number of rows as the main table. 

140 ValueError is raised if the table is an AlignedDynamicTable instead of regular DynamicTable. 

141 """ 

142 category = getargs('category', kwargs) 

143 if len(category) != len(self): 

144 raise ValueError('New category DynamicTable does not align, it has %i rows expected %i' % 

145 (len(category), len(self))) 

146 if category.name in self.category_tables: 

147 raise ValueError("Category %s already in the table" % category.name) 

148 if isinstance(category, AlignedDynamicTable): 

149 raise ValueError("Category is an AlignedDynamicTable. Nesting of AlignedDynamicTable " 

150 "is currently not supported.") 

151 self.category_tables[category.name] = category 

152 category.parent = self 

153 

154 @docval({'name': 'name', 'type': str, 'doc': 'Name of the category we want to retrieve', 'default': None}) 

155 def get_category(self, **kwargs): 

156 name = popargs('name', kwargs) 

157 if name is None or (name not in self.category_tables and name == self.name): 

158 return self 

159 else: 

160 return self.category_tables[name] 

161 

162 @docval(*get_docval(DynamicTable.add_column), 

163 {'name': 'category', 'type': str, 'doc': 'The category the column should be added to', 

164 'default': None}) 

165 def add_column(self, **kwargs): 

166 """ 

167 Add a column to the table 

168 

169 :raises: KeyError if the category does not exist 

170 

171 """ 

172 category_name = popargs('category', kwargs) 

173 if category_name is None: 

174 # Add the column to our main table 

175 super().add_column(**kwargs) 

176 else: 

177 # Add the column to a sub-category table 

178 try: 

179 category = self.get_category(category_name) 

180 except KeyError: 

181 raise KeyError("Category %s not in table" % category_name) 

182 category.add_column(**kwargs) 

183 

184 @docval({'name': 'data', 'type': dict, 'doc': 'the data to put in this row', 'default': None}, 

185 {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None}, 

186 {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique', 

187 'default': False}, 

188 allow_extra=True) 

189 def add_row(self, **kwargs): 

190 """ 

191 We can either provide the row data as a single dict or by specifying a dict for each category 

192 """ 

193 data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs) 

194 data = data if data is not None else kwargs 

195 

196 # extract the category data 

197 category_data = {k: data.pop(k) for k in self.categories if k in data} 

198 

199 # Check that we have the approbriate categories provided 

200 missing_categories = set(self.categories) - set(list(category_data.keys())) 

201 if missing_categories: 

202 raise KeyError( 

203 '\n'.join([ 

204 'row data keys do not match available categories', 

205 'missing {} category keys: {}'.format(len(missing_categories), missing_categories) 

206 ]) 

207 ) 

208 # Add the data to our main dynamic table 

209 data['id'] = row_id 

210 data['enforce_unique_id'] = enforce_unique_id 

211 super().add_row(**data) 

212 

213 # Add the data to all out dynamic table categories 

214 for category, values in category_data.items(): 

215 self.category_tables[category].add_row(**values) 

216 

217 @docval({'name': 'include_category_tables', 'type': bool, 

218 'doc': "Ignore sub-category tables and just look at the main table", 'default': False}, 

219 {'name': 'ignore_category_ids', 'type': bool, 

220 'doc': "Ignore id columns of sub-category tables", 'default': False}) 

221 def get_colnames(self, **kwargs): 

222 """Get the full list of names of columns for this table 

223 

224 :returns: List of tuples (str, str) where the first string is the name of the DynamicTable 

225 that contains the column and the second string is the name of the column. If 

226 include_category_tables is False, then a list of column names is returned. 

227 """ 

228 if not getargs('include_category_tables', kwargs): 

229 return self.colnames 

230 else: 

231 ignore_category_ids = getargs('ignore_category_ids', kwargs) 

232 columns = [(self.name, c) for c in self.colnames] 

233 for category in self.category_tables.values(): 

234 if not ignore_category_ids: 

235 columns += [(category.name, 'id'), ] 

236 columns += [(category.name, c) for c in category.colnames] 

237 return columns 

238 

239 @docval({'name': 'ignore_category_ids', 'type': bool, 

240 'doc': "Ignore id columns of sub-category tables", 'default': False}) 

241 def to_dataframe(self, **kwargs): 

242 """Convert the collection of tables to a single pandas DataFrame""" 

243 dfs = [super().to_dataframe().reset_index(), ] 

244 if getargs('ignore_category_ids', kwargs): 

245 dfs += [category.to_dataframe() for category in self.category_tables.values()] 

246 else: 

247 dfs += [category.to_dataframe().reset_index() for category in self.category_tables.values()] 

248 names = [self.name, ] + list(self.category_tables.keys()) 

249 res = pd.concat(dfs, axis=1, keys=names) 

250 res.set_index((self.name, 'id'), drop=True, inplace=True) 

251 return res 

252 

253 def __getitem__(self, item): 

254 """ 

255 Called to implement standard array slicing syntax. 

256 

257 Same as ``self.get(item)``. See :py:meth:`~hdmf.common.alignedtable.AlignedDynamicTable.get` for details. 

258 """ 

259 return self.get(item) 

260 

261 def get(self, item, **kwargs): 

262 """ 

263 Access elements (rows, columns, category tables etc.) from the table. Instead of calling 

264 this function directly, the class also implements standard array slicing syntax 

265 via :py:meth:`~hdmf.common.alignedtable.AlignedDynamicTable.__getitem__` 

266 (which calls this function). For example, instead of calling 

267 ``self.get(item=slice(2,5))`` we may use the often more convenient form of ``self[2:5]`` instead. 

268 

269 :param item: Selection defining the items of interest. This may be either a: 

270 

271 * **int, list, array, slice** : Return one or multiple row of the table as a pandas.DataFrame. For example: 

272 * ``self[0]`` : Select the first row of the table 

273 * ``self[[0,3]]`` : Select the first and fourth row of the table 

274 * ``self[1:4]`` : Select the rows with index 1,2,3 from the table 

275 

276 * **string** : Return a column from the main table or a category table. For example: 

277 * ``self['column']`` : Return the column from the main table. 

278 * ``self['my_category']`` : Returns a DataFrame of the ``my_category`` category table. 

279 This is a shorthand for ``self.get_category('my_category').to_dataframe()``. 

280 

281 * **tuple**: Get a column, row, or cell from a particular category table. The tuple is expected to 

282 consist of the following elements: 

283 

284 * ``category``: string with the name of the category. To select from the main 

285 table use ``self.name`` or ``None``. 

286 * ``column``: string with the name of the column, and 

287 * ``row``: integer index of the row. 

288 

289 The tuple itself then may take the following forms: 

290 

291 * Select a single column from a table via: 

292 * ``self[category, column]`` 

293 * Select a single full row of a given category table via: 

294 * ``self[row, category]`` (recommended, for consistency with DynamicTable) 

295 * ``self[category, row]`` 

296 * Select a single cell via: 

297 * ``self[row, (category, column)]`` (recommended, for consistency with DynamicTable) 

298 * ``self[row, category, column]`` 

299 * ``self[category, column, row]`` 

300 

301 :returns: Depending on the type of selection the function returns a: 

302 

303 * **pandas.DataFrame**: when retrieving a row or category table 

304 * **array** : when retrieving a single column 

305 * **single value** : when retrieving a single cell. The data type and shape will depend on the 

306 data type and shape of the cell/column. 

307 """ 

308 if isinstance(item, (int, list, np.ndarray, slice)): 

309 # get a single full row from all tables 

310 dfs = ([super().get(item, **kwargs).reset_index(), ] + 

311 [category[item].reset_index() for category in self.category_tables.values()]) 

312 names = [self.name, ] + list(self.category_tables.keys()) 

313 res = pd.concat(dfs, axis=1, keys=names) 

314 res.set_index((self.name, 'id'), drop=True, inplace=True) 

315 return res 

316 elif isinstance(item, str) or item is None: 

317 if item in self.colnames: 

318 # get a specific column 

319 return super().get(item, **kwargs) 

320 else: 

321 # get a single category 

322 return self.get_category(item).to_dataframe() 

323 elif isinstance(item, tuple): 323 ↛ exitline 323 didn't return from function 'get', because the condition on line 323 was never false

324 if len(item) == 2: 

325 # DynamicTable allows selection of cells via the syntax [int, str], i.e,. [row_index, columnname] 

326 # We support this syntax here as well with the additional caveat that in AlignedDynamicTable 

327 # columns are identified by tuples of strings. As such [int, str] refers not to a cell but 

328 # a single row in a particular category table (i.e., [row_index, category]). To select a cell 

329 # the second part of the item then is a tuple of strings, i.e., [row_index, (category, column)] 

330 if isinstance(item[0], (int, np.integer)): 

331 # Select a single cell or row of a sub-table based on row-index(item[0]) 

332 # and the category (if item[1] is a string) or column (if item[1] is a tuple of (category, column) 

333 re = self[item[0]][item[1]] 

334 # re is a pandas.Series or pandas.Dataframe. If we selected a single cell 

335 # (i.e., item[2] was a tuple defining a particular column) then return the value of the cell 

336 if re.size == 1: 

337 re = re.values[0] 

338 # If we selected a single cell from a ragged column then we need to change the list to a tuple 

339 if isinstance(re, list): 

340 re = tuple(re) 

341 # We selected a row of a whole table (i.e., item[2] identified only the category table, 

342 # but not a particular column). 

343 # Change the result from a pandas.Series to a pandas.DataFrame for consistency with DynamicTable 

344 if isinstance(re, pd.Series): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true

345 re = re.to_frame() 

346 return re 

347 else: 

348 return self.get_category(item[0])[item[1]] 

349 elif len(item) == 3: 

350 if isinstance(item[0], (int, np.integer)): 

351 return self.get_category(item[1])[item[2]][item[0]] 

352 else: 

353 return self.get_category(item[0])[item[1]][item[2]] 

354 else: 

355 raise ValueError("Expected tuple of length 2 of the form [category, column], [row, category], " 

356 "[row, (category, column)] or a tuple of length 3 of the form " 

357 "[category, column, row], [row, category, column]") 

358 

359 @docval({'name': 'ignore_category_tables', 'type': bool, 

360 'doc': "Ignore the category tables and only check in the main table columns", 'default': False}, 

361 allow_extra=False) 

362 def has_foreign_columns(self, **kwargs): 

363 """ 

364 Does the table contain DynamicTableRegion columns 

365 

366 :returns: True if the table or any of the category tables contains a DynamicTableRegion column, else False 

367 """ 

368 ignore_category_tables = getargs('ignore_category_tables', kwargs) 

369 if super().has_foreign_columns(): 

370 return True 

371 if not ignore_category_tables: 

372 for table in self.category_tables.values(): 

373 if table.has_foreign_columns(): 

374 return True 

375 return False 

376 

377 @docval({'name': 'ignore_category_tables', 'type': bool, 

378 'doc': "Ignore the category tables and only check in the main table columns", 'default': False}, 

379 allow_extra=False) 

380 def get_foreign_columns(self, **kwargs): 

381 """ 

382 Determine the names of all columns that link to another DynamicTable, i.e., 

383 find all DynamicTableRegion type columns. Similar to a foreign key in a 

384 database, a DynamicTableRegion column references elements in another table. 

385 

386 :returns: List of tuples (str, str) where the first string is the name of the 

387 category table (or None if the column is in the main table) and the 

388 second string is the column name. 

389 """ 

390 ignore_category_tables = getargs('ignore_category_tables', kwargs) 

391 col_names = [(None, col_name) for col_name in super().get_foreign_columns()] 

392 if not ignore_category_tables: 

393 for table in self.category_tables.values(): 

394 col_names += [(table.name, col_name) for col_name in table.get_foreign_columns()] 

395 return col_names 

396 

397 @docval(*get_docval(DynamicTable.get_linked_tables), 

398 {'name': 'ignore_category_tables', 'type': bool, 

399 'doc': "Ignore the category tables and only check in the main table columns", 'default': False}, 

400 allow_extra=False) 

401 def get_linked_tables(self, **kwargs): 

402 """ 

403 Get a list of the full list of all tables that are being linked to directly or indirectly 

404 from this table via foreign DynamicTableColumns included in this table or in any table that 

405 can be reached through DynamicTableRegion columns 

406 

407 

408 Returns: List of dicts with the following keys: 

409 * 'source_table' : The source table containing the DynamicTableRegion column 

410 * 'source_column' : The relevant DynamicTableRegion column in the 'source_table' 

411 * 'target_table' : The target DynamicTable; same as source_column.table. 

412 

413 """ 

414 ignore_category_tables = getargs('ignore_category_tables', kwargs) 

415 other_tables = None if ignore_category_tables else list(self.category_tables.values()) 

416 return super().get_linked_tables(other_tables=other_tables)