Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2High level interface to PyTables for reading and writing pandas data structures 

3to disk 

4""" 

5 

6import copy 

7from datetime import date, tzinfo 

8import itertools 

9import os 

10import re 

11from typing import ( 

12 TYPE_CHECKING, 

13 Any, 

14 Dict, 

15 Hashable, 

16 List, 

17 Optional, 

18 Tuple, 

19 Type, 

20 Union, 

21) 

22import warnings 

23 

24import numpy as np 

25 

26from pandas._config import config, get_option 

27 

28from pandas._libs import lib, writers as libwriters 

29from pandas._libs.tslibs import timezones 

30from pandas._typing import ArrayLike, FrameOrSeries 

31from pandas.compat._optional import import_optional_dependency 

32from pandas.errors import PerformanceWarning 

33from pandas.util._decorators import cache_readonly 

34 

35from pandas.core.dtypes.common import ( 

36 ensure_object, 

37 is_categorical_dtype, 

38 is_complex_dtype, 

39 is_datetime64_dtype, 

40 is_datetime64tz_dtype, 

41 is_extension_array_dtype, 

42 is_list_like, 

43 is_string_dtype, 

44 is_timedelta64_dtype, 

45) 

46from pandas.core.dtypes.generic import ABCExtensionArray 

47from pandas.core.dtypes.missing import array_equivalent 

48 

49from pandas import ( 

50 DataFrame, 

51 DatetimeIndex, 

52 Index, 

53 Int64Index, 

54 MultiIndex, 

55 PeriodIndex, 

56 Series, 

57 TimedeltaIndex, 

58 concat, 

59 isna, 

60) 

61from pandas.core.arrays.categorical import Categorical 

62import pandas.core.common as com 

63from pandas.core.computation.pytables import PyTablesExpr, maybe_expression 

64from pandas.core.indexes.api import ensure_index 

65 

66from pandas.io.common import stringify_path 

67from pandas.io.formats.printing import adjoin, pprint_thing 

68 

69if TYPE_CHECKING: 

70 from tables import File, Node, Col # noqa:F401 

71 

72 

73# versioning attribute 

74_version = "0.15.2" 

75 

76# encoding 

77_default_encoding = "UTF-8" 

78 

79 

80def _ensure_decoded(s): 

81 """ if we have bytes, decode them to unicode """ 

82 if isinstance(s, np.bytes_): 

83 s = s.decode("UTF-8") 

84 return s 

85 

86 

87def _ensure_encoding(encoding): 

88 # set the encoding if we need 

89 if encoding is None: 

90 encoding = _default_encoding 

91 

92 return encoding 

93 

94 

95def _ensure_str(name): 

96 """ 

97 Ensure that an index / column name is a str (python 3); otherwise they 

98 may be np.string dtype. Non-string dtypes are passed through unchanged. 

99 

100 https://github.com/pandas-dev/pandas/issues/13492 

101 """ 

102 if isinstance(name, str): 

103 name = str(name) 

104 return name 

105 

106 

107Term = PyTablesExpr 

108 

109 

110def _ensure_term(where, scope_level: int): 

111 """ 

112 ensure that the where is a Term or a list of Term 

113 this makes sure that we are capturing the scope of variables 

114 that are passed 

115 create the terms here with a frame_level=2 (we are 2 levels down) 

116 """ 

117 

118 # only consider list/tuple here as an ndarray is automatically a coordinate 

119 # list 

120 level = scope_level + 1 

121 if isinstance(where, (list, tuple)): 

122 wlist = [] 

123 for w in filter(lambda x: x is not None, where): 

124 if not maybe_expression(w): 

125 wlist.append(w) 

126 else: 

127 wlist.append(Term(w, scope_level=level)) 

128 where = wlist 

129 elif maybe_expression(where): 

130 where = Term(where, scope_level=level) 

131 return where if where is None or len(where) else None 

132 

133 

134class PossibleDataLossError(Exception): 

135 pass 

136 

137 

138class ClosedFileError(Exception): 

139 pass 

140 

141 

142class IncompatibilityWarning(Warning): 

143 pass 

144 

145 

146incompatibility_doc = """ 

147where criteria is being ignored as this version [%s] is too old (or 

148not-defined), read the file in and write it out to a new file to upgrade (with 

149the copy_to method) 

150""" 

151 

152 

153class AttributeConflictWarning(Warning): 

154 pass 

155 

156 

157attribute_conflict_doc = """ 

158the [%s] attribute of the existing index is [%s] which conflicts with the new 

159[%s], resetting the attribute to None 

160""" 

161 

162 

163class DuplicateWarning(Warning): 

164 pass 

165 

166 

167duplicate_doc = """ 

168duplicate entries in table, taking most recently appended 

169""" 

170 

171performance_doc = """ 

172your performance may suffer as PyTables will pickle object types that it cannot 

173map directly to c-types [inferred_type->%s,key->%s] [items->%s] 

174""" 

175 

176# formats 

177_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"} 

178 

179# axes map 

180_AXES_MAP = {DataFrame: [0]} 

181 

182# register our configuration options 

183dropna_doc = """ 

184: boolean 

185 drop ALL nan rows when appending to a table 

186""" 

187format_doc = """ 

188: format 

189 default format writing format, if None, then 

190 put will default to 'fixed' and append will default to 'table' 

191""" 

192 

193with config.config_prefix("io.hdf"): 

194 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool) 

195 config.register_option( 

196 "default_format", 

197 None, 

198 format_doc, 

199 validator=config.is_one_of_factory(["fixed", "table", None]), 

200 ) 

201 

202# oh the troubles to reduce import time 

203_table_mod = None 

204_table_file_open_policy_is_strict = False 

205 

206 

207def _tables(): 

208 global _table_mod 

209 global _table_file_open_policy_is_strict 

210 if _table_mod is None: 

211 import tables 

212 

213 _table_mod = tables 

214 

215 # set the file open policy 

216 # return the file open policy; this changes as of pytables 3.1 

217 # depending on the HDF5 version 

218 try: 

219 _table_file_open_policy_is_strict = ( 

220 tables.file._FILE_OPEN_POLICY == "strict" 

221 ) 

222 except AttributeError: 

223 pass 

224 

225 return _table_mod 

226 

227 

228# interface to/from ### 

229 

230 

231def to_hdf( 

232 path_or_buf, 

233 key: str, 

234 value: FrameOrSeries, 

235 mode: str = "a", 

236 complevel: Optional[int] = None, 

237 complib: Optional[str] = None, 

238 append: bool = False, 

239 format: Optional[str] = None, 

240 index: bool = True, 

241 min_itemsize: Optional[Union[int, Dict[str, int]]] = None, 

242 nan_rep=None, 

243 dropna: Optional[bool] = None, 

244 data_columns: Optional[List[str]] = None, 

245 errors: str = "strict", 

246 encoding: str = "UTF-8", 

247): 

248 """ store this object, close it if we opened it """ 

249 

250 if append: 

251 f = lambda store: store.append( 

252 key, 

253 value, 

254 format=format, 

255 index=index, 

256 min_itemsize=min_itemsize, 

257 nan_rep=nan_rep, 

258 dropna=dropna, 

259 data_columns=data_columns, 

260 errors=errors, 

261 encoding=encoding, 

262 ) 

263 else: 

264 # NB: dropna is not passed to `put` 

265 f = lambda store: store.put( 

266 key, 

267 value, 

268 format=format, 

269 index=index, 

270 min_itemsize=min_itemsize, 

271 nan_rep=nan_rep, 

272 data_columns=data_columns, 

273 errors=errors, 

274 encoding=encoding, 

275 ) 

276 

277 path_or_buf = stringify_path(path_or_buf) 

278 if isinstance(path_or_buf, str): 

279 with HDFStore( 

280 path_or_buf, mode=mode, complevel=complevel, complib=complib 

281 ) as store: 

282 f(store) 

283 else: 

284 f(path_or_buf) 

285 

286 

287def read_hdf( 

288 path_or_buf, 

289 key=None, 

290 mode: str = "r", 

291 errors: str = "strict", 

292 where=None, 

293 start: Optional[int] = None, 

294 stop: Optional[int] = None, 

295 columns=None, 

296 iterator=False, 

297 chunksize: Optional[int] = None, 

298 **kwargs, 

299): 

300 """ 

301 Read from the store, close it if we opened it. 

302 

303 Retrieve pandas object stored in file, optionally based on where 

304 criteria 

305 

306 Parameters 

307 ---------- 

308 path_or_buf : str, path object, pandas.HDFStore or file-like object 

309 Any valid string path is acceptable. The string could be a URL. Valid 

310 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

311 expected. A local file could be: ``file://localhost/path/to/table.h5``. 

312 

313 If you want to pass in a path object, pandas accepts any 

314 ``os.PathLike``. 

315 

316 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object. 

317 

318 By file-like object, we refer to objects with a ``read()`` method, 

319 such as a file handler (e.g. via builtin ``open`` function) 

320 or ``StringIO``. 

321 

322 .. versionadded:: 0.21.0 support for __fspath__ protocol. 

323 

324 key : object, optional 

325 The group identifier in the store. Can be omitted if the HDF file 

326 contains a single pandas object. 

327 mode : {'r', 'r+', 'a'}, default 'r' 

328 Mode to use when opening the file. Ignored if path_or_buf is a 

329 :class:`pandas.HDFStore`. Default is 'r'. 

330 where : list, optional 

331 A list of Term (or convertible) objects. 

332 start : int, optional 

333 Row number to start selection. 

334 stop : int, optional 

335 Row number to stop selection. 

336 columns : list, optional 

337 A list of columns names to return. 

338 iterator : bool, optional 

339 Return an iterator object. 

340 chunksize : int, optional 

341 Number of rows to include in an iteration when using an iterator. 

342 errors : str, default 'strict' 

343 Specifies how encoding and decoding errors are to be handled. 

344 See the errors argument for :func:`open` for a full list 

345 of options. 

346 **kwargs 

347 Additional keyword arguments passed to HDFStore. 

348 

349 Returns 

350 ------- 

351 item : object 

352 The selected object. Return type depends on the object stored. 

353 

354 See Also 

355 -------- 

356 DataFrame.to_hdf : Write a HDF file from a DataFrame. 

357 HDFStore : Low-level access to HDF files. 

358 

359 Examples 

360 -------- 

361 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) 

362 >>> df.to_hdf('./store.h5', 'data') 

363 >>> reread = pd.read_hdf('./store.h5') 

364 """ 

365 

366 if mode not in ["r", "r+", "a"]: 

367 raise ValueError( 

368 f"mode {mode} is not allowed while performing a read. " 

369 f"Allowed modes are r, r+ and a." 

370 ) 

371 # grab the scope 

372 if where is not None: 

373 where = _ensure_term(where, scope_level=1) 

374 

375 if isinstance(path_or_buf, HDFStore): 

376 if not path_or_buf.is_open: 

377 raise IOError("The HDFStore must be open for reading.") 

378 

379 store = path_or_buf 

380 auto_close = False 

381 else: 

382 path_or_buf = stringify_path(path_or_buf) 

383 if not isinstance(path_or_buf, str): 

384 raise NotImplementedError( 

385 "Support for generic buffers has not been implemented." 

386 ) 

387 try: 

388 exists = os.path.exists(path_or_buf) 

389 

390 # if filepath is too long 

391 except (TypeError, ValueError): 

392 exists = False 

393 

394 if not exists: 

395 raise FileNotFoundError(f"File {path_or_buf} does not exist") 

396 

397 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs) 

398 # can't auto open/close if we are using an iterator 

399 # so delegate to the iterator 

400 auto_close = True 

401 

402 try: 

403 if key is None: 

404 groups = store.groups() 

405 if len(groups) == 0: 

406 raise ValueError("No dataset in HDF5 file.") 

407 candidate_only_group = groups[0] 

408 

409 # For the HDF file to have only one dataset, all other groups 

410 # should then be metadata groups for that candidate group. (This 

411 # assumes that the groups() method enumerates parent groups 

412 # before their children.) 

413 for group_to_check in groups[1:]: 

414 if not _is_metadata_of(group_to_check, candidate_only_group): 

415 raise ValueError( 

416 "key must be provided when HDF5 file " 

417 "contains multiple datasets." 

418 ) 

419 key = candidate_only_group._v_pathname 

420 return store.select( 

421 key, 

422 where=where, 

423 start=start, 

424 stop=stop, 

425 columns=columns, 

426 iterator=iterator, 

427 chunksize=chunksize, 

428 auto_close=auto_close, 

429 ) 

430 except (ValueError, TypeError, KeyError): 

431 if not isinstance(path_or_buf, HDFStore): 

432 # if there is an error, close the store if we opened it. 

433 try: 

434 store.close() 

435 except AttributeError: 

436 pass 

437 

438 raise 

439 

440 

441def _is_metadata_of(group: "Node", parent_group: "Node") -> bool: 

442 """Check if a given group is a metadata group for a given parent_group.""" 

443 if group._v_depth <= parent_group._v_depth: 

444 return False 

445 

446 current = group 

447 while current._v_depth > 1: 

448 parent = current._v_parent 

449 if parent == parent_group and current._v_name == "meta": 

450 return True 

451 current = current._v_parent 

452 return False 

453 

454 

455class HDFStore: 

456 """ 

457 Dict-like IO interface for storing pandas objects in PyTables. 

458 

459 Either Fixed or Table format. 

460 

461 Parameters 

462 ---------- 

463 path : string 

464 File path to HDF5 file 

465 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

466 

467 ``'r'`` 

468 Read-only; no data can be modified. 

469 ``'w'`` 

470 Write; a new file is created (an existing file with the same 

471 name would be deleted). 

472 ``'a'`` 

473 Append; an existing file is opened for reading and writing, 

474 and if the file does not exist it is created. 

475 ``'r+'`` 

476 It is similar to ``'a'``, but the file must already exist. 

477 complevel : int, 0-9, default None 

478 Specifies a compression level for data. 

479 A value of 0 or None disables compression. 

480 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' 

481 Specifies the compression library to be used. 

482 As of v0.20.2 these additional compressors for Blosc are supported 

483 (default if no compressor specified: 'blosc:blosclz'): 

484 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 

485 'blosc:zlib', 'blosc:zstd'}. 

486 Specifying a compression library which is not available issues 

487 a ValueError. 

488 fletcher32 : bool, default False 

489 If applying compression use the fletcher32 checksum 

490 

491 Examples 

492 -------- 

493 >>> bar = pd.DataFrame(np.random.randn(10, 4)) 

494 >>> store = pd.HDFStore('test.h5') 

495 >>> store['foo'] = bar # write to HDF5 

496 >>> bar = store['foo'] # retrieve 

497 >>> store.close() 

498 """ 

499 

500 _handle: Optional["File"] 

501 _mode: str 

502 _complevel: int 

503 _fletcher32: bool 

504 

505 def __init__( 

506 self, 

507 path, 

508 mode: str = "a", 

509 complevel: Optional[int] = None, 

510 complib=None, 

511 fletcher32: bool = False, 

512 **kwargs, 

513 ): 

514 

515 if "format" in kwargs: 

516 raise ValueError("format is not a defined argument for HDFStore") 

517 

518 tables = import_optional_dependency("tables") 

519 

520 if complib is not None and complib not in tables.filters.all_complibs: 

521 raise ValueError( 

522 f"complib only supports {tables.filters.all_complibs} compression." 

523 ) 

524 

525 if complib is None and complevel is not None: 

526 complib = tables.filters.default_complib 

527 

528 self._path = stringify_path(path) 

529 if mode is None: 

530 mode = "a" 

531 self._mode = mode 

532 self._handle = None 

533 self._complevel = complevel if complevel else 0 

534 self._complib = complib 

535 self._fletcher32 = fletcher32 

536 self._filters = None 

537 self.open(mode=mode, **kwargs) 

538 

539 def __fspath__(self): 

540 return self._path 

541 

542 @property 

543 def root(self): 

544 """ return the root node """ 

545 self._check_if_open() 

546 return self._handle.root 

547 

548 @property 

549 def filename(self): 

550 return self._path 

551 

552 def __getitem__(self, key: str): 

553 return self.get(key) 

554 

555 def __setitem__(self, key: str, value): 

556 self.put(key, value) 

557 

558 def __delitem__(self, key: str): 

559 return self.remove(key) 

560 

561 def __getattr__(self, name: str): 

562 """ allow attribute access to get stores """ 

563 try: 

564 return self.get(name) 

565 except (KeyError, ClosedFileError): 

566 pass 

567 raise AttributeError( 

568 f"'{type(self).__name__}' object has no attribute '{name}'" 

569 ) 

570 

571 def __contains__(self, key: str) -> bool: 

572 """ check for existence of this key 

573 can match the exact pathname or the pathnm w/o the leading '/' 

574 """ 

575 node = self.get_node(key) 

576 if node is not None: 

577 name = node._v_pathname 

578 if name == key or name[1:] == key: 

579 return True 

580 return False 

581 

582 def __len__(self) -> int: 

583 return len(self.groups()) 

584 

585 def __repr__(self) -> str: 

586 pstr = pprint_thing(self._path) 

587 return f"{type(self)}\nFile path: {pstr}\n" 

588 

589 def __enter__(self): 

590 return self 

591 

592 def __exit__(self, exc_type, exc_value, traceback): 

593 self.close() 

594 

595 def keys(self) -> List[str]: 

596 """ 

597 Return a list of keys corresponding to objects stored in HDFStore. 

598 

599 Returns 

600 ------- 

601 list 

602 List of ABSOLUTE path-names (e.g. have the leading '/'). 

603 """ 

604 return [n._v_pathname for n in self.groups()] 

605 

606 def __iter__(self): 

607 return iter(self.keys()) 

608 

609 def items(self): 

610 """ 

611 iterate on key->group 

612 """ 

613 for g in self.groups(): 

614 yield g._v_pathname, g 

615 

616 iteritems = items 

617 

618 def open(self, mode: str = "a", **kwargs): 

619 """ 

620 Open the file in the specified mode 

621 

622 Parameters 

623 ---------- 

624 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

625 See HDFStore docstring or tables.open_file for info about modes 

626 """ 

627 tables = _tables() 

628 

629 if self._mode != mode: 

630 

631 # if we are changing a write mode to read, ok 

632 if self._mode in ["a", "w"] and mode in ["r", "r+"]: 

633 pass 

634 elif mode in ["w"]: 

635 

636 # this would truncate, raise here 

637 if self.is_open: 

638 raise PossibleDataLossError( 

639 f"Re-opening the file [{self._path}] with mode [{self._mode}] " 

640 "will delete the current file!" 

641 ) 

642 

643 self._mode = mode 

644 

645 # close and reopen the handle 

646 if self.is_open: 

647 self.close() 

648 

649 if self._complevel and self._complevel > 0: 

650 self._filters = _tables().Filters( 

651 self._complevel, self._complib, fletcher32=self._fletcher32 

652 ) 

653 

654 try: 

655 self._handle = tables.open_file(self._path, self._mode, **kwargs) 

656 except IOError as err: # pragma: no cover 

657 if "can not be written" in str(err): 

658 print(f"Opening {self._path} in read-only mode") 

659 self._handle = tables.open_file(self._path, "r", **kwargs) 

660 else: 

661 raise 

662 

663 except ValueError as err: 

664 

665 # trap PyTables >= 3.1 FILE_OPEN_POLICY exception 

666 # to provide an updated message 

667 if "FILE_OPEN_POLICY" in str(err): 

668 hdf_version = tables.get_hdf5_version() 

669 err = ValueError( 

670 f"PyTables [{tables.__version__}] no longer supports " 

671 "opening multiple files\n" 

672 "even in read-only mode on this HDF5 version " 

673 f"[{hdf_version}]. You can accept this\n" 

674 "and not open the same file multiple times at once,\n" 

675 "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " 

676 "which allows\n" 

677 "files to be opened multiple times at once\n" 

678 ) 

679 

680 raise err 

681 

682 except Exception as err: 

683 

684 # trying to read from a non-existent file causes an error which 

685 # is not part of IOError, make it one 

686 if self._mode == "r" and "Unable to open/create file" in str(err): 

687 raise IOError(str(err)) 

688 raise 

689 

690 def close(self): 

691 """ 

692 Close the PyTables file handle 

693 """ 

694 if self._handle is not None: 

695 self._handle.close() 

696 self._handle = None 

697 

698 @property 

699 def is_open(self) -> bool: 

700 """ 

701 return a boolean indicating whether the file is open 

702 """ 

703 if self._handle is None: 

704 return False 

705 return bool(self._handle.isopen) 

706 

707 def flush(self, fsync: bool = False): 

708 """ 

709 Force all buffered modifications to be written to disk. 

710 

711 Parameters 

712 ---------- 

713 fsync : bool (default False) 

714 call ``os.fsync()`` on the file handle to force writing to disk. 

715 

716 Notes 

717 ----- 

718 Without ``fsync=True``, flushing may not guarantee that the OS writes 

719 to disk. With fsync, the operation will block until the OS claims the 

720 file has been written; however, other caching layers may still 

721 interfere. 

722 """ 

723 if self._handle is not None: 

724 self._handle.flush() 

725 if fsync: 

726 try: 

727 os.fsync(self._handle.fileno()) 

728 except OSError: 

729 pass 

730 

731 def get(self, key: str): 

732 """ 

733 Retrieve pandas object stored in file. 

734 

735 Parameters 

736 ---------- 

737 key : str 

738 

739 Returns 

740 ------- 

741 object 

742 Same type as object stored in file. 

743 """ 

744 group = self.get_node(key) 

745 if group is None: 

746 raise KeyError(f"No object named {key} in the file") 

747 return self._read_group(group) 

748 

749 def select( 

750 self, 

751 key: str, 

752 where=None, 

753 start=None, 

754 stop=None, 

755 columns=None, 

756 iterator=False, 

757 chunksize=None, 

758 auto_close: bool = False, 

759 ): 

760 """ 

761 Retrieve pandas object stored in file, optionally based on where criteria. 

762 

763 Parameters 

764 ---------- 

765 key : str 

766 Object being retrieved from file. 

767 where : list, default None 

768 List of Term (or convertible) objects, optional. 

769 start : int, default None 

770 Row number to start selection. 

771 stop : int, default None 

772 Row number to stop selection. 

773 columns : list, default None 

774 A list of columns that if not None, will limit the return columns. 

775 iterator : bool, default False 

776 Returns an iterator. 

777 chunksize : int, default None 

778 Number or rows to include in iteration, return an iterator. 

779 auto_close : bool, default False 

780 Should automatically close the store when finished. 

781 

782 Returns 

783 ------- 

784 object 

785 Retrieved object from file. 

786 """ 

787 group = self.get_node(key) 

788 if group is None: 

789 raise KeyError(f"No object named {key} in the file") 

790 

791 # create the storer and axes 

792 where = _ensure_term(where, scope_level=1) 

793 s = self._create_storer(group) 

794 s.infer_axes() 

795 

796 # function to call on iteration 

797 def func(_start, _stop, _where): 

798 return s.read(start=_start, stop=_stop, where=_where, columns=columns) 

799 

800 # create the iterator 

801 it = TableIterator( 

802 self, 

803 s, 

804 func, 

805 where=where, 

806 nrows=s.nrows, 

807 start=start, 

808 stop=stop, 

809 iterator=iterator, 

810 chunksize=chunksize, 

811 auto_close=auto_close, 

812 ) 

813 

814 return it.get_result() 

815 

816 def select_as_coordinates( 

817 self, 

818 key: str, 

819 where=None, 

820 start: Optional[int] = None, 

821 stop: Optional[int] = None, 

822 ): 

823 """ 

824 return the selection as an Index 

825 

826 Parameters 

827 ---------- 

828 key : str 

829 where : list of Term (or convertible) objects, optional 

830 start : integer (defaults to None), row number to start selection 

831 stop : integer (defaults to None), row number to stop selection 

832 """ 

833 where = _ensure_term(where, scope_level=1) 

834 tbl = self.get_storer(key) 

835 if not isinstance(tbl, Table): 

836 raise TypeError("can only read_coordinates with a table") 

837 return tbl.read_coordinates(where=where, start=start, stop=stop) 

838 

839 def select_column( 

840 self, 

841 key: str, 

842 column: str, 

843 start: Optional[int] = None, 

844 stop: Optional[int] = None, 

845 ): 

846 """ 

847 return a single column from the table. This is generally only useful to 

848 select an indexable 

849 

850 Parameters 

851 ---------- 

852 key : str 

853 column : str 

854 The column of interest. 

855 start : int or None, default None 

856 stop : int or None, default None 

857 

858 Raises 

859 ------ 

860 raises KeyError if the column is not found (or key is not a valid 

861 store) 

862 raises ValueError if the column can not be extracted individually (it 

863 is part of a data block) 

864 

865 """ 

866 tbl = self.get_storer(key) 

867 if not isinstance(tbl, Table): 

868 raise TypeError("can only read_column with a table") 

869 return tbl.read_column(column=column, start=start, stop=stop) 

870 

871 def select_as_multiple( 

872 self, 

873 keys, 

874 where=None, 

875 selector=None, 

876 columns=None, 

877 start=None, 

878 stop=None, 

879 iterator=False, 

880 chunksize=None, 

881 auto_close: bool = False, 

882 ): 

883 """ 

884 Retrieve pandas objects from multiple tables. 

885 

886 Parameters 

887 ---------- 

888 keys : a list of the tables 

889 selector : the table to apply the where criteria (defaults to keys[0] 

890 if not supplied) 

891 columns : the columns I want back 

892 start : integer (defaults to None), row number to start selection 

893 stop : integer (defaults to None), row number to stop selection 

894 iterator : boolean, return an iterator, default False 

895 chunksize : nrows to include in iteration, return an iterator 

896 auto_close : bool, default False 

897 Should automatically close the store when finished. 

898 

899 Raises 

900 ------ 

901 raises KeyError if keys or selector is not found or keys is empty 

902 raises TypeError if keys is not a list or tuple 

903 raises ValueError if the tables are not ALL THE SAME DIMENSIONS 

904 """ 

905 

906 # default to single select 

907 where = _ensure_term(where, scope_level=1) 

908 if isinstance(keys, (list, tuple)) and len(keys) == 1: 

909 keys = keys[0] 

910 if isinstance(keys, str): 

911 return self.select( 

912 key=keys, 

913 where=where, 

914 columns=columns, 

915 start=start, 

916 stop=stop, 

917 iterator=iterator, 

918 chunksize=chunksize, 

919 auto_close=auto_close, 

920 ) 

921 

922 if not isinstance(keys, (list, tuple)): 

923 raise TypeError("keys must be a list/tuple") 

924 

925 if not len(keys): 

926 raise ValueError("keys must have a non-zero length") 

927 

928 if selector is None: 

929 selector = keys[0] 

930 

931 # collect the tables 

932 tbls = [self.get_storer(k) for k in keys] 

933 s = self.get_storer(selector) 

934 

935 # validate rows 

936 nrows = None 

937 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): 

938 if t is None: 

939 raise KeyError(f"Invalid table [{k}]") 

940 if not t.is_table: 

941 raise TypeError( 

942 f"object [{t.pathname}] is not a table, and cannot be used in all " 

943 "select as multiple" 

944 ) 

945 

946 if nrows is None: 

947 nrows = t.nrows 

948 elif t.nrows != nrows: 

949 raise ValueError("all tables must have exactly the same nrows!") 

950 

951 # The isinstance checks here are redundant with the check above, 

952 # but necessary for mypy; see GH#29757 

953 _tbls = [x for x in tbls if isinstance(x, Table)] 

954 

955 # axis is the concentration axes 

956 axis = list({t.non_index_axes[0][0] for t in _tbls})[0] 

957 

958 def func(_start, _stop, _where): 

959 

960 # retrieve the objs, _where is always passed as a set of 

961 # coordinates here 

962 objs = [ 

963 t.read(where=_where, columns=columns, start=_start, stop=_stop) 

964 for t in tbls 

965 ] 

966 

967 # concat and return 

968 return concat(objs, axis=axis, verify_integrity=False)._consolidate() 

969 

970 # create the iterator 

971 it = TableIterator( 

972 self, 

973 s, 

974 func, 

975 where=where, 

976 nrows=nrows, 

977 start=start, 

978 stop=stop, 

979 iterator=iterator, 

980 chunksize=chunksize, 

981 auto_close=auto_close, 

982 ) 

983 

984 return it.get_result(coordinates=True) 

985 

986 def put( 

987 self, 

988 key: str, 

989 value: FrameOrSeries, 

990 format=None, 

991 index=True, 

992 append=False, 

993 complib=None, 

994 complevel: Optional[int] = None, 

995 min_itemsize: Optional[Union[int, Dict[str, int]]] = None, 

996 nan_rep=None, 

997 data_columns: Optional[List[str]] = None, 

998 encoding=None, 

999 errors: str = "strict", 

1000 ): 

1001 """ 

1002 Store object in HDFStore. 

1003 

1004 Parameters 

1005 ---------- 

1006 key : str 

1007 value : {Series, DataFrame} 

1008 format : 'fixed(f)|table(t)', default is 'fixed' 

1009 fixed(f) : Fixed format 

1010 Fast writing/reading. Not-appendable, nor searchable. 

1011 table(t) : Table format 

1012 Write as a PyTables Table structure which may perform 

1013 worse but allow more flexible operations like searching 

1014 / selecting subsets of the data. 

1015 append : bool, default False 

1016 This will force Table format, append the input data to the 

1017 existing. 

1018 data_columns : list, default None 

1019 List of columns to create as data columns, or True to 

1020 use all columns. See `here 

1021 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1022 encoding : str, default None 

1023 Provide an encoding for strings. 

1024 dropna : bool, default False, do not write an ALL nan row to 

1025 The store settable by the option 'io.hdf.dropna_table'. 

1026 """ 

1027 if format is None: 

1028 format = get_option("io.hdf.default_format") or "fixed" 

1029 format = self._validate_format(format) 

1030 self._write_to_group( 

1031 key, 

1032 value, 

1033 format=format, 

1034 index=index, 

1035 append=append, 

1036 complib=complib, 

1037 complevel=complevel, 

1038 min_itemsize=min_itemsize, 

1039 nan_rep=nan_rep, 

1040 data_columns=data_columns, 

1041 encoding=encoding, 

1042 errors=errors, 

1043 ) 

1044 

1045 def remove(self, key: str, where=None, start=None, stop=None): 

1046 """ 

1047 Remove pandas object partially by specifying the where condition 

1048 

1049 Parameters 

1050 ---------- 

1051 key : string 

1052 Node to remove or delete rows from 

1053 where : list of Term (or convertible) objects, optional 

1054 start : integer (defaults to None), row number to start selection 

1055 stop : integer (defaults to None), row number to stop selection 

1056 

1057 Returns 

1058 ------- 

1059 number of rows removed (or None if not a Table) 

1060 

1061 Raises 

1062 ------ 

1063 raises KeyError if key is not a valid store 

1064 

1065 """ 

1066 where = _ensure_term(where, scope_level=1) 

1067 try: 

1068 s = self.get_storer(key) 

1069 except KeyError: 

1070 # the key is not a valid store, re-raising KeyError 

1071 raise 

1072 except AssertionError: 

1073 # surface any assertion errors for e.g. debugging 

1074 raise 

1075 except Exception: 

1076 # In tests we get here with ClosedFileError, TypeError, and 

1077 # _table_mod.NoSuchNodeError. TODO: Catch only these? 

1078 

1079 if where is not None: 

1080 raise ValueError( 

1081 "trying to remove a node with a non-None where clause!" 

1082 ) 

1083 

1084 # we are actually trying to remove a node (with children) 

1085 node = self.get_node(key) 

1086 if node is not None: 

1087 node._f_remove(recursive=True) 

1088 return None 

1089 

1090 # remove the node 

1091 if com.all_none(where, start, stop): 

1092 s.group._f_remove(recursive=True) 

1093 

1094 # delete from the table 

1095 else: 

1096 if not s.is_table: 

1097 raise ValueError( 

1098 "can only remove with where on objects written as tables" 

1099 ) 

1100 return s.delete(where=where, start=start, stop=stop) 

1101 

1102 def append( 

1103 self, 

1104 key: str, 

1105 value: FrameOrSeries, 

1106 format=None, 

1107 axes=None, 

1108 index=True, 

1109 append=True, 

1110 complib=None, 

1111 complevel: Optional[int] = None, 

1112 columns=None, 

1113 min_itemsize: Optional[Union[int, Dict[str, int]]] = None, 

1114 nan_rep=None, 

1115 chunksize=None, 

1116 expectedrows=None, 

1117 dropna: Optional[bool] = None, 

1118 data_columns: Optional[List[str]] = None, 

1119 encoding=None, 

1120 errors: str = "strict", 

1121 ): 

1122 """ 

1123 Append to Table in file. Node must already exist and be Table 

1124 format. 

1125 

1126 Parameters 

1127 ---------- 

1128 key : str 

1129 value : {Series, DataFrame} 

1130 format : 'table' is the default 

1131 table(t) : table format 

1132 Write as a PyTables Table structure which may perform 

1133 worse but allow more flexible operations like searching 

1134 / selecting subsets of the data. 

1135 append : bool, default True 

1136 Append the input data to the existing. 

1137 data_columns : list of columns, or True, default None 

1138 List of columns to create as indexed data columns for on-disk 

1139 queries, or True to use all columns. By default only the axes 

1140 of the object are indexed. See `here 

1141 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1142 min_itemsize : dict of columns that specify minimum string sizes 

1143 nan_rep : string to use as string nan representation 

1144 chunksize : size to chunk the writing 

1145 expectedrows : expected TOTAL row size of this table 

1146 encoding : default None, provide an encoding for strings 

1147 dropna : bool, default False 

1148 Do not write an ALL nan row to the store settable 

1149 by the option 'io.hdf.dropna_table'. 

1150 

1151 Notes 

1152 ----- 

1153 Does *not* check if data being appended overlaps with existing 

1154 data in the table, so be careful 

1155 """ 

1156 if columns is not None: 

1157 raise TypeError( 

1158 "columns is not a supported keyword in append, try data_columns" 

1159 ) 

1160 

1161 if dropna is None: 

1162 dropna = get_option("io.hdf.dropna_table") 

1163 if format is None: 

1164 format = get_option("io.hdf.default_format") or "table" 

1165 format = self._validate_format(format) 

1166 self._write_to_group( 

1167 key, 

1168 value, 

1169 format=format, 

1170 axes=axes, 

1171 index=index, 

1172 append=append, 

1173 complib=complib, 

1174 complevel=complevel, 

1175 min_itemsize=min_itemsize, 

1176 nan_rep=nan_rep, 

1177 chunksize=chunksize, 

1178 expectedrows=expectedrows, 

1179 dropna=dropna, 

1180 data_columns=data_columns, 

1181 encoding=encoding, 

1182 errors=errors, 

1183 ) 

1184 

1185 def append_to_multiple( 

1186 self, 

1187 d: Dict, 

1188 value, 

1189 selector, 

1190 data_columns=None, 

1191 axes=None, 

1192 dropna=False, 

1193 **kwargs, 

1194 ): 

1195 """ 

1196 Append to multiple tables 

1197 

1198 Parameters 

1199 ---------- 

1200 d : a dict of table_name to table_columns, None is acceptable as the 

1201 values of one node (this will get all the remaining columns) 

1202 value : a pandas object 

1203 selector : a string that designates the indexable table; all of its 

1204 columns will be designed as data_columns, unless data_columns is 

1205 passed, in which case these are used 

1206 data_columns : list of columns to create as data columns, or True to 

1207 use all columns 

1208 dropna : if evaluates to True, drop rows from all tables if any single 

1209 row in each table has all NaN. Default False. 

1210 

1211 Notes 

1212 ----- 

1213 axes parameter is currently not accepted 

1214 

1215 """ 

1216 if axes is not None: 

1217 raise TypeError( 

1218 "axes is currently not accepted as a parameter to append_to_multiple; " 

1219 "you can create the tables independently instead" 

1220 ) 

1221 

1222 if not isinstance(d, dict): 

1223 raise ValueError( 

1224 "append_to_multiple must have a dictionary specified as the " 

1225 "way to split the value" 

1226 ) 

1227 

1228 if selector not in d: 

1229 raise ValueError( 

1230 "append_to_multiple requires a selector that is in passed dict" 

1231 ) 

1232 

1233 # figure out the splitting axis (the non_index_axis) 

1234 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] 

1235 

1236 # figure out how to split the value 

1237 remain_key = None 

1238 remain_values: List = [] 

1239 for k, v in d.items(): 

1240 if v is None: 

1241 if remain_key is not None: 

1242 raise ValueError( 

1243 "append_to_multiple can only have one value in d that " 

1244 "is None" 

1245 ) 

1246 remain_key = k 

1247 else: 

1248 remain_values.extend(v) 

1249 if remain_key is not None: 

1250 ordered = value.axes[axis] 

1251 ordd = ordered.difference(Index(remain_values)) 

1252 ordd = sorted(ordered.get_indexer(ordd)) 

1253 d[remain_key] = ordered.take(ordd) 

1254 

1255 # data_columns 

1256 if data_columns is None: 

1257 data_columns = d[selector] 

1258 

1259 # ensure rows are synchronized across the tables 

1260 if dropna: 

1261 idxs = (value[cols].dropna(how="all").index for cols in d.values()) 

1262 valid_index = next(idxs) 

1263 for index in idxs: 

1264 valid_index = valid_index.intersection(index) 

1265 value = value.loc[valid_index] 

1266 

1267 # append 

1268 for k, v in d.items(): 

1269 dc = data_columns if k == selector else None 

1270 

1271 # compute the val 

1272 val = value.reindex(v, axis=axis) 

1273 

1274 self.append(k, val, data_columns=dc, **kwargs) 

1275 

1276 def create_table_index( 

1277 self, 

1278 key: str, 

1279 columns=None, 

1280 optlevel: Optional[int] = None, 

1281 kind: Optional[str] = None, 

1282 ): 

1283 """ 

1284 Create a pytables index on the table. 

1285 

1286 Parameters 

1287 ---------- 

1288 key : str 

1289 columns : None, bool, or listlike[str] 

1290 Indicate which columns to create an index on. 

1291 

1292 * False : Do not create any indexes. 

1293 * True : Create indexes on all columns. 

1294 * None : Create indexes on all columns. 

1295 * listlike : Create indexes on the given columns. 

1296 

1297 optlevel : int or None, default None 

1298 Optimization level, if None, pytables defaults to 6. 

1299 kind : str or None, default None 

1300 Kind of index, if None, pytables defaults to "medium". 

1301 

1302 Raises 

1303 ------ 

1304 TypeError: raises if the node is not a table 

1305 """ 

1306 

1307 # version requirements 

1308 _tables() 

1309 s = self.get_storer(key) 

1310 if s is None: 

1311 return 

1312 

1313 if not isinstance(s, Table): 

1314 raise TypeError("cannot create table index on a Fixed format store") 

1315 s.create_index(columns=columns, optlevel=optlevel, kind=kind) 

1316 

1317 def groups(self): 

1318 """ 

1319 Return a list of all the top-level nodes. 

1320 

1321 Each node returned is not a pandas storage object. 

1322 

1323 Returns 

1324 ------- 

1325 list 

1326 List of objects. 

1327 """ 

1328 _tables() 

1329 self._check_if_open() 

1330 return [ 

1331 g 

1332 for g in self._handle.walk_groups() 

1333 if ( 

1334 not isinstance(g, _table_mod.link.Link) 

1335 and ( 

1336 getattr(g._v_attrs, "pandas_type", None) 

1337 or getattr(g, "table", None) 

1338 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table") 

1339 ) 

1340 ) 

1341 ] 

1342 

1343 def walk(self, where="/"): 

1344 """ 

1345 Walk the pytables group hierarchy for pandas objects. 

1346 

1347 This generator will yield the group path, subgroups and pandas object 

1348 names for each group. 

1349 

1350 Any non-pandas PyTables objects that are not a group will be ignored. 

1351 

1352 The `where` group itself is listed first (preorder), then each of its 

1353 child groups (following an alphanumerical order) is also traversed, 

1354 following the same procedure. 

1355 

1356 .. versionadded:: 0.24.0 

1357 

1358 Parameters 

1359 ---------- 

1360 where : str, default "/" 

1361 Group where to start walking. 

1362 

1363 Yields 

1364 ------ 

1365 path : str 

1366 Full path to a group (without trailing '/'). 

1367 groups : list 

1368 Names (strings) of the groups contained in `path`. 

1369 leaves : list 

1370 Names (strings) of the pandas objects contained in `path`. 

1371 """ 

1372 _tables() 

1373 self._check_if_open() 

1374 for g in self._handle.walk_groups(where): 

1375 if getattr(g._v_attrs, "pandas_type", None) is not None: 

1376 continue 

1377 

1378 groups = [] 

1379 leaves = [] 

1380 for child in g._v_children.values(): 

1381 pandas_type = getattr(child._v_attrs, "pandas_type", None) 

1382 if pandas_type is None: 

1383 if isinstance(child, _table_mod.group.Group): 

1384 groups.append(child._v_name) 

1385 else: 

1386 leaves.append(child._v_name) 

1387 

1388 yield (g._v_pathname.rstrip("/"), groups, leaves) 

1389 

1390 def get_node(self, key: str) -> Optional["Node"]: 

1391 """ return the node with the key or None if it does not exist """ 

1392 self._check_if_open() 

1393 if not key.startswith("/"): 

1394 key = "/" + key 

1395 

1396 assert self._handle is not None 

1397 assert _table_mod is not None # for mypy 

1398 try: 

1399 node = self._handle.get_node(self.root, key) 

1400 except _table_mod.exceptions.NoSuchNodeError: 

1401 return None 

1402 

1403 assert isinstance(node, _table_mod.Node), type(node) 

1404 return node 

1405 

1406 def get_storer(self, key: str) -> Union["GenericFixed", "Table"]: 

1407 """ return the storer object for a key, raise if not in the file """ 

1408 group = self.get_node(key) 

1409 if group is None: 

1410 raise KeyError(f"No object named {key} in the file") 

1411 

1412 s = self._create_storer(group) 

1413 s.infer_axes() 

1414 return s 

1415 

1416 def copy( 

1417 self, 

1418 file, 

1419 mode="w", 

1420 propindexes: bool = True, 

1421 keys=None, 

1422 complib=None, 

1423 complevel: Optional[int] = None, 

1424 fletcher32: bool = False, 

1425 overwrite=True, 

1426 ): 

1427 """ 

1428 Copy the existing store to a new file, updating in place. 

1429 

1430 Parameters 

1431 ---------- 

1432 propindexes: bool, default True 

1433 Restore indexes in copied file. 

1434 keys : list of keys to include in the copy (defaults to all) 

1435 overwrite : overwrite (remove and replace) existing nodes in the 

1436 new store (default is True) 

1437 mode, complib, complevel, fletcher32 same as in HDFStore.__init__ 

1438 

1439 Returns 

1440 ------- 

1441 open file handle of the new store 

1442 """ 

1443 new_store = HDFStore( 

1444 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 

1445 ) 

1446 if keys is None: 

1447 keys = list(self.keys()) 

1448 if not isinstance(keys, (tuple, list)): 

1449 keys = [keys] 

1450 for k in keys: 

1451 s = self.get_storer(k) 

1452 if s is not None: 

1453 

1454 if k in new_store: 

1455 if overwrite: 

1456 new_store.remove(k) 

1457 

1458 data = self.select(k) 

1459 if isinstance(s, Table): 

1460 

1461 index: Union[bool, List[str]] = False 

1462 if propindexes: 

1463 index = [a.name for a in s.axes if a.is_indexed] 

1464 new_store.append( 

1465 k, 

1466 data, 

1467 index=index, 

1468 data_columns=getattr(s, "data_columns", None), 

1469 encoding=s.encoding, 

1470 ) 

1471 else: 

1472 new_store.put(k, data, encoding=s.encoding) 

1473 

1474 return new_store 

1475 

1476 def info(self) -> str: 

1477 """ 

1478 Print detailed information on the store. 

1479 

1480 .. versionadded:: 0.21.0 

1481 

1482 Returns 

1483 ------- 

1484 str 

1485 """ 

1486 path = pprint_thing(self._path) 

1487 output = f"{type(self)}\nFile path: {path}\n" 

1488 

1489 if self.is_open: 

1490 lkeys = sorted(self.keys()) 

1491 if len(lkeys): 

1492 keys = [] 

1493 values = [] 

1494 

1495 for k in lkeys: 

1496 try: 

1497 s = self.get_storer(k) 

1498 if s is not None: 

1499 keys.append(pprint_thing(s.pathname or k)) 

1500 values.append(pprint_thing(s or "invalid_HDFStore node")) 

1501 except AssertionError: 

1502 # surface any assertion errors for e.g. debugging 

1503 raise 

1504 except Exception as detail: 

1505 keys.append(k) 

1506 dstr = pprint_thing(detail) 

1507 values.append(f"[invalid_HDFStore node: {dstr}]") 

1508 

1509 output += adjoin(12, keys, values) 

1510 else: 

1511 output += "Empty" 

1512 else: 

1513 output += "File is CLOSED" 

1514 

1515 return output 

1516 

1517 # ------------------------------------------------------------------------ 

1518 # private methods 

1519 

1520 def _check_if_open(self): 

1521 if not self.is_open: 

1522 raise ClosedFileError(f"{self._path} file is not open!") 

1523 

1524 def _validate_format(self, format: str) -> str: 

1525 """ validate / deprecate formats """ 

1526 

1527 # validate 

1528 try: 

1529 format = _FORMAT_MAP[format.lower()] 

1530 except KeyError: 

1531 raise TypeError(f"invalid HDFStore format specified [{format}]") 

1532 

1533 return format 

1534 

1535 def _create_storer( 

1536 self, 

1537 group, 

1538 format=None, 

1539 value: Optional[FrameOrSeries] = None, 

1540 encoding: str = "UTF-8", 

1541 errors: str = "strict", 

1542 ) -> Union["GenericFixed", "Table"]: 

1543 """ return a suitable class to operate """ 

1544 

1545 cls: Union[Type["GenericFixed"], Type["Table"]] 

1546 

1547 if value is not None and not isinstance(value, (Series, DataFrame)): 

1548 raise TypeError("value must be None, Series, or DataFrame") 

1549 

1550 def error(t): 

1551 # return instead of raising so mypy can tell where we are raising 

1552 return TypeError( 

1553 f"cannot properly create the storer for: [{t}] [group->" 

1554 f"{group},value->{type(value)},format->{format}" 

1555 ) 

1556 

1557 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) 

1558 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) 

1559 

1560 # infer the pt from the passed value 

1561 if pt is None: 

1562 if value is None: 

1563 

1564 _tables() 

1565 assert _table_mod is not None # for mypy 

1566 if getattr(group, "table", None) or isinstance( 

1567 group, _table_mod.table.Table 

1568 ): 

1569 pt = "frame_table" 

1570 tt = "generic_table" 

1571 else: 

1572 raise TypeError( 

1573 "cannot create a storer if the object is not existing " 

1574 "nor a value are passed" 

1575 ) 

1576 else: 

1577 _TYPE_MAP = {Series: "series", DataFrame: "frame"} 

1578 pt = _TYPE_MAP[type(value)] 

1579 

1580 # we are actually a table 

1581 if format == "table": 

1582 pt += "_table" 

1583 

1584 # a storer node 

1585 if "table" not in pt: 

1586 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed} 

1587 try: 

1588 cls = _STORER_MAP[pt] 

1589 except KeyError: 

1590 raise error("_STORER_MAP") 

1591 return cls(self, group, encoding=encoding, errors=errors) 

1592 

1593 # existing node (and must be a table) 

1594 if tt is None: 

1595 

1596 # if we are a writer, determine the tt 

1597 if value is not None: 

1598 

1599 if pt == "series_table": 

1600 index = getattr(value, "index", None) 

1601 if index is not None: 

1602 if index.nlevels == 1: 

1603 tt = "appendable_series" 

1604 elif index.nlevels > 1: 

1605 tt = "appendable_multiseries" 

1606 elif pt == "frame_table": 

1607 index = getattr(value, "index", None) 

1608 if index is not None: 

1609 if index.nlevels == 1: 

1610 tt = "appendable_frame" 

1611 elif index.nlevels > 1: 

1612 tt = "appendable_multiframe" 

1613 

1614 _TABLE_MAP = { 

1615 "generic_table": GenericTable, 

1616 "appendable_series": AppendableSeriesTable, 

1617 "appendable_multiseries": AppendableMultiSeriesTable, 

1618 "appendable_frame": AppendableFrameTable, 

1619 "appendable_multiframe": AppendableMultiFrameTable, 

1620 "worm": WORMTable, 

1621 } 

1622 try: 

1623 cls = _TABLE_MAP[tt] 

1624 except KeyError: 

1625 raise error("_TABLE_MAP") 

1626 

1627 return cls(self, group, encoding=encoding, errors=errors) 

1628 

1629 def _write_to_group( 

1630 self, 

1631 key: str, 

1632 value: FrameOrSeries, 

1633 format, 

1634 axes=None, 

1635 index=True, 

1636 append=False, 

1637 complib=None, 

1638 complevel: Optional[int] = None, 

1639 fletcher32=None, 

1640 min_itemsize: Optional[Union[int, Dict[str, int]]] = None, 

1641 chunksize=None, 

1642 expectedrows=None, 

1643 dropna=False, 

1644 nan_rep=None, 

1645 data_columns=None, 

1646 encoding=None, 

1647 errors: str = "strict", 

1648 ): 

1649 group = self.get_node(key) 

1650 

1651 # we make this assertion for mypy; the get_node call will already 

1652 # have raised if this is incorrect 

1653 assert self._handle is not None 

1654 

1655 # remove the node if we are not appending 

1656 if group is not None and not append: 

1657 self._handle.remove_node(group, recursive=True) 

1658 group = None 

1659 

1660 # we don't want to store a table node at all if our object is 0-len 

1661 # as there are not dtypes 

1662 if getattr(value, "empty", None) and (format == "table" or append): 

1663 return 

1664 

1665 if group is None: 

1666 paths = key.split("/") 

1667 

1668 # recursively create the groups 

1669 path = "/" 

1670 for p in paths: 

1671 if not len(p): 

1672 continue 

1673 new_path = path 

1674 if not path.endswith("/"): 

1675 new_path += "/" 

1676 new_path += p 

1677 group = self.get_node(new_path) 

1678 if group is None: 

1679 group = self._handle.create_group(path, p) 

1680 path = new_path 

1681 

1682 s = self._create_storer(group, format, value, encoding=encoding, errors=errors) 

1683 if append: 

1684 # raise if we are trying to append to a Fixed format, 

1685 # or a table that exists (and we are putting) 

1686 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists): 

1687 raise ValueError("Can only append to Tables") 

1688 if not s.is_exists: 

1689 s.set_object_info() 

1690 else: 

1691 s.set_object_info() 

1692 

1693 if not s.is_table and complib: 

1694 raise ValueError("Compression not supported on Fixed format stores") 

1695 

1696 # write the object 

1697 s.write( 

1698 obj=value, 

1699 axes=axes, 

1700 append=append, 

1701 complib=complib, 

1702 complevel=complevel, 

1703 fletcher32=fletcher32, 

1704 min_itemsize=min_itemsize, 

1705 chunksize=chunksize, 

1706 expectedrows=expectedrows, 

1707 dropna=dropna, 

1708 nan_rep=nan_rep, 

1709 data_columns=data_columns, 

1710 ) 

1711 

1712 if isinstance(s, Table) and index: 

1713 s.create_index(columns=index) 

1714 

1715 def _read_group(self, group: "Node"): 

1716 s = self._create_storer(group) 

1717 s.infer_axes() 

1718 return s.read() 

1719 

1720 

1721class TableIterator: 

1722 """ 

1723 Define the iteration interface on a table 

1724 

1725 Parameters 

1726 ---------- 

1727 store : HDFStore 

1728 s : the referred storer 

1729 func : the function to execute the query 

1730 where : the where of the query 

1731 nrows : the rows to iterate on 

1732 start : the passed start value (default is None) 

1733 stop : the passed stop value (default is None) 

1734 iterator : bool, default False 

1735 Whether to use the default iterator. 

1736 chunksize : the passed chunking value (default is 100000) 

1737 auto_close : bool, default False 

1738 Whether to automatically close the store at the end of iteration. 

1739 """ 

1740 

1741 chunksize: Optional[int] 

1742 store: HDFStore 

1743 s: Union["GenericFixed", "Table"] 

1744 

1745 def __init__( 

1746 self, 

1747 store: HDFStore, 

1748 s: Union["GenericFixed", "Table"], 

1749 func, 

1750 where, 

1751 nrows, 

1752 start=None, 

1753 stop=None, 

1754 iterator: bool = False, 

1755 chunksize: Optional[int] = None, 

1756 auto_close: bool = False, 

1757 ): 

1758 self.store = store 

1759 self.s = s 

1760 self.func = func 

1761 self.where = where 

1762 

1763 # set start/stop if they are not set if we are a table 

1764 if self.s.is_table: 

1765 if nrows is None: 

1766 nrows = 0 

1767 if start is None: 

1768 start = 0 

1769 if stop is None: 

1770 stop = nrows 

1771 stop = min(nrows, stop) 

1772 

1773 self.nrows = nrows 

1774 self.start = start 

1775 self.stop = stop 

1776 

1777 self.coordinates = None 

1778 if iterator or chunksize is not None: 

1779 if chunksize is None: 

1780 chunksize = 100000 

1781 self.chunksize = int(chunksize) 

1782 else: 

1783 self.chunksize = None 

1784 

1785 self.auto_close = auto_close 

1786 

1787 def __iter__(self): 

1788 

1789 # iterate 

1790 current = self.start 

1791 while current < self.stop: 

1792 

1793 stop = min(current + self.chunksize, self.stop) 

1794 value = self.func(None, None, self.coordinates[current:stop]) 

1795 current = stop 

1796 if value is None or not len(value): 

1797 continue 

1798 

1799 yield value 

1800 

1801 self.close() 

1802 

1803 def close(self): 

1804 if self.auto_close: 

1805 self.store.close() 

1806 

1807 def get_result(self, coordinates: bool = False): 

1808 

1809 # return the actual iterator 

1810 if self.chunksize is not None: 

1811 if not isinstance(self.s, Table): 

1812 raise TypeError("can only use an iterator or chunksize on a table") 

1813 

1814 self.coordinates = self.s.read_coordinates(where=self.where) 

1815 

1816 return self 

1817 

1818 # if specified read via coordinates (necessary for multiple selections 

1819 if coordinates: 

1820 if not isinstance(self.s, Table): 

1821 raise TypeError("can only read_coordinates on a table") 

1822 where = self.s.read_coordinates( 

1823 where=self.where, start=self.start, stop=self.stop 

1824 ) 

1825 else: 

1826 where = self.where 

1827 

1828 # directly return the result 

1829 results = self.func(self.start, self.stop, where) 

1830 self.close() 

1831 return results 

1832 

1833 

1834class IndexCol: 

1835 """ an index column description class 

1836 

1837 Parameters 

1838 ---------- 

1839 

1840 axis : axis which I reference 

1841 values : the ndarray like converted values 

1842 kind : a string description of this type 

1843 typ : the pytables type 

1844 pos : the position in the pytables 

1845 

1846 """ 

1847 

1848 is_an_indexable = True 

1849 is_data_indexable = True 

1850 _info_fields = ["freq", "tz", "index_name"] 

1851 

1852 name: str 

1853 cname: str 

1854 

1855 def __init__( 

1856 self, 

1857 name: str, 

1858 values=None, 

1859 kind=None, 

1860 typ=None, 

1861 cname: Optional[str] = None, 

1862 axis=None, 

1863 pos=None, 

1864 freq=None, 

1865 tz=None, 

1866 index_name=None, 

1867 ordered=None, 

1868 table=None, 

1869 meta=None, 

1870 metadata=None, 

1871 ): 

1872 

1873 if not isinstance(name, str): 

1874 raise ValueError("`name` must be a str.") 

1875 

1876 self.values = values 

1877 self.kind = kind 

1878 self.typ = typ 

1879 self.name = name 

1880 self.cname = cname or name 

1881 self.axis = axis 

1882 self.pos = pos 

1883 self.freq = freq 

1884 self.tz = tz 

1885 self.index_name = index_name 

1886 self.ordered = ordered 

1887 self.table = table 

1888 self.meta = meta 

1889 self.metadata = metadata 

1890 

1891 if pos is not None: 

1892 self.set_pos(pos) 

1893 

1894 # These are ensured as long as the passed arguments match the 

1895 # constructor annotations. 

1896 assert isinstance(self.name, str) 

1897 assert isinstance(self.cname, str) 

1898 

1899 @property 

1900 def itemsize(self) -> int: 

1901 # Assumes self.typ has already been initialized 

1902 return self.typ.itemsize 

1903 

1904 @property 

1905 def kind_attr(self) -> str: 

1906 return f"{self.name}_kind" 

1907 

1908 def set_pos(self, pos: int): 

1909 """ set the position of this column in the Table """ 

1910 self.pos = pos 

1911 if pos is not None and self.typ is not None: 

1912 self.typ._v_pos = pos 

1913 

1914 def __repr__(self) -> str: 

1915 temp = tuple( 

1916 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) 

1917 ) 

1918 return ",".join( 

1919 ( 

1920 f"{key}->{value}" 

1921 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) 

1922 ) 

1923 ) 

1924 

1925 def __eq__(self, other: Any) -> bool: 

1926 """ compare 2 col items """ 

1927 return all( 

1928 getattr(self, a, None) == getattr(other, a, None) 

1929 for a in ["name", "cname", "axis", "pos"] 

1930 ) 

1931 

1932 def __ne__(self, other) -> bool: 

1933 return not self.__eq__(other) 

1934 

1935 @property 

1936 def is_indexed(self) -> bool: 

1937 """ return whether I am an indexed column """ 

1938 if not hasattr(self.table, "cols"): 

1939 # e.g. if infer hasn't been called yet, self.table will be None. 

1940 return False 

1941 # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute 

1942 # 'error: "None" has no attribute "cols"' 

1943 return getattr(self.table.cols, self.cname).is_indexed # type: ignore 

1944 

1945 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): 

1946 """ 

1947 Convert the data from this selection to the appropriate pandas type. 

1948 """ 

1949 assert isinstance(values, np.ndarray), type(values) 

1950 

1951 # values is a recarray 

1952 if values.dtype.fields is not None: 

1953 values = values[self.cname] 

1954 

1955 val_kind = _ensure_decoded(self.kind) 

1956 values = _maybe_convert(values, val_kind, encoding, errors) 

1957 

1958 kwargs = dict() 

1959 kwargs["name"] = _ensure_decoded(self.index_name) 

1960 

1961 if self.freq is not None: 

1962 kwargs["freq"] = _ensure_decoded(self.freq) 

1963 

1964 # making an Index instance could throw a number of different errors 

1965 try: 

1966 new_pd_index = Index(values, **kwargs) 

1967 except ValueError: 

1968 # if the output freq is different that what we recorded, 

1969 # it should be None (see also 'doc example part 2') 

1970 if "freq" in kwargs: 

1971 kwargs["freq"] = None 

1972 new_pd_index = Index(values, **kwargs) 

1973 

1974 new_pd_index = _set_tz(new_pd_index, self.tz) 

1975 return new_pd_index, new_pd_index 

1976 

1977 def take_data(self): 

1978 """ return the values""" 

1979 return self.values 

1980 

1981 @property 

1982 def attrs(self): 

1983 return self.table._v_attrs 

1984 

1985 @property 

1986 def description(self): 

1987 return self.table.description 

1988 

1989 @property 

1990 def col(self): 

1991 """ return my current col description """ 

1992 return getattr(self.description, self.cname, None) 

1993 

1994 @property 

1995 def cvalues(self): 

1996 """ return my cython values """ 

1997 return self.values 

1998 

1999 def __iter__(self): 

2000 return iter(self.values) 

2001 

2002 def maybe_set_size(self, min_itemsize=None): 

2003 """ maybe set a string col itemsize: 

2004 min_itemsize can be an integer or a dict with this columns name 

2005 with an integer size """ 

2006 if _ensure_decoded(self.kind) == "string": 

2007 

2008 if isinstance(min_itemsize, dict): 

2009 min_itemsize = min_itemsize.get(self.name) 

2010 

2011 if min_itemsize is not None and self.typ.itemsize < min_itemsize: 

2012 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) 

2013 

2014 def validate_names(self): 

2015 pass 

2016 

2017 def validate_and_set(self, handler: "AppendableTable", append: bool): 

2018 self.table = handler.table 

2019 self.validate_col() 

2020 self.validate_attr(append) 

2021 self.validate_metadata(handler) 

2022 self.write_metadata(handler) 

2023 self.set_attr() 

2024 

2025 def validate_col(self, itemsize=None): 

2026 """ validate this column: return the compared against itemsize """ 

2027 

2028 # validate this column for string truncation (or reset to the max size) 

2029 if _ensure_decoded(self.kind) == "string": 

2030 c = self.col 

2031 if c is not None: 

2032 if itemsize is None: 

2033 itemsize = self.itemsize 

2034 if c.itemsize < itemsize: 

2035 raise ValueError( 

2036 f"Trying to store a string with len [{itemsize}] in " 

2037 f"[{self.cname}] column but\nthis column has a limit of " 

2038 f"[{c.itemsize}]!\nConsider using min_itemsize to " 

2039 "preset the sizes on these columns" 

2040 ) 

2041 return c.itemsize 

2042 

2043 return None 

2044 

2045 def validate_attr(self, append: bool): 

2046 # check for backwards incompatibility 

2047 if append: 

2048 existing_kind = getattr(self.attrs, self.kind_attr, None) 

2049 if existing_kind is not None and existing_kind != self.kind: 

2050 raise TypeError( 

2051 f"incompatible kind in col [{existing_kind} - {self.kind}]" 

2052 ) 

2053 

2054 def update_info(self, info): 

2055 """ set/update the info for this indexable with the key/value 

2056 if there is a conflict raise/warn as needed """ 

2057 

2058 for key in self._info_fields: 

2059 

2060 value = getattr(self, key, None) 

2061 idx = info.setdefault(self.name, {}) 

2062 

2063 existing_value = idx.get(key) 

2064 if key in idx and value is not None and existing_value != value: 

2065 

2066 # frequency/name just warn 

2067 if key in ["freq", "index_name"]: 

2068 ws = attribute_conflict_doc % (key, existing_value, value) 

2069 warnings.warn(ws, AttributeConflictWarning, stacklevel=6) 

2070 

2071 # reset 

2072 idx[key] = None 

2073 setattr(self, key, None) 

2074 

2075 else: 

2076 raise ValueError( 

2077 f"invalid info for [{self.name}] for [{key}], " 

2078 f"existing_value [{existing_value}] conflicts with " 

2079 f"new value [{value}]" 

2080 ) 

2081 else: 

2082 if value is not None or existing_value is not None: 

2083 idx[key] = value 

2084 

2085 def set_info(self, info): 

2086 """ set my state from the passed info """ 

2087 idx = info.get(self.name) 

2088 if idx is not None: 

2089 self.__dict__.update(idx) 

2090 

2091 def set_attr(self): 

2092 """ set the kind for this column """ 

2093 setattr(self.attrs, self.kind_attr, self.kind) 

2094 

2095 def validate_metadata(self, handler: "AppendableTable"): 

2096 """ validate that kind=category does not change the categories """ 

2097 if self.meta == "category": 

2098 new_metadata = self.metadata 

2099 cur_metadata = handler.read_metadata(self.cname) 

2100 if ( 

2101 new_metadata is not None 

2102 and cur_metadata is not None 

2103 and not array_equivalent(new_metadata, cur_metadata) 

2104 ): 

2105 raise ValueError( 

2106 "cannot append a categorical with " 

2107 "different categories to the existing" 

2108 ) 

2109 

2110 def write_metadata(self, handler: "AppendableTable"): 

2111 """ set the meta data """ 

2112 if self.metadata is not None: 

2113 handler.write_metadata(self.cname, self.metadata) 

2114 

2115 

2116class GenericIndexCol(IndexCol): 

2117 """ an index which is not represented in the data of the table """ 

2118 

2119 @property 

2120 def is_indexed(self) -> bool: 

2121 return False 

2122 

2123 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): 

2124 """ 

2125 Convert the data from this selection to the appropriate pandas type. 

2126 

2127 Parameters 

2128 ---------- 

2129 values : np.ndarray 

2130 nan_rep : str 

2131 encoding : str 

2132 errors : str 

2133 """ 

2134 assert isinstance(values, np.ndarray), type(values) 

2135 

2136 values = Int64Index(np.arange(len(values))) 

2137 return values, values 

2138 

2139 def set_attr(self): 

2140 pass 

2141 

2142 

2143class DataCol(IndexCol): 

2144 """ a data holding column, by definition this is not indexable 

2145 

2146 Parameters 

2147 ---------- 

2148 

2149 data : the actual data 

2150 cname : the column name in the table to hold the data (typically 

2151 values) 

2152 meta : a string description of the metadata 

2153 metadata : the actual metadata 

2154 """ 

2155 

2156 is_an_indexable = False 

2157 is_data_indexable = False 

2158 _info_fields = ["tz", "ordered"] 

2159 

2160 def __init__( 

2161 self, 

2162 name: str, 

2163 values=None, 

2164 kind=None, 

2165 typ=None, 

2166 cname=None, 

2167 pos=None, 

2168 tz=None, 

2169 ordered=None, 

2170 table=None, 

2171 meta=None, 

2172 metadata=None, 

2173 dtype=None, 

2174 data=None, 

2175 ): 

2176 super().__init__( 

2177 name=name, 

2178 values=values, 

2179 kind=kind, 

2180 typ=typ, 

2181 pos=pos, 

2182 cname=cname, 

2183 tz=tz, 

2184 ordered=ordered, 

2185 table=table, 

2186 meta=meta, 

2187 metadata=metadata, 

2188 ) 

2189 self.dtype = dtype 

2190 self.data = data 

2191 

2192 @property 

2193 def dtype_attr(self) -> str: 

2194 return f"{self.name}_dtype" 

2195 

2196 @property 

2197 def meta_attr(self) -> str: 

2198 return f"{self.name}_meta" 

2199 

2200 def __repr__(self) -> str: 

2201 temp = tuple( 

2202 map( 

2203 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) 

2204 ) 

2205 ) 

2206 return ",".join( 

2207 ( 

2208 f"{key}->{value}" 

2209 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) 

2210 ) 

2211 ) 

2212 

2213 def __eq__(self, other: Any) -> bool: 

2214 """ compare 2 col items """ 

2215 return all( 

2216 getattr(self, a, None) == getattr(other, a, None) 

2217 for a in ["name", "cname", "dtype", "pos"] 

2218 ) 

2219 

2220 def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): 

2221 assert data is not None 

2222 assert self.dtype is None 

2223 

2224 data, dtype_name = _get_data_and_dtype_name(data) 

2225 

2226 self.data = data 

2227 self.dtype = dtype_name 

2228 self.kind = _dtype_to_kind(dtype_name) 

2229 

2230 def take_data(self): 

2231 """ return the data """ 

2232 return self.data 

2233 

2234 @classmethod 

2235 def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col": 

2236 """ 

2237 Get an appropriately typed and shaped pytables.Col object for values. 

2238 """ 

2239 

2240 dtype = values.dtype 

2241 itemsize = dtype.itemsize 

2242 

2243 shape = values.shape 

2244 if values.ndim == 1: 

2245 # EA, use block shape pretending it is 2D 

2246 shape = (1, values.size) 

2247 

2248 if is_categorical_dtype(dtype): 

2249 codes = values.codes 

2250 atom = cls.get_atom_data(shape, kind=codes.dtype.name) 

2251 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): 

2252 atom = cls.get_atom_datetime64(shape) 

2253 elif is_timedelta64_dtype(dtype): 

2254 atom = cls.get_atom_timedelta64(shape) 

2255 elif is_complex_dtype(dtype): 

2256 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0]) 

2257 

2258 elif is_string_dtype(dtype): 

2259 atom = cls.get_atom_string(shape, itemsize) 

2260 

2261 else: 

2262 atom = cls.get_atom_data(shape, kind=dtype.name) 

2263 

2264 return atom 

2265 

2266 @classmethod 

2267 def get_atom_string(cls, shape, itemsize): 

2268 return _tables().StringCol(itemsize=itemsize, shape=shape[0]) 

2269 

2270 @classmethod 

2271 def get_atom_coltype(cls, kind: str) -> Type["Col"]: 

2272 """ return the PyTables column class for this column """ 

2273 if kind.startswith("uint"): 

2274 k4 = kind[4:] 

2275 col_name = f"UInt{k4}Col" 

2276 elif kind.startswith("period"): 

2277 # we store as integer 

2278 col_name = "Int64Col" 

2279 else: 

2280 kcap = kind.capitalize() 

2281 col_name = f"{kcap}Col" 

2282 

2283 return getattr(_tables(), col_name) 

2284 

2285 @classmethod 

2286 def get_atom_data(cls, shape, kind: str) -> "Col": 

2287 return cls.get_atom_coltype(kind=kind)(shape=shape[0]) 

2288 

2289 @classmethod 

2290 def get_atom_datetime64(cls, shape): 

2291 return _tables().Int64Col(shape=shape[0]) 

2292 

2293 @classmethod 

2294 def get_atom_timedelta64(cls, shape): 

2295 return _tables().Int64Col(shape=shape[0]) 

2296 

2297 @property 

2298 def shape(self): 

2299 return getattr(self.data, "shape", None) 

2300 

2301 @property 

2302 def cvalues(self): 

2303 """ return my cython values """ 

2304 return self.data 

2305 

2306 def validate_attr(self, append): 

2307 """validate that we have the same order as the existing & same dtype""" 

2308 if append: 

2309 existing_fields = getattr(self.attrs, self.kind_attr, None) 

2310 if existing_fields is not None and existing_fields != list(self.values): 

2311 raise ValueError("appended items do not match existing items in table!") 

2312 

2313 existing_dtype = getattr(self.attrs, self.dtype_attr, None) 

2314 if existing_dtype is not None and existing_dtype != self.dtype: 

2315 raise ValueError( 

2316 "appended items dtype do not match existing " 

2317 "items dtype in table!" 

2318 ) 

2319 

2320 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): 

2321 """ 

2322 Convert the data from this selection to the appropriate pandas type. 

2323 

2324 Parameters 

2325 ---------- 

2326 values : np.ndarray 

2327 nan_rep : 

2328 encoding : str 

2329 errors : str 

2330 

2331 Returns 

2332 ------- 

2333 index : listlike to become an Index 

2334 data : ndarraylike to become a column 

2335 """ 

2336 assert isinstance(values, np.ndarray), type(values) 

2337 

2338 # values is a recarray 

2339 if values.dtype.fields is not None: 

2340 values = values[self.cname] 

2341 

2342 assert self.typ is not None 

2343 if self.dtype is None: 

2344 # Note: in tests we never have timedelta64 or datetime64, 

2345 # so the _get_data_and_dtype_name may be unnecessary 

2346 converted, dtype_name = _get_data_and_dtype_name(values) 

2347 kind = _dtype_to_kind(dtype_name) 

2348 else: 

2349 converted = values 

2350 dtype_name = self.dtype 

2351 kind = self.kind 

2352 

2353 assert isinstance(converted, np.ndarray) # for mypy 

2354 

2355 # use the meta if needed 

2356 meta = _ensure_decoded(self.meta) 

2357 metadata = self.metadata 

2358 ordered = self.ordered 

2359 tz = self.tz 

2360 

2361 assert dtype_name is not None 

2362 # convert to the correct dtype 

2363 dtype = _ensure_decoded(dtype_name) 

2364 

2365 # reverse converts 

2366 if dtype == "datetime64": 

2367 

2368 # recreate with tz if indicated 

2369 converted = _set_tz(converted, tz, coerce=True) 

2370 

2371 elif dtype == "timedelta64": 

2372 converted = np.asarray(converted, dtype="m8[ns]") 

2373 elif dtype == "date": 

2374 try: 

2375 converted = np.asarray( 

2376 [date.fromordinal(v) for v in converted], dtype=object 

2377 ) 

2378 except ValueError: 

2379 converted = np.asarray( 

2380 [date.fromtimestamp(v) for v in converted], dtype=object 

2381 ) 

2382 

2383 elif meta == "category": 

2384 

2385 # we have a categorical 

2386 categories = metadata 

2387 codes = converted.ravel() 

2388 

2389 # if we have stored a NaN in the categories 

2390 # then strip it; in theory we could have BOTH 

2391 # -1s in the codes and nulls :< 

2392 if categories is None: 

2393 # Handle case of NaN-only categorical columns in which case 

2394 # the categories are an empty array; when this is stored, 

2395 # pytables cannot write a zero-len array, so on readback 

2396 # the categories would be None and `read_hdf()` would fail. 

2397 categories = Index([], dtype=np.float64) 

2398 else: 

2399 mask = isna(categories) 

2400 if mask.any(): 

2401 categories = categories[~mask] 

2402 codes[codes != -1] -= mask.astype(int).cumsum().values 

2403 

2404 converted = Categorical.from_codes( 

2405 codes, categories=categories, ordered=ordered 

2406 ) 

2407 

2408 else: 

2409 

2410 try: 

2411 converted = converted.astype(dtype, copy=False) 

2412 except TypeError: 

2413 converted = converted.astype("O", copy=False) 

2414 

2415 # convert nans / decode 

2416 if _ensure_decoded(kind) == "string": 

2417 converted = _unconvert_string_array( 

2418 converted, nan_rep=nan_rep, encoding=encoding, errors=errors 

2419 ) 

2420 

2421 return self.values, converted 

2422 

2423 def set_attr(self): 

2424 """ set the data for this column """ 

2425 setattr(self.attrs, self.kind_attr, self.values) 

2426 setattr(self.attrs, self.meta_attr, self.meta) 

2427 assert self.dtype is not None 

2428 setattr(self.attrs, self.dtype_attr, self.dtype) 

2429 

2430 

2431class DataIndexableCol(DataCol): 

2432 """ represent a data column that can be indexed """ 

2433 

2434 is_data_indexable = True 

2435 

2436 def validate_names(self): 

2437 if not Index(self.values).is_object(): 

2438 # TODO: should the message here be more specifically non-str? 

2439 raise ValueError("cannot have non-object label DataIndexableCol") 

2440 

2441 @classmethod 

2442 def get_atom_string(cls, shape, itemsize): 

2443 return _tables().StringCol(itemsize=itemsize) 

2444 

2445 @classmethod 

2446 def get_atom_data(cls, shape, kind: str) -> "Col": 

2447 return cls.get_atom_coltype(kind=kind)() 

2448 

2449 @classmethod 

2450 def get_atom_datetime64(cls, shape): 

2451 return _tables().Int64Col() 

2452 

2453 @classmethod 

2454 def get_atom_timedelta64(cls, shape): 

2455 return _tables().Int64Col() 

2456 

2457 

2458class GenericDataIndexableCol(DataIndexableCol): 

2459 """ represent a generic pytables data column """ 

2460 

2461 pass 

2462 

2463 

2464class Fixed: 

2465 """ represent an object in my store 

2466 facilitate read/write of various types of objects 

2467 this is an abstract base class 

2468 

2469 Parameters 

2470 ---------- 

2471 parent : HDFStore 

2472 group : Node 

2473 The group node where the table resides. 

2474 """ 

2475 

2476 pandas_kind: str 

2477 format_type: str = "fixed" # GH#30962 needed by dask 

2478 obj_type: Type[Union[DataFrame, Series]] 

2479 ndim: int 

2480 encoding: str 

2481 parent: HDFStore 

2482 group: "Node" 

2483 errors: str 

2484 is_table = False 

2485 

2486 def __init__( 

2487 self, 

2488 parent: HDFStore, 

2489 group: "Node", 

2490 encoding: str = "UTF-8", 

2491 errors: str = "strict", 

2492 ): 

2493 assert isinstance(parent, HDFStore), type(parent) 

2494 assert _table_mod is not None # needed for mypy 

2495 assert isinstance(group, _table_mod.Node), type(group) 

2496 self.parent = parent 

2497 self.group = group 

2498 self.encoding = _ensure_encoding(encoding) 

2499 self.errors = errors 

2500 

2501 @property 

2502 def is_old_version(self) -> bool: 

2503 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 

2504 

2505 @property 

2506 def version(self) -> Tuple[int, int, int]: 

2507 """ compute and set our version """ 

2508 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) 

2509 try: 

2510 version = tuple(int(x) for x in version.split(".")) 

2511 if len(version) == 2: 

2512 version = version + (0,) 

2513 except AttributeError: 

2514 version = (0, 0, 0) 

2515 return version 

2516 

2517 @property 

2518 def pandas_type(self): 

2519 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) 

2520 

2521 def __repr__(self) -> str: 

2522 """ return a pretty representation of myself """ 

2523 self.infer_axes() 

2524 s = self.shape 

2525 if s is not None: 

2526 if isinstance(s, (list, tuple)): 

2527 jshape = ",".join(pprint_thing(x) for x in s) 

2528 s = f"[{jshape}]" 

2529 return f"{self.pandas_type:12.12} (shape->{s})" 

2530 return self.pandas_type 

2531 

2532 def set_object_info(self): 

2533 """ set my pandas type & version """ 

2534 self.attrs.pandas_type = str(self.pandas_kind) 

2535 self.attrs.pandas_version = str(_version) 

2536 

2537 def copy(self): 

2538 new_self = copy.copy(self) 

2539 return new_self 

2540 

2541 @property 

2542 def shape(self): 

2543 return self.nrows 

2544 

2545 @property 

2546 def pathname(self): 

2547 return self.group._v_pathname 

2548 

2549 @property 

2550 def _handle(self): 

2551 return self.parent._handle 

2552 

2553 @property 

2554 def _filters(self): 

2555 return self.parent._filters 

2556 

2557 @property 

2558 def _complevel(self) -> int: 

2559 return self.parent._complevel 

2560 

2561 @property 

2562 def _fletcher32(self) -> bool: 

2563 return self.parent._fletcher32 

2564 

2565 @property 

2566 def attrs(self): 

2567 return self.group._v_attrs 

2568 

2569 def set_attrs(self): 

2570 """ set our object attributes """ 

2571 pass 

2572 

2573 def get_attrs(self): 

2574 """ get our object attributes """ 

2575 pass 

2576 

2577 @property 

2578 def storable(self): 

2579 """ return my storable """ 

2580 return self.group 

2581 

2582 @property 

2583 def is_exists(self) -> bool: 

2584 return False 

2585 

2586 @property 

2587 def nrows(self): 

2588 return getattr(self.storable, "nrows", None) 

2589 

2590 def validate(self, other): 

2591 """ validate against an existing storable """ 

2592 if other is None: 

2593 return 

2594 return True 

2595 

2596 def validate_version(self, where=None): 

2597 """ are we trying to operate on an old version? """ 

2598 return True 

2599 

2600 def infer_axes(self): 

2601 """ infer the axes of my storer 

2602 return a boolean indicating if we have a valid storer or not """ 

2603 

2604 s = self.storable 

2605 if s is None: 

2606 return False 

2607 self.get_attrs() 

2608 return True 

2609 

2610 def read( 

2611 self, 

2612 where=None, 

2613 columns=None, 

2614 start: Optional[int] = None, 

2615 stop: Optional[int] = None, 

2616 ): 

2617 raise NotImplementedError( 

2618 "cannot read on an abstract storer: subclasses should implement" 

2619 ) 

2620 

2621 def write(self, **kwargs): 

2622 raise NotImplementedError( 

2623 "cannot write on an abstract storer: subclasses should implement" 

2624 ) 

2625 

2626 def delete( 

2627 self, where=None, start: Optional[int] = None, stop: Optional[int] = None 

2628 ): 

2629 """ 

2630 support fully deleting the node in its entirety (only) - where 

2631 specification must be None 

2632 """ 

2633 if com.all_none(where, start, stop): 

2634 self._handle.remove_node(self.group, recursive=True) 

2635 return None 

2636 

2637 raise TypeError("cannot delete on an abstract storer") 

2638 

2639 

2640class GenericFixed(Fixed): 

2641 """ a generified fixed version """ 

2642 

2643 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} 

2644 _reverse_index_map = {v: k for k, v in _index_type_map.items()} 

2645 attributes: List[str] = [] 

2646 

2647 # indexer helpders 

2648 def _class_to_alias(self, cls) -> str: 

2649 return self._index_type_map.get(cls, "") 

2650 

2651 def _alias_to_class(self, alias): 

2652 if isinstance(alias, type): # pragma: no cover 

2653 # compat: for a short period of time master stored types 

2654 return alias 

2655 return self._reverse_index_map.get(alias, Index) 

2656 

2657 def _get_index_factory(self, klass): 

2658 if klass == DatetimeIndex: 

2659 

2660 def f(values, freq=None, tz=None): 

2661 # data are already in UTC, localize and convert if tz present 

2662 result = DatetimeIndex._simple_new(values.values, name=None, freq=freq) 

2663 if tz is not None: 

2664 result = result.tz_localize("UTC").tz_convert(tz) 

2665 return result 

2666 

2667 return f 

2668 elif klass == PeriodIndex: 

2669 

2670 def f(values, freq=None, tz=None): 

2671 return PeriodIndex._simple_new(values, name=None, freq=freq) 

2672 

2673 return f 

2674 

2675 return klass 

2676 

2677 def validate_read(self, columns, where): 

2678 """ 

2679 raise if any keywords are passed which are not-None 

2680 """ 

2681 if columns is not None: 

2682 raise TypeError( 

2683 "cannot pass a column specification when reading " 

2684 "a Fixed format store. this store must be " 

2685 "selected in its entirety" 

2686 ) 

2687 if where is not None: 

2688 raise TypeError( 

2689 "cannot pass a where specification when reading " 

2690 "from a Fixed format store. this store must be " 

2691 "selected in its entirety" 

2692 ) 

2693 

2694 @property 

2695 def is_exists(self) -> bool: 

2696 return True 

2697 

2698 def set_attrs(self): 

2699 """ set our object attributes """ 

2700 self.attrs.encoding = self.encoding 

2701 self.attrs.errors = self.errors 

2702 

2703 def get_attrs(self): 

2704 """ retrieve our attributes """ 

2705 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

2706 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

2707 for n in self.attributes: 

2708 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) 

2709 

2710 def write(self, obj, **kwargs): 

2711 self.set_attrs() 

2712 

2713 def read_array( 

2714 self, key: str, start: Optional[int] = None, stop: Optional[int] = None 

2715 ): 

2716 """ read an array for the specified node (off of group """ 

2717 import tables 

2718 

2719 node = getattr(self.group, key) 

2720 attrs = node._v_attrs 

2721 

2722 transposed = getattr(attrs, "transposed", False) 

2723 

2724 if isinstance(node, tables.VLArray): 

2725 ret = node[0][start:stop] 

2726 else: 

2727 dtype = getattr(attrs, "value_type", None) 

2728 shape = getattr(attrs, "shape", None) 

2729 

2730 if shape is not None: 

2731 # length 0 axis 

2732 ret = np.empty(shape, dtype=dtype) 

2733 else: 

2734 ret = node[start:stop] 

2735 

2736 if dtype == "datetime64": 

2737 

2738 # reconstruct a timezone if indicated 

2739 tz = getattr(attrs, "tz", None) 

2740 ret = _set_tz(ret, tz, coerce=True) 

2741 

2742 elif dtype == "timedelta64": 

2743 ret = np.asarray(ret, dtype="m8[ns]") 

2744 

2745 if transposed: 

2746 return ret.T 

2747 else: 

2748 return ret 

2749 

2750 def read_index( 

2751 self, key: str, start: Optional[int] = None, stop: Optional[int] = None 

2752 ) -> Index: 

2753 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) 

2754 

2755 if variety == "multi": 

2756 return self.read_multi_index(key, start=start, stop=stop) 

2757 elif variety == "regular": 

2758 node = getattr(self.group, key) 

2759 index = self.read_index_node(node, start=start, stop=stop) 

2760 return index 

2761 else: # pragma: no cover 

2762 raise TypeError(f"unrecognized index variety: {variety}") 

2763 

2764 def write_index(self, key: str, index: Index): 

2765 if isinstance(index, MultiIndex): 

2766 setattr(self.attrs, f"{key}_variety", "multi") 

2767 self.write_multi_index(key, index) 

2768 else: 

2769 setattr(self.attrs, f"{key}_variety", "regular") 

2770 converted = _convert_index("index", index, self.encoding, self.errors) 

2771 

2772 self.write_array(key, converted.values) 

2773 

2774 node = getattr(self.group, key) 

2775 node._v_attrs.kind = converted.kind 

2776 node._v_attrs.name = index.name 

2777 

2778 if isinstance(index, (DatetimeIndex, PeriodIndex)): 

2779 node._v_attrs.index_class = self._class_to_alias(type(index)) 

2780 

2781 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): 

2782 node._v_attrs.freq = index.freq 

2783 

2784 if isinstance(index, DatetimeIndex) and index.tz is not None: 

2785 node._v_attrs.tz = _get_tz(index.tz) 

2786 

2787 def write_multi_index(self, key: str, index: MultiIndex): 

2788 setattr(self.attrs, f"{key}_nlevels", index.nlevels) 

2789 

2790 for i, (lev, level_codes, name) in enumerate( 

2791 zip(index.levels, index.codes, index.names) 

2792 ): 

2793 # write the level 

2794 if is_extension_array_dtype(lev): 

2795 raise NotImplementedError( 

2796 "Saving a MultiIndex with an extension dtype is not supported." 

2797 ) 

2798 level_key = f"{key}_level{i}" 

2799 conv_level = _convert_index(level_key, lev, self.encoding, self.errors) 

2800 self.write_array(level_key, conv_level.values) 

2801 node = getattr(self.group, level_key) 

2802 node._v_attrs.kind = conv_level.kind 

2803 node._v_attrs.name = name 

2804 

2805 # write the name 

2806 setattr(node._v_attrs, f"{key}_name{name}", name) 

2807 

2808 # write the labels 

2809 label_key = f"{key}_label{i}" 

2810 self.write_array(label_key, level_codes) 

2811 

2812 def read_multi_index( 

2813 self, key: str, start: Optional[int] = None, stop: Optional[int] = None 

2814 ) -> MultiIndex: 

2815 nlevels = getattr(self.attrs, f"{key}_nlevels") 

2816 

2817 levels = [] 

2818 codes = [] 

2819 names: List[Optional[Hashable]] = [] 

2820 for i in range(nlevels): 

2821 level_key = f"{key}_level{i}" 

2822 node = getattr(self.group, level_key) 

2823 lev = self.read_index_node(node, start=start, stop=stop) 

2824 levels.append(lev) 

2825 names.append(lev.name) 

2826 

2827 label_key = f"{key}_label{i}" 

2828 level_codes = self.read_array(label_key, start=start, stop=stop) 

2829 codes.append(level_codes) 

2830 

2831 return MultiIndex( 

2832 levels=levels, codes=codes, names=names, verify_integrity=True 

2833 ) 

2834 

2835 def read_index_node( 

2836 self, node: "Node", start: Optional[int] = None, stop: Optional[int] = None 

2837 ) -> Index: 

2838 data = node[start:stop] 

2839 # If the index was an empty array write_array_empty() will 

2840 # have written a sentinel. Here we relace it with the original. 

2841 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: 

2842 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,) 

2843 kind = _ensure_decoded(node._v_attrs.kind) 

2844 name = None 

2845 

2846 if "name" in node._v_attrs: 

2847 name = _ensure_str(node._v_attrs.name) 

2848 name = _ensure_decoded(name) 

2849 

2850 index_class = self._alias_to_class( 

2851 _ensure_decoded(getattr(node._v_attrs, "index_class", "")) 

2852 ) 

2853 factory = self._get_index_factory(index_class) 

2854 

2855 kwargs = {} 

2856 if "freq" in node._v_attrs: 

2857 kwargs["freq"] = node._v_attrs["freq"] 

2858 

2859 if "tz" in node._v_attrs: 

2860 if isinstance(node._v_attrs["tz"], bytes): 

2861 # created by python2 

2862 kwargs["tz"] = node._v_attrs["tz"].decode("utf-8") 

2863 else: 

2864 # created by python3 

2865 kwargs["tz"] = node._v_attrs["tz"] 

2866 

2867 if kind == "date": 

2868 index = factory( 

2869 _unconvert_index( 

2870 data, kind, encoding=self.encoding, errors=self.errors 

2871 ), 

2872 dtype=object, 

2873 **kwargs, 

2874 ) 

2875 else: 

2876 index = factory( 

2877 _unconvert_index( 

2878 data, kind, encoding=self.encoding, errors=self.errors 

2879 ), 

2880 **kwargs, 

2881 ) 

2882 

2883 index.name = name 

2884 

2885 return index 

2886 

2887 def write_array_empty(self, key: str, value: ArrayLike): 

2888 """ write a 0-len array """ 

2889 

2890 # ugly hack for length 0 axes 

2891 arr = np.empty((1,) * value.ndim) 

2892 self._handle.create_array(self.group, key, arr) 

2893 node = getattr(self.group, key) 

2894 node._v_attrs.value_type = str(value.dtype) 

2895 node._v_attrs.shape = value.shape 

2896 

2897 def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None): 

2898 # TODO: we only have one test that gets here, the only EA 

2899 # that gets passed is DatetimeArray, and we never have 

2900 # both self._filters and EA 

2901 assert isinstance(value, (np.ndarray, ABCExtensionArray)), type(value) 

2902 

2903 if key in self.group: 

2904 self._handle.remove_node(self.group, key) 

2905 

2906 # Transform needed to interface with pytables row/col notation 

2907 empty_array = value.size == 0 

2908 transposed = False 

2909 

2910 if is_categorical_dtype(value): 

2911 raise NotImplementedError( 

2912 "Cannot store a category dtype in " 

2913 "a HDF5 dataset that uses format=" 

2914 '"fixed". Use format="table".' 

2915 ) 

2916 if not empty_array: 

2917 if hasattr(value, "T"): 

2918 # ExtensionArrays (1d) may not have transpose. 

2919 value = value.T 

2920 transposed = True 

2921 

2922 atom = None 

2923 if self._filters is not None: 

2924 try: 

2925 # get the atom for this datatype 

2926 atom = _tables().Atom.from_dtype(value.dtype) 

2927 except ValueError: 

2928 pass 

2929 

2930 if atom is not None: 

2931 # We only get here if self._filters is non-None and 

2932 # the Atom.from_dtype call succeeded 

2933 

2934 # create an empty chunked array and fill it from value 

2935 if not empty_array: 

2936 ca = self._handle.create_carray( 

2937 self.group, key, atom, value.shape, filters=self._filters 

2938 ) 

2939 ca[:] = value 

2940 

2941 else: 

2942 self.write_array_empty(key, value) 

2943 

2944 elif value.dtype.type == np.object_: 

2945 

2946 # infer the type, warn if we have a non-string type here (for 

2947 # performance) 

2948 inferred_type = lib.infer_dtype(value.ravel(), skipna=False) 

2949 if empty_array: 

2950 pass 

2951 elif inferred_type == "string": 

2952 pass 

2953 else: 

2954 ws = performance_doc % (inferred_type, key, items) 

2955 warnings.warn(ws, PerformanceWarning, stacklevel=7) 

2956 

2957 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) 

2958 vlarr.append(value) 

2959 

2960 elif empty_array: 

2961 self.write_array_empty(key, value) 

2962 elif is_datetime64_dtype(value.dtype): 

2963 self._handle.create_array(self.group, key, value.view("i8")) 

2964 getattr(self.group, key)._v_attrs.value_type = "datetime64" 

2965 elif is_datetime64tz_dtype(value.dtype): 

2966 # store as UTC 

2967 # with a zone 

2968 self._handle.create_array(self.group, key, value.asi8) 

2969 

2970 node = getattr(self.group, key) 

2971 node._v_attrs.tz = _get_tz(value.tz) 

2972 node._v_attrs.value_type = "datetime64" 

2973 elif is_timedelta64_dtype(value.dtype): 

2974 self._handle.create_array(self.group, key, value.view("i8")) 

2975 getattr(self.group, key)._v_attrs.value_type = "timedelta64" 

2976 else: 

2977 self._handle.create_array(self.group, key, value) 

2978 

2979 getattr(self.group, key)._v_attrs.transposed = transposed 

2980 

2981 

2982class SeriesFixed(GenericFixed): 

2983 pandas_kind = "series" 

2984 attributes = ["name"] 

2985 

2986 name: Optional[Hashable] 

2987 

2988 @property 

2989 def shape(self): 

2990 try: 

2991 return (len(self.group.values),) 

2992 except (TypeError, AttributeError): 

2993 return None 

2994 

2995 def read( 

2996 self, 

2997 where=None, 

2998 columns=None, 

2999 start: Optional[int] = None, 

3000 stop: Optional[int] = None, 

3001 ): 

3002 self.validate_read(columns, where) 

3003 index = self.read_index("index", start=start, stop=stop) 

3004 values = self.read_array("values", start=start, stop=stop) 

3005 return Series(values, index=index, name=self.name) 

3006 

3007 def write(self, obj, **kwargs): 

3008 super().write(obj, **kwargs) 

3009 self.write_index("index", obj.index) 

3010 self.write_array("values", obj.values) 

3011 self.attrs.name = obj.name 

3012 

3013 

3014class BlockManagerFixed(GenericFixed): 

3015 attributes = ["ndim", "nblocks"] 

3016 

3017 nblocks: int 

3018 

3019 @property 

3020 def shape(self): 

3021 try: 

3022 ndim = self.ndim 

3023 

3024 # items 

3025 items = 0 

3026 for i in range(self.nblocks): 

3027 node = getattr(self.group, f"block{i}_items") 

3028 shape = getattr(node, "shape", None) 

3029 if shape is not None: 

3030 items += shape[0] 

3031 

3032 # data shape 

3033 node = self.group.block0_values 

3034 shape = getattr(node, "shape", None) 

3035 if shape is not None: 

3036 shape = list(shape[0 : (ndim - 1)]) 

3037 else: 

3038 shape = [] 

3039 

3040 shape.append(items) 

3041 

3042 return shape 

3043 except AttributeError: 

3044 return None 

3045 

3046 def read( 

3047 self, 

3048 where=None, 

3049 columns=None, 

3050 start: Optional[int] = None, 

3051 stop: Optional[int] = None, 

3052 ): 

3053 # start, stop applied to rows, so 0th axis only 

3054 self.validate_read(columns, where) 

3055 select_axis = self.obj_type()._get_block_manager_axis(0) 

3056 

3057 axes = [] 

3058 for i in range(self.ndim): 

3059 

3060 _start, _stop = (start, stop) if i == select_axis else (None, None) 

3061 ax = self.read_index(f"axis{i}", start=_start, stop=_stop) 

3062 axes.append(ax) 

3063 

3064 items = axes[0] 

3065 dfs = [] 

3066 

3067 for i in range(self.nblocks): 

3068 

3069 blk_items = self.read_index(f"block{i}_items") 

3070 values = self.read_array(f"block{i}_values", start=_start, stop=_stop) 

3071 

3072 columns = items[items.get_indexer(blk_items)] 

3073 df = DataFrame(values.T, columns=columns, index=axes[1]) 

3074 dfs.append(df) 

3075 

3076 if len(dfs) > 0: 

3077 out = concat(dfs, axis=1) 

3078 out = out.reindex(columns=items, copy=False) 

3079 return out 

3080 

3081 return DataFrame(columns=axes[0], index=axes[1]) 

3082 

3083 def write(self, obj, **kwargs): 

3084 super().write(obj, **kwargs) 

3085 data = obj._data 

3086 if not data.is_consolidated(): 

3087 data = data.consolidate() 

3088 

3089 self.attrs.ndim = data.ndim 

3090 for i, ax in enumerate(data.axes): 

3091 if i == 0: 

3092 if not ax.is_unique: 

3093 raise ValueError("Columns index has to be unique for fixed format") 

3094 self.write_index(f"axis{i}", ax) 

3095 

3096 # Supporting mixed-type DataFrame objects...nontrivial 

3097 self.attrs.nblocks = len(data.blocks) 

3098 for i, blk in enumerate(data.blocks): 

3099 # I have no idea why, but writing values before items fixed #2299 

3100 blk_items = data.items.take(blk.mgr_locs) 

3101 self.write_array(f"block{i}_values", blk.values, items=blk_items) 

3102 self.write_index(f"block{i}_items", blk_items) 

3103 

3104 

3105class FrameFixed(BlockManagerFixed): 

3106 pandas_kind = "frame" 

3107 obj_type = DataFrame 

3108 

3109 

3110class Table(Fixed): 

3111 """ represent a table: 

3112 facilitate read/write of various types of tables 

3113 

3114 Attrs in Table Node 

3115 ------------------- 

3116 These are attributes that are store in the main table node, they are 

3117 necessary to recreate these tables when read back in. 

3118 

3119 index_axes : a list of tuples of the (original indexing axis and 

3120 index column) 

3121 non_index_axes: a list of tuples of the (original index axis and 

3122 columns on a non-indexing axis) 

3123 values_axes : a list of the columns which comprise the data of this 

3124 table 

3125 data_columns : a list of the columns that we are allowing indexing 

3126 (these become single columns in values_axes), or True to force all 

3127 columns 

3128 nan_rep : the string to use for nan representations for string 

3129 objects 

3130 levels : the names of levels 

3131 metadata : the names of the metadata columns 

3132 

3133 """ 

3134 

3135 pandas_kind = "wide_table" 

3136 format_type: str = "table" # GH#30962 needed by dask 

3137 table_type: str 

3138 levels = 1 

3139 is_table = True 

3140 

3141 index_axes: List[IndexCol] 

3142 non_index_axes: List[Tuple[int, Any]] 

3143 values_axes: List[DataCol] 

3144 data_columns: List 

3145 metadata: List 

3146 info: Dict 

3147 

3148 def __init__( 

3149 self, 

3150 parent: HDFStore, 

3151 group: "Node", 

3152 encoding=None, 

3153 errors: str = "strict", 

3154 index_axes=None, 

3155 non_index_axes=None, 

3156 values_axes=None, 

3157 data_columns=None, 

3158 info=None, 

3159 nan_rep=None, 

3160 ): 

3161 super().__init__(parent, group, encoding=encoding, errors=errors) 

3162 self.index_axes = index_axes or [] 

3163 self.non_index_axes = non_index_axes or [] 

3164 self.values_axes = values_axes or [] 

3165 self.data_columns = data_columns or [] 

3166 self.info = info or dict() 

3167 self.nan_rep = nan_rep 

3168 

3169 @property 

3170 def table_type_short(self) -> str: 

3171 return self.table_type.split("_")[0] 

3172 

3173 def __repr__(self) -> str: 

3174 """ return a pretty representation of myself """ 

3175 self.infer_axes() 

3176 jdc = ",".join(self.data_columns) if len(self.data_columns) else "" 

3177 dc = f",dc->[{jdc}]" 

3178 

3179 ver = "" 

3180 if self.is_old_version: 

3181 jver = ".".join(str(x) for x in self.version) 

3182 ver = f"[{jver}]" 

3183 

3184 jindex_axes = ",".join(a.name for a in self.index_axes) 

3185 return ( 

3186 f"{self.pandas_type:12.12}{ver} " 

3187 f"(typ->{self.table_type_short},nrows->{self.nrows}," 

3188 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" 

3189 ) 

3190 

3191 def __getitem__(self, c: str): 

3192 """ return the axis for c """ 

3193 for a in self.axes: 

3194 if c == a.name: 

3195 return a 

3196 return None 

3197 

3198 def validate(self, other): 

3199 """ validate against an existing table """ 

3200 if other is None: 

3201 return 

3202 

3203 if other.table_type != self.table_type: 

3204 raise TypeError( 

3205 "incompatible table_type with existing " 

3206 f"[{other.table_type} - {self.table_type}]" 

3207 ) 

3208 

3209 for c in ["index_axes", "non_index_axes", "values_axes"]: 

3210 sv = getattr(self, c, None) 

3211 ov = getattr(other, c, None) 

3212 if sv != ov: 

3213 

3214 # show the error for the specific axes 

3215 for i, sax in enumerate(sv): 

3216 oax = ov[i] 

3217 if sax != oax: 

3218 raise ValueError( 

3219 f"invalid combination of [{c}] on appending data " 

3220 f"[{sax}] vs current table [{oax}]" 

3221 ) 

3222 

3223 # should never get here 

3224 raise Exception( 

3225 f"invalid combination of [{c}] on appending data [{sv}] vs " 

3226 f"current table [{ov}]" 

3227 ) 

3228 

3229 @property 

3230 def is_multi_index(self) -> bool: 

3231 """the levels attribute is 1 or a list in the case of a multi-index""" 

3232 return isinstance(self.levels, list) 

3233 

3234 def validate_multiindex(self, obj): 

3235 """validate that we can store the multi-index; reset and return the 

3236 new object 

3237 """ 

3238 levels = [ 

3239 l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) 

3240 ] 

3241 try: 

3242 return obj.reset_index(), levels 

3243 except ValueError: 

3244 raise ValueError( 

3245 "duplicate names/columns in the multi-index when storing as a table" 

3246 ) 

3247 

3248 @property 

3249 def nrows_expected(self) -> int: 

3250 """ based on our axes, compute the expected nrows """ 

3251 return np.prod([i.cvalues.shape[0] for i in self.index_axes]) 

3252 

3253 @property 

3254 def is_exists(self) -> bool: 

3255 """ has this table been created """ 

3256 return "table" in self.group 

3257 

3258 @property 

3259 def storable(self): 

3260 return getattr(self.group, "table", None) 

3261 

3262 @property 

3263 def table(self): 

3264 """ return the table group (this is my storable) """ 

3265 return self.storable 

3266 

3267 @property 

3268 def dtype(self): 

3269 return self.table.dtype 

3270 

3271 @property 

3272 def description(self): 

3273 return self.table.description 

3274 

3275 @property 

3276 def axes(self): 

3277 return itertools.chain(self.index_axes, self.values_axes) 

3278 

3279 @property 

3280 def ncols(self) -> int: 

3281 """ the number of total columns in the values axes """ 

3282 return sum(len(a.values) for a in self.values_axes) 

3283 

3284 @property 

3285 def is_transposed(self) -> bool: 

3286 return False 

3287 

3288 @property 

3289 def data_orientation(self): 

3290 """return a tuple of my permutated axes, non_indexable at the front""" 

3291 return tuple( 

3292 itertools.chain( 

3293 [int(a[0]) for a in self.non_index_axes], 

3294 [int(a.axis) for a in self.index_axes], 

3295 ) 

3296 ) 

3297 

3298 def queryables(self) -> Dict[str, Any]: 

3299 """ return a dict of the kinds allowable columns for this object """ 

3300 

3301 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here 

3302 axis_names = {0: "index", 1: "columns"} 

3303 

3304 # compute the values_axes queryables 

3305 d1 = [(a.cname, a) for a in self.index_axes] 

3306 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes] 

3307 d3 = [ 

3308 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) 

3309 ] 

3310 

3311 return dict(d1 + d2 + d3) # type: ignore 

3312 # error: List comprehension has incompatible type 

3313 # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]] 

3314 

3315 def index_cols(self): 

3316 """ return a list of my index cols """ 

3317 # Note: each `i.cname` below is assured to be a str. 

3318 return [(i.axis, i.cname) for i in self.index_axes] 

3319 

3320 def values_cols(self) -> List[str]: 

3321 """ return a list of my values cols """ 

3322 return [i.cname for i in self.values_axes] 

3323 

3324 def _get_metadata_path(self, key: str) -> str: 

3325 """ return the metadata pathname for this key """ 

3326 group = self.group._v_pathname 

3327 return f"{group}/meta/{key}/meta" 

3328 

3329 def write_metadata(self, key: str, values: np.ndarray): 

3330 """ 

3331 Write out a metadata array to the key as a fixed-format Series. 

3332 

3333 Parameters 

3334 ---------- 

3335 key : str 

3336 values : ndarray 

3337 """ 

3338 values = Series(values) 

3339 self.parent.put( 

3340 self._get_metadata_path(key), 

3341 values, 

3342 format="table", 

3343 encoding=self.encoding, 

3344 errors=self.errors, 

3345 nan_rep=self.nan_rep, 

3346 ) 

3347 

3348 def read_metadata(self, key: str): 

3349 """ return the meta data array for this key """ 

3350 if getattr(getattr(self.group, "meta", None), key, None) is not None: 

3351 return self.parent.select(self._get_metadata_path(key)) 

3352 return None 

3353 

3354 def set_attrs(self): 

3355 """ set our table type & indexables """ 

3356 self.attrs.table_type = str(self.table_type) 

3357 self.attrs.index_cols = self.index_cols() 

3358 self.attrs.values_cols = self.values_cols() 

3359 self.attrs.non_index_axes = self.non_index_axes 

3360 self.attrs.data_columns = self.data_columns 

3361 self.attrs.nan_rep = self.nan_rep 

3362 self.attrs.encoding = self.encoding 

3363 self.attrs.errors = self.errors 

3364 self.attrs.levels = self.levels 

3365 self.attrs.info = self.info 

3366 

3367 def get_attrs(self): 

3368 """ retrieve our attributes """ 

3369 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] 

3370 self.data_columns = getattr(self.attrs, "data_columns", None) or [] 

3371 self.info = getattr(self.attrs, "info", None) or dict() 

3372 self.nan_rep = getattr(self.attrs, "nan_rep", None) 

3373 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

3374 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

3375 self.levels = getattr(self.attrs, "levels", None) or [] 

3376 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

3377 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

3378 

3379 def validate_version(self, where=None): 

3380 """ are we trying to operate on an old version? """ 

3381 if where is not None: 

3382 if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: 

3383 ws = incompatibility_doc % ".".join([str(x) for x in self.version]) 

3384 warnings.warn(ws, IncompatibilityWarning) 

3385 

3386 def validate_min_itemsize(self, min_itemsize): 

3387 """validate the min_itemsize doesn't contain items that are not in the 

3388 axes this needs data_columns to be defined 

3389 """ 

3390 if min_itemsize is None: 

3391 return 

3392 if not isinstance(min_itemsize, dict): 

3393 return 

3394 

3395 q = self.queryables() 

3396 for k, v in min_itemsize.items(): 

3397 

3398 # ok, apply generally 

3399 if k == "values": 

3400 continue 

3401 if k not in q: 

3402 raise ValueError( 

3403 f"min_itemsize has the key [{k}] which is not an axis or " 

3404 "data_column" 

3405 ) 

3406 

3407 @cache_readonly 

3408 def indexables(self): 

3409 """ create/cache the indexables if they don't exist """ 

3410 _indexables = [] 

3411 

3412 desc = self.description 

3413 table_attrs = self.table.attrs 

3414 

3415 # Note: each of the `name` kwargs below are str, ensured 

3416 # by the definition in index_cols. 

3417 # index columns 

3418 for i, (axis, name) in enumerate(self.attrs.index_cols): 

3419 atom = getattr(desc, name) 

3420 md = self.read_metadata(name) 

3421 meta = "category" if md is not None else None 

3422 

3423 kind_attr = f"{name}_kind" 

3424 kind = getattr(table_attrs, kind_attr, None) 

3425 

3426 index_col = IndexCol( 

3427 name=name, 

3428 axis=axis, 

3429 pos=i, 

3430 kind=kind, 

3431 typ=atom, 

3432 table=self.table, 

3433 meta=meta, 

3434 metadata=md, 

3435 ) 

3436 _indexables.append(index_col) 

3437 

3438 # values columns 

3439 dc = set(self.data_columns) 

3440 base_pos = len(_indexables) 

3441 

3442 def f(i, c): 

3443 assert isinstance(c, str) 

3444 klass = DataCol 

3445 if c in dc: 

3446 klass = DataIndexableCol 

3447 

3448 atom = getattr(desc, c) 

3449 adj_name = _maybe_adjust_name(c, self.version) 

3450 

3451 # TODO: why kind_attr here? 

3452 values = getattr(table_attrs, f"{adj_name}_kind", None) 

3453 dtype = getattr(table_attrs, f"{adj_name}_dtype", None) 

3454 kind = _dtype_to_kind(dtype) 

3455 

3456 md = self.read_metadata(c) 

3457 # TODO: figure out why these two versions of `meta` dont always match. 

3458 # meta = "category" if md is not None else None 

3459 meta = getattr(table_attrs, f"{adj_name}_meta", None) 

3460 

3461 obj = klass( 

3462 name=adj_name, 

3463 cname=c, 

3464 values=values, 

3465 kind=kind, 

3466 pos=base_pos + i, 

3467 typ=atom, 

3468 table=self.table, 

3469 meta=meta, 

3470 metadata=md, 

3471 dtype=dtype, 

3472 ) 

3473 return obj 

3474 

3475 # Note: the definition of `values_cols` ensures that each 

3476 # `c` below is a str. 

3477 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)]) 

3478 

3479 return _indexables 

3480 

3481 def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): 

3482 """ 

3483 Create a pytables index on the specified columns. 

3484 

3485 Parameters 

3486 ---------- 

3487 columns : None, bool, or listlike[str] 

3488 Indicate which columns to create an index on. 

3489 

3490 * False : Do not create any indexes. 

3491 * True : Create indexes on all columns. 

3492 * None : Create indexes on all columns. 

3493 * listlike : Create indexes on the given columns. 

3494 

3495 optlevel : int or None, default None 

3496 Optimization level, if None, pytables defaults to 6. 

3497 kind : str or None, default None 

3498 Kind of index, if None, pytables defaults to "medium". 

3499 

3500 Raises 

3501 ------ 

3502 TypeError if trying to create an index on a complex-type column. 

3503 

3504 Notes 

3505 ----- 

3506 Cannot index Time64Col or ComplexCol. 

3507 Pytables must be >= 3.0. 

3508 """ 

3509 

3510 if not self.infer_axes(): 

3511 return 

3512 if columns is False: 

3513 return 

3514 

3515 # index all indexables and data_columns 

3516 if columns is None or columns is True: 

3517 columns = [a.cname for a in self.axes if a.is_data_indexable] 

3518 if not isinstance(columns, (tuple, list)): 

3519 columns = [columns] 

3520 

3521 kw = dict() 

3522 if optlevel is not None: 

3523 kw["optlevel"] = optlevel 

3524 if kind is not None: 

3525 kw["kind"] = kind 

3526 

3527 table = self.table 

3528 for c in columns: 

3529 v = getattr(table.cols, c, None) 

3530 if v is not None: 

3531 

3532 # remove the index if the kind/optlevel have changed 

3533 if v.is_indexed: 

3534 index = v.index 

3535 cur_optlevel = index.optlevel 

3536 cur_kind = index.kind 

3537 

3538 if kind is not None and cur_kind != kind: 

3539 v.remove_index() 

3540 else: 

3541 kw["kind"] = cur_kind 

3542 

3543 if optlevel is not None and cur_optlevel != optlevel: 

3544 v.remove_index() 

3545 else: 

3546 kw["optlevel"] = cur_optlevel 

3547 

3548 # create the index 

3549 if not v.is_indexed: 

3550 if v.type.startswith("complex"): 

3551 raise TypeError( 

3552 "Columns containing complex values can be stored but " 

3553 "cannot be indexed when using table format. Either use " 

3554 "fixed format, set index=False, or do not include " 

3555 "the columns containing complex values to " 

3556 "data_columns when initializing the table." 

3557 ) 

3558 v.create_index(**kw) 

3559 

3560 def _read_axes( 

3561 self, where, start: Optional[int] = None, stop: Optional[int] = None 

3562 ) -> List[Tuple[ArrayLike, ArrayLike]]: 

3563 """ 

3564 Create the axes sniffed from the table. 

3565 

3566 Parameters 

3567 ---------- 

3568 where : ??? 

3569 start : int or None, default None 

3570 stop : int or None, default None 

3571 

3572 Returns 

3573 ------- 

3574 List[Tuple[index_values, column_values]] 

3575 """ 

3576 

3577 # create the selection 

3578 selection = Selection(self, where=where, start=start, stop=stop) 

3579 values = selection.select() 

3580 

3581 results = [] 

3582 # convert the data 

3583 for a in self.axes: 

3584 a.set_info(self.info) 

3585 res = a.convert( 

3586 values, 

3587 nan_rep=self.nan_rep, 

3588 encoding=self.encoding, 

3589 errors=self.errors, 

3590 ) 

3591 results.append(res) 

3592 

3593 return results 

3594 

3595 @classmethod 

3596 def get_object(cls, obj, transposed: bool): 

3597 """ return the data for this obj """ 

3598 return obj 

3599 

3600 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): 

3601 """take the input data_columns and min_itemize and create a data 

3602 columns spec 

3603 """ 

3604 

3605 if not len(non_index_axes): 

3606 return [] 

3607 

3608 axis, axis_labels = non_index_axes[0] 

3609 info = self.info.get(axis, dict()) 

3610 if info.get("type") == "MultiIndex" and data_columns: 

3611 raise ValueError( 

3612 f"cannot use a multi-index on axis [{axis}] with " 

3613 f"data_columns {data_columns}" 

3614 ) 

3615 

3616 # evaluate the passed data_columns, True == use all columns 

3617 # take only valide axis labels 

3618 if data_columns is True: 

3619 data_columns = list(axis_labels) 

3620 elif data_columns is None: 

3621 data_columns = [] 

3622 

3623 # if min_itemsize is a dict, add the keys (exclude 'values') 

3624 if isinstance(min_itemsize, dict): 

3625 

3626 existing_data_columns = set(data_columns) 

3627 data_columns = list(data_columns) # ensure we do not modify 

3628 data_columns.extend( 

3629 [ 

3630 k 

3631 for k in min_itemsize.keys() 

3632 if k != "values" and k not in existing_data_columns 

3633 ] 

3634 ) 

3635 

3636 # return valid columns in the order of our axis 

3637 return [c for c in data_columns if c in axis_labels] 

3638 

3639 def _create_axes( 

3640 self, 

3641 axes, 

3642 obj: DataFrame, 

3643 validate: bool = True, 

3644 nan_rep=None, 

3645 data_columns=None, 

3646 min_itemsize=None, 

3647 ): 

3648 """ 

3649 Create and return the axes. 

3650 

3651 Parameters 

3652 ---------- 

3653 axes: list or None 

3654 The names or numbers of the axes to create. 

3655 obj : DataFrame 

3656 The object to create axes on. 

3657 validate: bool, default True 

3658 Whether to validate the obj against an existing object already written. 

3659 nan_rep : 

3660 A value to use for string column nan_rep. 

3661 data_columns : List[str], True, or None, default None 

3662 Specify the columns that we want to create to allow indexing on. 

3663 

3664 * True : Use all available columns. 

3665 * None : Use no columns. 

3666 * List[str] : Use the specified columns. 

3667 

3668 min_itemsize: Dict[str, int] or None, default None 

3669 The min itemsize for a column in bytes. 

3670 """ 

3671 

3672 if not isinstance(obj, DataFrame): 

3673 group = self.group._v_name 

3674 raise TypeError( 

3675 f"cannot properly create the storer for: [group->{group}," 

3676 f"value->{type(obj)}]" 

3677 ) 

3678 

3679 # set the default axes if needed 

3680 if axes is None: 

3681 axes = [0] 

3682 

3683 # map axes to numbers 

3684 axes = [obj._get_axis_number(a) for a in axes] 

3685 

3686 # do we have an existing table (if so, use its axes & data_columns) 

3687 if self.infer_axes(): 

3688 table_exists = True 

3689 axes = [a.axis for a in self.index_axes] 

3690 data_columns = list(self.data_columns) 

3691 nan_rep = self.nan_rep 

3692 # TODO: do we always have validate=True here? 

3693 else: 

3694 table_exists = False 

3695 

3696 new_info = self.info 

3697 

3698 assert self.ndim == 2 # with next check, we must have len(axes) == 1 

3699 # currently support on ndim-1 axes 

3700 if len(axes) != self.ndim - 1: 

3701 raise ValueError( 

3702 "currently only support ndim-1 indexers in an AppendableTable" 

3703 ) 

3704 

3705 # create according to the new data 

3706 new_non_index_axes: List = [] 

3707 

3708 # nan_representation 

3709 if nan_rep is None: 

3710 nan_rep = "nan" 

3711 

3712 # We construct the non-index-axis first, since that alters new_info 

3713 idx = [x for x in [0, 1] if x not in axes][0] 

3714 

3715 a = obj.axes[idx] 

3716 # we might be able to change the axes on the appending data if necessary 

3717 append_axis = list(a) 

3718 if table_exists: 

3719 indexer = len(new_non_index_axes) # i.e. 0 

3720 exist_axis = self.non_index_axes[indexer][1] 

3721 if not array_equivalent(np.array(append_axis), np.array(exist_axis)): 

3722 

3723 # ahah! -> reindex 

3724 if array_equivalent( 

3725 np.array(sorted(append_axis)), np.array(sorted(exist_axis)) 

3726 ): 

3727 append_axis = exist_axis 

3728 

3729 # the non_index_axes info 

3730 info = new_info.setdefault(idx, {}) 

3731 info["names"] = list(a.names) 

3732 info["type"] = type(a).__name__ 

3733 

3734 new_non_index_axes.append((idx, append_axis)) 

3735 

3736 # Now we can construct our new index axis 

3737 idx = axes[0] 

3738 a = obj.axes[idx] 

3739 axis_name = obj._AXIS_NAMES[idx] 

3740 new_index = _convert_index(axis_name, a, self.encoding, self.errors) 

3741 new_index.axis = idx 

3742 

3743 # Because we are always 2D, there is only one new_index, so 

3744 # we know it will have pos=0 

3745 new_index.set_pos(0) 

3746 new_index.update_info(new_info) 

3747 new_index.maybe_set_size(min_itemsize) # check for column conflicts 

3748 

3749 new_index_axes = [new_index] 

3750 j = len(new_index_axes) # i.e. 1 

3751 assert j == 1 

3752 

3753 # reindex by our non_index_axes & compute data_columns 

3754 assert len(new_non_index_axes) == 1 

3755 for a in new_non_index_axes: 

3756 obj = _reindex_axis(obj, a[0], a[1]) 

3757 

3758 def get_blk_items(mgr, blocks): 

3759 return [mgr.items.take(blk.mgr_locs) for blk in blocks] 

3760 

3761 transposed = new_index.axis == 1 

3762 

3763 # figure out data_columns and get out blocks 

3764 data_columns = self.validate_data_columns( 

3765 data_columns, min_itemsize, new_non_index_axes 

3766 ) 

3767 

3768 block_obj = self.get_object(obj, transposed)._consolidate() 

3769 

3770 blocks, blk_items = self._get_blocks_and_items( 

3771 block_obj, table_exists, new_non_index_axes, self.values_axes, data_columns 

3772 ) 

3773 

3774 # add my values 

3775 vaxes = [] 

3776 for i, (b, b_items) in enumerate(zip(blocks, blk_items)): 

3777 

3778 # shape of the data column are the indexable axes 

3779 klass = DataCol 

3780 name = None 

3781 

3782 # we have a data_column 

3783 if data_columns and len(b_items) == 1 and b_items[0] in data_columns: 

3784 klass = DataIndexableCol 

3785 name = b_items[0] 

3786 if not (name is None or isinstance(name, str)): 

3787 # TODO: should the message here be more specifically non-str? 

3788 raise ValueError("cannot have non-object label DataIndexableCol") 

3789 

3790 # make sure that we match up the existing columns 

3791 # if we have an existing table 

3792 existing_col: Optional[DataCol] 

3793 

3794 if table_exists and validate: 

3795 try: 

3796 existing_col = self.values_axes[i] 

3797 except (IndexError, KeyError): 

3798 raise ValueError( 

3799 f"Incompatible appended table [{blocks}]" 

3800 f"with existing table [{self.values_axes}]" 

3801 ) 

3802 else: 

3803 existing_col = None 

3804 

3805 new_name = name or f"values_block_{i}" 

3806 data_converted = _maybe_convert_for_string_atom( 

3807 new_name, 

3808 b, 

3809 existing_col=existing_col, 

3810 min_itemsize=min_itemsize, 

3811 nan_rep=nan_rep, 

3812 encoding=self.encoding, 

3813 errors=self.errors, 

3814 ) 

3815 adj_name = _maybe_adjust_name(new_name, self.version) 

3816 

3817 typ = klass._get_atom(data_converted) 

3818 kind = _dtype_to_kind(data_converted.dtype.name) 

3819 tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None 

3820 

3821 meta = metadata = ordered = None 

3822 if is_categorical_dtype(data_converted): 

3823 ordered = data_converted.ordered 

3824 meta = "category" 

3825 metadata = np.array(data_converted.categories, copy=False).ravel() 

3826 

3827 data, dtype_name = _get_data_and_dtype_name(data_converted) 

3828 

3829 col = klass( 

3830 name=adj_name, 

3831 cname=new_name, 

3832 values=list(b_items), 

3833 typ=typ, 

3834 pos=j, 

3835 kind=kind, 

3836 tz=tz, 

3837 ordered=ordered, 

3838 meta=meta, 

3839 metadata=metadata, 

3840 dtype=dtype_name, 

3841 data=data, 

3842 ) 

3843 col.update_info(new_info) 

3844 

3845 vaxes.append(col) 

3846 

3847 j += 1 

3848 

3849 dcs = [col.name for col in vaxes if col.is_data_indexable] 

3850 

3851 new_table = type(self)( 

3852 parent=self.parent, 

3853 group=self.group, 

3854 encoding=self.encoding, 

3855 errors=self.errors, 

3856 index_axes=new_index_axes, 

3857 non_index_axes=new_non_index_axes, 

3858 values_axes=vaxes, 

3859 data_columns=dcs, 

3860 info=new_info, 

3861 nan_rep=nan_rep, 

3862 ) 

3863 if hasattr(self, "levels"): 

3864 # TODO: get this into constructor, only for appropriate subclass 

3865 new_table.levels = self.levels 

3866 

3867 new_table.validate_min_itemsize(min_itemsize) 

3868 

3869 if validate and table_exists: 

3870 new_table.validate(self) 

3871 

3872 return new_table 

3873 

3874 @staticmethod 

3875 def _get_blocks_and_items( 

3876 block_obj, table_exists, new_non_index_axes, values_axes, data_columns 

3877 ): 

3878 # Helper to clarify non-state-altering parts of _create_axes 

3879 

3880 def get_blk_items(mgr, blocks): 

3881 return [mgr.items.take(blk.mgr_locs) for blk in blocks] 

3882 

3883 blocks = block_obj._data.blocks 

3884 blk_items = get_blk_items(block_obj._data, blocks) 

3885 

3886 if len(data_columns): 

3887 axis, axis_labels = new_non_index_axes[0] 

3888 new_labels = Index(axis_labels).difference(Index(data_columns)) 

3889 mgr = block_obj.reindex(new_labels, axis=axis)._data 

3890 

3891 blocks = list(mgr.blocks) 

3892 blk_items = get_blk_items(mgr, blocks) 

3893 for c in data_columns: 

3894 mgr = block_obj.reindex([c], axis=axis)._data 

3895 blocks.extend(mgr.blocks) 

3896 blk_items.extend(get_blk_items(mgr, mgr.blocks)) 

3897 

3898 # reorder the blocks in the same order as the existing table if we can 

3899 if table_exists: 

3900 by_items = { 

3901 tuple(b_items.tolist()): (b, b_items) 

3902 for b, b_items in zip(blocks, blk_items) 

3903 } 

3904 new_blocks = [] 

3905 new_blk_items = [] 

3906 for ea in values_axes: 

3907 items = tuple(ea.values) 

3908 try: 

3909 b, b_items = by_items.pop(items) 

3910 new_blocks.append(b) 

3911 new_blk_items.append(b_items) 

3912 except (IndexError, KeyError): 

3913 jitems = ",".join(pprint_thing(item) for item in items) 

3914 raise ValueError( 

3915 f"cannot match existing table structure for [{jitems}] " 

3916 "on appending data" 

3917 ) 

3918 blocks = new_blocks 

3919 blk_items = new_blk_items 

3920 

3921 return blocks, blk_items 

3922 

3923 def process_axes(self, obj, selection: "Selection", columns=None): 

3924 """ process axes filters """ 

3925 

3926 # make a copy to avoid side effects 

3927 if columns is not None: 

3928 columns = list(columns) 

3929 

3930 # make sure to include levels if we have them 

3931 if columns is not None and self.is_multi_index: 

3932 assert isinstance(self.levels, list) # assured by is_multi_index 

3933 for n in self.levels: 

3934 if n not in columns: 

3935 columns.insert(0, n) 

3936 

3937 # reorder by any non_index_axes & limit to the select columns 

3938 for axis, labels in self.non_index_axes: 

3939 obj = _reindex_axis(obj, axis, labels, columns) 

3940 

3941 # apply the selection filters (but keep in the same order) 

3942 if selection.filter is not None: 

3943 for field, op, filt in selection.filter.format(): 

3944 

3945 def process_filter(field, filt): 

3946 

3947 for axis_name in obj._AXIS_NAMES.values(): 

3948 axis_number = obj._get_axis_number(axis_name) 

3949 axis_values = obj._get_axis(axis_name) 

3950 assert axis_number is not None 

3951 

3952 # see if the field is the name of an axis 

3953 if field == axis_name: 

3954 

3955 # if we have a multi-index, then need to include 

3956 # the levels 

3957 if self.is_multi_index: 

3958 filt = filt.union(Index(self.levels)) 

3959 

3960 takers = op(axis_values, filt) 

3961 return obj.loc(axis=axis_number)[takers] 

3962 

3963 # this might be the name of a file IN an axis 

3964 elif field in axis_values: 

3965 

3966 # we need to filter on this dimension 

3967 values = ensure_index(getattr(obj, field).values) 

3968 filt = ensure_index(filt) 

3969 

3970 # hack until we support reversed dim flags 

3971 if isinstance(obj, DataFrame): 

3972 axis_number = 1 - axis_number 

3973 takers = op(values, filt) 

3974 return obj.loc(axis=axis_number)[takers] 

3975 

3976 raise ValueError(f"cannot find the field [{field}] for filtering!") 

3977 

3978 obj = process_filter(field, filt) 

3979 

3980 return obj 

3981 

3982 def create_description( 

3983 self, 

3984 complib, 

3985 complevel: Optional[int], 

3986 fletcher32: bool, 

3987 expectedrows: Optional[int], 

3988 ) -> Dict[str, Any]: 

3989 """ create the description of the table from the axes & values """ 

3990 

3991 # provided expected rows if its passed 

3992 if expectedrows is None: 

3993 expectedrows = max(self.nrows_expected, 10000) 

3994 

3995 d = dict(name="table", expectedrows=expectedrows) 

3996 

3997 # description from the axes & values 

3998 d["description"] = {a.cname: a.typ for a in self.axes} 

3999 

4000 if complib: 

4001 if complevel is None: 

4002 complevel = self._complevel or 9 

4003 filters = _tables().Filters( 

4004 complevel=complevel, 

4005 complib=complib, 

4006 fletcher32=fletcher32 or self._fletcher32, 

4007 ) 

4008 d["filters"] = filters 

4009 elif self._filters is not None: 

4010 d["filters"] = self._filters 

4011 

4012 return d 

4013 

4014 def read_coordinates( 

4015 self, where=None, start: Optional[int] = None, stop: Optional[int] = None, 

4016 ): 

4017 """select coordinates (row numbers) from a table; return the 

4018 coordinates object 

4019 """ 

4020 

4021 # validate the version 

4022 self.validate_version(where) 

4023 

4024 # infer the data kind 

4025 if not self.infer_axes(): 

4026 return False 

4027 

4028 # create the selection 

4029 selection = Selection(self, where=where, start=start, stop=stop) 

4030 coords = selection.select_coords() 

4031 if selection.filter is not None: 

4032 for field, op, filt in selection.filter.format(): 

4033 data = self.read_column( 

4034 field, start=coords.min(), stop=coords.max() + 1 

4035 ) 

4036 coords = coords[op(data.iloc[coords - coords.min()], filt).values] 

4037 

4038 return Index(coords) 

4039 

4040 def read_column( 

4041 self, 

4042 column: str, 

4043 where=None, 

4044 start: Optional[int] = None, 

4045 stop: Optional[int] = None, 

4046 ): 

4047 """return a single column from the table, generally only indexables 

4048 are interesting 

4049 """ 

4050 

4051 # validate the version 

4052 self.validate_version() 

4053 

4054 # infer the data kind 

4055 if not self.infer_axes(): 

4056 return False 

4057 

4058 if where is not None: 

4059 raise TypeError("read_column does not currently accept a where clause") 

4060 

4061 # find the axes 

4062 for a in self.axes: 

4063 if column == a.name: 

4064 

4065 if not a.is_data_indexable: 

4066 raise ValueError( 

4067 f"column [{column}] can not be extracted individually; " 

4068 "it is not data indexable" 

4069 ) 

4070 

4071 # column must be an indexable or a data column 

4072 c = getattr(self.table.cols, column) 

4073 a.set_info(self.info) 

4074 col_values = a.convert( 

4075 c[start:stop], 

4076 nan_rep=self.nan_rep, 

4077 encoding=self.encoding, 

4078 errors=self.errors, 

4079 ) 

4080 return Series(_set_tz(col_values[1], a.tz), name=column) 

4081 

4082 raise KeyError(f"column [{column}] not found in the table") 

4083 

4084 

4085class WORMTable(Table): 

4086 """ a write-once read-many table: this format DOES NOT ALLOW appending to a 

4087 table. writing is a one-time operation the data are stored in a format 

4088 that allows for searching the data on disk 

4089 """ 

4090 

4091 table_type = "worm" 

4092 

4093 def read( 

4094 self, 

4095 where=None, 

4096 columns=None, 

4097 start: Optional[int] = None, 

4098 stop: Optional[int] = None, 

4099 ): 

4100 """ read the indices and the indexing array, calculate offset rows and 

4101 return """ 

4102 raise NotImplementedError("WORMTable needs to implement read") 

4103 

4104 def write(self, **kwargs): 

4105 """ write in a format that we can search later on (but cannot append 

4106 to): write out the indices and the values using _write_array 

4107 (e.g. a CArray) create an indexing table so that we can search 

4108 """ 

4109 raise NotImplementedError("WORMTable needs to implement write") 

4110 

4111 

4112class AppendableTable(Table): 

4113 """ support the new appendable table formats """ 

4114 

4115 table_type = "appendable" 

4116 

4117 def write( 

4118 self, 

4119 obj, 

4120 axes=None, 

4121 append=False, 

4122 complib=None, 

4123 complevel=None, 

4124 fletcher32=None, 

4125 min_itemsize=None, 

4126 chunksize=None, 

4127 expectedrows=None, 

4128 dropna=False, 

4129 nan_rep=None, 

4130 data_columns=None, 

4131 ): 

4132 

4133 if not append and self.is_exists: 

4134 self._handle.remove_node(self.group, "table") 

4135 

4136 # create the axes 

4137 table = self._create_axes( 

4138 axes=axes, 

4139 obj=obj, 

4140 validate=append, 

4141 min_itemsize=min_itemsize, 

4142 nan_rep=nan_rep, 

4143 data_columns=data_columns, 

4144 ) 

4145 

4146 for a in table.axes: 

4147 a.validate_names() 

4148 

4149 if not table.is_exists: 

4150 

4151 # create the table 

4152 options = table.create_description( 

4153 complib=complib, 

4154 complevel=complevel, 

4155 fletcher32=fletcher32, 

4156 expectedrows=expectedrows, 

4157 ) 

4158 

4159 # set the table attributes 

4160 table.set_attrs() 

4161 

4162 # create the table 

4163 table._handle.create_table(table.group, **options) 

4164 

4165 # update my info 

4166 table.attrs.info = table.info 

4167 

4168 # validate the axes and set the kinds 

4169 for a in table.axes: 

4170 a.validate_and_set(table, append) 

4171 

4172 # add the rows 

4173 table.write_data(chunksize, dropna=dropna) 

4174 

4175 def write_data(self, chunksize: Optional[int], dropna: bool = False): 

4176 """ we form the data into a 2-d including indexes,values,mask 

4177 write chunk-by-chunk """ 

4178 

4179 names = self.dtype.names 

4180 nrows = self.nrows_expected 

4181 

4182 # if dropna==True, then drop ALL nan rows 

4183 masks = [] 

4184 if dropna: 

4185 

4186 for a in self.values_axes: 

4187 

4188 # figure the mask: only do if we can successfully process this 

4189 # column, otherwise ignore the mask 

4190 mask = isna(a.data).all(axis=0) 

4191 if isinstance(mask, np.ndarray): 

4192 masks.append(mask.astype("u1", copy=False)) 

4193 

4194 # consolidate masks 

4195 if len(masks): 

4196 mask = masks[0] 

4197 for m in masks[1:]: 

4198 mask = mask & m 

4199 mask = mask.ravel() 

4200 else: 

4201 mask = None 

4202 

4203 # broadcast the indexes if needed 

4204 indexes = [a.cvalues for a in self.index_axes] 

4205 nindexes = len(indexes) 

4206 assert nindexes == 1, nindexes # ensures we dont need to broadcast 

4207 

4208 # transpose the values so first dimension is last 

4209 # reshape the values if needed 

4210 values = [a.take_data() for a in self.values_axes] 

4211 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values] 

4212 bvalues = [] 

4213 for i, v in enumerate(values): 

4214 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape 

4215 bvalues.append(values[i].reshape(new_shape)) 

4216 

4217 # write the chunks 

4218 if chunksize is None: 

4219 chunksize = 100000 

4220 

4221 rows = np.empty(min(chunksize, nrows), dtype=self.dtype) 

4222 chunks = int(nrows / chunksize) + 1 

4223 for i in range(chunks): 

4224 start_i = i * chunksize 

4225 end_i = min((i + 1) * chunksize, nrows) 

4226 if start_i >= end_i: 

4227 break 

4228 

4229 self.write_data_chunk( 

4230 rows, 

4231 indexes=[a[start_i:end_i] for a in indexes], 

4232 mask=mask[start_i:end_i] if mask is not None else None, 

4233 values=[v[start_i:end_i] for v in bvalues], 

4234 ) 

4235 

4236 def write_data_chunk( 

4237 self, 

4238 rows: np.ndarray, 

4239 indexes: List[np.ndarray], 

4240 mask: Optional[np.ndarray], 

4241 values: List[np.ndarray], 

4242 ): 

4243 """ 

4244 Parameters 

4245 ---------- 

4246 rows : an empty memory space where we are putting the chunk 

4247 indexes : an array of the indexes 

4248 mask : an array of the masks 

4249 values : an array of the values 

4250 """ 

4251 

4252 # 0 len 

4253 for v in values: 

4254 if not np.prod(v.shape): 

4255 return 

4256 

4257 nrows = indexes[0].shape[0] 

4258 if nrows != len(rows): 

4259 rows = np.empty(nrows, dtype=self.dtype) 

4260 names = self.dtype.names 

4261 nindexes = len(indexes) 

4262 

4263 # indexes 

4264 for i, idx in enumerate(indexes): 

4265 rows[names[i]] = idx 

4266 

4267 # values 

4268 for i, v in enumerate(values): 

4269 rows[names[i + nindexes]] = v 

4270 

4271 # mask 

4272 if mask is not None: 

4273 m = ~mask.ravel().astype(bool, copy=False) 

4274 if not m.all(): 

4275 rows = rows[m] 

4276 

4277 if len(rows): 

4278 self.table.append(rows) 

4279 self.table.flush() 

4280 

4281 def delete( 

4282 self, where=None, start: Optional[int] = None, stop: Optional[int] = None, 

4283 ): 

4284 

4285 # delete all rows (and return the nrows) 

4286 if where is None or not len(where): 

4287 if start is None and stop is None: 

4288 nrows = self.nrows 

4289 self._handle.remove_node(self.group, recursive=True) 

4290 else: 

4291 # pytables<3.0 would remove a single row with stop=None 

4292 if stop is None: 

4293 stop = self.nrows 

4294 nrows = self.table.remove_rows(start=start, stop=stop) 

4295 self.table.flush() 

4296 return nrows 

4297 

4298 # infer the data kind 

4299 if not self.infer_axes(): 

4300 return None 

4301 

4302 # create the selection 

4303 table = self.table 

4304 selection = Selection(self, where, start=start, stop=stop) 

4305 values = selection.select_coords() 

4306 

4307 # delete the rows in reverse order 

4308 sorted_series = Series(values).sort_values() 

4309 ln = len(sorted_series) 

4310 

4311 if ln: 

4312 

4313 # construct groups of consecutive rows 

4314 diff = sorted_series.diff() 

4315 groups = list(diff[diff > 1].index) 

4316 

4317 # 1 group 

4318 if not len(groups): 

4319 groups = [0] 

4320 

4321 # final element 

4322 if groups[-1] != ln: 

4323 groups.append(ln) 

4324 

4325 # initial element 

4326 if groups[0] != 0: 

4327 groups.insert(0, 0) 

4328 

4329 # we must remove in reverse order! 

4330 pg = groups.pop() 

4331 for g in reversed(groups): 

4332 rows = sorted_series.take(range(g, pg)) 

4333 table.remove_rows( 

4334 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1 

4335 ) 

4336 pg = g 

4337 

4338 self.table.flush() 

4339 

4340 # return the number of rows removed 

4341 return ln 

4342 

4343 

4344class AppendableFrameTable(AppendableTable): 

4345 """ support the new appendable table formats """ 

4346 

4347 pandas_kind = "frame_table" 

4348 table_type = "appendable_frame" 

4349 ndim = 2 

4350 obj_type: Type[Union[DataFrame, Series]] = DataFrame 

4351 

4352 @property 

4353 def is_transposed(self) -> bool: 

4354 return self.index_axes[0].axis == 1 

4355 

4356 @classmethod 

4357 def get_object(cls, obj, transposed: bool): 

4358 """ these are written transposed """ 

4359 if transposed: 

4360 obj = obj.T 

4361 return obj 

4362 

4363 def read( 

4364 self, 

4365 where=None, 

4366 columns=None, 

4367 start: Optional[int] = None, 

4368 stop: Optional[int] = None, 

4369 ): 

4370 

4371 # validate the version 

4372 self.validate_version(where) 

4373 

4374 # infer the data kind 

4375 if not self.infer_axes(): 

4376 return None 

4377 

4378 result = self._read_axes(where=where, start=start, stop=stop) 

4379 

4380 info = ( 

4381 self.info.get(self.non_index_axes[0][0], dict()) 

4382 if len(self.non_index_axes) 

4383 else dict() 

4384 ) 

4385 

4386 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] 

4387 assert len(inds) == 1 

4388 ind = inds[0] 

4389 

4390 index = result[ind][0] 

4391 

4392 frames = [] 

4393 for i, a in enumerate(self.axes): 

4394 if a not in self.values_axes: 

4395 continue 

4396 index_vals, cvalues = result[i] 

4397 

4398 # we could have a multi-index constructor here 

4399 # ensure_index doesn't recognized our list-of-tuples here 

4400 if info.get("type") == "MultiIndex": 

4401 cols = MultiIndex.from_tuples(index_vals) 

4402 else: 

4403 cols = Index(index_vals) 

4404 

4405 names = info.get("names") 

4406 if names is not None: 

4407 cols.set_names(names, inplace=True) 

4408 

4409 if self.is_transposed: 

4410 values = cvalues 

4411 index_ = cols 

4412 cols_ = Index(index, name=getattr(index, "name", None)) 

4413 else: 

4414 values = cvalues.T 

4415 index_ = Index(index, name=getattr(index, "name", None)) 

4416 cols_ = cols 

4417 

4418 # if we have a DataIndexableCol, its shape will only be 1 dim 

4419 if values.ndim == 1 and isinstance(values, np.ndarray): 

4420 values = values.reshape((1, values.shape[0])) 

4421 

4422 if isinstance(values, np.ndarray): 

4423 df = DataFrame(values.T, columns=cols_, index=index_) 

4424 elif isinstance(values, Index): 

4425 df = DataFrame(values, columns=cols_, index=index_) 

4426 else: 

4427 # Categorical 

4428 df = DataFrame([values], columns=cols_, index=index_) 

4429 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) 

4430 frames.append(df) 

4431 

4432 if len(frames) == 1: 

4433 df = frames[0] 

4434 else: 

4435 df = concat(frames, axis=1) 

4436 

4437 selection = Selection(self, where=where, start=start, stop=stop) 

4438 # apply the selection filters & axis orderings 

4439 df = self.process_axes(df, selection=selection, columns=columns) 

4440 

4441 return df 

4442 

4443 

4444class AppendableSeriesTable(AppendableFrameTable): 

4445 """ support the new appendable table formats """ 

4446 

4447 pandas_kind = "series_table" 

4448 table_type = "appendable_series" 

4449 ndim = 2 

4450 obj_type = Series 

4451 

4452 @property 

4453 def is_transposed(self) -> bool: 

4454 return False 

4455 

4456 @classmethod 

4457 def get_object(cls, obj, transposed: bool): 

4458 return obj 

4459 

4460 def write(self, obj, data_columns=None, **kwargs): 

4461 """ we are going to write this as a frame table """ 

4462 if not isinstance(obj, DataFrame): 

4463 name = obj.name or "values" 

4464 obj = obj.to_frame(name) 

4465 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) 

4466 

4467 def read( 

4468 self, 

4469 where=None, 

4470 columns=None, 

4471 start: Optional[int] = None, 

4472 stop: Optional[int] = None, 

4473 ) -> Series: 

4474 

4475 is_multi_index = self.is_multi_index 

4476 if columns is not None and is_multi_index: 

4477 assert isinstance(self.levels, list) # needed for mypy 

4478 for n in self.levels: 

4479 if n not in columns: 

4480 columns.insert(0, n) 

4481 s = super().read(where=where, columns=columns, start=start, stop=stop) 

4482 if is_multi_index: 

4483 s.set_index(self.levels, inplace=True) 

4484 

4485 s = s.iloc[:, 0] 

4486 

4487 # remove the default name 

4488 if s.name == "values": 

4489 s.name = None 

4490 return s 

4491 

4492 

4493class AppendableMultiSeriesTable(AppendableSeriesTable): 

4494 """ support the new appendable table formats """ 

4495 

4496 pandas_kind = "series_table" 

4497 table_type = "appendable_multiseries" 

4498 

4499 def write(self, obj, **kwargs): 

4500 """ we are going to write this as a frame table """ 

4501 name = obj.name or "values" 

4502 obj, self.levels = self.validate_multiindex(obj) 

4503 cols = list(self.levels) 

4504 cols.append(name) 

4505 obj.columns = cols 

4506 return super().write(obj=obj, **kwargs) 

4507 

4508 

4509class GenericTable(AppendableFrameTable): 

4510 """ a table that read/writes the generic pytables table format """ 

4511 

4512 pandas_kind = "frame_table" 

4513 table_type = "generic_table" 

4514 ndim = 2 

4515 obj_type = DataFrame 

4516 

4517 @property 

4518 def pandas_type(self) -> str: 

4519 return self.pandas_kind 

4520 

4521 @property 

4522 def storable(self): 

4523 return getattr(self.group, "table", None) or self.group 

4524 

4525 def get_attrs(self): 

4526 """ retrieve our attributes """ 

4527 self.non_index_axes = [] 

4528 self.nan_rep = None 

4529 self.levels = [] 

4530 

4531 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

4532 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

4533 self.data_columns = [a.name for a in self.values_axes] 

4534 

4535 @cache_readonly 

4536 def indexables(self): 

4537 """ create the indexables from the table description """ 

4538 d = self.description 

4539 

4540 # TODO: can we get a typ for this? AFAICT it is the only place 

4541 # where we aren't passing one 

4542 # the index columns is just a simple index 

4543 md = self.read_metadata("index") 

4544 meta = "category" if md is not None else None 

4545 index_col = GenericIndexCol( 

4546 name="index", axis=0, table=self.table, meta=meta, metadata=md 

4547 ) 

4548 

4549 _indexables = [index_col] 

4550 

4551 for i, n in enumerate(d._v_names): 

4552 assert isinstance(n, str) 

4553 

4554 atom = getattr(d, n) 

4555 md = self.read_metadata(n) 

4556 meta = "category" if md is not None else None 

4557 dc = GenericDataIndexableCol( 

4558 name=n, 

4559 pos=i, 

4560 values=[n], 

4561 typ=atom, 

4562 table=self.table, 

4563 meta=meta, 

4564 metadata=md, 

4565 ) 

4566 _indexables.append(dc) 

4567 

4568 return _indexables 

4569 

4570 def write(self, **kwargs): 

4571 raise NotImplementedError("cannot write on an generic table") 

4572 

4573 

4574class AppendableMultiFrameTable(AppendableFrameTable): 

4575 """ a frame with a multi-index """ 

4576 

4577 table_type = "appendable_multiframe" 

4578 obj_type = DataFrame 

4579 ndim = 2 

4580 _re_levels = re.compile(r"^level_\d+$") 

4581 

4582 @property 

4583 def table_type_short(self) -> str: 

4584 return "appendable_multi" 

4585 

4586 def write(self, obj, data_columns=None, **kwargs): 

4587 if data_columns is None: 

4588 data_columns = [] 

4589 elif data_columns is True: 

4590 data_columns = obj.columns.tolist() 

4591 obj, self.levels = self.validate_multiindex(obj) 

4592 for n in self.levels: 

4593 if n not in data_columns: 

4594 data_columns.insert(0, n) 

4595 return super().write(obj=obj, data_columns=data_columns, **kwargs) 

4596 

4597 def read( 

4598 self, 

4599 where=None, 

4600 columns=None, 

4601 start: Optional[int] = None, 

4602 stop: Optional[int] = None, 

4603 ): 

4604 

4605 df = super().read(where=where, columns=columns, start=start, stop=stop) 

4606 df = df.set_index(self.levels) 

4607 

4608 # remove names for 'level_%d' 

4609 df.index = df.index.set_names( 

4610 [None if self._re_levels.search(l) else l for l in df.index.names] 

4611 ) 

4612 

4613 return df 

4614 

4615 

4616def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame: 

4617 ax = obj._get_axis(axis) 

4618 labels = ensure_index(labels) 

4619 

4620 # try not to reindex even if other is provided 

4621 # if it equals our current index 

4622 if other is not None: 

4623 other = ensure_index(other) 

4624 if (other is None or labels.equals(other)) and labels.equals(ax): 

4625 return obj 

4626 

4627 labels = ensure_index(labels.unique()) 

4628 if other is not None: 

4629 labels = ensure_index(other.unique()).intersection(labels, sort=False) 

4630 if not labels.equals(ax): 

4631 slicer: List[Union[slice, Index]] = [slice(None, None)] * obj.ndim 

4632 slicer[axis] = labels 

4633 obj = obj.loc[tuple(slicer)] 

4634 return obj 

4635 

4636 

4637# tz to/from coercion 

4638 

4639 

4640def _get_tz(tz: tzinfo) -> Union[str, tzinfo]: 

4641 """ for a tz-aware type, return an encoded zone """ 

4642 zone = timezones.get_timezone(tz) 

4643 return zone 

4644 

4645 

4646def _set_tz( 

4647 values: Union[np.ndarray, Index], 

4648 tz: Optional[Union[str, tzinfo]], 

4649 coerce: bool = False, 

4650) -> Union[np.ndarray, DatetimeIndex]: 

4651 """ 

4652 coerce the values to a DatetimeIndex if tz is set 

4653 preserve the input shape if possible 

4654 

4655 Parameters 

4656 ---------- 

4657 values : ndarray or Index 

4658 tz : str or tzinfo 

4659 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray 

4660 """ 

4661 if isinstance(values, DatetimeIndex): 

4662 # If values is tzaware, the tz gets dropped in the values.ravel() 

4663 # call below (which returns an ndarray). So we are only non-lossy 

4664 # if `tz` matches `values.tz`. 

4665 assert values.tz is None or values.tz == tz 

4666 

4667 if tz is not None: 

4668 name = getattr(values, "name", None) 

4669 values = values.ravel() 

4670 tz = timezones.get_timezone(_ensure_decoded(tz)) 

4671 values = DatetimeIndex(values, name=name) 

4672 values = values.tz_localize("UTC").tz_convert(tz) 

4673 elif coerce: 

4674 values = np.asarray(values, dtype="M8[ns]") 

4675 

4676 return values 

4677 

4678 

4679def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: 

4680 assert isinstance(name, str) 

4681 

4682 index_name = index.name 

4683 converted, dtype_name = _get_data_and_dtype_name(index) 

4684 kind = _dtype_to_kind(dtype_name) 

4685 atom = DataIndexableCol._get_atom(converted) 

4686 

4687 if isinstance(index, Int64Index): 

4688 # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, 

4689 # in which case "kind" is "integer", "integer", "datetime64", 

4690 # "timedelta64", and "integer", respectively. 

4691 return IndexCol( 

4692 name, 

4693 values=converted, 

4694 kind=kind, 

4695 typ=atom, 

4696 freq=getattr(index, "freq", None), 

4697 tz=getattr(index, "tz", None), 

4698 index_name=index_name, 

4699 ) 

4700 

4701 if isinstance(index, MultiIndex): 

4702 raise TypeError("MultiIndex not supported here!") 

4703 

4704 inferred_type = lib.infer_dtype(index, skipna=False) 

4705 # we wont get inferred_type of "datetime64" or "timedelta64" as these 

4706 # would go through the DatetimeIndex/TimedeltaIndex paths above 

4707 

4708 values = np.asarray(index) 

4709 

4710 if inferred_type == "date": 

4711 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) 

4712 return IndexCol( 

4713 name, converted, "date", _tables().Time32Col(), index_name=index_name, 

4714 ) 

4715 elif inferred_type == "string": 

4716 

4717 converted = _convert_string_array(values, encoding, errors) 

4718 itemsize = converted.dtype.itemsize 

4719 return IndexCol( 

4720 name, 

4721 converted, 

4722 "string", 

4723 _tables().StringCol(itemsize), 

4724 index_name=index_name, 

4725 ) 

4726 

4727 elif inferred_type in ["integer", "floating"]: 

4728 return IndexCol( 

4729 name, values=converted, kind=kind, typ=atom, index_name=index_name, 

4730 ) 

4731 else: 

4732 assert isinstance(converted, np.ndarray) and converted.dtype == object 

4733 assert kind == "object", kind 

4734 atom = _tables().ObjectAtom() 

4735 return IndexCol(name, converted, kind, atom, index_name=index_name,) 

4736 

4737 

4738def _unconvert_index( 

4739 data, kind: str, encoding: str, errors: str 

4740) -> Union[np.ndarray, Index]: 

4741 index: Union[Index, np.ndarray] 

4742 

4743 if kind == "datetime64": 

4744 index = DatetimeIndex(data) 

4745 elif kind == "timedelta64": 

4746 index = TimedeltaIndex(data) 

4747 elif kind == "date": 

4748 try: 

4749 index = np.asarray([date.fromordinal(v) for v in data], dtype=object) 

4750 except (ValueError): 

4751 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object) 

4752 elif kind in ("integer", "float"): 

4753 index = np.asarray(data) 

4754 elif kind in ("string"): 

4755 index = _unconvert_string_array( 

4756 data, nan_rep=None, encoding=encoding, errors=errors 

4757 ) 

4758 elif kind == "object": 

4759 index = np.asarray(data[0]) 

4760 else: # pragma: no cover 

4761 raise ValueError(f"unrecognized index type {kind}") 

4762 return index 

4763 

4764 

4765def _maybe_convert_for_string_atom( 

4766 name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors 

4767): 

4768 

4769 if not block.is_object: 

4770 return block.values 

4771 

4772 dtype_name = block.dtype.name 

4773 inferred_type = lib.infer_dtype(block.values, skipna=False) 

4774 

4775 if inferred_type == "date": 

4776 raise TypeError("[date] is not implemented as a table column") 

4777 elif inferred_type == "datetime": 

4778 # after GH#8260 

4779 # this only would be hit for a multi-timezone dtype which is an error 

4780 raise TypeError( 

4781 "too many timezones in this block, create separate data columns" 

4782 ) 

4783 

4784 elif not (inferred_type == "string" or dtype_name == "object"): 

4785 return block.values 

4786 

4787 block = block.fillna(nan_rep, downcast=False) 

4788 if isinstance(block, list): 

4789 # Note: because block is always object dtype, fillna goes 

4790 # through a path such that the result is always a 1-element list 

4791 block = block[0] 

4792 data = block.values 

4793 

4794 # see if we have a valid string type 

4795 inferred_type = lib.infer_dtype(data.ravel(), skipna=False) 

4796 if inferred_type != "string": 

4797 

4798 # we cannot serialize this data, so report an exception on a column 

4799 # by column basis 

4800 for i in range(len(block.shape[0])): 

4801 

4802 col = block.iget(i) 

4803 inferred_type = lib.infer_dtype(col.ravel(), skipna=False) 

4804 if inferred_type != "string": 

4805 iloc = block.mgr_locs.indexer[i] 

4806 raise TypeError( 

4807 f"Cannot serialize the column [{iloc}] because\n" 

4808 f"its data contents are [{inferred_type}] object dtype" 

4809 ) 

4810 

4811 # itemsize is the maximum length of a string (along any dimension) 

4812 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) 

4813 assert data_converted.shape == block.shape, (data_converted.shape, block.shape) 

4814 itemsize = data_converted.itemsize 

4815 

4816 # specified min_itemsize? 

4817 if isinstance(min_itemsize, dict): 

4818 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) 

4819 itemsize = max(min_itemsize or 0, itemsize) 

4820 

4821 # check for column in the values conflicts 

4822 if existing_col is not None: 

4823 eci = existing_col.validate_col(itemsize) 

4824 if eci > itemsize: 

4825 itemsize = eci 

4826 

4827 data_converted = data_converted.astype(f"|S{itemsize}", copy=False) 

4828 return data_converted 

4829 

4830 

4831def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray: 

4832 """ 

4833 Take a string-like that is object dtype and coerce to a fixed size string type. 

4834 

4835 Parameters 

4836 ---------- 

4837 data : np.ndarray[object] 

4838 encoding : str 

4839 errors : str 

4840 Handler for encoding errors. 

4841 

4842 Returns 

4843 ------- 

4844 np.ndarray[fixed-length-string] 

4845 """ 

4846 

4847 # encode if needed 

4848 if len(data): 

4849 data = ( 

4850 Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) 

4851 ) 

4852 

4853 # create the sized dtype 

4854 ensured = ensure_object(data.ravel()) 

4855 itemsize = max(1, libwriters.max_len_string_array(ensured)) 

4856 

4857 data = np.asarray(data, dtype=f"S{itemsize}") 

4858 return data 

4859 

4860 

4861def _unconvert_string_array( 

4862 data: np.ndarray, nan_rep, encoding: str, errors: str 

4863) -> np.ndarray: 

4864 """ 

4865 Inverse of _convert_string_array. 

4866 

4867 Parameters 

4868 ---------- 

4869 data : np.ndarray[fixed-length-string] 

4870 nan_rep : the storage repr of NaN 

4871 encoding : str 

4872 errors : str 

4873 Handler for encoding errors. 

4874 

4875 Returns 

4876 ------- 

4877 np.ndarray[object] 

4878 Decoded data. 

4879 """ 

4880 shape = data.shape 

4881 data = np.asarray(data.ravel(), dtype=object) 

4882 

4883 if len(data): 

4884 

4885 itemsize = libwriters.max_len_string_array(ensure_object(data)) 

4886 dtype = f"U{itemsize}" 

4887 

4888 if isinstance(data[0], bytes): 

4889 data = Series(data).str.decode(encoding, errors=errors).values 

4890 else: 

4891 data = data.astype(dtype, copy=False).astype(object, copy=False) 

4892 

4893 if nan_rep is None: 

4894 nan_rep = "nan" 

4895 

4896 data = libwriters.string_array_replace_from_nan_rep(data, nan_rep) 

4897 return data.reshape(shape) 

4898 

4899 

4900def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str): 

4901 assert isinstance(val_kind, str), type(val_kind) 

4902 if _need_convert(val_kind): 

4903 conv = _get_converter(val_kind, encoding, errors) 

4904 values = conv(values) 

4905 return values 

4906 

4907 

4908def _get_converter(kind: str, encoding: str, errors: str): 

4909 if kind == "datetime64": 

4910 return lambda x: np.asarray(x, dtype="M8[ns]") 

4911 elif kind == "string": 

4912 return lambda x: _unconvert_string_array( 

4913 x, nan_rep=None, encoding=encoding, errors=errors 

4914 ) 

4915 else: # pragma: no cover 

4916 raise ValueError(f"invalid kind {kind}") 

4917 

4918 

4919def _need_convert(kind: str) -> bool: 

4920 if kind in ("datetime64", "string"): 

4921 return True 

4922 return False 

4923 

4924 

4925def _maybe_adjust_name(name: str, version) -> str: 

4926 """ 

4927 Prior to 0.10.1, we named values blocks like: values_block_0 an the 

4928 name values_0, adjust the given name if necessary. 

4929 

4930 Parameters 

4931 ---------- 

4932 name : str 

4933 version : Tuple[int, int, int] 

4934 

4935 Returns 

4936 ------- 

4937 str 

4938 """ 

4939 try: 

4940 if version[0] == 0 and version[1] <= 10 and version[2] == 0: 

4941 m = re.search(r"values_block_(\d+)", name) 

4942 if m: 

4943 grp = m.groups()[0] 

4944 name = f"values_{grp}" 

4945 except IndexError: 

4946 pass 

4947 return name 

4948 

4949 

4950def _dtype_to_kind(dtype_str: str) -> str: 

4951 """ 

4952 Find the "kind" string describing the given dtype name. 

4953 """ 

4954 dtype_str = _ensure_decoded(dtype_str) 

4955 

4956 if dtype_str.startswith("string") or dtype_str.startswith("bytes"): 

4957 kind = "string" 

4958 elif dtype_str.startswith("float"): 

4959 kind = "float" 

4960 elif dtype_str.startswith("complex"): 

4961 kind = "complex" 

4962 elif dtype_str.startswith("int") or dtype_str.startswith("uint"): 

4963 kind = "integer" 

4964 elif dtype_str.startswith("datetime64"): 

4965 kind = "datetime64" 

4966 elif dtype_str.startswith("timedelta"): 

4967 kind = "timedelta64" 

4968 elif dtype_str.startswith("bool"): 

4969 kind = "bool" 

4970 elif dtype_str.startswith("category"): 

4971 kind = "category" 

4972 elif dtype_str.startswith("period"): 

4973 # We store the `freq` attr so we can restore from integers 

4974 kind = "integer" 

4975 elif dtype_str == "object": 

4976 kind = "object" 

4977 else: 

4978 raise ValueError(f"cannot interpret dtype of [{dtype_str}]") 

4979 

4980 return kind 

4981 

4982 

4983def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): 

4984 """ 

4985 Convert the passed data into a storable form and a dtype string. 

4986 """ 

4987 if is_categorical_dtype(data.dtype): 

4988 data = data.codes 

4989 

4990 # For datetime64tz we need to drop the TZ in tests TODO: why? 

4991 dtype_name = data.dtype.name.split("[")[0] 

4992 

4993 if data.dtype.kind in ["m", "M"]: 

4994 data = np.asarray(data.view("i8")) 

4995 # TODO: we used to reshape for the dt64tz case, but no longer 

4996 # doing that doesn't seem to break anything. why? 

4997 

4998 elif isinstance(data, PeriodIndex): 

4999 data = data.asi8 

5000 

5001 data = np.asarray(data) 

5002 return data, dtype_name 

5003 

5004 

5005class Selection: 

5006 """ 

5007 Carries out a selection operation on a tables.Table object. 

5008 

5009 Parameters 

5010 ---------- 

5011 table : a Table object 

5012 where : list of Terms (or convertible to) 

5013 start, stop: indices to start and/or stop selection 

5014 

5015 """ 

5016 

5017 def __init__( 

5018 self, 

5019 table: Table, 

5020 where=None, 

5021 start: Optional[int] = None, 

5022 stop: Optional[int] = None, 

5023 ): 

5024 self.table = table 

5025 self.where = where 

5026 self.start = start 

5027 self.stop = stop 

5028 self.condition = None 

5029 self.filter = None 

5030 self.terms = None 

5031 self.coordinates = None 

5032 

5033 if is_list_like(where): 

5034 

5035 # see if we have a passed coordinate like 

5036 try: 

5037 inferred = lib.infer_dtype(where, skipna=False) 

5038 if inferred == "integer" or inferred == "boolean": 

5039 where = np.asarray(where) 

5040 if where.dtype == np.bool_: 

5041 start, stop = self.start, self.stop 

5042 if start is None: 

5043 start = 0 

5044 if stop is None: 

5045 stop = self.table.nrows 

5046 self.coordinates = np.arange(start, stop)[where] 

5047 elif issubclass(where.dtype.type, np.integer): 

5048 if (self.start is not None and (where < self.start).any()) or ( 

5049 self.stop is not None and (where >= self.stop).any() 

5050 ): 

5051 raise ValueError( 

5052 "where must have index locations >= start and < stop" 

5053 ) 

5054 self.coordinates = where 

5055 

5056 except ValueError: 

5057 pass 

5058 

5059 if self.coordinates is None: 

5060 

5061 self.terms = self.generate(where) 

5062 

5063 # create the numexpr & the filter 

5064 if self.terms is not None: 

5065 self.condition, self.filter = self.terms.evaluate() 

5066 

5067 def generate(self, where): 

5068 """ where can be a : dict,list,tuple,string """ 

5069 if where is None: 

5070 return None 

5071 

5072 q = self.table.queryables() 

5073 try: 

5074 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) 

5075 except NameError: 

5076 # raise a nice message, suggesting that the user should use 

5077 # data_columns 

5078 qkeys = ",".join(q.keys()) 

5079 raise ValueError( 

5080 f"The passed where expression: {where}\n" 

5081 " contains an invalid variable reference\n" 

5082 " all of the variable references must be a " 

5083 "reference to\n" 

5084 " an axis (e.g. 'index' or 'columns'), or a " 

5085 "data_column\n" 

5086 f" The currently defined references are: {qkeys}\n" 

5087 ) 

5088 

5089 def select(self): 

5090 """ 

5091 generate the selection 

5092 """ 

5093 if self.condition is not None: 

5094 return self.table.table.read_where( 

5095 self.condition.format(), start=self.start, stop=self.stop 

5096 ) 

5097 elif self.coordinates is not None: 

5098 return self.table.table.read_coordinates(self.coordinates) 

5099 return self.table.table.read(start=self.start, stop=self.stop) 

5100 

5101 def select_coords(self): 

5102 """ 

5103 generate the selection 

5104 """ 

5105 start, stop = self.start, self.stop 

5106 nrows = self.table.nrows 

5107 if start is None: 

5108 start = 0 

5109 elif start < 0: 

5110 start += nrows 

5111 if self.stop is None: 

5112 stop = nrows 

5113 elif stop < 0: 

5114 stop += nrows 

5115 

5116 if self.condition is not None: 

5117 return self.table.table.get_where_list( 

5118 self.condition.format(), start=start, stop=stop, sort=True 

5119 ) 

5120 elif self.coordinates is not None: 

5121 return self.coordinates 

5122 

5123 return np.arange(start, stop)