Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/pytables.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2High level interface to PyTables for reading and writing pandas data structures
3to disk
4"""
6import copy
7from datetime import date, tzinfo
8import itertools
9import os
10import re
11from typing import (
12 TYPE_CHECKING,
13 Any,
14 Dict,
15 Hashable,
16 List,
17 Optional,
18 Tuple,
19 Type,
20 Union,
21)
22import warnings
24import numpy as np
26from pandas._config import config, get_option
28from pandas._libs import lib, writers as libwriters
29from pandas._libs.tslibs import timezones
30from pandas._typing import ArrayLike, FrameOrSeries
31from pandas.compat._optional import import_optional_dependency
32from pandas.errors import PerformanceWarning
33from pandas.util._decorators import cache_readonly
35from pandas.core.dtypes.common import (
36 ensure_object,
37 is_categorical_dtype,
38 is_complex_dtype,
39 is_datetime64_dtype,
40 is_datetime64tz_dtype,
41 is_extension_array_dtype,
42 is_list_like,
43 is_string_dtype,
44 is_timedelta64_dtype,
45)
46from pandas.core.dtypes.generic import ABCExtensionArray
47from pandas.core.dtypes.missing import array_equivalent
49from pandas import (
50 DataFrame,
51 DatetimeIndex,
52 Index,
53 Int64Index,
54 MultiIndex,
55 PeriodIndex,
56 Series,
57 TimedeltaIndex,
58 concat,
59 isna,
60)
61from pandas.core.arrays.categorical import Categorical
62import pandas.core.common as com
63from pandas.core.computation.pytables import PyTablesExpr, maybe_expression
64from pandas.core.indexes.api import ensure_index
66from pandas.io.common import stringify_path
67from pandas.io.formats.printing import adjoin, pprint_thing
69if TYPE_CHECKING:
70 from tables import File, Node, Col # noqa:F401
73# versioning attribute
74_version = "0.15.2"
76# encoding
77_default_encoding = "UTF-8"
80def _ensure_decoded(s):
81 """ if we have bytes, decode them to unicode """
82 if isinstance(s, np.bytes_):
83 s = s.decode("UTF-8")
84 return s
87def _ensure_encoding(encoding):
88 # set the encoding if we need
89 if encoding is None:
90 encoding = _default_encoding
92 return encoding
95def _ensure_str(name):
96 """
97 Ensure that an index / column name is a str (python 3); otherwise they
98 may be np.string dtype. Non-string dtypes are passed through unchanged.
100 https://github.com/pandas-dev/pandas/issues/13492
101 """
102 if isinstance(name, str):
103 name = str(name)
104 return name
107Term = PyTablesExpr
110def _ensure_term(where, scope_level: int):
111 """
112 ensure that the where is a Term or a list of Term
113 this makes sure that we are capturing the scope of variables
114 that are passed
115 create the terms here with a frame_level=2 (we are 2 levels down)
116 """
118 # only consider list/tuple here as an ndarray is automatically a coordinate
119 # list
120 level = scope_level + 1
121 if isinstance(where, (list, tuple)):
122 wlist = []
123 for w in filter(lambda x: x is not None, where):
124 if not maybe_expression(w):
125 wlist.append(w)
126 else:
127 wlist.append(Term(w, scope_level=level))
128 where = wlist
129 elif maybe_expression(where):
130 where = Term(where, scope_level=level)
131 return where if where is None or len(where) else None
134class PossibleDataLossError(Exception):
135 pass
138class ClosedFileError(Exception):
139 pass
142class IncompatibilityWarning(Warning):
143 pass
146incompatibility_doc = """
147where criteria is being ignored as this version [%s] is too old (or
148not-defined), read the file in and write it out to a new file to upgrade (with
149the copy_to method)
150"""
153class AttributeConflictWarning(Warning):
154 pass
157attribute_conflict_doc = """
158the [%s] attribute of the existing index is [%s] which conflicts with the new
159[%s], resetting the attribute to None
160"""
163class DuplicateWarning(Warning):
164 pass
167duplicate_doc = """
168duplicate entries in table, taking most recently appended
169"""
171performance_doc = """
172your performance may suffer as PyTables will pickle object types that it cannot
173map directly to c-types [inferred_type->%s,key->%s] [items->%s]
174"""
176# formats
177_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
179# axes map
180_AXES_MAP = {DataFrame: [0]}
182# register our configuration options
183dropna_doc = """
184: boolean
185 drop ALL nan rows when appending to a table
186"""
187format_doc = """
188: format
189 default format writing format, if None, then
190 put will default to 'fixed' and append will default to 'table'
191"""
193with config.config_prefix("io.hdf"):
194 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
195 config.register_option(
196 "default_format",
197 None,
198 format_doc,
199 validator=config.is_one_of_factory(["fixed", "table", None]),
200 )
202# oh the troubles to reduce import time
203_table_mod = None
204_table_file_open_policy_is_strict = False
207def _tables():
208 global _table_mod
209 global _table_file_open_policy_is_strict
210 if _table_mod is None:
211 import tables
213 _table_mod = tables
215 # set the file open policy
216 # return the file open policy; this changes as of pytables 3.1
217 # depending on the HDF5 version
218 try:
219 _table_file_open_policy_is_strict = (
220 tables.file._FILE_OPEN_POLICY == "strict"
221 )
222 except AttributeError:
223 pass
225 return _table_mod
228# interface to/from ###
231def to_hdf(
232 path_or_buf,
233 key: str,
234 value: FrameOrSeries,
235 mode: str = "a",
236 complevel: Optional[int] = None,
237 complib: Optional[str] = None,
238 append: bool = False,
239 format: Optional[str] = None,
240 index: bool = True,
241 min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
242 nan_rep=None,
243 dropna: Optional[bool] = None,
244 data_columns: Optional[List[str]] = None,
245 errors: str = "strict",
246 encoding: str = "UTF-8",
247):
248 """ store this object, close it if we opened it """
250 if append:
251 f = lambda store: store.append(
252 key,
253 value,
254 format=format,
255 index=index,
256 min_itemsize=min_itemsize,
257 nan_rep=nan_rep,
258 dropna=dropna,
259 data_columns=data_columns,
260 errors=errors,
261 encoding=encoding,
262 )
263 else:
264 # NB: dropna is not passed to `put`
265 f = lambda store: store.put(
266 key,
267 value,
268 format=format,
269 index=index,
270 min_itemsize=min_itemsize,
271 nan_rep=nan_rep,
272 data_columns=data_columns,
273 errors=errors,
274 encoding=encoding,
275 )
277 path_or_buf = stringify_path(path_or_buf)
278 if isinstance(path_or_buf, str):
279 with HDFStore(
280 path_or_buf, mode=mode, complevel=complevel, complib=complib
281 ) as store:
282 f(store)
283 else:
284 f(path_or_buf)
287def read_hdf(
288 path_or_buf,
289 key=None,
290 mode: str = "r",
291 errors: str = "strict",
292 where=None,
293 start: Optional[int] = None,
294 stop: Optional[int] = None,
295 columns=None,
296 iterator=False,
297 chunksize: Optional[int] = None,
298 **kwargs,
299):
300 """
301 Read from the store, close it if we opened it.
303 Retrieve pandas object stored in file, optionally based on where
304 criteria
306 Parameters
307 ----------
308 path_or_buf : str, path object, pandas.HDFStore or file-like object
309 Any valid string path is acceptable. The string could be a URL. Valid
310 URL schemes include http, ftp, s3, and file. For file URLs, a host is
311 expected. A local file could be: ``file://localhost/path/to/table.h5``.
313 If you want to pass in a path object, pandas accepts any
314 ``os.PathLike``.
316 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
318 By file-like object, we refer to objects with a ``read()`` method,
319 such as a file handler (e.g. via builtin ``open`` function)
320 or ``StringIO``.
322 .. versionadded:: 0.21.0 support for __fspath__ protocol.
324 key : object, optional
325 The group identifier in the store. Can be omitted if the HDF file
326 contains a single pandas object.
327 mode : {'r', 'r+', 'a'}, default 'r'
328 Mode to use when opening the file. Ignored if path_or_buf is a
329 :class:`pandas.HDFStore`. Default is 'r'.
330 where : list, optional
331 A list of Term (or convertible) objects.
332 start : int, optional
333 Row number to start selection.
334 stop : int, optional
335 Row number to stop selection.
336 columns : list, optional
337 A list of columns names to return.
338 iterator : bool, optional
339 Return an iterator object.
340 chunksize : int, optional
341 Number of rows to include in an iteration when using an iterator.
342 errors : str, default 'strict'
343 Specifies how encoding and decoding errors are to be handled.
344 See the errors argument for :func:`open` for a full list
345 of options.
346 **kwargs
347 Additional keyword arguments passed to HDFStore.
349 Returns
350 -------
351 item : object
352 The selected object. Return type depends on the object stored.
354 See Also
355 --------
356 DataFrame.to_hdf : Write a HDF file from a DataFrame.
357 HDFStore : Low-level access to HDF files.
359 Examples
360 --------
361 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
362 >>> df.to_hdf('./store.h5', 'data')
363 >>> reread = pd.read_hdf('./store.h5')
364 """
366 if mode not in ["r", "r+", "a"]:
367 raise ValueError(
368 f"mode {mode} is not allowed while performing a read. "
369 f"Allowed modes are r, r+ and a."
370 )
371 # grab the scope
372 if where is not None:
373 where = _ensure_term(where, scope_level=1)
375 if isinstance(path_or_buf, HDFStore):
376 if not path_or_buf.is_open:
377 raise IOError("The HDFStore must be open for reading.")
379 store = path_or_buf
380 auto_close = False
381 else:
382 path_or_buf = stringify_path(path_or_buf)
383 if not isinstance(path_or_buf, str):
384 raise NotImplementedError(
385 "Support for generic buffers has not been implemented."
386 )
387 try:
388 exists = os.path.exists(path_or_buf)
390 # if filepath is too long
391 except (TypeError, ValueError):
392 exists = False
394 if not exists:
395 raise FileNotFoundError(f"File {path_or_buf} does not exist")
397 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
398 # can't auto open/close if we are using an iterator
399 # so delegate to the iterator
400 auto_close = True
402 try:
403 if key is None:
404 groups = store.groups()
405 if len(groups) == 0:
406 raise ValueError("No dataset in HDF5 file.")
407 candidate_only_group = groups[0]
409 # For the HDF file to have only one dataset, all other groups
410 # should then be metadata groups for that candidate group. (This
411 # assumes that the groups() method enumerates parent groups
412 # before their children.)
413 for group_to_check in groups[1:]:
414 if not _is_metadata_of(group_to_check, candidate_only_group):
415 raise ValueError(
416 "key must be provided when HDF5 file "
417 "contains multiple datasets."
418 )
419 key = candidate_only_group._v_pathname
420 return store.select(
421 key,
422 where=where,
423 start=start,
424 stop=stop,
425 columns=columns,
426 iterator=iterator,
427 chunksize=chunksize,
428 auto_close=auto_close,
429 )
430 except (ValueError, TypeError, KeyError):
431 if not isinstance(path_or_buf, HDFStore):
432 # if there is an error, close the store if we opened it.
433 try:
434 store.close()
435 except AttributeError:
436 pass
438 raise
441def _is_metadata_of(group: "Node", parent_group: "Node") -> bool:
442 """Check if a given group is a metadata group for a given parent_group."""
443 if group._v_depth <= parent_group._v_depth:
444 return False
446 current = group
447 while current._v_depth > 1:
448 parent = current._v_parent
449 if parent == parent_group and current._v_name == "meta":
450 return True
451 current = current._v_parent
452 return False
455class HDFStore:
456 """
457 Dict-like IO interface for storing pandas objects in PyTables.
459 Either Fixed or Table format.
461 Parameters
462 ----------
463 path : string
464 File path to HDF5 file
465 mode : {'a', 'w', 'r', 'r+'}, default 'a'
467 ``'r'``
468 Read-only; no data can be modified.
469 ``'w'``
470 Write; a new file is created (an existing file with the same
471 name would be deleted).
472 ``'a'``
473 Append; an existing file is opened for reading and writing,
474 and if the file does not exist it is created.
475 ``'r+'``
476 It is similar to ``'a'``, but the file must already exist.
477 complevel : int, 0-9, default None
478 Specifies a compression level for data.
479 A value of 0 or None disables compression.
480 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
481 Specifies the compression library to be used.
482 As of v0.20.2 these additional compressors for Blosc are supported
483 (default if no compressor specified: 'blosc:blosclz'):
484 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
485 'blosc:zlib', 'blosc:zstd'}.
486 Specifying a compression library which is not available issues
487 a ValueError.
488 fletcher32 : bool, default False
489 If applying compression use the fletcher32 checksum
491 Examples
492 --------
493 >>> bar = pd.DataFrame(np.random.randn(10, 4))
494 >>> store = pd.HDFStore('test.h5')
495 >>> store['foo'] = bar # write to HDF5
496 >>> bar = store['foo'] # retrieve
497 >>> store.close()
498 """
500 _handle: Optional["File"]
501 _mode: str
502 _complevel: int
503 _fletcher32: bool
505 def __init__(
506 self,
507 path,
508 mode: str = "a",
509 complevel: Optional[int] = None,
510 complib=None,
511 fletcher32: bool = False,
512 **kwargs,
513 ):
515 if "format" in kwargs:
516 raise ValueError("format is not a defined argument for HDFStore")
518 tables = import_optional_dependency("tables")
520 if complib is not None and complib not in tables.filters.all_complibs:
521 raise ValueError(
522 f"complib only supports {tables.filters.all_complibs} compression."
523 )
525 if complib is None and complevel is not None:
526 complib = tables.filters.default_complib
528 self._path = stringify_path(path)
529 if mode is None:
530 mode = "a"
531 self._mode = mode
532 self._handle = None
533 self._complevel = complevel if complevel else 0
534 self._complib = complib
535 self._fletcher32 = fletcher32
536 self._filters = None
537 self.open(mode=mode, **kwargs)
539 def __fspath__(self):
540 return self._path
542 @property
543 def root(self):
544 """ return the root node """
545 self._check_if_open()
546 return self._handle.root
548 @property
549 def filename(self):
550 return self._path
552 def __getitem__(self, key: str):
553 return self.get(key)
555 def __setitem__(self, key: str, value):
556 self.put(key, value)
558 def __delitem__(self, key: str):
559 return self.remove(key)
561 def __getattr__(self, name: str):
562 """ allow attribute access to get stores """
563 try:
564 return self.get(name)
565 except (KeyError, ClosedFileError):
566 pass
567 raise AttributeError(
568 f"'{type(self).__name__}' object has no attribute '{name}'"
569 )
571 def __contains__(self, key: str) -> bool:
572 """ check for existence of this key
573 can match the exact pathname or the pathnm w/o the leading '/'
574 """
575 node = self.get_node(key)
576 if node is not None:
577 name = node._v_pathname
578 if name == key or name[1:] == key:
579 return True
580 return False
582 def __len__(self) -> int:
583 return len(self.groups())
585 def __repr__(self) -> str:
586 pstr = pprint_thing(self._path)
587 return f"{type(self)}\nFile path: {pstr}\n"
589 def __enter__(self):
590 return self
592 def __exit__(self, exc_type, exc_value, traceback):
593 self.close()
595 def keys(self) -> List[str]:
596 """
597 Return a list of keys corresponding to objects stored in HDFStore.
599 Returns
600 -------
601 list
602 List of ABSOLUTE path-names (e.g. have the leading '/').
603 """
604 return [n._v_pathname for n in self.groups()]
606 def __iter__(self):
607 return iter(self.keys())
609 def items(self):
610 """
611 iterate on key->group
612 """
613 for g in self.groups():
614 yield g._v_pathname, g
616 iteritems = items
618 def open(self, mode: str = "a", **kwargs):
619 """
620 Open the file in the specified mode
622 Parameters
623 ----------
624 mode : {'a', 'w', 'r', 'r+'}, default 'a'
625 See HDFStore docstring or tables.open_file for info about modes
626 """
627 tables = _tables()
629 if self._mode != mode:
631 # if we are changing a write mode to read, ok
632 if self._mode in ["a", "w"] and mode in ["r", "r+"]:
633 pass
634 elif mode in ["w"]:
636 # this would truncate, raise here
637 if self.is_open:
638 raise PossibleDataLossError(
639 f"Re-opening the file [{self._path}] with mode [{self._mode}] "
640 "will delete the current file!"
641 )
643 self._mode = mode
645 # close and reopen the handle
646 if self.is_open:
647 self.close()
649 if self._complevel and self._complevel > 0:
650 self._filters = _tables().Filters(
651 self._complevel, self._complib, fletcher32=self._fletcher32
652 )
654 try:
655 self._handle = tables.open_file(self._path, self._mode, **kwargs)
656 except IOError as err: # pragma: no cover
657 if "can not be written" in str(err):
658 print(f"Opening {self._path} in read-only mode")
659 self._handle = tables.open_file(self._path, "r", **kwargs)
660 else:
661 raise
663 except ValueError as err:
665 # trap PyTables >= 3.1 FILE_OPEN_POLICY exception
666 # to provide an updated message
667 if "FILE_OPEN_POLICY" in str(err):
668 hdf_version = tables.get_hdf5_version()
669 err = ValueError(
670 f"PyTables [{tables.__version__}] no longer supports "
671 "opening multiple files\n"
672 "even in read-only mode on this HDF5 version "
673 f"[{hdf_version}]. You can accept this\n"
674 "and not open the same file multiple times at once,\n"
675 "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
676 "which allows\n"
677 "files to be opened multiple times at once\n"
678 )
680 raise err
682 except Exception as err:
684 # trying to read from a non-existent file causes an error which
685 # is not part of IOError, make it one
686 if self._mode == "r" and "Unable to open/create file" in str(err):
687 raise IOError(str(err))
688 raise
690 def close(self):
691 """
692 Close the PyTables file handle
693 """
694 if self._handle is not None:
695 self._handle.close()
696 self._handle = None
698 @property
699 def is_open(self) -> bool:
700 """
701 return a boolean indicating whether the file is open
702 """
703 if self._handle is None:
704 return False
705 return bool(self._handle.isopen)
707 def flush(self, fsync: bool = False):
708 """
709 Force all buffered modifications to be written to disk.
711 Parameters
712 ----------
713 fsync : bool (default False)
714 call ``os.fsync()`` on the file handle to force writing to disk.
716 Notes
717 -----
718 Without ``fsync=True``, flushing may not guarantee that the OS writes
719 to disk. With fsync, the operation will block until the OS claims the
720 file has been written; however, other caching layers may still
721 interfere.
722 """
723 if self._handle is not None:
724 self._handle.flush()
725 if fsync:
726 try:
727 os.fsync(self._handle.fileno())
728 except OSError:
729 pass
731 def get(self, key: str):
732 """
733 Retrieve pandas object stored in file.
735 Parameters
736 ----------
737 key : str
739 Returns
740 -------
741 object
742 Same type as object stored in file.
743 """
744 group = self.get_node(key)
745 if group is None:
746 raise KeyError(f"No object named {key} in the file")
747 return self._read_group(group)
749 def select(
750 self,
751 key: str,
752 where=None,
753 start=None,
754 stop=None,
755 columns=None,
756 iterator=False,
757 chunksize=None,
758 auto_close: bool = False,
759 ):
760 """
761 Retrieve pandas object stored in file, optionally based on where criteria.
763 Parameters
764 ----------
765 key : str
766 Object being retrieved from file.
767 where : list, default None
768 List of Term (or convertible) objects, optional.
769 start : int, default None
770 Row number to start selection.
771 stop : int, default None
772 Row number to stop selection.
773 columns : list, default None
774 A list of columns that if not None, will limit the return columns.
775 iterator : bool, default False
776 Returns an iterator.
777 chunksize : int, default None
778 Number or rows to include in iteration, return an iterator.
779 auto_close : bool, default False
780 Should automatically close the store when finished.
782 Returns
783 -------
784 object
785 Retrieved object from file.
786 """
787 group = self.get_node(key)
788 if group is None:
789 raise KeyError(f"No object named {key} in the file")
791 # create the storer and axes
792 where = _ensure_term(where, scope_level=1)
793 s = self._create_storer(group)
794 s.infer_axes()
796 # function to call on iteration
797 def func(_start, _stop, _where):
798 return s.read(start=_start, stop=_stop, where=_where, columns=columns)
800 # create the iterator
801 it = TableIterator(
802 self,
803 s,
804 func,
805 where=where,
806 nrows=s.nrows,
807 start=start,
808 stop=stop,
809 iterator=iterator,
810 chunksize=chunksize,
811 auto_close=auto_close,
812 )
814 return it.get_result()
816 def select_as_coordinates(
817 self,
818 key: str,
819 where=None,
820 start: Optional[int] = None,
821 stop: Optional[int] = None,
822 ):
823 """
824 return the selection as an Index
826 Parameters
827 ----------
828 key : str
829 where : list of Term (or convertible) objects, optional
830 start : integer (defaults to None), row number to start selection
831 stop : integer (defaults to None), row number to stop selection
832 """
833 where = _ensure_term(where, scope_level=1)
834 tbl = self.get_storer(key)
835 if not isinstance(tbl, Table):
836 raise TypeError("can only read_coordinates with a table")
837 return tbl.read_coordinates(where=where, start=start, stop=stop)
839 def select_column(
840 self,
841 key: str,
842 column: str,
843 start: Optional[int] = None,
844 stop: Optional[int] = None,
845 ):
846 """
847 return a single column from the table. This is generally only useful to
848 select an indexable
850 Parameters
851 ----------
852 key : str
853 column : str
854 The column of interest.
855 start : int or None, default None
856 stop : int or None, default None
858 Raises
859 ------
860 raises KeyError if the column is not found (or key is not a valid
861 store)
862 raises ValueError if the column can not be extracted individually (it
863 is part of a data block)
865 """
866 tbl = self.get_storer(key)
867 if not isinstance(tbl, Table):
868 raise TypeError("can only read_column with a table")
869 return tbl.read_column(column=column, start=start, stop=stop)
871 def select_as_multiple(
872 self,
873 keys,
874 where=None,
875 selector=None,
876 columns=None,
877 start=None,
878 stop=None,
879 iterator=False,
880 chunksize=None,
881 auto_close: bool = False,
882 ):
883 """
884 Retrieve pandas objects from multiple tables.
886 Parameters
887 ----------
888 keys : a list of the tables
889 selector : the table to apply the where criteria (defaults to keys[0]
890 if not supplied)
891 columns : the columns I want back
892 start : integer (defaults to None), row number to start selection
893 stop : integer (defaults to None), row number to stop selection
894 iterator : boolean, return an iterator, default False
895 chunksize : nrows to include in iteration, return an iterator
896 auto_close : bool, default False
897 Should automatically close the store when finished.
899 Raises
900 ------
901 raises KeyError if keys or selector is not found or keys is empty
902 raises TypeError if keys is not a list or tuple
903 raises ValueError if the tables are not ALL THE SAME DIMENSIONS
904 """
906 # default to single select
907 where = _ensure_term(where, scope_level=1)
908 if isinstance(keys, (list, tuple)) and len(keys) == 1:
909 keys = keys[0]
910 if isinstance(keys, str):
911 return self.select(
912 key=keys,
913 where=where,
914 columns=columns,
915 start=start,
916 stop=stop,
917 iterator=iterator,
918 chunksize=chunksize,
919 auto_close=auto_close,
920 )
922 if not isinstance(keys, (list, tuple)):
923 raise TypeError("keys must be a list/tuple")
925 if not len(keys):
926 raise ValueError("keys must have a non-zero length")
928 if selector is None:
929 selector = keys[0]
931 # collect the tables
932 tbls = [self.get_storer(k) for k in keys]
933 s = self.get_storer(selector)
935 # validate rows
936 nrows = None
937 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
938 if t is None:
939 raise KeyError(f"Invalid table [{k}]")
940 if not t.is_table:
941 raise TypeError(
942 f"object [{t.pathname}] is not a table, and cannot be used in all "
943 "select as multiple"
944 )
946 if nrows is None:
947 nrows = t.nrows
948 elif t.nrows != nrows:
949 raise ValueError("all tables must have exactly the same nrows!")
951 # The isinstance checks here are redundant with the check above,
952 # but necessary for mypy; see GH#29757
953 _tbls = [x for x in tbls if isinstance(x, Table)]
955 # axis is the concentration axes
956 axis = list({t.non_index_axes[0][0] for t in _tbls})[0]
958 def func(_start, _stop, _where):
960 # retrieve the objs, _where is always passed as a set of
961 # coordinates here
962 objs = [
963 t.read(where=_where, columns=columns, start=_start, stop=_stop)
964 for t in tbls
965 ]
967 # concat and return
968 return concat(objs, axis=axis, verify_integrity=False)._consolidate()
970 # create the iterator
971 it = TableIterator(
972 self,
973 s,
974 func,
975 where=where,
976 nrows=nrows,
977 start=start,
978 stop=stop,
979 iterator=iterator,
980 chunksize=chunksize,
981 auto_close=auto_close,
982 )
984 return it.get_result(coordinates=True)
986 def put(
987 self,
988 key: str,
989 value: FrameOrSeries,
990 format=None,
991 index=True,
992 append=False,
993 complib=None,
994 complevel: Optional[int] = None,
995 min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
996 nan_rep=None,
997 data_columns: Optional[List[str]] = None,
998 encoding=None,
999 errors: str = "strict",
1000 ):
1001 """
1002 Store object in HDFStore.
1004 Parameters
1005 ----------
1006 key : str
1007 value : {Series, DataFrame}
1008 format : 'fixed(f)|table(t)', default is 'fixed'
1009 fixed(f) : Fixed format
1010 Fast writing/reading. Not-appendable, nor searchable.
1011 table(t) : Table format
1012 Write as a PyTables Table structure which may perform
1013 worse but allow more flexible operations like searching
1014 / selecting subsets of the data.
1015 append : bool, default False
1016 This will force Table format, append the input data to the
1017 existing.
1018 data_columns : list, default None
1019 List of columns to create as data columns, or True to
1020 use all columns. See `here
1021 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1022 encoding : str, default None
1023 Provide an encoding for strings.
1024 dropna : bool, default False, do not write an ALL nan row to
1025 The store settable by the option 'io.hdf.dropna_table'.
1026 """
1027 if format is None:
1028 format = get_option("io.hdf.default_format") or "fixed"
1029 format = self._validate_format(format)
1030 self._write_to_group(
1031 key,
1032 value,
1033 format=format,
1034 index=index,
1035 append=append,
1036 complib=complib,
1037 complevel=complevel,
1038 min_itemsize=min_itemsize,
1039 nan_rep=nan_rep,
1040 data_columns=data_columns,
1041 encoding=encoding,
1042 errors=errors,
1043 )
1045 def remove(self, key: str, where=None, start=None, stop=None):
1046 """
1047 Remove pandas object partially by specifying the where condition
1049 Parameters
1050 ----------
1051 key : string
1052 Node to remove or delete rows from
1053 where : list of Term (or convertible) objects, optional
1054 start : integer (defaults to None), row number to start selection
1055 stop : integer (defaults to None), row number to stop selection
1057 Returns
1058 -------
1059 number of rows removed (or None if not a Table)
1061 Raises
1062 ------
1063 raises KeyError if key is not a valid store
1065 """
1066 where = _ensure_term(where, scope_level=1)
1067 try:
1068 s = self.get_storer(key)
1069 except KeyError:
1070 # the key is not a valid store, re-raising KeyError
1071 raise
1072 except AssertionError:
1073 # surface any assertion errors for e.g. debugging
1074 raise
1075 except Exception:
1076 # In tests we get here with ClosedFileError, TypeError, and
1077 # _table_mod.NoSuchNodeError. TODO: Catch only these?
1079 if where is not None:
1080 raise ValueError(
1081 "trying to remove a node with a non-None where clause!"
1082 )
1084 # we are actually trying to remove a node (with children)
1085 node = self.get_node(key)
1086 if node is not None:
1087 node._f_remove(recursive=True)
1088 return None
1090 # remove the node
1091 if com.all_none(where, start, stop):
1092 s.group._f_remove(recursive=True)
1094 # delete from the table
1095 else:
1096 if not s.is_table:
1097 raise ValueError(
1098 "can only remove with where on objects written as tables"
1099 )
1100 return s.delete(where=where, start=start, stop=stop)
1102 def append(
1103 self,
1104 key: str,
1105 value: FrameOrSeries,
1106 format=None,
1107 axes=None,
1108 index=True,
1109 append=True,
1110 complib=None,
1111 complevel: Optional[int] = None,
1112 columns=None,
1113 min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
1114 nan_rep=None,
1115 chunksize=None,
1116 expectedrows=None,
1117 dropna: Optional[bool] = None,
1118 data_columns: Optional[List[str]] = None,
1119 encoding=None,
1120 errors: str = "strict",
1121 ):
1122 """
1123 Append to Table in file. Node must already exist and be Table
1124 format.
1126 Parameters
1127 ----------
1128 key : str
1129 value : {Series, DataFrame}
1130 format : 'table' is the default
1131 table(t) : table format
1132 Write as a PyTables Table structure which may perform
1133 worse but allow more flexible operations like searching
1134 / selecting subsets of the data.
1135 append : bool, default True
1136 Append the input data to the existing.
1137 data_columns : list of columns, or True, default None
1138 List of columns to create as indexed data columns for on-disk
1139 queries, or True to use all columns. By default only the axes
1140 of the object are indexed. See `here
1141 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1142 min_itemsize : dict of columns that specify minimum string sizes
1143 nan_rep : string to use as string nan representation
1144 chunksize : size to chunk the writing
1145 expectedrows : expected TOTAL row size of this table
1146 encoding : default None, provide an encoding for strings
1147 dropna : bool, default False
1148 Do not write an ALL nan row to the store settable
1149 by the option 'io.hdf.dropna_table'.
1151 Notes
1152 -----
1153 Does *not* check if data being appended overlaps with existing
1154 data in the table, so be careful
1155 """
1156 if columns is not None:
1157 raise TypeError(
1158 "columns is not a supported keyword in append, try data_columns"
1159 )
1161 if dropna is None:
1162 dropna = get_option("io.hdf.dropna_table")
1163 if format is None:
1164 format = get_option("io.hdf.default_format") or "table"
1165 format = self._validate_format(format)
1166 self._write_to_group(
1167 key,
1168 value,
1169 format=format,
1170 axes=axes,
1171 index=index,
1172 append=append,
1173 complib=complib,
1174 complevel=complevel,
1175 min_itemsize=min_itemsize,
1176 nan_rep=nan_rep,
1177 chunksize=chunksize,
1178 expectedrows=expectedrows,
1179 dropna=dropna,
1180 data_columns=data_columns,
1181 encoding=encoding,
1182 errors=errors,
1183 )
1185 def append_to_multiple(
1186 self,
1187 d: Dict,
1188 value,
1189 selector,
1190 data_columns=None,
1191 axes=None,
1192 dropna=False,
1193 **kwargs,
1194 ):
1195 """
1196 Append to multiple tables
1198 Parameters
1199 ----------
1200 d : a dict of table_name to table_columns, None is acceptable as the
1201 values of one node (this will get all the remaining columns)
1202 value : a pandas object
1203 selector : a string that designates the indexable table; all of its
1204 columns will be designed as data_columns, unless data_columns is
1205 passed, in which case these are used
1206 data_columns : list of columns to create as data columns, or True to
1207 use all columns
1208 dropna : if evaluates to True, drop rows from all tables if any single
1209 row in each table has all NaN. Default False.
1211 Notes
1212 -----
1213 axes parameter is currently not accepted
1215 """
1216 if axes is not None:
1217 raise TypeError(
1218 "axes is currently not accepted as a parameter to append_to_multiple; "
1219 "you can create the tables independently instead"
1220 )
1222 if not isinstance(d, dict):
1223 raise ValueError(
1224 "append_to_multiple must have a dictionary specified as the "
1225 "way to split the value"
1226 )
1228 if selector not in d:
1229 raise ValueError(
1230 "append_to_multiple requires a selector that is in passed dict"
1231 )
1233 # figure out the splitting axis (the non_index_axis)
1234 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
1236 # figure out how to split the value
1237 remain_key = None
1238 remain_values: List = []
1239 for k, v in d.items():
1240 if v is None:
1241 if remain_key is not None:
1242 raise ValueError(
1243 "append_to_multiple can only have one value in d that "
1244 "is None"
1245 )
1246 remain_key = k
1247 else:
1248 remain_values.extend(v)
1249 if remain_key is not None:
1250 ordered = value.axes[axis]
1251 ordd = ordered.difference(Index(remain_values))
1252 ordd = sorted(ordered.get_indexer(ordd))
1253 d[remain_key] = ordered.take(ordd)
1255 # data_columns
1256 if data_columns is None:
1257 data_columns = d[selector]
1259 # ensure rows are synchronized across the tables
1260 if dropna:
1261 idxs = (value[cols].dropna(how="all").index for cols in d.values())
1262 valid_index = next(idxs)
1263 for index in idxs:
1264 valid_index = valid_index.intersection(index)
1265 value = value.loc[valid_index]
1267 # append
1268 for k, v in d.items():
1269 dc = data_columns if k == selector else None
1271 # compute the val
1272 val = value.reindex(v, axis=axis)
1274 self.append(k, val, data_columns=dc, **kwargs)
1276 def create_table_index(
1277 self,
1278 key: str,
1279 columns=None,
1280 optlevel: Optional[int] = None,
1281 kind: Optional[str] = None,
1282 ):
1283 """
1284 Create a pytables index on the table.
1286 Parameters
1287 ----------
1288 key : str
1289 columns : None, bool, or listlike[str]
1290 Indicate which columns to create an index on.
1292 * False : Do not create any indexes.
1293 * True : Create indexes on all columns.
1294 * None : Create indexes on all columns.
1295 * listlike : Create indexes on the given columns.
1297 optlevel : int or None, default None
1298 Optimization level, if None, pytables defaults to 6.
1299 kind : str or None, default None
1300 Kind of index, if None, pytables defaults to "medium".
1302 Raises
1303 ------
1304 TypeError: raises if the node is not a table
1305 """
1307 # version requirements
1308 _tables()
1309 s = self.get_storer(key)
1310 if s is None:
1311 return
1313 if not isinstance(s, Table):
1314 raise TypeError("cannot create table index on a Fixed format store")
1315 s.create_index(columns=columns, optlevel=optlevel, kind=kind)
1317 def groups(self):
1318 """
1319 Return a list of all the top-level nodes.
1321 Each node returned is not a pandas storage object.
1323 Returns
1324 -------
1325 list
1326 List of objects.
1327 """
1328 _tables()
1329 self._check_if_open()
1330 return [
1331 g
1332 for g in self._handle.walk_groups()
1333 if (
1334 not isinstance(g, _table_mod.link.Link)
1335 and (
1336 getattr(g._v_attrs, "pandas_type", None)
1337 or getattr(g, "table", None)
1338 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
1339 )
1340 )
1341 ]
1343 def walk(self, where="/"):
1344 """
1345 Walk the pytables group hierarchy for pandas objects.
1347 This generator will yield the group path, subgroups and pandas object
1348 names for each group.
1350 Any non-pandas PyTables objects that are not a group will be ignored.
1352 The `where` group itself is listed first (preorder), then each of its
1353 child groups (following an alphanumerical order) is also traversed,
1354 following the same procedure.
1356 .. versionadded:: 0.24.0
1358 Parameters
1359 ----------
1360 where : str, default "/"
1361 Group where to start walking.
1363 Yields
1364 ------
1365 path : str
1366 Full path to a group (without trailing '/').
1367 groups : list
1368 Names (strings) of the groups contained in `path`.
1369 leaves : list
1370 Names (strings) of the pandas objects contained in `path`.
1371 """
1372 _tables()
1373 self._check_if_open()
1374 for g in self._handle.walk_groups(where):
1375 if getattr(g._v_attrs, "pandas_type", None) is not None:
1376 continue
1378 groups = []
1379 leaves = []
1380 for child in g._v_children.values():
1381 pandas_type = getattr(child._v_attrs, "pandas_type", None)
1382 if pandas_type is None:
1383 if isinstance(child, _table_mod.group.Group):
1384 groups.append(child._v_name)
1385 else:
1386 leaves.append(child._v_name)
1388 yield (g._v_pathname.rstrip("/"), groups, leaves)
1390 def get_node(self, key: str) -> Optional["Node"]:
1391 """ return the node with the key or None if it does not exist """
1392 self._check_if_open()
1393 if not key.startswith("/"):
1394 key = "/" + key
1396 assert self._handle is not None
1397 assert _table_mod is not None # for mypy
1398 try:
1399 node = self._handle.get_node(self.root, key)
1400 except _table_mod.exceptions.NoSuchNodeError:
1401 return None
1403 assert isinstance(node, _table_mod.Node), type(node)
1404 return node
1406 def get_storer(self, key: str) -> Union["GenericFixed", "Table"]:
1407 """ return the storer object for a key, raise if not in the file """
1408 group = self.get_node(key)
1409 if group is None:
1410 raise KeyError(f"No object named {key} in the file")
1412 s = self._create_storer(group)
1413 s.infer_axes()
1414 return s
1416 def copy(
1417 self,
1418 file,
1419 mode="w",
1420 propindexes: bool = True,
1421 keys=None,
1422 complib=None,
1423 complevel: Optional[int] = None,
1424 fletcher32: bool = False,
1425 overwrite=True,
1426 ):
1427 """
1428 Copy the existing store to a new file, updating in place.
1430 Parameters
1431 ----------
1432 propindexes: bool, default True
1433 Restore indexes in copied file.
1434 keys : list of keys to include in the copy (defaults to all)
1435 overwrite : overwrite (remove and replace) existing nodes in the
1436 new store (default is True)
1437 mode, complib, complevel, fletcher32 same as in HDFStore.__init__
1439 Returns
1440 -------
1441 open file handle of the new store
1442 """
1443 new_store = HDFStore(
1444 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
1445 )
1446 if keys is None:
1447 keys = list(self.keys())
1448 if not isinstance(keys, (tuple, list)):
1449 keys = [keys]
1450 for k in keys:
1451 s = self.get_storer(k)
1452 if s is not None:
1454 if k in new_store:
1455 if overwrite:
1456 new_store.remove(k)
1458 data = self.select(k)
1459 if isinstance(s, Table):
1461 index: Union[bool, List[str]] = False
1462 if propindexes:
1463 index = [a.name for a in s.axes if a.is_indexed]
1464 new_store.append(
1465 k,
1466 data,
1467 index=index,
1468 data_columns=getattr(s, "data_columns", None),
1469 encoding=s.encoding,
1470 )
1471 else:
1472 new_store.put(k, data, encoding=s.encoding)
1474 return new_store
1476 def info(self) -> str:
1477 """
1478 Print detailed information on the store.
1480 .. versionadded:: 0.21.0
1482 Returns
1483 -------
1484 str
1485 """
1486 path = pprint_thing(self._path)
1487 output = f"{type(self)}\nFile path: {path}\n"
1489 if self.is_open:
1490 lkeys = sorted(self.keys())
1491 if len(lkeys):
1492 keys = []
1493 values = []
1495 for k in lkeys:
1496 try:
1497 s = self.get_storer(k)
1498 if s is not None:
1499 keys.append(pprint_thing(s.pathname or k))
1500 values.append(pprint_thing(s or "invalid_HDFStore node"))
1501 except AssertionError:
1502 # surface any assertion errors for e.g. debugging
1503 raise
1504 except Exception as detail:
1505 keys.append(k)
1506 dstr = pprint_thing(detail)
1507 values.append(f"[invalid_HDFStore node: {dstr}]")
1509 output += adjoin(12, keys, values)
1510 else:
1511 output += "Empty"
1512 else:
1513 output += "File is CLOSED"
1515 return output
1517 # ------------------------------------------------------------------------
1518 # private methods
1520 def _check_if_open(self):
1521 if not self.is_open:
1522 raise ClosedFileError(f"{self._path} file is not open!")
1524 def _validate_format(self, format: str) -> str:
1525 """ validate / deprecate formats """
1527 # validate
1528 try:
1529 format = _FORMAT_MAP[format.lower()]
1530 except KeyError:
1531 raise TypeError(f"invalid HDFStore format specified [{format}]")
1533 return format
1535 def _create_storer(
1536 self,
1537 group,
1538 format=None,
1539 value: Optional[FrameOrSeries] = None,
1540 encoding: str = "UTF-8",
1541 errors: str = "strict",
1542 ) -> Union["GenericFixed", "Table"]:
1543 """ return a suitable class to operate """
1545 cls: Union[Type["GenericFixed"], Type["Table"]]
1547 if value is not None and not isinstance(value, (Series, DataFrame)):
1548 raise TypeError("value must be None, Series, or DataFrame")
1550 def error(t):
1551 # return instead of raising so mypy can tell where we are raising
1552 return TypeError(
1553 f"cannot properly create the storer for: [{t}] [group->"
1554 f"{group},value->{type(value)},format->{format}"
1555 )
1557 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
1558 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
1560 # infer the pt from the passed value
1561 if pt is None:
1562 if value is None:
1564 _tables()
1565 assert _table_mod is not None # for mypy
1566 if getattr(group, "table", None) or isinstance(
1567 group, _table_mod.table.Table
1568 ):
1569 pt = "frame_table"
1570 tt = "generic_table"
1571 else:
1572 raise TypeError(
1573 "cannot create a storer if the object is not existing "
1574 "nor a value are passed"
1575 )
1576 else:
1577 _TYPE_MAP = {Series: "series", DataFrame: "frame"}
1578 pt = _TYPE_MAP[type(value)]
1580 # we are actually a table
1581 if format == "table":
1582 pt += "_table"
1584 # a storer node
1585 if "table" not in pt:
1586 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
1587 try:
1588 cls = _STORER_MAP[pt]
1589 except KeyError:
1590 raise error("_STORER_MAP")
1591 return cls(self, group, encoding=encoding, errors=errors)
1593 # existing node (and must be a table)
1594 if tt is None:
1596 # if we are a writer, determine the tt
1597 if value is not None:
1599 if pt == "series_table":
1600 index = getattr(value, "index", None)
1601 if index is not None:
1602 if index.nlevels == 1:
1603 tt = "appendable_series"
1604 elif index.nlevels > 1:
1605 tt = "appendable_multiseries"
1606 elif pt == "frame_table":
1607 index = getattr(value, "index", None)
1608 if index is not None:
1609 if index.nlevels == 1:
1610 tt = "appendable_frame"
1611 elif index.nlevels > 1:
1612 tt = "appendable_multiframe"
1614 _TABLE_MAP = {
1615 "generic_table": GenericTable,
1616 "appendable_series": AppendableSeriesTable,
1617 "appendable_multiseries": AppendableMultiSeriesTable,
1618 "appendable_frame": AppendableFrameTable,
1619 "appendable_multiframe": AppendableMultiFrameTable,
1620 "worm": WORMTable,
1621 }
1622 try:
1623 cls = _TABLE_MAP[tt]
1624 except KeyError:
1625 raise error("_TABLE_MAP")
1627 return cls(self, group, encoding=encoding, errors=errors)
1629 def _write_to_group(
1630 self,
1631 key: str,
1632 value: FrameOrSeries,
1633 format,
1634 axes=None,
1635 index=True,
1636 append=False,
1637 complib=None,
1638 complevel: Optional[int] = None,
1639 fletcher32=None,
1640 min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
1641 chunksize=None,
1642 expectedrows=None,
1643 dropna=False,
1644 nan_rep=None,
1645 data_columns=None,
1646 encoding=None,
1647 errors: str = "strict",
1648 ):
1649 group = self.get_node(key)
1651 # we make this assertion for mypy; the get_node call will already
1652 # have raised if this is incorrect
1653 assert self._handle is not None
1655 # remove the node if we are not appending
1656 if group is not None and not append:
1657 self._handle.remove_node(group, recursive=True)
1658 group = None
1660 # we don't want to store a table node at all if our object is 0-len
1661 # as there are not dtypes
1662 if getattr(value, "empty", None) and (format == "table" or append):
1663 return
1665 if group is None:
1666 paths = key.split("/")
1668 # recursively create the groups
1669 path = "/"
1670 for p in paths:
1671 if not len(p):
1672 continue
1673 new_path = path
1674 if not path.endswith("/"):
1675 new_path += "/"
1676 new_path += p
1677 group = self.get_node(new_path)
1678 if group is None:
1679 group = self._handle.create_group(path, p)
1680 path = new_path
1682 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
1683 if append:
1684 # raise if we are trying to append to a Fixed format,
1685 # or a table that exists (and we are putting)
1686 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
1687 raise ValueError("Can only append to Tables")
1688 if not s.is_exists:
1689 s.set_object_info()
1690 else:
1691 s.set_object_info()
1693 if not s.is_table and complib:
1694 raise ValueError("Compression not supported on Fixed format stores")
1696 # write the object
1697 s.write(
1698 obj=value,
1699 axes=axes,
1700 append=append,
1701 complib=complib,
1702 complevel=complevel,
1703 fletcher32=fletcher32,
1704 min_itemsize=min_itemsize,
1705 chunksize=chunksize,
1706 expectedrows=expectedrows,
1707 dropna=dropna,
1708 nan_rep=nan_rep,
1709 data_columns=data_columns,
1710 )
1712 if isinstance(s, Table) and index:
1713 s.create_index(columns=index)
1715 def _read_group(self, group: "Node"):
1716 s = self._create_storer(group)
1717 s.infer_axes()
1718 return s.read()
1721class TableIterator:
1722 """
1723 Define the iteration interface on a table
1725 Parameters
1726 ----------
1727 store : HDFStore
1728 s : the referred storer
1729 func : the function to execute the query
1730 where : the where of the query
1731 nrows : the rows to iterate on
1732 start : the passed start value (default is None)
1733 stop : the passed stop value (default is None)
1734 iterator : bool, default False
1735 Whether to use the default iterator.
1736 chunksize : the passed chunking value (default is 100000)
1737 auto_close : bool, default False
1738 Whether to automatically close the store at the end of iteration.
1739 """
1741 chunksize: Optional[int]
1742 store: HDFStore
1743 s: Union["GenericFixed", "Table"]
1745 def __init__(
1746 self,
1747 store: HDFStore,
1748 s: Union["GenericFixed", "Table"],
1749 func,
1750 where,
1751 nrows,
1752 start=None,
1753 stop=None,
1754 iterator: bool = False,
1755 chunksize: Optional[int] = None,
1756 auto_close: bool = False,
1757 ):
1758 self.store = store
1759 self.s = s
1760 self.func = func
1761 self.where = where
1763 # set start/stop if they are not set if we are a table
1764 if self.s.is_table:
1765 if nrows is None:
1766 nrows = 0
1767 if start is None:
1768 start = 0
1769 if stop is None:
1770 stop = nrows
1771 stop = min(nrows, stop)
1773 self.nrows = nrows
1774 self.start = start
1775 self.stop = stop
1777 self.coordinates = None
1778 if iterator or chunksize is not None:
1779 if chunksize is None:
1780 chunksize = 100000
1781 self.chunksize = int(chunksize)
1782 else:
1783 self.chunksize = None
1785 self.auto_close = auto_close
1787 def __iter__(self):
1789 # iterate
1790 current = self.start
1791 while current < self.stop:
1793 stop = min(current + self.chunksize, self.stop)
1794 value = self.func(None, None, self.coordinates[current:stop])
1795 current = stop
1796 if value is None or not len(value):
1797 continue
1799 yield value
1801 self.close()
1803 def close(self):
1804 if self.auto_close:
1805 self.store.close()
1807 def get_result(self, coordinates: bool = False):
1809 # return the actual iterator
1810 if self.chunksize is not None:
1811 if not isinstance(self.s, Table):
1812 raise TypeError("can only use an iterator or chunksize on a table")
1814 self.coordinates = self.s.read_coordinates(where=self.where)
1816 return self
1818 # if specified read via coordinates (necessary for multiple selections
1819 if coordinates:
1820 if not isinstance(self.s, Table):
1821 raise TypeError("can only read_coordinates on a table")
1822 where = self.s.read_coordinates(
1823 where=self.where, start=self.start, stop=self.stop
1824 )
1825 else:
1826 where = self.where
1828 # directly return the result
1829 results = self.func(self.start, self.stop, where)
1830 self.close()
1831 return results
1834class IndexCol:
1835 """ an index column description class
1837 Parameters
1838 ----------
1840 axis : axis which I reference
1841 values : the ndarray like converted values
1842 kind : a string description of this type
1843 typ : the pytables type
1844 pos : the position in the pytables
1846 """
1848 is_an_indexable = True
1849 is_data_indexable = True
1850 _info_fields = ["freq", "tz", "index_name"]
1852 name: str
1853 cname: str
1855 def __init__(
1856 self,
1857 name: str,
1858 values=None,
1859 kind=None,
1860 typ=None,
1861 cname: Optional[str] = None,
1862 axis=None,
1863 pos=None,
1864 freq=None,
1865 tz=None,
1866 index_name=None,
1867 ordered=None,
1868 table=None,
1869 meta=None,
1870 metadata=None,
1871 ):
1873 if not isinstance(name, str):
1874 raise ValueError("`name` must be a str.")
1876 self.values = values
1877 self.kind = kind
1878 self.typ = typ
1879 self.name = name
1880 self.cname = cname or name
1881 self.axis = axis
1882 self.pos = pos
1883 self.freq = freq
1884 self.tz = tz
1885 self.index_name = index_name
1886 self.ordered = ordered
1887 self.table = table
1888 self.meta = meta
1889 self.metadata = metadata
1891 if pos is not None:
1892 self.set_pos(pos)
1894 # These are ensured as long as the passed arguments match the
1895 # constructor annotations.
1896 assert isinstance(self.name, str)
1897 assert isinstance(self.cname, str)
1899 @property
1900 def itemsize(self) -> int:
1901 # Assumes self.typ has already been initialized
1902 return self.typ.itemsize
1904 @property
1905 def kind_attr(self) -> str:
1906 return f"{self.name}_kind"
1908 def set_pos(self, pos: int):
1909 """ set the position of this column in the Table """
1910 self.pos = pos
1911 if pos is not None and self.typ is not None:
1912 self.typ._v_pos = pos
1914 def __repr__(self) -> str:
1915 temp = tuple(
1916 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
1917 )
1918 return ",".join(
1919 (
1920 f"{key}->{value}"
1921 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
1922 )
1923 )
1925 def __eq__(self, other: Any) -> bool:
1926 """ compare 2 col items """
1927 return all(
1928 getattr(self, a, None) == getattr(other, a, None)
1929 for a in ["name", "cname", "axis", "pos"]
1930 )
1932 def __ne__(self, other) -> bool:
1933 return not self.__eq__(other)
1935 @property
1936 def is_indexed(self) -> bool:
1937 """ return whether I am an indexed column """
1938 if not hasattr(self.table, "cols"):
1939 # e.g. if infer hasn't been called yet, self.table will be None.
1940 return False
1941 # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute
1942 # 'error: "None" has no attribute "cols"'
1943 return getattr(self.table.cols, self.cname).is_indexed # type: ignore
1945 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
1946 """
1947 Convert the data from this selection to the appropriate pandas type.
1948 """
1949 assert isinstance(values, np.ndarray), type(values)
1951 # values is a recarray
1952 if values.dtype.fields is not None:
1953 values = values[self.cname]
1955 val_kind = _ensure_decoded(self.kind)
1956 values = _maybe_convert(values, val_kind, encoding, errors)
1958 kwargs = dict()
1959 kwargs["name"] = _ensure_decoded(self.index_name)
1961 if self.freq is not None:
1962 kwargs["freq"] = _ensure_decoded(self.freq)
1964 # making an Index instance could throw a number of different errors
1965 try:
1966 new_pd_index = Index(values, **kwargs)
1967 except ValueError:
1968 # if the output freq is different that what we recorded,
1969 # it should be None (see also 'doc example part 2')
1970 if "freq" in kwargs:
1971 kwargs["freq"] = None
1972 new_pd_index = Index(values, **kwargs)
1974 new_pd_index = _set_tz(new_pd_index, self.tz)
1975 return new_pd_index, new_pd_index
1977 def take_data(self):
1978 """ return the values"""
1979 return self.values
1981 @property
1982 def attrs(self):
1983 return self.table._v_attrs
1985 @property
1986 def description(self):
1987 return self.table.description
1989 @property
1990 def col(self):
1991 """ return my current col description """
1992 return getattr(self.description, self.cname, None)
1994 @property
1995 def cvalues(self):
1996 """ return my cython values """
1997 return self.values
1999 def __iter__(self):
2000 return iter(self.values)
2002 def maybe_set_size(self, min_itemsize=None):
2003 """ maybe set a string col itemsize:
2004 min_itemsize can be an integer or a dict with this columns name
2005 with an integer size """
2006 if _ensure_decoded(self.kind) == "string":
2008 if isinstance(min_itemsize, dict):
2009 min_itemsize = min_itemsize.get(self.name)
2011 if min_itemsize is not None and self.typ.itemsize < min_itemsize:
2012 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
2014 def validate_names(self):
2015 pass
2017 def validate_and_set(self, handler: "AppendableTable", append: bool):
2018 self.table = handler.table
2019 self.validate_col()
2020 self.validate_attr(append)
2021 self.validate_metadata(handler)
2022 self.write_metadata(handler)
2023 self.set_attr()
2025 def validate_col(self, itemsize=None):
2026 """ validate this column: return the compared against itemsize """
2028 # validate this column for string truncation (or reset to the max size)
2029 if _ensure_decoded(self.kind) == "string":
2030 c = self.col
2031 if c is not None:
2032 if itemsize is None:
2033 itemsize = self.itemsize
2034 if c.itemsize < itemsize:
2035 raise ValueError(
2036 f"Trying to store a string with len [{itemsize}] in "
2037 f"[{self.cname}] column but\nthis column has a limit of "
2038 f"[{c.itemsize}]!\nConsider using min_itemsize to "
2039 "preset the sizes on these columns"
2040 )
2041 return c.itemsize
2043 return None
2045 def validate_attr(self, append: bool):
2046 # check for backwards incompatibility
2047 if append:
2048 existing_kind = getattr(self.attrs, self.kind_attr, None)
2049 if existing_kind is not None and existing_kind != self.kind:
2050 raise TypeError(
2051 f"incompatible kind in col [{existing_kind} - {self.kind}]"
2052 )
2054 def update_info(self, info):
2055 """ set/update the info for this indexable with the key/value
2056 if there is a conflict raise/warn as needed """
2058 for key in self._info_fields:
2060 value = getattr(self, key, None)
2061 idx = info.setdefault(self.name, {})
2063 existing_value = idx.get(key)
2064 if key in idx and value is not None and existing_value != value:
2066 # frequency/name just warn
2067 if key in ["freq", "index_name"]:
2068 ws = attribute_conflict_doc % (key, existing_value, value)
2069 warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
2071 # reset
2072 idx[key] = None
2073 setattr(self, key, None)
2075 else:
2076 raise ValueError(
2077 f"invalid info for [{self.name}] for [{key}], "
2078 f"existing_value [{existing_value}] conflicts with "
2079 f"new value [{value}]"
2080 )
2081 else:
2082 if value is not None or existing_value is not None:
2083 idx[key] = value
2085 def set_info(self, info):
2086 """ set my state from the passed info """
2087 idx = info.get(self.name)
2088 if idx is not None:
2089 self.__dict__.update(idx)
2091 def set_attr(self):
2092 """ set the kind for this column """
2093 setattr(self.attrs, self.kind_attr, self.kind)
2095 def validate_metadata(self, handler: "AppendableTable"):
2096 """ validate that kind=category does not change the categories """
2097 if self.meta == "category":
2098 new_metadata = self.metadata
2099 cur_metadata = handler.read_metadata(self.cname)
2100 if (
2101 new_metadata is not None
2102 and cur_metadata is not None
2103 and not array_equivalent(new_metadata, cur_metadata)
2104 ):
2105 raise ValueError(
2106 "cannot append a categorical with "
2107 "different categories to the existing"
2108 )
2110 def write_metadata(self, handler: "AppendableTable"):
2111 """ set the meta data """
2112 if self.metadata is not None:
2113 handler.write_metadata(self.cname, self.metadata)
2116class GenericIndexCol(IndexCol):
2117 """ an index which is not represented in the data of the table """
2119 @property
2120 def is_indexed(self) -> bool:
2121 return False
2123 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2124 """
2125 Convert the data from this selection to the appropriate pandas type.
2127 Parameters
2128 ----------
2129 values : np.ndarray
2130 nan_rep : str
2131 encoding : str
2132 errors : str
2133 """
2134 assert isinstance(values, np.ndarray), type(values)
2136 values = Int64Index(np.arange(len(values)))
2137 return values, values
2139 def set_attr(self):
2140 pass
2143class DataCol(IndexCol):
2144 """ a data holding column, by definition this is not indexable
2146 Parameters
2147 ----------
2149 data : the actual data
2150 cname : the column name in the table to hold the data (typically
2151 values)
2152 meta : a string description of the metadata
2153 metadata : the actual metadata
2154 """
2156 is_an_indexable = False
2157 is_data_indexable = False
2158 _info_fields = ["tz", "ordered"]
2160 def __init__(
2161 self,
2162 name: str,
2163 values=None,
2164 kind=None,
2165 typ=None,
2166 cname=None,
2167 pos=None,
2168 tz=None,
2169 ordered=None,
2170 table=None,
2171 meta=None,
2172 metadata=None,
2173 dtype=None,
2174 data=None,
2175 ):
2176 super().__init__(
2177 name=name,
2178 values=values,
2179 kind=kind,
2180 typ=typ,
2181 pos=pos,
2182 cname=cname,
2183 tz=tz,
2184 ordered=ordered,
2185 table=table,
2186 meta=meta,
2187 metadata=metadata,
2188 )
2189 self.dtype = dtype
2190 self.data = data
2192 @property
2193 def dtype_attr(self) -> str:
2194 return f"{self.name}_dtype"
2196 @property
2197 def meta_attr(self) -> str:
2198 return f"{self.name}_meta"
2200 def __repr__(self) -> str:
2201 temp = tuple(
2202 map(
2203 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
2204 )
2205 )
2206 return ",".join(
2207 (
2208 f"{key}->{value}"
2209 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
2210 )
2211 )
2213 def __eq__(self, other: Any) -> bool:
2214 """ compare 2 col items """
2215 return all(
2216 getattr(self, a, None) == getattr(other, a, None)
2217 for a in ["name", "cname", "dtype", "pos"]
2218 )
2220 def set_data(self, data: Union[np.ndarray, ABCExtensionArray]):
2221 assert data is not None
2222 assert self.dtype is None
2224 data, dtype_name = _get_data_and_dtype_name(data)
2226 self.data = data
2227 self.dtype = dtype_name
2228 self.kind = _dtype_to_kind(dtype_name)
2230 def take_data(self):
2231 """ return the data """
2232 return self.data
2234 @classmethod
2235 def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col":
2236 """
2237 Get an appropriately typed and shaped pytables.Col object for values.
2238 """
2240 dtype = values.dtype
2241 itemsize = dtype.itemsize
2243 shape = values.shape
2244 if values.ndim == 1:
2245 # EA, use block shape pretending it is 2D
2246 shape = (1, values.size)
2248 if is_categorical_dtype(dtype):
2249 codes = values.codes
2250 atom = cls.get_atom_data(shape, kind=codes.dtype.name)
2251 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
2252 atom = cls.get_atom_datetime64(shape)
2253 elif is_timedelta64_dtype(dtype):
2254 atom = cls.get_atom_timedelta64(shape)
2255 elif is_complex_dtype(dtype):
2256 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
2258 elif is_string_dtype(dtype):
2259 atom = cls.get_atom_string(shape, itemsize)
2261 else:
2262 atom = cls.get_atom_data(shape, kind=dtype.name)
2264 return atom
2266 @classmethod
2267 def get_atom_string(cls, shape, itemsize):
2268 return _tables().StringCol(itemsize=itemsize, shape=shape[0])
2270 @classmethod
2271 def get_atom_coltype(cls, kind: str) -> Type["Col"]:
2272 """ return the PyTables column class for this column """
2273 if kind.startswith("uint"):
2274 k4 = kind[4:]
2275 col_name = f"UInt{k4}Col"
2276 elif kind.startswith("period"):
2277 # we store as integer
2278 col_name = "Int64Col"
2279 else:
2280 kcap = kind.capitalize()
2281 col_name = f"{kcap}Col"
2283 return getattr(_tables(), col_name)
2285 @classmethod
2286 def get_atom_data(cls, shape, kind: str) -> "Col":
2287 return cls.get_atom_coltype(kind=kind)(shape=shape[0])
2289 @classmethod
2290 def get_atom_datetime64(cls, shape):
2291 return _tables().Int64Col(shape=shape[0])
2293 @classmethod
2294 def get_atom_timedelta64(cls, shape):
2295 return _tables().Int64Col(shape=shape[0])
2297 @property
2298 def shape(self):
2299 return getattr(self.data, "shape", None)
2301 @property
2302 def cvalues(self):
2303 """ return my cython values """
2304 return self.data
2306 def validate_attr(self, append):
2307 """validate that we have the same order as the existing & same dtype"""
2308 if append:
2309 existing_fields = getattr(self.attrs, self.kind_attr, None)
2310 if existing_fields is not None and existing_fields != list(self.values):
2311 raise ValueError("appended items do not match existing items in table!")
2313 existing_dtype = getattr(self.attrs, self.dtype_attr, None)
2314 if existing_dtype is not None and existing_dtype != self.dtype:
2315 raise ValueError(
2316 "appended items dtype do not match existing "
2317 "items dtype in table!"
2318 )
2320 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2321 """
2322 Convert the data from this selection to the appropriate pandas type.
2324 Parameters
2325 ----------
2326 values : np.ndarray
2327 nan_rep :
2328 encoding : str
2329 errors : str
2331 Returns
2332 -------
2333 index : listlike to become an Index
2334 data : ndarraylike to become a column
2335 """
2336 assert isinstance(values, np.ndarray), type(values)
2338 # values is a recarray
2339 if values.dtype.fields is not None:
2340 values = values[self.cname]
2342 assert self.typ is not None
2343 if self.dtype is None:
2344 # Note: in tests we never have timedelta64 or datetime64,
2345 # so the _get_data_and_dtype_name may be unnecessary
2346 converted, dtype_name = _get_data_and_dtype_name(values)
2347 kind = _dtype_to_kind(dtype_name)
2348 else:
2349 converted = values
2350 dtype_name = self.dtype
2351 kind = self.kind
2353 assert isinstance(converted, np.ndarray) # for mypy
2355 # use the meta if needed
2356 meta = _ensure_decoded(self.meta)
2357 metadata = self.metadata
2358 ordered = self.ordered
2359 tz = self.tz
2361 assert dtype_name is not None
2362 # convert to the correct dtype
2363 dtype = _ensure_decoded(dtype_name)
2365 # reverse converts
2366 if dtype == "datetime64":
2368 # recreate with tz if indicated
2369 converted = _set_tz(converted, tz, coerce=True)
2371 elif dtype == "timedelta64":
2372 converted = np.asarray(converted, dtype="m8[ns]")
2373 elif dtype == "date":
2374 try:
2375 converted = np.asarray(
2376 [date.fromordinal(v) for v in converted], dtype=object
2377 )
2378 except ValueError:
2379 converted = np.asarray(
2380 [date.fromtimestamp(v) for v in converted], dtype=object
2381 )
2383 elif meta == "category":
2385 # we have a categorical
2386 categories = metadata
2387 codes = converted.ravel()
2389 # if we have stored a NaN in the categories
2390 # then strip it; in theory we could have BOTH
2391 # -1s in the codes and nulls :<
2392 if categories is None:
2393 # Handle case of NaN-only categorical columns in which case
2394 # the categories are an empty array; when this is stored,
2395 # pytables cannot write a zero-len array, so on readback
2396 # the categories would be None and `read_hdf()` would fail.
2397 categories = Index([], dtype=np.float64)
2398 else:
2399 mask = isna(categories)
2400 if mask.any():
2401 categories = categories[~mask]
2402 codes[codes != -1] -= mask.astype(int).cumsum().values
2404 converted = Categorical.from_codes(
2405 codes, categories=categories, ordered=ordered
2406 )
2408 else:
2410 try:
2411 converted = converted.astype(dtype, copy=False)
2412 except TypeError:
2413 converted = converted.astype("O", copy=False)
2415 # convert nans / decode
2416 if _ensure_decoded(kind) == "string":
2417 converted = _unconvert_string_array(
2418 converted, nan_rep=nan_rep, encoding=encoding, errors=errors
2419 )
2421 return self.values, converted
2423 def set_attr(self):
2424 """ set the data for this column """
2425 setattr(self.attrs, self.kind_attr, self.values)
2426 setattr(self.attrs, self.meta_attr, self.meta)
2427 assert self.dtype is not None
2428 setattr(self.attrs, self.dtype_attr, self.dtype)
2431class DataIndexableCol(DataCol):
2432 """ represent a data column that can be indexed """
2434 is_data_indexable = True
2436 def validate_names(self):
2437 if not Index(self.values).is_object():
2438 # TODO: should the message here be more specifically non-str?
2439 raise ValueError("cannot have non-object label DataIndexableCol")
2441 @classmethod
2442 def get_atom_string(cls, shape, itemsize):
2443 return _tables().StringCol(itemsize=itemsize)
2445 @classmethod
2446 def get_atom_data(cls, shape, kind: str) -> "Col":
2447 return cls.get_atom_coltype(kind=kind)()
2449 @classmethod
2450 def get_atom_datetime64(cls, shape):
2451 return _tables().Int64Col()
2453 @classmethod
2454 def get_atom_timedelta64(cls, shape):
2455 return _tables().Int64Col()
2458class GenericDataIndexableCol(DataIndexableCol):
2459 """ represent a generic pytables data column """
2461 pass
2464class Fixed:
2465 """ represent an object in my store
2466 facilitate read/write of various types of objects
2467 this is an abstract base class
2469 Parameters
2470 ----------
2471 parent : HDFStore
2472 group : Node
2473 The group node where the table resides.
2474 """
2476 pandas_kind: str
2477 format_type: str = "fixed" # GH#30962 needed by dask
2478 obj_type: Type[Union[DataFrame, Series]]
2479 ndim: int
2480 encoding: str
2481 parent: HDFStore
2482 group: "Node"
2483 errors: str
2484 is_table = False
2486 def __init__(
2487 self,
2488 parent: HDFStore,
2489 group: "Node",
2490 encoding: str = "UTF-8",
2491 errors: str = "strict",
2492 ):
2493 assert isinstance(parent, HDFStore), type(parent)
2494 assert _table_mod is not None # needed for mypy
2495 assert isinstance(group, _table_mod.Node), type(group)
2496 self.parent = parent
2497 self.group = group
2498 self.encoding = _ensure_encoding(encoding)
2499 self.errors = errors
2501 @property
2502 def is_old_version(self) -> bool:
2503 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
2505 @property
2506 def version(self) -> Tuple[int, int, int]:
2507 """ compute and set our version """
2508 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
2509 try:
2510 version = tuple(int(x) for x in version.split("."))
2511 if len(version) == 2:
2512 version = version + (0,)
2513 except AttributeError:
2514 version = (0, 0, 0)
2515 return version
2517 @property
2518 def pandas_type(self):
2519 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
2521 def __repr__(self) -> str:
2522 """ return a pretty representation of myself """
2523 self.infer_axes()
2524 s = self.shape
2525 if s is not None:
2526 if isinstance(s, (list, tuple)):
2527 jshape = ",".join(pprint_thing(x) for x in s)
2528 s = f"[{jshape}]"
2529 return f"{self.pandas_type:12.12} (shape->{s})"
2530 return self.pandas_type
2532 def set_object_info(self):
2533 """ set my pandas type & version """
2534 self.attrs.pandas_type = str(self.pandas_kind)
2535 self.attrs.pandas_version = str(_version)
2537 def copy(self):
2538 new_self = copy.copy(self)
2539 return new_self
2541 @property
2542 def shape(self):
2543 return self.nrows
2545 @property
2546 def pathname(self):
2547 return self.group._v_pathname
2549 @property
2550 def _handle(self):
2551 return self.parent._handle
2553 @property
2554 def _filters(self):
2555 return self.parent._filters
2557 @property
2558 def _complevel(self) -> int:
2559 return self.parent._complevel
2561 @property
2562 def _fletcher32(self) -> bool:
2563 return self.parent._fletcher32
2565 @property
2566 def attrs(self):
2567 return self.group._v_attrs
2569 def set_attrs(self):
2570 """ set our object attributes """
2571 pass
2573 def get_attrs(self):
2574 """ get our object attributes """
2575 pass
2577 @property
2578 def storable(self):
2579 """ return my storable """
2580 return self.group
2582 @property
2583 def is_exists(self) -> bool:
2584 return False
2586 @property
2587 def nrows(self):
2588 return getattr(self.storable, "nrows", None)
2590 def validate(self, other):
2591 """ validate against an existing storable """
2592 if other is None:
2593 return
2594 return True
2596 def validate_version(self, where=None):
2597 """ are we trying to operate on an old version? """
2598 return True
2600 def infer_axes(self):
2601 """ infer the axes of my storer
2602 return a boolean indicating if we have a valid storer or not """
2604 s = self.storable
2605 if s is None:
2606 return False
2607 self.get_attrs()
2608 return True
2610 def read(
2611 self,
2612 where=None,
2613 columns=None,
2614 start: Optional[int] = None,
2615 stop: Optional[int] = None,
2616 ):
2617 raise NotImplementedError(
2618 "cannot read on an abstract storer: subclasses should implement"
2619 )
2621 def write(self, **kwargs):
2622 raise NotImplementedError(
2623 "cannot write on an abstract storer: subclasses should implement"
2624 )
2626 def delete(
2627 self, where=None, start: Optional[int] = None, stop: Optional[int] = None
2628 ):
2629 """
2630 support fully deleting the node in its entirety (only) - where
2631 specification must be None
2632 """
2633 if com.all_none(where, start, stop):
2634 self._handle.remove_node(self.group, recursive=True)
2635 return None
2637 raise TypeError("cannot delete on an abstract storer")
2640class GenericFixed(Fixed):
2641 """ a generified fixed version """
2643 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
2644 _reverse_index_map = {v: k for k, v in _index_type_map.items()}
2645 attributes: List[str] = []
2647 # indexer helpders
2648 def _class_to_alias(self, cls) -> str:
2649 return self._index_type_map.get(cls, "")
2651 def _alias_to_class(self, alias):
2652 if isinstance(alias, type): # pragma: no cover
2653 # compat: for a short period of time master stored types
2654 return alias
2655 return self._reverse_index_map.get(alias, Index)
2657 def _get_index_factory(self, klass):
2658 if klass == DatetimeIndex:
2660 def f(values, freq=None, tz=None):
2661 # data are already in UTC, localize and convert if tz present
2662 result = DatetimeIndex._simple_new(values.values, name=None, freq=freq)
2663 if tz is not None:
2664 result = result.tz_localize("UTC").tz_convert(tz)
2665 return result
2667 return f
2668 elif klass == PeriodIndex:
2670 def f(values, freq=None, tz=None):
2671 return PeriodIndex._simple_new(values, name=None, freq=freq)
2673 return f
2675 return klass
2677 def validate_read(self, columns, where):
2678 """
2679 raise if any keywords are passed which are not-None
2680 """
2681 if columns is not None:
2682 raise TypeError(
2683 "cannot pass a column specification when reading "
2684 "a Fixed format store. this store must be "
2685 "selected in its entirety"
2686 )
2687 if where is not None:
2688 raise TypeError(
2689 "cannot pass a where specification when reading "
2690 "from a Fixed format store. this store must be "
2691 "selected in its entirety"
2692 )
2694 @property
2695 def is_exists(self) -> bool:
2696 return True
2698 def set_attrs(self):
2699 """ set our object attributes """
2700 self.attrs.encoding = self.encoding
2701 self.attrs.errors = self.errors
2703 def get_attrs(self):
2704 """ retrieve our attributes """
2705 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
2706 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
2707 for n in self.attributes:
2708 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
2710 def write(self, obj, **kwargs):
2711 self.set_attrs()
2713 def read_array(
2714 self, key: str, start: Optional[int] = None, stop: Optional[int] = None
2715 ):
2716 """ read an array for the specified node (off of group """
2717 import tables
2719 node = getattr(self.group, key)
2720 attrs = node._v_attrs
2722 transposed = getattr(attrs, "transposed", False)
2724 if isinstance(node, tables.VLArray):
2725 ret = node[0][start:stop]
2726 else:
2727 dtype = getattr(attrs, "value_type", None)
2728 shape = getattr(attrs, "shape", None)
2730 if shape is not None:
2731 # length 0 axis
2732 ret = np.empty(shape, dtype=dtype)
2733 else:
2734 ret = node[start:stop]
2736 if dtype == "datetime64":
2738 # reconstruct a timezone if indicated
2739 tz = getattr(attrs, "tz", None)
2740 ret = _set_tz(ret, tz, coerce=True)
2742 elif dtype == "timedelta64":
2743 ret = np.asarray(ret, dtype="m8[ns]")
2745 if transposed:
2746 return ret.T
2747 else:
2748 return ret
2750 def read_index(
2751 self, key: str, start: Optional[int] = None, stop: Optional[int] = None
2752 ) -> Index:
2753 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
2755 if variety == "multi":
2756 return self.read_multi_index(key, start=start, stop=stop)
2757 elif variety == "regular":
2758 node = getattr(self.group, key)
2759 index = self.read_index_node(node, start=start, stop=stop)
2760 return index
2761 else: # pragma: no cover
2762 raise TypeError(f"unrecognized index variety: {variety}")
2764 def write_index(self, key: str, index: Index):
2765 if isinstance(index, MultiIndex):
2766 setattr(self.attrs, f"{key}_variety", "multi")
2767 self.write_multi_index(key, index)
2768 else:
2769 setattr(self.attrs, f"{key}_variety", "regular")
2770 converted = _convert_index("index", index, self.encoding, self.errors)
2772 self.write_array(key, converted.values)
2774 node = getattr(self.group, key)
2775 node._v_attrs.kind = converted.kind
2776 node._v_attrs.name = index.name
2778 if isinstance(index, (DatetimeIndex, PeriodIndex)):
2779 node._v_attrs.index_class = self._class_to_alias(type(index))
2781 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
2782 node._v_attrs.freq = index.freq
2784 if isinstance(index, DatetimeIndex) and index.tz is not None:
2785 node._v_attrs.tz = _get_tz(index.tz)
2787 def write_multi_index(self, key: str, index: MultiIndex):
2788 setattr(self.attrs, f"{key}_nlevels", index.nlevels)
2790 for i, (lev, level_codes, name) in enumerate(
2791 zip(index.levels, index.codes, index.names)
2792 ):
2793 # write the level
2794 if is_extension_array_dtype(lev):
2795 raise NotImplementedError(
2796 "Saving a MultiIndex with an extension dtype is not supported."
2797 )
2798 level_key = f"{key}_level{i}"
2799 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
2800 self.write_array(level_key, conv_level.values)
2801 node = getattr(self.group, level_key)
2802 node._v_attrs.kind = conv_level.kind
2803 node._v_attrs.name = name
2805 # write the name
2806 setattr(node._v_attrs, f"{key}_name{name}", name)
2808 # write the labels
2809 label_key = f"{key}_label{i}"
2810 self.write_array(label_key, level_codes)
2812 def read_multi_index(
2813 self, key: str, start: Optional[int] = None, stop: Optional[int] = None
2814 ) -> MultiIndex:
2815 nlevels = getattr(self.attrs, f"{key}_nlevels")
2817 levels = []
2818 codes = []
2819 names: List[Optional[Hashable]] = []
2820 for i in range(nlevels):
2821 level_key = f"{key}_level{i}"
2822 node = getattr(self.group, level_key)
2823 lev = self.read_index_node(node, start=start, stop=stop)
2824 levels.append(lev)
2825 names.append(lev.name)
2827 label_key = f"{key}_label{i}"
2828 level_codes = self.read_array(label_key, start=start, stop=stop)
2829 codes.append(level_codes)
2831 return MultiIndex(
2832 levels=levels, codes=codes, names=names, verify_integrity=True
2833 )
2835 def read_index_node(
2836 self, node: "Node", start: Optional[int] = None, stop: Optional[int] = None
2837 ) -> Index:
2838 data = node[start:stop]
2839 # If the index was an empty array write_array_empty() will
2840 # have written a sentinel. Here we relace it with the original.
2841 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
2842 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,)
2843 kind = _ensure_decoded(node._v_attrs.kind)
2844 name = None
2846 if "name" in node._v_attrs:
2847 name = _ensure_str(node._v_attrs.name)
2848 name = _ensure_decoded(name)
2850 index_class = self._alias_to_class(
2851 _ensure_decoded(getattr(node._v_attrs, "index_class", ""))
2852 )
2853 factory = self._get_index_factory(index_class)
2855 kwargs = {}
2856 if "freq" in node._v_attrs:
2857 kwargs["freq"] = node._v_attrs["freq"]
2859 if "tz" in node._v_attrs:
2860 if isinstance(node._v_attrs["tz"], bytes):
2861 # created by python2
2862 kwargs["tz"] = node._v_attrs["tz"].decode("utf-8")
2863 else:
2864 # created by python3
2865 kwargs["tz"] = node._v_attrs["tz"]
2867 if kind == "date":
2868 index = factory(
2869 _unconvert_index(
2870 data, kind, encoding=self.encoding, errors=self.errors
2871 ),
2872 dtype=object,
2873 **kwargs,
2874 )
2875 else:
2876 index = factory(
2877 _unconvert_index(
2878 data, kind, encoding=self.encoding, errors=self.errors
2879 ),
2880 **kwargs,
2881 )
2883 index.name = name
2885 return index
2887 def write_array_empty(self, key: str, value: ArrayLike):
2888 """ write a 0-len array """
2890 # ugly hack for length 0 axes
2891 arr = np.empty((1,) * value.ndim)
2892 self._handle.create_array(self.group, key, arr)
2893 node = getattr(self.group, key)
2894 node._v_attrs.value_type = str(value.dtype)
2895 node._v_attrs.shape = value.shape
2897 def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None):
2898 # TODO: we only have one test that gets here, the only EA
2899 # that gets passed is DatetimeArray, and we never have
2900 # both self._filters and EA
2901 assert isinstance(value, (np.ndarray, ABCExtensionArray)), type(value)
2903 if key in self.group:
2904 self._handle.remove_node(self.group, key)
2906 # Transform needed to interface with pytables row/col notation
2907 empty_array = value.size == 0
2908 transposed = False
2910 if is_categorical_dtype(value):
2911 raise NotImplementedError(
2912 "Cannot store a category dtype in "
2913 "a HDF5 dataset that uses format="
2914 '"fixed". Use format="table".'
2915 )
2916 if not empty_array:
2917 if hasattr(value, "T"):
2918 # ExtensionArrays (1d) may not have transpose.
2919 value = value.T
2920 transposed = True
2922 atom = None
2923 if self._filters is not None:
2924 try:
2925 # get the atom for this datatype
2926 atom = _tables().Atom.from_dtype(value.dtype)
2927 except ValueError:
2928 pass
2930 if atom is not None:
2931 # We only get here if self._filters is non-None and
2932 # the Atom.from_dtype call succeeded
2934 # create an empty chunked array and fill it from value
2935 if not empty_array:
2936 ca = self._handle.create_carray(
2937 self.group, key, atom, value.shape, filters=self._filters
2938 )
2939 ca[:] = value
2941 else:
2942 self.write_array_empty(key, value)
2944 elif value.dtype.type == np.object_:
2946 # infer the type, warn if we have a non-string type here (for
2947 # performance)
2948 inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
2949 if empty_array:
2950 pass
2951 elif inferred_type == "string":
2952 pass
2953 else:
2954 ws = performance_doc % (inferred_type, key, items)
2955 warnings.warn(ws, PerformanceWarning, stacklevel=7)
2957 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
2958 vlarr.append(value)
2960 elif empty_array:
2961 self.write_array_empty(key, value)
2962 elif is_datetime64_dtype(value.dtype):
2963 self._handle.create_array(self.group, key, value.view("i8"))
2964 getattr(self.group, key)._v_attrs.value_type = "datetime64"
2965 elif is_datetime64tz_dtype(value.dtype):
2966 # store as UTC
2967 # with a zone
2968 self._handle.create_array(self.group, key, value.asi8)
2970 node = getattr(self.group, key)
2971 node._v_attrs.tz = _get_tz(value.tz)
2972 node._v_attrs.value_type = "datetime64"
2973 elif is_timedelta64_dtype(value.dtype):
2974 self._handle.create_array(self.group, key, value.view("i8"))
2975 getattr(self.group, key)._v_attrs.value_type = "timedelta64"
2976 else:
2977 self._handle.create_array(self.group, key, value)
2979 getattr(self.group, key)._v_attrs.transposed = transposed
2982class SeriesFixed(GenericFixed):
2983 pandas_kind = "series"
2984 attributes = ["name"]
2986 name: Optional[Hashable]
2988 @property
2989 def shape(self):
2990 try:
2991 return (len(self.group.values),)
2992 except (TypeError, AttributeError):
2993 return None
2995 def read(
2996 self,
2997 where=None,
2998 columns=None,
2999 start: Optional[int] = None,
3000 stop: Optional[int] = None,
3001 ):
3002 self.validate_read(columns, where)
3003 index = self.read_index("index", start=start, stop=stop)
3004 values = self.read_array("values", start=start, stop=stop)
3005 return Series(values, index=index, name=self.name)
3007 def write(self, obj, **kwargs):
3008 super().write(obj, **kwargs)
3009 self.write_index("index", obj.index)
3010 self.write_array("values", obj.values)
3011 self.attrs.name = obj.name
3014class BlockManagerFixed(GenericFixed):
3015 attributes = ["ndim", "nblocks"]
3017 nblocks: int
3019 @property
3020 def shape(self):
3021 try:
3022 ndim = self.ndim
3024 # items
3025 items = 0
3026 for i in range(self.nblocks):
3027 node = getattr(self.group, f"block{i}_items")
3028 shape = getattr(node, "shape", None)
3029 if shape is not None:
3030 items += shape[0]
3032 # data shape
3033 node = self.group.block0_values
3034 shape = getattr(node, "shape", None)
3035 if shape is not None:
3036 shape = list(shape[0 : (ndim - 1)])
3037 else:
3038 shape = []
3040 shape.append(items)
3042 return shape
3043 except AttributeError:
3044 return None
3046 def read(
3047 self,
3048 where=None,
3049 columns=None,
3050 start: Optional[int] = None,
3051 stop: Optional[int] = None,
3052 ):
3053 # start, stop applied to rows, so 0th axis only
3054 self.validate_read(columns, where)
3055 select_axis = self.obj_type()._get_block_manager_axis(0)
3057 axes = []
3058 for i in range(self.ndim):
3060 _start, _stop = (start, stop) if i == select_axis else (None, None)
3061 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
3062 axes.append(ax)
3064 items = axes[0]
3065 dfs = []
3067 for i in range(self.nblocks):
3069 blk_items = self.read_index(f"block{i}_items")
3070 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
3072 columns = items[items.get_indexer(blk_items)]
3073 df = DataFrame(values.T, columns=columns, index=axes[1])
3074 dfs.append(df)
3076 if len(dfs) > 0:
3077 out = concat(dfs, axis=1)
3078 out = out.reindex(columns=items, copy=False)
3079 return out
3081 return DataFrame(columns=axes[0], index=axes[1])
3083 def write(self, obj, **kwargs):
3084 super().write(obj, **kwargs)
3085 data = obj._data
3086 if not data.is_consolidated():
3087 data = data.consolidate()
3089 self.attrs.ndim = data.ndim
3090 for i, ax in enumerate(data.axes):
3091 if i == 0:
3092 if not ax.is_unique:
3093 raise ValueError("Columns index has to be unique for fixed format")
3094 self.write_index(f"axis{i}", ax)
3096 # Supporting mixed-type DataFrame objects...nontrivial
3097 self.attrs.nblocks = len(data.blocks)
3098 for i, blk in enumerate(data.blocks):
3099 # I have no idea why, but writing values before items fixed #2299
3100 blk_items = data.items.take(blk.mgr_locs)
3101 self.write_array(f"block{i}_values", blk.values, items=blk_items)
3102 self.write_index(f"block{i}_items", blk_items)
3105class FrameFixed(BlockManagerFixed):
3106 pandas_kind = "frame"
3107 obj_type = DataFrame
3110class Table(Fixed):
3111 """ represent a table:
3112 facilitate read/write of various types of tables
3114 Attrs in Table Node
3115 -------------------
3116 These are attributes that are store in the main table node, they are
3117 necessary to recreate these tables when read back in.
3119 index_axes : a list of tuples of the (original indexing axis and
3120 index column)
3121 non_index_axes: a list of tuples of the (original index axis and
3122 columns on a non-indexing axis)
3123 values_axes : a list of the columns which comprise the data of this
3124 table
3125 data_columns : a list of the columns that we are allowing indexing
3126 (these become single columns in values_axes), or True to force all
3127 columns
3128 nan_rep : the string to use for nan representations for string
3129 objects
3130 levels : the names of levels
3131 metadata : the names of the metadata columns
3133 """
3135 pandas_kind = "wide_table"
3136 format_type: str = "table" # GH#30962 needed by dask
3137 table_type: str
3138 levels = 1
3139 is_table = True
3141 index_axes: List[IndexCol]
3142 non_index_axes: List[Tuple[int, Any]]
3143 values_axes: List[DataCol]
3144 data_columns: List
3145 metadata: List
3146 info: Dict
3148 def __init__(
3149 self,
3150 parent: HDFStore,
3151 group: "Node",
3152 encoding=None,
3153 errors: str = "strict",
3154 index_axes=None,
3155 non_index_axes=None,
3156 values_axes=None,
3157 data_columns=None,
3158 info=None,
3159 nan_rep=None,
3160 ):
3161 super().__init__(parent, group, encoding=encoding, errors=errors)
3162 self.index_axes = index_axes or []
3163 self.non_index_axes = non_index_axes or []
3164 self.values_axes = values_axes or []
3165 self.data_columns = data_columns or []
3166 self.info = info or dict()
3167 self.nan_rep = nan_rep
3169 @property
3170 def table_type_short(self) -> str:
3171 return self.table_type.split("_")[0]
3173 def __repr__(self) -> str:
3174 """ return a pretty representation of myself """
3175 self.infer_axes()
3176 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
3177 dc = f",dc->[{jdc}]"
3179 ver = ""
3180 if self.is_old_version:
3181 jver = ".".join(str(x) for x in self.version)
3182 ver = f"[{jver}]"
3184 jindex_axes = ",".join(a.name for a in self.index_axes)
3185 return (
3186 f"{self.pandas_type:12.12}{ver} "
3187 f"(typ->{self.table_type_short},nrows->{self.nrows},"
3188 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
3189 )
3191 def __getitem__(self, c: str):
3192 """ return the axis for c """
3193 for a in self.axes:
3194 if c == a.name:
3195 return a
3196 return None
3198 def validate(self, other):
3199 """ validate against an existing table """
3200 if other is None:
3201 return
3203 if other.table_type != self.table_type:
3204 raise TypeError(
3205 "incompatible table_type with existing "
3206 f"[{other.table_type} - {self.table_type}]"
3207 )
3209 for c in ["index_axes", "non_index_axes", "values_axes"]:
3210 sv = getattr(self, c, None)
3211 ov = getattr(other, c, None)
3212 if sv != ov:
3214 # show the error for the specific axes
3215 for i, sax in enumerate(sv):
3216 oax = ov[i]
3217 if sax != oax:
3218 raise ValueError(
3219 f"invalid combination of [{c}] on appending data "
3220 f"[{sax}] vs current table [{oax}]"
3221 )
3223 # should never get here
3224 raise Exception(
3225 f"invalid combination of [{c}] on appending data [{sv}] vs "
3226 f"current table [{ov}]"
3227 )
3229 @property
3230 def is_multi_index(self) -> bool:
3231 """the levels attribute is 1 or a list in the case of a multi-index"""
3232 return isinstance(self.levels, list)
3234 def validate_multiindex(self, obj):
3235 """validate that we can store the multi-index; reset and return the
3236 new object
3237 """
3238 levels = [
3239 l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names)
3240 ]
3241 try:
3242 return obj.reset_index(), levels
3243 except ValueError:
3244 raise ValueError(
3245 "duplicate names/columns in the multi-index when storing as a table"
3246 )
3248 @property
3249 def nrows_expected(self) -> int:
3250 """ based on our axes, compute the expected nrows """
3251 return np.prod([i.cvalues.shape[0] for i in self.index_axes])
3253 @property
3254 def is_exists(self) -> bool:
3255 """ has this table been created """
3256 return "table" in self.group
3258 @property
3259 def storable(self):
3260 return getattr(self.group, "table", None)
3262 @property
3263 def table(self):
3264 """ return the table group (this is my storable) """
3265 return self.storable
3267 @property
3268 def dtype(self):
3269 return self.table.dtype
3271 @property
3272 def description(self):
3273 return self.table.description
3275 @property
3276 def axes(self):
3277 return itertools.chain(self.index_axes, self.values_axes)
3279 @property
3280 def ncols(self) -> int:
3281 """ the number of total columns in the values axes """
3282 return sum(len(a.values) for a in self.values_axes)
3284 @property
3285 def is_transposed(self) -> bool:
3286 return False
3288 @property
3289 def data_orientation(self):
3290 """return a tuple of my permutated axes, non_indexable at the front"""
3291 return tuple(
3292 itertools.chain(
3293 [int(a[0]) for a in self.non_index_axes],
3294 [int(a.axis) for a in self.index_axes],
3295 )
3296 )
3298 def queryables(self) -> Dict[str, Any]:
3299 """ return a dict of the kinds allowable columns for this object """
3301 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
3302 axis_names = {0: "index", 1: "columns"}
3304 # compute the values_axes queryables
3305 d1 = [(a.cname, a) for a in self.index_axes]
3306 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
3307 d3 = [
3308 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
3309 ]
3311 return dict(d1 + d2 + d3) # type: ignore
3312 # error: List comprehension has incompatible type
3313 # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]]
3315 def index_cols(self):
3316 """ return a list of my index cols """
3317 # Note: each `i.cname` below is assured to be a str.
3318 return [(i.axis, i.cname) for i in self.index_axes]
3320 def values_cols(self) -> List[str]:
3321 """ return a list of my values cols """
3322 return [i.cname for i in self.values_axes]
3324 def _get_metadata_path(self, key: str) -> str:
3325 """ return the metadata pathname for this key """
3326 group = self.group._v_pathname
3327 return f"{group}/meta/{key}/meta"
3329 def write_metadata(self, key: str, values: np.ndarray):
3330 """
3331 Write out a metadata array to the key as a fixed-format Series.
3333 Parameters
3334 ----------
3335 key : str
3336 values : ndarray
3337 """
3338 values = Series(values)
3339 self.parent.put(
3340 self._get_metadata_path(key),
3341 values,
3342 format="table",
3343 encoding=self.encoding,
3344 errors=self.errors,
3345 nan_rep=self.nan_rep,
3346 )
3348 def read_metadata(self, key: str):
3349 """ return the meta data array for this key """
3350 if getattr(getattr(self.group, "meta", None), key, None) is not None:
3351 return self.parent.select(self._get_metadata_path(key))
3352 return None
3354 def set_attrs(self):
3355 """ set our table type & indexables """
3356 self.attrs.table_type = str(self.table_type)
3357 self.attrs.index_cols = self.index_cols()
3358 self.attrs.values_cols = self.values_cols()
3359 self.attrs.non_index_axes = self.non_index_axes
3360 self.attrs.data_columns = self.data_columns
3361 self.attrs.nan_rep = self.nan_rep
3362 self.attrs.encoding = self.encoding
3363 self.attrs.errors = self.errors
3364 self.attrs.levels = self.levels
3365 self.attrs.info = self.info
3367 def get_attrs(self):
3368 """ retrieve our attributes """
3369 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
3370 self.data_columns = getattr(self.attrs, "data_columns", None) or []
3371 self.info = getattr(self.attrs, "info", None) or dict()
3372 self.nan_rep = getattr(self.attrs, "nan_rep", None)
3373 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
3374 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
3375 self.levels = getattr(self.attrs, "levels", None) or []
3376 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
3377 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
3379 def validate_version(self, where=None):
3380 """ are we trying to operate on an old version? """
3381 if where is not None:
3382 if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1:
3383 ws = incompatibility_doc % ".".join([str(x) for x in self.version])
3384 warnings.warn(ws, IncompatibilityWarning)
3386 def validate_min_itemsize(self, min_itemsize):
3387 """validate the min_itemsize doesn't contain items that are not in the
3388 axes this needs data_columns to be defined
3389 """
3390 if min_itemsize is None:
3391 return
3392 if not isinstance(min_itemsize, dict):
3393 return
3395 q = self.queryables()
3396 for k, v in min_itemsize.items():
3398 # ok, apply generally
3399 if k == "values":
3400 continue
3401 if k not in q:
3402 raise ValueError(
3403 f"min_itemsize has the key [{k}] which is not an axis or "
3404 "data_column"
3405 )
3407 @cache_readonly
3408 def indexables(self):
3409 """ create/cache the indexables if they don't exist """
3410 _indexables = []
3412 desc = self.description
3413 table_attrs = self.table.attrs
3415 # Note: each of the `name` kwargs below are str, ensured
3416 # by the definition in index_cols.
3417 # index columns
3418 for i, (axis, name) in enumerate(self.attrs.index_cols):
3419 atom = getattr(desc, name)
3420 md = self.read_metadata(name)
3421 meta = "category" if md is not None else None
3423 kind_attr = f"{name}_kind"
3424 kind = getattr(table_attrs, kind_attr, None)
3426 index_col = IndexCol(
3427 name=name,
3428 axis=axis,
3429 pos=i,
3430 kind=kind,
3431 typ=atom,
3432 table=self.table,
3433 meta=meta,
3434 metadata=md,
3435 )
3436 _indexables.append(index_col)
3438 # values columns
3439 dc = set(self.data_columns)
3440 base_pos = len(_indexables)
3442 def f(i, c):
3443 assert isinstance(c, str)
3444 klass = DataCol
3445 if c in dc:
3446 klass = DataIndexableCol
3448 atom = getattr(desc, c)
3449 adj_name = _maybe_adjust_name(c, self.version)
3451 # TODO: why kind_attr here?
3452 values = getattr(table_attrs, f"{adj_name}_kind", None)
3453 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
3454 kind = _dtype_to_kind(dtype)
3456 md = self.read_metadata(c)
3457 # TODO: figure out why these two versions of `meta` dont always match.
3458 # meta = "category" if md is not None else None
3459 meta = getattr(table_attrs, f"{adj_name}_meta", None)
3461 obj = klass(
3462 name=adj_name,
3463 cname=c,
3464 values=values,
3465 kind=kind,
3466 pos=base_pos + i,
3467 typ=atom,
3468 table=self.table,
3469 meta=meta,
3470 metadata=md,
3471 dtype=dtype,
3472 )
3473 return obj
3475 # Note: the definition of `values_cols` ensures that each
3476 # `c` below is a str.
3477 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
3479 return _indexables
3481 def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None):
3482 """
3483 Create a pytables index on the specified columns.
3485 Parameters
3486 ----------
3487 columns : None, bool, or listlike[str]
3488 Indicate which columns to create an index on.
3490 * False : Do not create any indexes.
3491 * True : Create indexes on all columns.
3492 * None : Create indexes on all columns.
3493 * listlike : Create indexes on the given columns.
3495 optlevel : int or None, default None
3496 Optimization level, if None, pytables defaults to 6.
3497 kind : str or None, default None
3498 Kind of index, if None, pytables defaults to "medium".
3500 Raises
3501 ------
3502 TypeError if trying to create an index on a complex-type column.
3504 Notes
3505 -----
3506 Cannot index Time64Col or ComplexCol.
3507 Pytables must be >= 3.0.
3508 """
3510 if not self.infer_axes():
3511 return
3512 if columns is False:
3513 return
3515 # index all indexables and data_columns
3516 if columns is None or columns is True:
3517 columns = [a.cname for a in self.axes if a.is_data_indexable]
3518 if not isinstance(columns, (tuple, list)):
3519 columns = [columns]
3521 kw = dict()
3522 if optlevel is not None:
3523 kw["optlevel"] = optlevel
3524 if kind is not None:
3525 kw["kind"] = kind
3527 table = self.table
3528 for c in columns:
3529 v = getattr(table.cols, c, None)
3530 if v is not None:
3532 # remove the index if the kind/optlevel have changed
3533 if v.is_indexed:
3534 index = v.index
3535 cur_optlevel = index.optlevel
3536 cur_kind = index.kind
3538 if kind is not None and cur_kind != kind:
3539 v.remove_index()
3540 else:
3541 kw["kind"] = cur_kind
3543 if optlevel is not None and cur_optlevel != optlevel:
3544 v.remove_index()
3545 else:
3546 kw["optlevel"] = cur_optlevel
3548 # create the index
3549 if not v.is_indexed:
3550 if v.type.startswith("complex"):
3551 raise TypeError(
3552 "Columns containing complex values can be stored but "
3553 "cannot be indexed when using table format. Either use "
3554 "fixed format, set index=False, or do not include "
3555 "the columns containing complex values to "
3556 "data_columns when initializing the table."
3557 )
3558 v.create_index(**kw)
3560 def _read_axes(
3561 self, where, start: Optional[int] = None, stop: Optional[int] = None
3562 ) -> List[Tuple[ArrayLike, ArrayLike]]:
3563 """
3564 Create the axes sniffed from the table.
3566 Parameters
3567 ----------
3568 where : ???
3569 start : int or None, default None
3570 stop : int or None, default None
3572 Returns
3573 -------
3574 List[Tuple[index_values, column_values]]
3575 """
3577 # create the selection
3578 selection = Selection(self, where=where, start=start, stop=stop)
3579 values = selection.select()
3581 results = []
3582 # convert the data
3583 for a in self.axes:
3584 a.set_info(self.info)
3585 res = a.convert(
3586 values,
3587 nan_rep=self.nan_rep,
3588 encoding=self.encoding,
3589 errors=self.errors,
3590 )
3591 results.append(res)
3593 return results
3595 @classmethod
3596 def get_object(cls, obj, transposed: bool):
3597 """ return the data for this obj """
3598 return obj
3600 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
3601 """take the input data_columns and min_itemize and create a data
3602 columns spec
3603 """
3605 if not len(non_index_axes):
3606 return []
3608 axis, axis_labels = non_index_axes[0]
3609 info = self.info.get(axis, dict())
3610 if info.get("type") == "MultiIndex" and data_columns:
3611 raise ValueError(
3612 f"cannot use a multi-index on axis [{axis}] with "
3613 f"data_columns {data_columns}"
3614 )
3616 # evaluate the passed data_columns, True == use all columns
3617 # take only valide axis labels
3618 if data_columns is True:
3619 data_columns = list(axis_labels)
3620 elif data_columns is None:
3621 data_columns = []
3623 # if min_itemsize is a dict, add the keys (exclude 'values')
3624 if isinstance(min_itemsize, dict):
3626 existing_data_columns = set(data_columns)
3627 data_columns = list(data_columns) # ensure we do not modify
3628 data_columns.extend(
3629 [
3630 k
3631 for k in min_itemsize.keys()
3632 if k != "values" and k not in existing_data_columns
3633 ]
3634 )
3636 # return valid columns in the order of our axis
3637 return [c for c in data_columns if c in axis_labels]
3639 def _create_axes(
3640 self,
3641 axes,
3642 obj: DataFrame,
3643 validate: bool = True,
3644 nan_rep=None,
3645 data_columns=None,
3646 min_itemsize=None,
3647 ):
3648 """
3649 Create and return the axes.
3651 Parameters
3652 ----------
3653 axes: list or None
3654 The names or numbers of the axes to create.
3655 obj : DataFrame
3656 The object to create axes on.
3657 validate: bool, default True
3658 Whether to validate the obj against an existing object already written.
3659 nan_rep :
3660 A value to use for string column nan_rep.
3661 data_columns : List[str], True, or None, default None
3662 Specify the columns that we want to create to allow indexing on.
3664 * True : Use all available columns.
3665 * None : Use no columns.
3666 * List[str] : Use the specified columns.
3668 min_itemsize: Dict[str, int] or None, default None
3669 The min itemsize for a column in bytes.
3670 """
3672 if not isinstance(obj, DataFrame):
3673 group = self.group._v_name
3674 raise TypeError(
3675 f"cannot properly create the storer for: [group->{group},"
3676 f"value->{type(obj)}]"
3677 )
3679 # set the default axes if needed
3680 if axes is None:
3681 axes = [0]
3683 # map axes to numbers
3684 axes = [obj._get_axis_number(a) for a in axes]
3686 # do we have an existing table (if so, use its axes & data_columns)
3687 if self.infer_axes():
3688 table_exists = True
3689 axes = [a.axis for a in self.index_axes]
3690 data_columns = list(self.data_columns)
3691 nan_rep = self.nan_rep
3692 # TODO: do we always have validate=True here?
3693 else:
3694 table_exists = False
3696 new_info = self.info
3698 assert self.ndim == 2 # with next check, we must have len(axes) == 1
3699 # currently support on ndim-1 axes
3700 if len(axes) != self.ndim - 1:
3701 raise ValueError(
3702 "currently only support ndim-1 indexers in an AppendableTable"
3703 )
3705 # create according to the new data
3706 new_non_index_axes: List = []
3708 # nan_representation
3709 if nan_rep is None:
3710 nan_rep = "nan"
3712 # We construct the non-index-axis first, since that alters new_info
3713 idx = [x for x in [0, 1] if x not in axes][0]
3715 a = obj.axes[idx]
3716 # we might be able to change the axes on the appending data if necessary
3717 append_axis = list(a)
3718 if table_exists:
3719 indexer = len(new_non_index_axes) # i.e. 0
3720 exist_axis = self.non_index_axes[indexer][1]
3721 if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
3723 # ahah! -> reindex
3724 if array_equivalent(
3725 np.array(sorted(append_axis)), np.array(sorted(exist_axis))
3726 ):
3727 append_axis = exist_axis
3729 # the non_index_axes info
3730 info = new_info.setdefault(idx, {})
3731 info["names"] = list(a.names)
3732 info["type"] = type(a).__name__
3734 new_non_index_axes.append((idx, append_axis))
3736 # Now we can construct our new index axis
3737 idx = axes[0]
3738 a = obj.axes[idx]
3739 axis_name = obj._AXIS_NAMES[idx]
3740 new_index = _convert_index(axis_name, a, self.encoding, self.errors)
3741 new_index.axis = idx
3743 # Because we are always 2D, there is only one new_index, so
3744 # we know it will have pos=0
3745 new_index.set_pos(0)
3746 new_index.update_info(new_info)
3747 new_index.maybe_set_size(min_itemsize) # check for column conflicts
3749 new_index_axes = [new_index]
3750 j = len(new_index_axes) # i.e. 1
3751 assert j == 1
3753 # reindex by our non_index_axes & compute data_columns
3754 assert len(new_non_index_axes) == 1
3755 for a in new_non_index_axes:
3756 obj = _reindex_axis(obj, a[0], a[1])
3758 def get_blk_items(mgr, blocks):
3759 return [mgr.items.take(blk.mgr_locs) for blk in blocks]
3761 transposed = new_index.axis == 1
3763 # figure out data_columns and get out blocks
3764 data_columns = self.validate_data_columns(
3765 data_columns, min_itemsize, new_non_index_axes
3766 )
3768 block_obj = self.get_object(obj, transposed)._consolidate()
3770 blocks, blk_items = self._get_blocks_and_items(
3771 block_obj, table_exists, new_non_index_axes, self.values_axes, data_columns
3772 )
3774 # add my values
3775 vaxes = []
3776 for i, (b, b_items) in enumerate(zip(blocks, blk_items)):
3778 # shape of the data column are the indexable axes
3779 klass = DataCol
3780 name = None
3782 # we have a data_column
3783 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
3784 klass = DataIndexableCol
3785 name = b_items[0]
3786 if not (name is None or isinstance(name, str)):
3787 # TODO: should the message here be more specifically non-str?
3788 raise ValueError("cannot have non-object label DataIndexableCol")
3790 # make sure that we match up the existing columns
3791 # if we have an existing table
3792 existing_col: Optional[DataCol]
3794 if table_exists and validate:
3795 try:
3796 existing_col = self.values_axes[i]
3797 except (IndexError, KeyError):
3798 raise ValueError(
3799 f"Incompatible appended table [{blocks}]"
3800 f"with existing table [{self.values_axes}]"
3801 )
3802 else:
3803 existing_col = None
3805 new_name = name or f"values_block_{i}"
3806 data_converted = _maybe_convert_for_string_atom(
3807 new_name,
3808 b,
3809 existing_col=existing_col,
3810 min_itemsize=min_itemsize,
3811 nan_rep=nan_rep,
3812 encoding=self.encoding,
3813 errors=self.errors,
3814 )
3815 adj_name = _maybe_adjust_name(new_name, self.version)
3817 typ = klass._get_atom(data_converted)
3818 kind = _dtype_to_kind(data_converted.dtype.name)
3819 tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None
3821 meta = metadata = ordered = None
3822 if is_categorical_dtype(data_converted):
3823 ordered = data_converted.ordered
3824 meta = "category"
3825 metadata = np.array(data_converted.categories, copy=False).ravel()
3827 data, dtype_name = _get_data_and_dtype_name(data_converted)
3829 col = klass(
3830 name=adj_name,
3831 cname=new_name,
3832 values=list(b_items),
3833 typ=typ,
3834 pos=j,
3835 kind=kind,
3836 tz=tz,
3837 ordered=ordered,
3838 meta=meta,
3839 metadata=metadata,
3840 dtype=dtype_name,
3841 data=data,
3842 )
3843 col.update_info(new_info)
3845 vaxes.append(col)
3847 j += 1
3849 dcs = [col.name for col in vaxes if col.is_data_indexable]
3851 new_table = type(self)(
3852 parent=self.parent,
3853 group=self.group,
3854 encoding=self.encoding,
3855 errors=self.errors,
3856 index_axes=new_index_axes,
3857 non_index_axes=new_non_index_axes,
3858 values_axes=vaxes,
3859 data_columns=dcs,
3860 info=new_info,
3861 nan_rep=nan_rep,
3862 )
3863 if hasattr(self, "levels"):
3864 # TODO: get this into constructor, only for appropriate subclass
3865 new_table.levels = self.levels
3867 new_table.validate_min_itemsize(min_itemsize)
3869 if validate and table_exists:
3870 new_table.validate(self)
3872 return new_table
3874 @staticmethod
3875 def _get_blocks_and_items(
3876 block_obj, table_exists, new_non_index_axes, values_axes, data_columns
3877 ):
3878 # Helper to clarify non-state-altering parts of _create_axes
3880 def get_blk_items(mgr, blocks):
3881 return [mgr.items.take(blk.mgr_locs) for blk in blocks]
3883 blocks = block_obj._data.blocks
3884 blk_items = get_blk_items(block_obj._data, blocks)
3886 if len(data_columns):
3887 axis, axis_labels = new_non_index_axes[0]
3888 new_labels = Index(axis_labels).difference(Index(data_columns))
3889 mgr = block_obj.reindex(new_labels, axis=axis)._data
3891 blocks = list(mgr.blocks)
3892 blk_items = get_blk_items(mgr, blocks)
3893 for c in data_columns:
3894 mgr = block_obj.reindex([c], axis=axis)._data
3895 blocks.extend(mgr.blocks)
3896 blk_items.extend(get_blk_items(mgr, mgr.blocks))
3898 # reorder the blocks in the same order as the existing table if we can
3899 if table_exists:
3900 by_items = {
3901 tuple(b_items.tolist()): (b, b_items)
3902 for b, b_items in zip(blocks, blk_items)
3903 }
3904 new_blocks = []
3905 new_blk_items = []
3906 for ea in values_axes:
3907 items = tuple(ea.values)
3908 try:
3909 b, b_items = by_items.pop(items)
3910 new_blocks.append(b)
3911 new_blk_items.append(b_items)
3912 except (IndexError, KeyError):
3913 jitems = ",".join(pprint_thing(item) for item in items)
3914 raise ValueError(
3915 f"cannot match existing table structure for [{jitems}] "
3916 "on appending data"
3917 )
3918 blocks = new_blocks
3919 blk_items = new_blk_items
3921 return blocks, blk_items
3923 def process_axes(self, obj, selection: "Selection", columns=None):
3924 """ process axes filters """
3926 # make a copy to avoid side effects
3927 if columns is not None:
3928 columns = list(columns)
3930 # make sure to include levels if we have them
3931 if columns is not None and self.is_multi_index:
3932 assert isinstance(self.levels, list) # assured by is_multi_index
3933 for n in self.levels:
3934 if n not in columns:
3935 columns.insert(0, n)
3937 # reorder by any non_index_axes & limit to the select columns
3938 for axis, labels in self.non_index_axes:
3939 obj = _reindex_axis(obj, axis, labels, columns)
3941 # apply the selection filters (but keep in the same order)
3942 if selection.filter is not None:
3943 for field, op, filt in selection.filter.format():
3945 def process_filter(field, filt):
3947 for axis_name in obj._AXIS_NAMES.values():
3948 axis_number = obj._get_axis_number(axis_name)
3949 axis_values = obj._get_axis(axis_name)
3950 assert axis_number is not None
3952 # see if the field is the name of an axis
3953 if field == axis_name:
3955 # if we have a multi-index, then need to include
3956 # the levels
3957 if self.is_multi_index:
3958 filt = filt.union(Index(self.levels))
3960 takers = op(axis_values, filt)
3961 return obj.loc(axis=axis_number)[takers]
3963 # this might be the name of a file IN an axis
3964 elif field in axis_values:
3966 # we need to filter on this dimension
3967 values = ensure_index(getattr(obj, field).values)
3968 filt = ensure_index(filt)
3970 # hack until we support reversed dim flags
3971 if isinstance(obj, DataFrame):
3972 axis_number = 1 - axis_number
3973 takers = op(values, filt)
3974 return obj.loc(axis=axis_number)[takers]
3976 raise ValueError(f"cannot find the field [{field}] for filtering!")
3978 obj = process_filter(field, filt)
3980 return obj
3982 def create_description(
3983 self,
3984 complib,
3985 complevel: Optional[int],
3986 fletcher32: bool,
3987 expectedrows: Optional[int],
3988 ) -> Dict[str, Any]:
3989 """ create the description of the table from the axes & values """
3991 # provided expected rows if its passed
3992 if expectedrows is None:
3993 expectedrows = max(self.nrows_expected, 10000)
3995 d = dict(name="table", expectedrows=expectedrows)
3997 # description from the axes & values
3998 d["description"] = {a.cname: a.typ for a in self.axes}
4000 if complib:
4001 if complevel is None:
4002 complevel = self._complevel or 9
4003 filters = _tables().Filters(
4004 complevel=complevel,
4005 complib=complib,
4006 fletcher32=fletcher32 or self._fletcher32,
4007 )
4008 d["filters"] = filters
4009 elif self._filters is not None:
4010 d["filters"] = self._filters
4012 return d
4014 def read_coordinates(
4015 self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
4016 ):
4017 """select coordinates (row numbers) from a table; return the
4018 coordinates object
4019 """
4021 # validate the version
4022 self.validate_version(where)
4024 # infer the data kind
4025 if not self.infer_axes():
4026 return False
4028 # create the selection
4029 selection = Selection(self, where=where, start=start, stop=stop)
4030 coords = selection.select_coords()
4031 if selection.filter is not None:
4032 for field, op, filt in selection.filter.format():
4033 data = self.read_column(
4034 field, start=coords.min(), stop=coords.max() + 1
4035 )
4036 coords = coords[op(data.iloc[coords - coords.min()], filt).values]
4038 return Index(coords)
4040 def read_column(
4041 self,
4042 column: str,
4043 where=None,
4044 start: Optional[int] = None,
4045 stop: Optional[int] = None,
4046 ):
4047 """return a single column from the table, generally only indexables
4048 are interesting
4049 """
4051 # validate the version
4052 self.validate_version()
4054 # infer the data kind
4055 if not self.infer_axes():
4056 return False
4058 if where is not None:
4059 raise TypeError("read_column does not currently accept a where clause")
4061 # find the axes
4062 for a in self.axes:
4063 if column == a.name:
4065 if not a.is_data_indexable:
4066 raise ValueError(
4067 f"column [{column}] can not be extracted individually; "
4068 "it is not data indexable"
4069 )
4071 # column must be an indexable or a data column
4072 c = getattr(self.table.cols, column)
4073 a.set_info(self.info)
4074 col_values = a.convert(
4075 c[start:stop],
4076 nan_rep=self.nan_rep,
4077 encoding=self.encoding,
4078 errors=self.errors,
4079 )
4080 return Series(_set_tz(col_values[1], a.tz), name=column)
4082 raise KeyError(f"column [{column}] not found in the table")
4085class WORMTable(Table):
4086 """ a write-once read-many table: this format DOES NOT ALLOW appending to a
4087 table. writing is a one-time operation the data are stored in a format
4088 that allows for searching the data on disk
4089 """
4091 table_type = "worm"
4093 def read(
4094 self,
4095 where=None,
4096 columns=None,
4097 start: Optional[int] = None,
4098 stop: Optional[int] = None,
4099 ):
4100 """ read the indices and the indexing array, calculate offset rows and
4101 return """
4102 raise NotImplementedError("WORMTable needs to implement read")
4104 def write(self, **kwargs):
4105 """ write in a format that we can search later on (but cannot append
4106 to): write out the indices and the values using _write_array
4107 (e.g. a CArray) create an indexing table so that we can search
4108 """
4109 raise NotImplementedError("WORMTable needs to implement write")
4112class AppendableTable(Table):
4113 """ support the new appendable table formats """
4115 table_type = "appendable"
4117 def write(
4118 self,
4119 obj,
4120 axes=None,
4121 append=False,
4122 complib=None,
4123 complevel=None,
4124 fletcher32=None,
4125 min_itemsize=None,
4126 chunksize=None,
4127 expectedrows=None,
4128 dropna=False,
4129 nan_rep=None,
4130 data_columns=None,
4131 ):
4133 if not append and self.is_exists:
4134 self._handle.remove_node(self.group, "table")
4136 # create the axes
4137 table = self._create_axes(
4138 axes=axes,
4139 obj=obj,
4140 validate=append,
4141 min_itemsize=min_itemsize,
4142 nan_rep=nan_rep,
4143 data_columns=data_columns,
4144 )
4146 for a in table.axes:
4147 a.validate_names()
4149 if not table.is_exists:
4151 # create the table
4152 options = table.create_description(
4153 complib=complib,
4154 complevel=complevel,
4155 fletcher32=fletcher32,
4156 expectedrows=expectedrows,
4157 )
4159 # set the table attributes
4160 table.set_attrs()
4162 # create the table
4163 table._handle.create_table(table.group, **options)
4165 # update my info
4166 table.attrs.info = table.info
4168 # validate the axes and set the kinds
4169 for a in table.axes:
4170 a.validate_and_set(table, append)
4172 # add the rows
4173 table.write_data(chunksize, dropna=dropna)
4175 def write_data(self, chunksize: Optional[int], dropna: bool = False):
4176 """ we form the data into a 2-d including indexes,values,mask
4177 write chunk-by-chunk """
4179 names = self.dtype.names
4180 nrows = self.nrows_expected
4182 # if dropna==True, then drop ALL nan rows
4183 masks = []
4184 if dropna:
4186 for a in self.values_axes:
4188 # figure the mask: only do if we can successfully process this
4189 # column, otherwise ignore the mask
4190 mask = isna(a.data).all(axis=0)
4191 if isinstance(mask, np.ndarray):
4192 masks.append(mask.astype("u1", copy=False))
4194 # consolidate masks
4195 if len(masks):
4196 mask = masks[0]
4197 for m in masks[1:]:
4198 mask = mask & m
4199 mask = mask.ravel()
4200 else:
4201 mask = None
4203 # broadcast the indexes if needed
4204 indexes = [a.cvalues for a in self.index_axes]
4205 nindexes = len(indexes)
4206 assert nindexes == 1, nindexes # ensures we dont need to broadcast
4208 # transpose the values so first dimension is last
4209 # reshape the values if needed
4210 values = [a.take_data() for a in self.values_axes]
4211 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
4212 bvalues = []
4213 for i, v in enumerate(values):
4214 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
4215 bvalues.append(values[i].reshape(new_shape))
4217 # write the chunks
4218 if chunksize is None:
4219 chunksize = 100000
4221 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
4222 chunks = int(nrows / chunksize) + 1
4223 for i in range(chunks):
4224 start_i = i * chunksize
4225 end_i = min((i + 1) * chunksize, nrows)
4226 if start_i >= end_i:
4227 break
4229 self.write_data_chunk(
4230 rows,
4231 indexes=[a[start_i:end_i] for a in indexes],
4232 mask=mask[start_i:end_i] if mask is not None else None,
4233 values=[v[start_i:end_i] for v in bvalues],
4234 )
4236 def write_data_chunk(
4237 self,
4238 rows: np.ndarray,
4239 indexes: List[np.ndarray],
4240 mask: Optional[np.ndarray],
4241 values: List[np.ndarray],
4242 ):
4243 """
4244 Parameters
4245 ----------
4246 rows : an empty memory space where we are putting the chunk
4247 indexes : an array of the indexes
4248 mask : an array of the masks
4249 values : an array of the values
4250 """
4252 # 0 len
4253 for v in values:
4254 if not np.prod(v.shape):
4255 return
4257 nrows = indexes[0].shape[0]
4258 if nrows != len(rows):
4259 rows = np.empty(nrows, dtype=self.dtype)
4260 names = self.dtype.names
4261 nindexes = len(indexes)
4263 # indexes
4264 for i, idx in enumerate(indexes):
4265 rows[names[i]] = idx
4267 # values
4268 for i, v in enumerate(values):
4269 rows[names[i + nindexes]] = v
4271 # mask
4272 if mask is not None:
4273 m = ~mask.ravel().astype(bool, copy=False)
4274 if not m.all():
4275 rows = rows[m]
4277 if len(rows):
4278 self.table.append(rows)
4279 self.table.flush()
4281 def delete(
4282 self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
4283 ):
4285 # delete all rows (and return the nrows)
4286 if where is None or not len(where):
4287 if start is None and stop is None:
4288 nrows = self.nrows
4289 self._handle.remove_node(self.group, recursive=True)
4290 else:
4291 # pytables<3.0 would remove a single row with stop=None
4292 if stop is None:
4293 stop = self.nrows
4294 nrows = self.table.remove_rows(start=start, stop=stop)
4295 self.table.flush()
4296 return nrows
4298 # infer the data kind
4299 if not self.infer_axes():
4300 return None
4302 # create the selection
4303 table = self.table
4304 selection = Selection(self, where, start=start, stop=stop)
4305 values = selection.select_coords()
4307 # delete the rows in reverse order
4308 sorted_series = Series(values).sort_values()
4309 ln = len(sorted_series)
4311 if ln:
4313 # construct groups of consecutive rows
4314 diff = sorted_series.diff()
4315 groups = list(diff[diff > 1].index)
4317 # 1 group
4318 if not len(groups):
4319 groups = [0]
4321 # final element
4322 if groups[-1] != ln:
4323 groups.append(ln)
4325 # initial element
4326 if groups[0] != 0:
4327 groups.insert(0, 0)
4329 # we must remove in reverse order!
4330 pg = groups.pop()
4331 for g in reversed(groups):
4332 rows = sorted_series.take(range(g, pg))
4333 table.remove_rows(
4334 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
4335 )
4336 pg = g
4338 self.table.flush()
4340 # return the number of rows removed
4341 return ln
4344class AppendableFrameTable(AppendableTable):
4345 """ support the new appendable table formats """
4347 pandas_kind = "frame_table"
4348 table_type = "appendable_frame"
4349 ndim = 2
4350 obj_type: Type[Union[DataFrame, Series]] = DataFrame
4352 @property
4353 def is_transposed(self) -> bool:
4354 return self.index_axes[0].axis == 1
4356 @classmethod
4357 def get_object(cls, obj, transposed: bool):
4358 """ these are written transposed """
4359 if transposed:
4360 obj = obj.T
4361 return obj
4363 def read(
4364 self,
4365 where=None,
4366 columns=None,
4367 start: Optional[int] = None,
4368 stop: Optional[int] = None,
4369 ):
4371 # validate the version
4372 self.validate_version(where)
4374 # infer the data kind
4375 if not self.infer_axes():
4376 return None
4378 result = self._read_axes(where=where, start=start, stop=stop)
4380 info = (
4381 self.info.get(self.non_index_axes[0][0], dict())
4382 if len(self.non_index_axes)
4383 else dict()
4384 )
4386 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
4387 assert len(inds) == 1
4388 ind = inds[0]
4390 index = result[ind][0]
4392 frames = []
4393 for i, a in enumerate(self.axes):
4394 if a not in self.values_axes:
4395 continue
4396 index_vals, cvalues = result[i]
4398 # we could have a multi-index constructor here
4399 # ensure_index doesn't recognized our list-of-tuples here
4400 if info.get("type") == "MultiIndex":
4401 cols = MultiIndex.from_tuples(index_vals)
4402 else:
4403 cols = Index(index_vals)
4405 names = info.get("names")
4406 if names is not None:
4407 cols.set_names(names, inplace=True)
4409 if self.is_transposed:
4410 values = cvalues
4411 index_ = cols
4412 cols_ = Index(index, name=getattr(index, "name", None))
4413 else:
4414 values = cvalues.T
4415 index_ = Index(index, name=getattr(index, "name", None))
4416 cols_ = cols
4418 # if we have a DataIndexableCol, its shape will only be 1 dim
4419 if values.ndim == 1 and isinstance(values, np.ndarray):
4420 values = values.reshape((1, values.shape[0]))
4422 if isinstance(values, np.ndarray):
4423 df = DataFrame(values.T, columns=cols_, index=index_)
4424 elif isinstance(values, Index):
4425 df = DataFrame(values, columns=cols_, index=index_)
4426 else:
4427 # Categorical
4428 df = DataFrame([values], columns=cols_, index=index_)
4429 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
4430 frames.append(df)
4432 if len(frames) == 1:
4433 df = frames[0]
4434 else:
4435 df = concat(frames, axis=1)
4437 selection = Selection(self, where=where, start=start, stop=stop)
4438 # apply the selection filters & axis orderings
4439 df = self.process_axes(df, selection=selection, columns=columns)
4441 return df
4444class AppendableSeriesTable(AppendableFrameTable):
4445 """ support the new appendable table formats """
4447 pandas_kind = "series_table"
4448 table_type = "appendable_series"
4449 ndim = 2
4450 obj_type = Series
4452 @property
4453 def is_transposed(self) -> bool:
4454 return False
4456 @classmethod
4457 def get_object(cls, obj, transposed: bool):
4458 return obj
4460 def write(self, obj, data_columns=None, **kwargs):
4461 """ we are going to write this as a frame table """
4462 if not isinstance(obj, DataFrame):
4463 name = obj.name or "values"
4464 obj = obj.to_frame(name)
4465 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4467 def read(
4468 self,
4469 where=None,
4470 columns=None,
4471 start: Optional[int] = None,
4472 stop: Optional[int] = None,
4473 ) -> Series:
4475 is_multi_index = self.is_multi_index
4476 if columns is not None and is_multi_index:
4477 assert isinstance(self.levels, list) # needed for mypy
4478 for n in self.levels:
4479 if n not in columns:
4480 columns.insert(0, n)
4481 s = super().read(where=where, columns=columns, start=start, stop=stop)
4482 if is_multi_index:
4483 s.set_index(self.levels, inplace=True)
4485 s = s.iloc[:, 0]
4487 # remove the default name
4488 if s.name == "values":
4489 s.name = None
4490 return s
4493class AppendableMultiSeriesTable(AppendableSeriesTable):
4494 """ support the new appendable table formats """
4496 pandas_kind = "series_table"
4497 table_type = "appendable_multiseries"
4499 def write(self, obj, **kwargs):
4500 """ we are going to write this as a frame table """
4501 name = obj.name or "values"
4502 obj, self.levels = self.validate_multiindex(obj)
4503 cols = list(self.levels)
4504 cols.append(name)
4505 obj.columns = cols
4506 return super().write(obj=obj, **kwargs)
4509class GenericTable(AppendableFrameTable):
4510 """ a table that read/writes the generic pytables table format """
4512 pandas_kind = "frame_table"
4513 table_type = "generic_table"
4514 ndim = 2
4515 obj_type = DataFrame
4517 @property
4518 def pandas_type(self) -> str:
4519 return self.pandas_kind
4521 @property
4522 def storable(self):
4523 return getattr(self.group, "table", None) or self.group
4525 def get_attrs(self):
4526 """ retrieve our attributes """
4527 self.non_index_axes = []
4528 self.nan_rep = None
4529 self.levels = []
4531 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
4532 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
4533 self.data_columns = [a.name for a in self.values_axes]
4535 @cache_readonly
4536 def indexables(self):
4537 """ create the indexables from the table description """
4538 d = self.description
4540 # TODO: can we get a typ for this? AFAICT it is the only place
4541 # where we aren't passing one
4542 # the index columns is just a simple index
4543 md = self.read_metadata("index")
4544 meta = "category" if md is not None else None
4545 index_col = GenericIndexCol(
4546 name="index", axis=0, table=self.table, meta=meta, metadata=md
4547 )
4549 _indexables = [index_col]
4551 for i, n in enumerate(d._v_names):
4552 assert isinstance(n, str)
4554 atom = getattr(d, n)
4555 md = self.read_metadata(n)
4556 meta = "category" if md is not None else None
4557 dc = GenericDataIndexableCol(
4558 name=n,
4559 pos=i,
4560 values=[n],
4561 typ=atom,
4562 table=self.table,
4563 meta=meta,
4564 metadata=md,
4565 )
4566 _indexables.append(dc)
4568 return _indexables
4570 def write(self, **kwargs):
4571 raise NotImplementedError("cannot write on an generic table")
4574class AppendableMultiFrameTable(AppendableFrameTable):
4575 """ a frame with a multi-index """
4577 table_type = "appendable_multiframe"
4578 obj_type = DataFrame
4579 ndim = 2
4580 _re_levels = re.compile(r"^level_\d+$")
4582 @property
4583 def table_type_short(self) -> str:
4584 return "appendable_multi"
4586 def write(self, obj, data_columns=None, **kwargs):
4587 if data_columns is None:
4588 data_columns = []
4589 elif data_columns is True:
4590 data_columns = obj.columns.tolist()
4591 obj, self.levels = self.validate_multiindex(obj)
4592 for n in self.levels:
4593 if n not in data_columns:
4594 data_columns.insert(0, n)
4595 return super().write(obj=obj, data_columns=data_columns, **kwargs)
4597 def read(
4598 self,
4599 where=None,
4600 columns=None,
4601 start: Optional[int] = None,
4602 stop: Optional[int] = None,
4603 ):
4605 df = super().read(where=where, columns=columns, start=start, stop=stop)
4606 df = df.set_index(self.levels)
4608 # remove names for 'level_%d'
4609 df.index = df.index.set_names(
4610 [None if self._re_levels.search(l) else l for l in df.index.names]
4611 )
4613 return df
4616def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame:
4617 ax = obj._get_axis(axis)
4618 labels = ensure_index(labels)
4620 # try not to reindex even if other is provided
4621 # if it equals our current index
4622 if other is not None:
4623 other = ensure_index(other)
4624 if (other is None or labels.equals(other)) and labels.equals(ax):
4625 return obj
4627 labels = ensure_index(labels.unique())
4628 if other is not None:
4629 labels = ensure_index(other.unique()).intersection(labels, sort=False)
4630 if not labels.equals(ax):
4631 slicer: List[Union[slice, Index]] = [slice(None, None)] * obj.ndim
4632 slicer[axis] = labels
4633 obj = obj.loc[tuple(slicer)]
4634 return obj
4637# tz to/from coercion
4640def _get_tz(tz: tzinfo) -> Union[str, tzinfo]:
4641 """ for a tz-aware type, return an encoded zone """
4642 zone = timezones.get_timezone(tz)
4643 return zone
4646def _set_tz(
4647 values: Union[np.ndarray, Index],
4648 tz: Optional[Union[str, tzinfo]],
4649 coerce: bool = False,
4650) -> Union[np.ndarray, DatetimeIndex]:
4651 """
4652 coerce the values to a DatetimeIndex if tz is set
4653 preserve the input shape if possible
4655 Parameters
4656 ----------
4657 values : ndarray or Index
4658 tz : str or tzinfo
4659 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
4660 """
4661 if isinstance(values, DatetimeIndex):
4662 # If values is tzaware, the tz gets dropped in the values.ravel()
4663 # call below (which returns an ndarray). So we are only non-lossy
4664 # if `tz` matches `values.tz`.
4665 assert values.tz is None or values.tz == tz
4667 if tz is not None:
4668 name = getattr(values, "name", None)
4669 values = values.ravel()
4670 tz = timezones.get_timezone(_ensure_decoded(tz))
4671 values = DatetimeIndex(values, name=name)
4672 values = values.tz_localize("UTC").tz_convert(tz)
4673 elif coerce:
4674 values = np.asarray(values, dtype="M8[ns]")
4676 return values
4679def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
4680 assert isinstance(name, str)
4682 index_name = index.name
4683 converted, dtype_name = _get_data_and_dtype_name(index)
4684 kind = _dtype_to_kind(dtype_name)
4685 atom = DataIndexableCol._get_atom(converted)
4687 if isinstance(index, Int64Index):
4688 # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
4689 # in which case "kind" is "integer", "integer", "datetime64",
4690 # "timedelta64", and "integer", respectively.
4691 return IndexCol(
4692 name,
4693 values=converted,
4694 kind=kind,
4695 typ=atom,
4696 freq=getattr(index, "freq", None),
4697 tz=getattr(index, "tz", None),
4698 index_name=index_name,
4699 )
4701 if isinstance(index, MultiIndex):
4702 raise TypeError("MultiIndex not supported here!")
4704 inferred_type = lib.infer_dtype(index, skipna=False)
4705 # we wont get inferred_type of "datetime64" or "timedelta64" as these
4706 # would go through the DatetimeIndex/TimedeltaIndex paths above
4708 values = np.asarray(index)
4710 if inferred_type == "date":
4711 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
4712 return IndexCol(
4713 name, converted, "date", _tables().Time32Col(), index_name=index_name,
4714 )
4715 elif inferred_type == "string":
4717 converted = _convert_string_array(values, encoding, errors)
4718 itemsize = converted.dtype.itemsize
4719 return IndexCol(
4720 name,
4721 converted,
4722 "string",
4723 _tables().StringCol(itemsize),
4724 index_name=index_name,
4725 )
4727 elif inferred_type in ["integer", "floating"]:
4728 return IndexCol(
4729 name, values=converted, kind=kind, typ=atom, index_name=index_name,
4730 )
4731 else:
4732 assert isinstance(converted, np.ndarray) and converted.dtype == object
4733 assert kind == "object", kind
4734 atom = _tables().ObjectAtom()
4735 return IndexCol(name, converted, kind, atom, index_name=index_name,)
4738def _unconvert_index(
4739 data, kind: str, encoding: str, errors: str
4740) -> Union[np.ndarray, Index]:
4741 index: Union[Index, np.ndarray]
4743 if kind == "datetime64":
4744 index = DatetimeIndex(data)
4745 elif kind == "timedelta64":
4746 index = TimedeltaIndex(data)
4747 elif kind == "date":
4748 try:
4749 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
4750 except (ValueError):
4751 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
4752 elif kind in ("integer", "float"):
4753 index = np.asarray(data)
4754 elif kind in ("string"):
4755 index = _unconvert_string_array(
4756 data, nan_rep=None, encoding=encoding, errors=errors
4757 )
4758 elif kind == "object":
4759 index = np.asarray(data[0])
4760 else: # pragma: no cover
4761 raise ValueError(f"unrecognized index type {kind}")
4762 return index
4765def _maybe_convert_for_string_atom(
4766 name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors
4767):
4769 if not block.is_object:
4770 return block.values
4772 dtype_name = block.dtype.name
4773 inferred_type = lib.infer_dtype(block.values, skipna=False)
4775 if inferred_type == "date":
4776 raise TypeError("[date] is not implemented as a table column")
4777 elif inferred_type == "datetime":
4778 # after GH#8260
4779 # this only would be hit for a multi-timezone dtype which is an error
4780 raise TypeError(
4781 "too many timezones in this block, create separate data columns"
4782 )
4784 elif not (inferred_type == "string" or dtype_name == "object"):
4785 return block.values
4787 block = block.fillna(nan_rep, downcast=False)
4788 if isinstance(block, list):
4789 # Note: because block is always object dtype, fillna goes
4790 # through a path such that the result is always a 1-element list
4791 block = block[0]
4792 data = block.values
4794 # see if we have a valid string type
4795 inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
4796 if inferred_type != "string":
4798 # we cannot serialize this data, so report an exception on a column
4799 # by column basis
4800 for i in range(len(block.shape[0])):
4802 col = block.iget(i)
4803 inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
4804 if inferred_type != "string":
4805 iloc = block.mgr_locs.indexer[i]
4806 raise TypeError(
4807 f"Cannot serialize the column [{iloc}] because\n"
4808 f"its data contents are [{inferred_type}] object dtype"
4809 )
4811 # itemsize is the maximum length of a string (along any dimension)
4812 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
4813 assert data_converted.shape == block.shape, (data_converted.shape, block.shape)
4814 itemsize = data_converted.itemsize
4816 # specified min_itemsize?
4817 if isinstance(min_itemsize, dict):
4818 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
4819 itemsize = max(min_itemsize or 0, itemsize)
4821 # check for column in the values conflicts
4822 if existing_col is not None:
4823 eci = existing_col.validate_col(itemsize)
4824 if eci > itemsize:
4825 itemsize = eci
4827 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
4828 return data_converted
4831def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
4832 """
4833 Take a string-like that is object dtype and coerce to a fixed size string type.
4835 Parameters
4836 ----------
4837 data : np.ndarray[object]
4838 encoding : str
4839 errors : str
4840 Handler for encoding errors.
4842 Returns
4843 -------
4844 np.ndarray[fixed-length-string]
4845 """
4847 # encode if needed
4848 if len(data):
4849 data = (
4850 Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape)
4851 )
4853 # create the sized dtype
4854 ensured = ensure_object(data.ravel())
4855 itemsize = max(1, libwriters.max_len_string_array(ensured))
4857 data = np.asarray(data, dtype=f"S{itemsize}")
4858 return data
4861def _unconvert_string_array(
4862 data: np.ndarray, nan_rep, encoding: str, errors: str
4863) -> np.ndarray:
4864 """
4865 Inverse of _convert_string_array.
4867 Parameters
4868 ----------
4869 data : np.ndarray[fixed-length-string]
4870 nan_rep : the storage repr of NaN
4871 encoding : str
4872 errors : str
4873 Handler for encoding errors.
4875 Returns
4876 -------
4877 np.ndarray[object]
4878 Decoded data.
4879 """
4880 shape = data.shape
4881 data = np.asarray(data.ravel(), dtype=object)
4883 if len(data):
4885 itemsize = libwriters.max_len_string_array(ensure_object(data))
4886 dtype = f"U{itemsize}"
4888 if isinstance(data[0], bytes):
4889 data = Series(data).str.decode(encoding, errors=errors).values
4890 else:
4891 data = data.astype(dtype, copy=False).astype(object, copy=False)
4893 if nan_rep is None:
4894 nan_rep = "nan"
4896 data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)
4897 return data.reshape(shape)
4900def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
4901 assert isinstance(val_kind, str), type(val_kind)
4902 if _need_convert(val_kind):
4903 conv = _get_converter(val_kind, encoding, errors)
4904 values = conv(values)
4905 return values
4908def _get_converter(kind: str, encoding: str, errors: str):
4909 if kind == "datetime64":
4910 return lambda x: np.asarray(x, dtype="M8[ns]")
4911 elif kind == "string":
4912 return lambda x: _unconvert_string_array(
4913 x, nan_rep=None, encoding=encoding, errors=errors
4914 )
4915 else: # pragma: no cover
4916 raise ValueError(f"invalid kind {kind}")
4919def _need_convert(kind: str) -> bool:
4920 if kind in ("datetime64", "string"):
4921 return True
4922 return False
4925def _maybe_adjust_name(name: str, version) -> str:
4926 """
4927 Prior to 0.10.1, we named values blocks like: values_block_0 an the
4928 name values_0, adjust the given name if necessary.
4930 Parameters
4931 ----------
4932 name : str
4933 version : Tuple[int, int, int]
4935 Returns
4936 -------
4937 str
4938 """
4939 try:
4940 if version[0] == 0 and version[1] <= 10 and version[2] == 0:
4941 m = re.search(r"values_block_(\d+)", name)
4942 if m:
4943 grp = m.groups()[0]
4944 name = f"values_{grp}"
4945 except IndexError:
4946 pass
4947 return name
4950def _dtype_to_kind(dtype_str: str) -> str:
4951 """
4952 Find the "kind" string describing the given dtype name.
4953 """
4954 dtype_str = _ensure_decoded(dtype_str)
4956 if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
4957 kind = "string"
4958 elif dtype_str.startswith("float"):
4959 kind = "float"
4960 elif dtype_str.startswith("complex"):
4961 kind = "complex"
4962 elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
4963 kind = "integer"
4964 elif dtype_str.startswith("datetime64"):
4965 kind = "datetime64"
4966 elif dtype_str.startswith("timedelta"):
4967 kind = "timedelta64"
4968 elif dtype_str.startswith("bool"):
4969 kind = "bool"
4970 elif dtype_str.startswith("category"):
4971 kind = "category"
4972 elif dtype_str.startswith("period"):
4973 # We store the `freq` attr so we can restore from integers
4974 kind = "integer"
4975 elif dtype_str == "object":
4976 kind = "object"
4977 else:
4978 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
4980 return kind
4983def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]):
4984 """
4985 Convert the passed data into a storable form and a dtype string.
4986 """
4987 if is_categorical_dtype(data.dtype):
4988 data = data.codes
4990 # For datetime64tz we need to drop the TZ in tests TODO: why?
4991 dtype_name = data.dtype.name.split("[")[0]
4993 if data.dtype.kind in ["m", "M"]:
4994 data = np.asarray(data.view("i8"))
4995 # TODO: we used to reshape for the dt64tz case, but no longer
4996 # doing that doesn't seem to break anything. why?
4998 elif isinstance(data, PeriodIndex):
4999 data = data.asi8
5001 data = np.asarray(data)
5002 return data, dtype_name
5005class Selection:
5006 """
5007 Carries out a selection operation on a tables.Table object.
5009 Parameters
5010 ----------
5011 table : a Table object
5012 where : list of Terms (or convertible to)
5013 start, stop: indices to start and/or stop selection
5015 """
5017 def __init__(
5018 self,
5019 table: Table,
5020 where=None,
5021 start: Optional[int] = None,
5022 stop: Optional[int] = None,
5023 ):
5024 self.table = table
5025 self.where = where
5026 self.start = start
5027 self.stop = stop
5028 self.condition = None
5029 self.filter = None
5030 self.terms = None
5031 self.coordinates = None
5033 if is_list_like(where):
5035 # see if we have a passed coordinate like
5036 try:
5037 inferred = lib.infer_dtype(where, skipna=False)
5038 if inferred == "integer" or inferred == "boolean":
5039 where = np.asarray(where)
5040 if where.dtype == np.bool_:
5041 start, stop = self.start, self.stop
5042 if start is None:
5043 start = 0
5044 if stop is None:
5045 stop = self.table.nrows
5046 self.coordinates = np.arange(start, stop)[where]
5047 elif issubclass(where.dtype.type, np.integer):
5048 if (self.start is not None and (where < self.start).any()) or (
5049 self.stop is not None and (where >= self.stop).any()
5050 ):
5051 raise ValueError(
5052 "where must have index locations >= start and < stop"
5053 )
5054 self.coordinates = where
5056 except ValueError:
5057 pass
5059 if self.coordinates is None:
5061 self.terms = self.generate(where)
5063 # create the numexpr & the filter
5064 if self.terms is not None:
5065 self.condition, self.filter = self.terms.evaluate()
5067 def generate(self, where):
5068 """ where can be a : dict,list,tuple,string """
5069 if where is None:
5070 return None
5072 q = self.table.queryables()
5073 try:
5074 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
5075 except NameError:
5076 # raise a nice message, suggesting that the user should use
5077 # data_columns
5078 qkeys = ",".join(q.keys())
5079 raise ValueError(
5080 f"The passed where expression: {where}\n"
5081 " contains an invalid variable reference\n"
5082 " all of the variable references must be a "
5083 "reference to\n"
5084 " an axis (e.g. 'index' or 'columns'), or a "
5085 "data_column\n"
5086 f" The currently defined references are: {qkeys}\n"
5087 )
5089 def select(self):
5090 """
5091 generate the selection
5092 """
5093 if self.condition is not None:
5094 return self.table.table.read_where(
5095 self.condition.format(), start=self.start, stop=self.stop
5096 )
5097 elif self.coordinates is not None:
5098 return self.table.table.read_coordinates(self.coordinates)
5099 return self.table.table.read(start=self.start, stop=self.stop)
5101 def select_coords(self):
5102 """
5103 generate the selection
5104 """
5105 start, stop = self.start, self.stop
5106 nrows = self.table.nrows
5107 if start is None:
5108 start = 0
5109 elif start < 0:
5110 start += nrows
5111 if self.stop is None:
5112 stop = nrows
5113 elif stop < 0:
5114 stop += nrows
5116 if self.condition is not None:
5117 return self.table.table.get_where_list(
5118 self.condition.format(), start=start, stop=stop, sort=True
5119 )
5120 elif self.coordinates is not None:
5121 return self.coordinates
5123 return np.arange(start, stop)