Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/frame.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2DataFrame
3---------
4An efficient 2D container for potentially mixed-type time series or other
5labeled data series.
7Similar to its R counterpart, data.frame, except providing automatic data
8alignment and a host of useful data manipulation methods having to do with the
9labeling information
10"""
11import collections
12from collections import abc
13from io import StringIO
14import itertools
15import sys
16from textwrap import dedent
17from typing import (
18 IO,
19 TYPE_CHECKING,
20 Any,
21 FrozenSet,
22 Hashable,
23 Iterable,
24 List,
25 Optional,
26 Sequence,
27 Set,
28 Tuple,
29 Type,
30 Union,
31 cast,
32)
33import warnings
35import numpy as np
36import numpy.ma as ma
38from pandas._config import get_option
40from pandas._libs import algos as libalgos, lib
41from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer
42from pandas.compat import PY37
43from pandas.compat._optional import import_optional_dependency
44from pandas.compat.numpy import function as nv
45from pandas.util._decorators import (
46 Appender,
47 Substitution,
48 deprecate_kwarg,
49 rewrite_axis_style_signature,
50)
51from pandas.util._validators import (
52 validate_axis_style_args,
53 validate_bool_kwarg,
54 validate_percentile,
55)
57from pandas.core.dtypes.cast import (
58 cast_scalar_to_array,
59 coerce_to_dtypes,
60 find_common_type,
61 infer_dtype_from_scalar,
62 invalidate_string_dtypes,
63 maybe_cast_to_datetime,
64 maybe_convert_platform,
65 maybe_downcast_to_dtype,
66 maybe_infer_to_datetimelike,
67 maybe_upcast,
68 maybe_upcast_putmask,
69)
70from pandas.core.dtypes.common import (
71 ensure_float64,
72 ensure_int64,
73 ensure_platform_int,
74 infer_dtype_from_object,
75 is_bool_dtype,
76 is_dict_like,
77 is_dtype_equal,
78 is_extension_array_dtype,
79 is_float_dtype,
80 is_hashable,
81 is_integer,
82 is_integer_dtype,
83 is_iterator,
84 is_list_like,
85 is_named_tuple,
86 is_object_dtype,
87 is_scalar,
88 is_sequence,
89 needs_i8_conversion,
90)
91from pandas.core.dtypes.generic import (
92 ABCDataFrame,
93 ABCIndexClass,
94 ABCMultiIndex,
95 ABCSeries,
96)
97from pandas.core.dtypes.missing import isna, notna
99from pandas.core import algorithms, common as com, nanops, ops
100from pandas.core.accessor import CachedAccessor
101from pandas.core.arrays import Categorical, ExtensionArray
102from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
103from pandas.core.arrays.sparse import SparseFrameAccessor
104from pandas.core.generic import NDFrame, _shared_docs
105from pandas.core.groupby import generic as groupby_generic
106from pandas.core.indexes import base as ibase
107from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
108from pandas.core.indexes.datetimes import DatetimeIndex
109from pandas.core.indexes.multi import maybe_droplevels
110from pandas.core.indexes.period import PeriodIndex
111from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
112from pandas.core.internals import BlockManager
113from pandas.core.internals.construction import (
114 arrays_to_mgr,
115 get_names_from_index,
116 init_dict,
117 init_ndarray,
118 masked_rec_array_to_mgr,
119 reorder_arrays,
120 sanitize_index,
121 to_arrays,
122)
123from pandas.core.ops.missing import dispatch_fill_zeros
124from pandas.core.series import Series
126from pandas.io.common import get_filepath_or_buffer
127from pandas.io.formats import console, format as fmt
128from pandas.io.formats.printing import pprint_thing
129import pandas.plotting
131if TYPE_CHECKING:
132 from pandas.io.formats.style import Styler
134# ---------------------------------------------------------------------
135# Docstring templates
137_shared_doc_kwargs = dict(
138 axes="index, columns",
139 klass="DataFrame",
140 axes_single_arg="{0 or 'index', 1 or 'columns'}",
141 axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
142 If 0 or 'index': apply function to each column.
143 If 1 or 'columns': apply function to each row.""",
144 optional_by="""
145 by : str or list of str
146 Name or list of names to sort by.
148 - if `axis` is 0 or `'index'` then `by` may contain index
149 levels and/or column labels.
150 - if `axis` is 1 or `'columns'` then `by` may contain column
151 levels and/or index labels.
153 .. versionchanged:: 0.23.0
155 Allow specifying index or column level names.""",
156 versionadded_to_excel="",
157 optional_labels="""labels : array-like, optional
158 New labels / index to conform the axis specified by 'axis' to.""",
159 optional_axis="""axis : int or str, optional
160 Axis to target. Can be either the axis name ('index', 'columns')
161 or number (0, 1).""",
162)
164_numeric_only_doc = """numeric_only : boolean, default None
165 Include only float, int, boolean data. If None, will attempt to use
166 everything, then use only numeric data
167"""
169_merge_doc = """
170Merge DataFrame or named Series objects with a database-style join.
172The join is done on columns or indexes. If joining columns on
173columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
174on indexes or indexes on a column or columns, the index will be passed on.
176Parameters
177----------%s
178right : DataFrame or named Series
179 Object to merge with.
180how : {'left', 'right', 'outer', 'inner'}, default 'inner'
181 Type of merge to be performed.
183 * left: use only keys from left frame, similar to a SQL left outer join;
184 preserve key order.
185 * right: use only keys from right frame, similar to a SQL right outer join;
186 preserve key order.
187 * outer: use union of keys from both frames, similar to a SQL full outer
188 join; sort keys lexicographically.
189 * inner: use intersection of keys from both frames, similar to a SQL inner
190 join; preserve the order of the left keys.
191on : label or list
192 Column or index level names to join on. These must be found in both
193 DataFrames. If `on` is None and not merging on indexes then this defaults
194 to the intersection of the columns in both DataFrames.
195left_on : label or list, or array-like
196 Column or index level names to join on in the left DataFrame. Can also
197 be an array or list of arrays of the length of the left DataFrame.
198 These arrays are treated as if they are columns.
199right_on : label or list, or array-like
200 Column or index level names to join on in the right DataFrame. Can also
201 be an array or list of arrays of the length of the right DataFrame.
202 These arrays are treated as if they are columns.
203left_index : bool, default False
204 Use the index from the left DataFrame as the join key(s). If it is a
205 MultiIndex, the number of keys in the other DataFrame (either the index
206 or a number of columns) must match the number of levels.
207right_index : bool, default False
208 Use the index from the right DataFrame as the join key. Same caveats as
209 left_index.
210sort : bool, default False
211 Sort the join keys lexicographically in the result DataFrame. If False,
212 the order of the join keys depends on the join type (how keyword).
213suffixes : tuple of (str, str), default ('_x', '_y')
214 Suffix to apply to overlapping column names in the left and right
215 side, respectively. To raise an exception on overlapping columns use
216 (False, False).
217copy : bool, default True
218 If False, avoid copy if possible.
219indicator : bool or str, default False
220 If True, adds a column to output DataFrame called "_merge" with
221 information on the source of each row.
222 If string, column with information on source of each row will be added to
223 output DataFrame, and column will be named value of string.
224 Information column is Categorical-type and takes on a value of "left_only"
225 for observations whose merge key only appears in 'left' DataFrame,
226 "right_only" for observations whose merge key only appears in 'right'
227 DataFrame, and "both" if the observation's merge key is found in both.
229validate : str, optional
230 If specified, checks if merge is of specified type.
232 * "one_to_one" or "1:1": check if merge keys are unique in both
233 left and right datasets.
234 * "one_to_many" or "1:m": check if merge keys are unique in left
235 dataset.
236 * "many_to_one" or "m:1": check if merge keys are unique in right
237 dataset.
238 * "many_to_many" or "m:m": allowed, but does not result in checks.
240 .. versionadded:: 0.21.0
242Returns
243-------
244DataFrame
245 A DataFrame of the two merged objects.
247See Also
248--------
249merge_ordered : Merge with optional filling/interpolation.
250merge_asof : Merge on nearest keys.
251DataFrame.join : Similar method using indices.
253Notes
254-----
255Support for specifying index levels as the `on`, `left_on`, and
256`right_on` parameters was added in version 0.23.0
257Support for merging named Series objects was added in version 0.24.0
259Examples
260--------
262>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
263... 'value': [1, 2, 3, 5]})
264>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
265... 'value': [5, 6, 7, 8]})
266>>> df1
267 lkey value
2680 foo 1
2691 bar 2
2702 baz 3
2713 foo 5
272>>> df2
273 rkey value
2740 foo 5
2751 bar 6
2762 baz 7
2773 foo 8
279Merge df1 and df2 on the lkey and rkey columns. The value columns have
280the default suffixes, _x and _y, appended.
282>>> df1.merge(df2, left_on='lkey', right_on='rkey')
283 lkey value_x rkey value_y
2840 foo 1 foo 5
2851 foo 1 foo 8
2862 foo 5 foo 5
2873 foo 5 foo 8
2884 bar 2 bar 6
2895 baz 3 baz 7
291Merge DataFrames df1 and df2 with specified left and right suffixes
292appended to any overlapping columns.
294>>> df1.merge(df2, left_on='lkey', right_on='rkey',
295... suffixes=('_left', '_right'))
296 lkey value_left rkey value_right
2970 foo 1 foo 5
2981 foo 1 foo 8
2992 foo 5 foo 5
3003 foo 5 foo 8
3014 bar 2 bar 6
3025 baz 3 baz 7
304Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
305any overlapping columns.
307>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
308Traceback (most recent call last):
309...
310ValueError: columns overlap but no suffix specified:
311 Index(['value'], dtype='object')
312"""
315# -----------------------------------------------------------------------
316# DataFrame class
319class DataFrame(NDFrame):
320 """
321 Two-dimensional, size-mutable, potentially heterogeneous tabular data.
323 Data structure also contains labeled axes (rows and columns).
324 Arithmetic operations align on both row and column labels. Can be
325 thought of as a dict-like container for Series objects. The primary
326 pandas data structure.
328 Parameters
329 ----------
330 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
331 Dict can contain Series, arrays, constants, or list-like objects.
333 .. versionchanged:: 0.23.0
334 If data is a dict, column order follows insertion-order for
335 Python 3.6 and later.
337 .. versionchanged:: 0.25.0
338 If data is a list of dicts, column order follows insertion-order
339 for Python 3.6 and later.
341 index : Index or array-like
342 Index to use for resulting frame. Will default to RangeIndex if
343 no indexing information part of input data and no index provided.
344 columns : Index or array-like
345 Column labels to use for resulting frame. Will default to
346 RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
347 dtype : dtype, default None
348 Data type to force. Only a single dtype is allowed. If None, infer.
349 copy : bool, default False
350 Copy data from inputs. Only affects DataFrame / 2d ndarray input.
352 See Also
353 --------
354 DataFrame.from_records : Constructor from tuples, also record arrays.
355 DataFrame.from_dict : From dicts of Series, arrays, or dicts.
356 read_csv
357 read_table
358 read_clipboard
360 Examples
361 --------
362 Constructing DataFrame from a dictionary.
364 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
365 >>> df = pd.DataFrame(data=d)
366 >>> df
367 col1 col2
368 0 1 3
369 1 2 4
371 Notice that the inferred dtype is int64.
373 >>> df.dtypes
374 col1 int64
375 col2 int64
376 dtype: object
378 To enforce a single dtype:
380 >>> df = pd.DataFrame(data=d, dtype=np.int8)
381 >>> df.dtypes
382 col1 int8
383 col2 int8
384 dtype: object
386 Constructing DataFrame from numpy ndarray:
388 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
389 ... columns=['a', 'b', 'c'])
390 >>> df2
391 a b c
392 0 1 2 3
393 1 4 5 6
394 2 7 8 9
395 """
397 _typ = "dataframe"
399 @property
400 def _constructor(self) -> Type["DataFrame"]:
401 return DataFrame
403 _constructor_sliced: Type[Series] = Series
404 _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([])
405 _accessors: Set[str] = {"sparse"}
407 @property
408 def _constructor_expanddim(self):
409 raise NotImplementedError("Not supported for DataFrames!")
411 # ----------------------------------------------------------------------
412 # Constructors
414 def __init__(
415 self,
416 data=None,
417 index: Optional[Axes] = None,
418 columns: Optional[Axes] = None,
419 dtype: Optional[Dtype] = None,
420 copy: bool = False,
421 ):
422 if data is None:
423 data = {}
424 if dtype is not None:
425 dtype = self._validate_dtype(dtype)
427 if isinstance(data, DataFrame):
428 data = data._data
430 if isinstance(data, BlockManager):
431 mgr = self._init_mgr(
432 data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
433 )
434 elif isinstance(data, dict):
435 mgr = init_dict(data, index, columns, dtype=dtype)
436 elif isinstance(data, ma.MaskedArray):
437 import numpy.ma.mrecords as mrecords
439 # masked recarray
440 if isinstance(data, mrecords.MaskedRecords):
441 mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
443 # a masked array
444 else:
445 mask = ma.getmaskarray(data)
446 if mask.any():
447 data, fill_value = maybe_upcast(data, copy=True)
448 data.soften_mask() # set hardmask False if it was True
449 data[mask] = fill_value
450 else:
451 data = data.copy()
452 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
454 elif isinstance(data, (np.ndarray, Series, Index)):
455 if data.dtype.names:
456 data_columns = list(data.dtype.names)
457 data = {k: data[k] for k in data_columns}
458 if columns is None:
459 columns = data_columns
460 mgr = init_dict(data, index, columns, dtype=dtype)
461 elif getattr(data, "name", None) is not None:
462 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
463 else:
464 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
466 # For data is list-like, or Iterable (will consume into list)
467 elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
468 if not isinstance(data, (abc.Sequence, ExtensionArray)):
469 data = list(data)
470 if len(data) > 0:
471 if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
472 if is_named_tuple(data[0]) and columns is None:
473 columns = data[0]._fields
474 arrays, columns = to_arrays(data, columns, dtype=dtype)
475 columns = ensure_index(columns)
477 # set the index
478 if index is None:
479 if isinstance(data[0], Series):
480 index = get_names_from_index(data)
481 elif isinstance(data[0], Categorical):
482 index = ibase.default_index(len(data[0]))
483 else:
484 index = ibase.default_index(len(data))
486 mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
487 else:
488 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
489 else:
490 mgr = init_dict({}, index, columns, dtype=dtype)
491 else:
492 try:
493 arr = np.array(data, dtype=dtype, copy=copy)
494 except (ValueError, TypeError) as e:
495 exc = TypeError(
496 "DataFrame constructor called with "
497 f"incompatible data and dtype: {e}"
498 )
499 raise exc from e
501 if arr.ndim == 0 and index is not None and columns is not None:
502 values = cast_scalar_to_array(
503 (len(index), len(columns)), data, dtype=dtype
504 )
505 mgr = init_ndarray(
506 values, index, columns, dtype=values.dtype, copy=False
507 )
508 else:
509 raise ValueError("DataFrame constructor not properly called!")
511 NDFrame.__init__(self, mgr, fastpath=True)
513 # ----------------------------------------------------------------------
515 @property
516 def axes(self) -> List[Index]:
517 """
518 Return a list representing the axes of the DataFrame.
520 It has the row axis labels and column axis labels as the only members.
521 They are returned in that order.
523 Examples
524 --------
525 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
526 >>> df.axes
527 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
528 dtype='object')]
529 """
530 return [self.index, self.columns]
532 @property
533 def shape(self) -> Tuple[int, int]:
534 """
535 Return a tuple representing the dimensionality of the DataFrame.
537 See Also
538 --------
539 ndarray.shape
541 Examples
542 --------
543 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
544 >>> df.shape
545 (2, 2)
547 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
548 ... 'col3': [5, 6]})
549 >>> df.shape
550 (2, 3)
551 """
552 return len(self.index), len(self.columns)
554 @property
555 def _is_homogeneous_type(self) -> bool:
556 """
557 Whether all the columns in a DataFrame have the same type.
559 Returns
560 -------
561 bool
563 See Also
564 --------
565 Index._is_homogeneous_type : Whether the object has a single
566 dtype.
567 MultiIndex._is_homogeneous_type : Whether all the levels of a
568 MultiIndex have the same dtype.
570 Examples
571 --------
572 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
573 True
574 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
575 False
577 Items with the same type but different sizes are considered
578 different types.
580 >>> DataFrame({
581 ... "A": np.array([1, 2], dtype=np.int32),
582 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
583 False
584 """
585 if self._data.any_extension_types:
586 return len({block.dtype for block in self._data.blocks}) == 1
587 else:
588 return not self._data.is_mixed_type
590 # ----------------------------------------------------------------------
591 # Rendering Methods
593 def _repr_fits_vertical_(self) -> bool:
594 """
595 Check length against max_rows.
596 """
597 max_rows = get_option("display.max_rows")
598 return len(self) <= max_rows
600 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
601 """
602 Check if full repr fits in horizontal boundaries imposed by the display
603 options width and max_columns.
605 In case off non-interactive session, no boundaries apply.
607 `ignore_width` is here so ipnb+HTML output can behave the way
608 users expect. display.max_columns remains in effect.
609 GH3541, GH3573
610 """
611 width, height = console.get_console_size()
612 max_columns = get_option("display.max_columns")
613 nb_columns = len(self.columns)
615 # exceed max columns
616 if (max_columns and nb_columns > max_columns) or (
617 (not ignore_width) and width and nb_columns > (width // 2)
618 ):
619 return False
621 # used by repr_html under IPython notebook or scripts ignore terminal
622 # dims
623 if ignore_width or not console.in_interactive_session():
624 return True
626 if get_option("display.width") is not None or console.in_ipython_frontend():
627 # check at least the column row for excessive width
628 max_rows = 1
629 else:
630 max_rows = get_option("display.max_rows")
632 # when auto-detecting, so width=None and not in ipython front end
633 # check whether repr fits horizontal by actually checking
634 # the width of the rendered repr
635 buf = StringIO()
637 # only care about the stuff we'll actually print out
638 # and to_string on entire frame may be expensive
639 d = self
641 if not (max_rows is None): # unlimited rows
642 # min of two, where one may be None
643 d = d.iloc[: min(max_rows, len(d))]
644 else:
645 return True
647 d.to_string(buf=buf)
648 value = buf.getvalue()
649 repr_width = max(len(l) for l in value.split("\n"))
651 return repr_width < width
653 def _info_repr(self) -> bool:
654 """
655 True if the repr should show the info view.
656 """
657 info_repr_option = get_option("display.large_repr") == "info"
658 return info_repr_option and not (
659 self._repr_fits_horizontal_() and self._repr_fits_vertical_()
660 )
662 def __repr__(self) -> str:
663 """
664 Return a string representation for a particular DataFrame.
665 """
666 buf = StringIO("")
667 if self._info_repr():
668 self.info(buf=buf)
669 return buf.getvalue()
671 max_rows = get_option("display.max_rows")
672 min_rows = get_option("display.min_rows")
673 max_cols = get_option("display.max_columns")
674 max_colwidth = get_option("display.max_colwidth")
675 show_dimensions = get_option("display.show_dimensions")
676 if get_option("display.expand_frame_repr"):
677 width, _ = console.get_console_size()
678 else:
679 width = None
680 self.to_string(
681 buf=buf,
682 max_rows=max_rows,
683 min_rows=min_rows,
684 max_cols=max_cols,
685 line_width=width,
686 max_colwidth=max_colwidth,
687 show_dimensions=show_dimensions,
688 )
690 return buf.getvalue()
692 def _repr_html_(self) -> Optional[str]:
693 """
694 Return a html representation for a particular DataFrame.
696 Mainly for IPython notebook.
697 """
698 if self._info_repr():
699 buf = StringIO("")
700 self.info(buf=buf)
701 # need to escape the <class>, should be the first line.
702 val = buf.getvalue().replace("<", r"<", 1)
703 val = val.replace(">", r">", 1)
704 return "<pre>" + val + "</pre>"
706 if get_option("display.notebook_repr_html"):
707 max_rows = get_option("display.max_rows")
708 min_rows = get_option("display.min_rows")
709 max_cols = get_option("display.max_columns")
710 show_dimensions = get_option("display.show_dimensions")
712 formatter = fmt.DataFrameFormatter(
713 self,
714 columns=None,
715 col_space=None,
716 na_rep="NaN",
717 formatters=None,
718 float_format=None,
719 sparsify=None,
720 justify=None,
721 index_names=True,
722 header=True,
723 index=True,
724 bold_rows=True,
725 escape=True,
726 max_rows=max_rows,
727 min_rows=min_rows,
728 max_cols=max_cols,
729 show_dimensions=show_dimensions,
730 decimal=".",
731 table_id=None,
732 render_links=False,
733 )
734 return formatter.to_html(notebook=True)
735 else:
736 return None
738 @Substitution(
739 header_type="bool or sequence",
740 header="Write out the column names. If a list of strings "
741 "is given, it is assumed to be aliases for the "
742 "column names",
743 col_space_type="int",
744 col_space="The minimum width of each column",
745 )
746 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
747 def to_string(
748 self,
749 buf: Optional[FilePathOrBuffer[str]] = None,
750 columns: Optional[Sequence[str]] = None,
751 col_space: Optional[int] = None,
752 header: Union[bool, Sequence[str]] = True,
753 index: bool = True,
754 na_rep: str = "NaN",
755 formatters: Optional[fmt.formatters_type] = None,
756 float_format: Optional[fmt.float_format_type] = None,
757 sparsify: Optional[bool] = None,
758 index_names: bool = True,
759 justify: Optional[str] = None,
760 max_rows: Optional[int] = None,
761 min_rows: Optional[int] = None,
762 max_cols: Optional[int] = None,
763 show_dimensions: bool = False,
764 decimal: str = ".",
765 line_width: Optional[int] = None,
766 max_colwidth: Optional[int] = None,
767 encoding: Optional[str] = None,
768 ) -> Optional[str]:
769 """
770 Render a DataFrame to a console-friendly tabular output.
771 %(shared_params)s
772 line_width : int, optional
773 Width to wrap a line in characters.
774 max_colwidth : int, optional
775 Max width to truncate each column in characters. By default, no limit.
777 .. versionadded:: 1.0.0
778 encoding : str, default "utf-8"
779 Set character encoding.
781 .. versionadded:: 1.0
782 %(returns)s
783 See Also
784 --------
785 to_html : Convert DataFrame to HTML.
787 Examples
788 --------
789 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
790 >>> df = pd.DataFrame(d)
791 >>> print(df.to_string())
792 col1 col2
793 0 1 4
794 1 2 5
795 2 3 6
796 """
798 from pandas import option_context
800 with option_context("display.max_colwidth", max_colwidth):
801 formatter = fmt.DataFrameFormatter(
802 self,
803 columns=columns,
804 col_space=col_space,
805 na_rep=na_rep,
806 formatters=formatters,
807 float_format=float_format,
808 sparsify=sparsify,
809 justify=justify,
810 index_names=index_names,
811 header=header,
812 index=index,
813 min_rows=min_rows,
814 max_rows=max_rows,
815 max_cols=max_cols,
816 show_dimensions=show_dimensions,
817 decimal=decimal,
818 line_width=line_width,
819 )
820 return formatter.to_string(buf=buf, encoding=encoding)
822 # ----------------------------------------------------------------------
824 @property
825 def style(self) -> "Styler":
826 """
827 Returns a Styler object.
829 Contains methods for building a styled HTML representation of the DataFrame.
830 a styled HTML representation fo the DataFrame.
832 See Also
833 --------
834 io.formats.style.Styler
835 """
836 from pandas.io.formats.style import Styler
838 return Styler(self)
840 _shared_docs[
841 "items"
842 ] = r"""
843 Iterate over (column name, Series) pairs.
845 Iterates over the DataFrame columns, returning a tuple with
846 the column name and the content as a Series.
848 Yields
849 ------
850 label : object
851 The column names for the DataFrame being iterated over.
852 content : Series
853 The column entries belonging to each label, as a Series.
855 See Also
856 --------
857 DataFrame.iterrows : Iterate over DataFrame rows as
858 (index, Series) pairs.
859 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
860 of the values.
862 Examples
863 --------
864 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
865 ... 'population': [1864, 22000, 80000]},
866 ... index=['panda', 'polar', 'koala'])
867 >>> df
868 species population
869 panda bear 1864
870 polar bear 22000
871 koala marsupial 80000
872 >>> for label, content in df.items():
873 ... print('label:', label)
874 ... print('content:', content, sep='\n')
875 ...
876 label: species
877 content:
878 panda bear
879 polar bear
880 koala marsupial
881 Name: species, dtype: object
882 label: population
883 content:
884 panda 1864
885 polar 22000
886 koala 80000
887 Name: population, dtype: int64
888 """
890 @Appender(_shared_docs["items"])
891 def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
892 if self.columns.is_unique and hasattr(self, "_item_cache"):
893 for k in self.columns:
894 yield k, self._get_item_cache(k)
895 else:
896 for i, k in enumerate(self.columns):
897 yield k, self._ixs(i, axis=1)
899 @Appender(_shared_docs["items"])
900 def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
901 yield from self.items()
903 def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
904 """
905 Iterate over DataFrame rows as (index, Series) pairs.
907 Yields
908 ------
909 index : label or tuple of label
910 The index of the row. A tuple for a `MultiIndex`.
911 data : Series
912 The data of the row as a Series.
914 it : generator
915 A generator that iterates over the rows of the frame.
917 See Also
918 --------
919 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
920 DataFrame.items : Iterate over (column name, Series) pairs.
922 Notes
923 -----
925 1. Because ``iterrows`` returns a Series for each row,
926 it does **not** preserve dtypes across the rows (dtypes are
927 preserved across columns for DataFrames). For example,
929 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
930 >>> row = next(df.iterrows())[1]
931 >>> row
932 int 1.0
933 float 1.5
934 Name: 0, dtype: float64
935 >>> print(row['int'].dtype)
936 float64
937 >>> print(df['int'].dtype)
938 int64
940 To preserve dtypes while iterating over the rows, it is better
941 to use :meth:`itertuples` which returns namedtuples of the values
942 and which is generally faster than ``iterrows``.
944 2. You should **never modify** something you are iterating over.
945 This is not guaranteed to work in all cases. Depending on the
946 data types, the iterator returns a copy and not a view, and writing
947 to it will have no effect.
948 """
949 columns = self.columns
950 klass = self._constructor_sliced
951 for k, v in zip(self.index, self.values):
952 s = klass(v, index=columns, name=k)
953 yield k, s
955 def itertuples(self, index=True, name="Pandas"):
956 """
957 Iterate over DataFrame rows as namedtuples.
959 Parameters
960 ----------
961 index : bool, default True
962 If True, return the index as the first element of the tuple.
963 name : str or None, default "Pandas"
964 The name of the returned namedtuples or None to return regular
965 tuples.
967 Returns
968 -------
969 iterator
970 An object to iterate over namedtuples for each row in the
971 DataFrame with the first field possibly being the index and
972 following fields being the column values.
974 See Also
975 --------
976 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
977 pairs.
978 DataFrame.items : Iterate over (column name, Series) pairs.
980 Notes
981 -----
982 The column names will be renamed to positional names if they are
983 invalid Python identifiers, repeated, or start with an underscore.
984 On python versions < 3.7 regular tuples are returned for DataFrames
985 with a large number of columns (>254).
987 Examples
988 --------
989 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
990 ... index=['dog', 'hawk'])
991 >>> df
992 num_legs num_wings
993 dog 4 0
994 hawk 2 2
995 >>> for row in df.itertuples():
996 ... print(row)
997 ...
998 Pandas(Index='dog', num_legs=4, num_wings=0)
999 Pandas(Index='hawk', num_legs=2, num_wings=2)
1001 By setting the `index` parameter to False we can remove the index
1002 as the first element of the tuple:
1004 >>> for row in df.itertuples(index=False):
1005 ... print(row)
1006 ...
1007 Pandas(num_legs=4, num_wings=0)
1008 Pandas(num_legs=2, num_wings=2)
1010 With the `name` parameter set we set a custom name for the yielded
1011 namedtuples:
1013 >>> for row in df.itertuples(name='Animal'):
1014 ... print(row)
1015 ...
1016 Animal(Index='dog', num_legs=4, num_wings=0)
1017 Animal(Index='hawk', num_legs=2, num_wings=2)
1018 """
1019 arrays = []
1020 fields = list(self.columns)
1021 if index:
1022 arrays.append(self.index)
1023 fields.insert(0, "Index")
1025 # use integer indexing because of possible duplicate column names
1026 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
1028 # Python versions before 3.7 support at most 255 arguments to constructors
1029 can_return_named_tuples = PY37 or len(self.columns) + index < 255
1030 if name is not None and can_return_named_tuples:
1031 itertuple = collections.namedtuple(name, fields, rename=True)
1032 return map(itertuple._make, zip(*arrays))
1034 # fallback to regular tuples
1035 return zip(*arrays)
1037 def __len__(self) -> int:
1038 """
1039 Returns length of info axis, but here we use the index.
1040 """
1041 return len(self.index)
1043 def dot(self, other):
1044 """
1045 Compute the matrix multiplication between the DataFrame and other.
1047 This method computes the matrix product between the DataFrame and the
1048 values of an other Series, DataFrame or a numpy array.
1050 It can also be called using ``self @ other`` in Python >= 3.5.
1052 Parameters
1053 ----------
1054 other : Series, DataFrame or array-like
1055 The other object to compute the matrix product with.
1057 Returns
1058 -------
1059 Series or DataFrame
1060 If other is a Series, return the matrix product between self and
1061 other as a Serie. If other is a DataFrame or a numpy.array, return
1062 the matrix product of self and other in a DataFrame of a np.array.
1064 See Also
1065 --------
1066 Series.dot: Similar method for Series.
1068 Notes
1069 -----
1070 The dimensions of DataFrame and other must be compatible in order to
1071 compute the matrix multiplication. In addition, the column names of
1072 DataFrame and the index of other must contain the same values, as they
1073 will be aligned prior to the multiplication.
1075 The dot method for Series computes the inner product, instead of the
1076 matrix product here.
1078 Examples
1079 --------
1080 Here we multiply a DataFrame with a Series.
1082 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
1083 >>> s = pd.Series([1, 1, 2, 1])
1084 >>> df.dot(s)
1085 0 -4
1086 1 5
1087 dtype: int64
1089 Here we multiply a DataFrame with another DataFrame.
1091 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
1092 >>> df.dot(other)
1093 0 1
1094 0 1 4
1095 1 2 2
1097 Note that the dot method give the same result as @
1099 >>> df @ other
1100 0 1
1101 0 1 4
1102 1 2 2
1104 The dot method works also if other is an np.array.
1106 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
1107 >>> df.dot(arr)
1108 0 1
1109 0 1 4
1110 1 2 2
1112 Note how shuffling of the objects does not change the result.
1114 >>> s2 = s.reindex([1, 0, 2, 3])
1115 >>> df.dot(s2)
1116 0 -4
1117 1 5
1118 dtype: int64
1119 """
1120 if isinstance(other, (Series, DataFrame)):
1121 common = self.columns.union(other.index)
1122 if len(common) > len(self.columns) or len(common) > len(other.index):
1123 raise ValueError("matrices are not aligned")
1125 left = self.reindex(columns=common, copy=False)
1126 right = other.reindex(index=common, copy=False)
1127 lvals = left.values
1128 rvals = right.values
1129 else:
1130 left = self
1131 lvals = self.values
1132 rvals = np.asarray(other)
1133 if lvals.shape[1] != rvals.shape[0]:
1134 raise ValueError(
1135 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
1136 )
1138 if isinstance(other, DataFrame):
1139 return self._constructor(
1140 np.dot(lvals, rvals), index=left.index, columns=other.columns
1141 )
1142 elif isinstance(other, Series):
1143 return Series(np.dot(lvals, rvals), index=left.index)
1144 elif isinstance(rvals, (np.ndarray, Index)):
1145 result = np.dot(lvals, rvals)
1146 if result.ndim == 2:
1147 return self._constructor(result, index=left.index)
1148 else:
1149 return Series(result, index=left.index)
1150 else: # pragma: no cover
1151 raise TypeError(f"unsupported type: {type(other)}")
1153 def __matmul__(self, other):
1154 """
1155 Matrix multiplication using binary `@` operator in Python>=3.5.
1156 """
1157 return self.dot(other)
1159 def __rmatmul__(self, other):
1160 """
1161 Matrix multiplication using binary `@` operator in Python>=3.5.
1162 """
1163 return self.T.dot(np.transpose(other)).T
1165 # ----------------------------------------------------------------------
1166 # IO methods (to / from other formats)
1168 @classmethod
1169 def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame":
1170 """
1171 Construct DataFrame from dict of array-like or dicts.
1173 Creates DataFrame object from dictionary by columns or by index
1174 allowing dtype specification.
1176 Parameters
1177 ----------
1178 data : dict
1179 Of the form {field : array-like} or {field : dict}.
1180 orient : {'columns', 'index'}, default 'columns'
1181 The "orientation" of the data. If the keys of the passed dict
1182 should be the columns of the resulting DataFrame, pass 'columns'
1183 (default). Otherwise if the keys should be rows, pass 'index'.
1184 dtype : dtype, default None
1185 Data type to force, otherwise infer.
1186 columns : list, default None
1187 Column labels to use when ``orient='index'``. Raises a ValueError
1188 if used with ``orient='columns'``.
1190 .. versionadded:: 0.23.0
1192 Returns
1193 -------
1194 DataFrame
1196 See Also
1197 --------
1198 DataFrame.from_records : DataFrame from ndarray (structured
1199 dtype), list of tuples, dict, or DataFrame.
1200 DataFrame : DataFrame object creation using constructor.
1202 Examples
1203 --------
1204 By default the keys of the dict become the DataFrame columns:
1206 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
1207 >>> pd.DataFrame.from_dict(data)
1208 col_1 col_2
1209 0 3 a
1210 1 2 b
1211 2 1 c
1212 3 0 d
1214 Specify ``orient='index'`` to create the DataFrame using dictionary
1215 keys as rows:
1217 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
1218 >>> pd.DataFrame.from_dict(data, orient='index')
1219 0 1 2 3
1220 row_1 3 2 1 0
1221 row_2 a b c d
1223 When using the 'index' orientation, the column names can be
1224 specified manually:
1226 >>> pd.DataFrame.from_dict(data, orient='index',
1227 ... columns=['A', 'B', 'C', 'D'])
1228 A B C D
1229 row_1 3 2 1 0
1230 row_2 a b c d
1231 """
1232 index = None
1233 orient = orient.lower()
1234 if orient == "index":
1235 if len(data) > 0:
1236 # TODO speed up Series case
1237 if isinstance(list(data.values())[0], (Series, dict)):
1238 data = _from_nested_dict(data)
1239 else:
1240 data, index = list(data.values()), list(data.keys())
1241 elif orient == "columns":
1242 if columns is not None:
1243 raise ValueError("cannot use columns parameter with orient='columns'")
1244 else: # pragma: no cover
1245 raise ValueError("only recognize index or columns for orient")
1247 return cls(data, index=index, columns=columns, dtype=dtype)
1249 def to_numpy(self, dtype=None, copy=False) -> np.ndarray:
1250 """
1251 Convert the DataFrame to a NumPy array.
1253 .. versionadded:: 0.24.0
1255 By default, the dtype of the returned array will be the common NumPy
1256 dtype of all types in the DataFrame. For example, if the dtypes are
1257 ``float16`` and ``float32``, the results dtype will be ``float32``.
1258 This may require copying data and coercing values, which may be
1259 expensive.
1261 Parameters
1262 ----------
1263 dtype : str or numpy.dtype, optional
1264 The dtype to pass to :meth:`numpy.asarray`.
1265 copy : bool, default False
1266 Whether to ensure that the returned value is a not a view on
1267 another array. Note that ``copy=False`` does not *ensure* that
1268 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
1269 a copy is made, even if not strictly necessary.
1271 Returns
1272 -------
1273 numpy.ndarray
1275 See Also
1276 --------
1277 Series.to_numpy : Similar method for Series.
1279 Examples
1280 --------
1281 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
1282 array([[1, 3],
1283 [2, 4]])
1285 With heterogeneous data, the lowest common type will have to
1286 be used.
1288 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
1289 >>> df.to_numpy()
1290 array([[1. , 3. ],
1291 [2. , 4.5]])
1293 For a mix of numeric and non-numeric types, the output array will
1294 have object dtype.
1296 >>> df['C'] = pd.date_range('2000', periods=2)
1297 >>> df.to_numpy()
1298 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
1299 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
1300 """
1301 result = np.array(self.values, dtype=dtype, copy=copy)
1302 return result
1304 def to_dict(self, orient="dict", into=dict):
1305 """
1306 Convert the DataFrame to a dictionary.
1308 The type of the key-value pairs can be customized with the parameters
1309 (see below).
1311 Parameters
1312 ----------
1313 orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
1314 Determines the type of the values of the dictionary.
1316 - 'dict' (default) : dict like {column -> {index -> value}}
1317 - 'list' : dict like {column -> [values]}
1318 - 'series' : dict like {column -> Series(values)}
1319 - 'split' : dict like
1320 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
1321 - 'records' : list like
1322 [{column -> value}, ... , {column -> value}]
1323 - 'index' : dict like {index -> {column -> value}}
1325 Abbreviations are allowed. `s` indicates `series` and `sp`
1326 indicates `split`.
1328 into : class, default dict
1329 The collections.abc.Mapping subclass used for all Mappings
1330 in the return value. Can be the actual class or an empty
1331 instance of the mapping type you want. If you want a
1332 collections.defaultdict, you must pass it initialized.
1334 .. versionadded:: 0.21.0
1336 Returns
1337 -------
1338 dict, list or collections.abc.Mapping
1339 Return a collections.abc.Mapping object representing the DataFrame.
1340 The resulting transformation depends on the `orient` parameter.
1342 See Also
1343 --------
1344 DataFrame.from_dict: Create a DataFrame from a dictionary.
1345 DataFrame.to_json: Convert a DataFrame to JSON format.
1347 Examples
1348 --------
1349 >>> df = pd.DataFrame({'col1': [1, 2],
1350 ... 'col2': [0.5, 0.75]},
1351 ... index=['row1', 'row2'])
1352 >>> df
1353 col1 col2
1354 row1 1 0.50
1355 row2 2 0.75
1356 >>> df.to_dict()
1357 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
1359 You can specify the return orientation.
1361 >>> df.to_dict('series')
1362 {'col1': row1 1
1363 row2 2
1364 Name: col1, dtype: int64,
1365 'col2': row1 0.50
1366 row2 0.75
1367 Name: col2, dtype: float64}
1369 >>> df.to_dict('split')
1370 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
1371 'data': [[1, 0.5], [2, 0.75]]}
1373 >>> df.to_dict('records')
1374 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
1376 >>> df.to_dict('index')
1377 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
1379 You can also specify the mapping type.
1381 >>> from collections import OrderedDict, defaultdict
1382 >>> df.to_dict(into=OrderedDict)
1383 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
1384 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
1386 If you want a `defaultdict`, you need to initialize it:
1388 >>> dd = defaultdict(list)
1389 >>> df.to_dict('records', into=dd)
1390 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
1391 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
1392 """
1393 if not self.columns.is_unique:
1394 warnings.warn(
1395 "DataFrame columns are not unique, some columns will be omitted.",
1396 UserWarning,
1397 stacklevel=2,
1398 )
1399 # GH16122
1400 into_c = com.standardize_mapping(into)
1401 if orient.lower().startswith("d"):
1402 return into_c((k, v.to_dict(into)) for k, v in self.items())
1403 elif orient.lower().startswith("l"):
1404 return into_c((k, v.tolist()) for k, v in self.items())
1405 elif orient.lower().startswith("sp"):
1406 return into_c(
1407 (
1408 ("index", self.index.tolist()),
1409 ("columns", self.columns.tolist()),
1410 (
1411 "data",
1412 [
1413 list(map(com.maybe_box_datetimelike, t))
1414 for t in self.itertuples(index=False, name=None)
1415 ],
1416 ),
1417 )
1418 )
1419 elif orient.lower().startswith("s"):
1420 return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items())
1421 elif orient.lower().startswith("r"):
1422 columns = self.columns.tolist()
1423 rows = (
1424 dict(zip(columns, row))
1425 for row in self.itertuples(index=False, name=None)
1426 )
1427 return [
1428 into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items())
1429 for row in rows
1430 ]
1431 elif orient.lower().startswith("i"):
1432 if not self.index.is_unique:
1433 raise ValueError("DataFrame index must be unique for orient='index'.")
1434 return into_c(
1435 (t[0], dict(zip(self.columns, t[1:])))
1436 for t in self.itertuples(name=None)
1437 )
1438 else:
1439 raise ValueError(f"orient '{orient}' not understood")
1441 def to_gbq(
1442 self,
1443 destination_table,
1444 project_id=None,
1445 chunksize=None,
1446 reauth=False,
1447 if_exists="fail",
1448 auth_local_webserver=False,
1449 table_schema=None,
1450 location=None,
1451 progress_bar=True,
1452 credentials=None,
1453 ) -> None:
1454 """
1455 Write a DataFrame to a Google BigQuery table.
1457 This function requires the `pandas-gbq package
1458 <https://pandas-gbq.readthedocs.io>`__.
1460 See the `How to authenticate with Google BigQuery
1461 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
1462 guide for authentication instructions.
1464 Parameters
1465 ----------
1466 destination_table : str
1467 Name of table to be written, in the form ``dataset.tablename``.
1468 project_id : str, optional
1469 Google BigQuery Account project ID. Optional when available from
1470 the environment.
1471 chunksize : int, optional
1472 Number of rows to be inserted in each chunk from the dataframe.
1473 Set to ``None`` to load the whole dataframe at once.
1474 reauth : bool, default False
1475 Force Google BigQuery to re-authenticate the user. This is useful
1476 if multiple accounts are used.
1477 if_exists : str, default 'fail'
1478 Behavior when the destination table exists. Value can be one of:
1480 ``'fail'``
1481 If table exists raise pandas_gbq.gbq.TableCreationError.
1482 ``'replace'``
1483 If table exists, drop it, recreate it, and insert data.
1484 ``'append'``
1485 If table exists, insert data. Create if does not exist.
1486 auth_local_webserver : bool, default False
1487 Use the `local webserver flow`_ instead of the `console flow`_
1488 when getting user credentials.
1490 .. _local webserver flow:
1491 http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
1492 .. _console flow:
1493 http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
1495 *New in version 0.2.0 of pandas-gbq*.
1496 table_schema : list of dicts, optional
1497 List of BigQuery table fields to which according DataFrame
1498 columns conform to, e.g. ``[{'name': 'col1', 'type':
1499 'STRING'},...]``. If schema is not provided, it will be
1500 generated according to dtypes of DataFrame columns. See
1501 BigQuery API documentation on available names of a field.
1503 *New in version 0.3.1 of pandas-gbq*.
1504 location : str, optional
1505 Location where the load job should run. See the `BigQuery locations
1506 documentation
1507 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
1508 list of available locations. The location must match that of the
1509 target dataset.
1511 *New in version 0.5.0 of pandas-gbq*.
1512 progress_bar : bool, default True
1513 Use the library `tqdm` to show the progress bar for the upload,
1514 chunk by chunk.
1516 *New in version 0.5.0 of pandas-gbq*.
1517 credentials : google.auth.credentials.Credentials, optional
1518 Credentials for accessing Google APIs. Use this parameter to
1519 override default credentials, such as to use Compute Engine
1520 :class:`google.auth.compute_engine.Credentials` or Service
1521 Account :class:`google.oauth2.service_account.Credentials`
1522 directly.
1524 *New in version 0.8.0 of pandas-gbq*.
1526 .. versionadded:: 0.24.0
1528 See Also
1529 --------
1530 pandas_gbq.to_gbq : This function in the pandas-gbq library.
1531 read_gbq : Read a DataFrame from Google BigQuery.
1532 """
1533 from pandas.io import gbq
1535 gbq.to_gbq(
1536 self,
1537 destination_table,
1538 project_id=project_id,
1539 chunksize=chunksize,
1540 reauth=reauth,
1541 if_exists=if_exists,
1542 auth_local_webserver=auth_local_webserver,
1543 table_schema=table_schema,
1544 location=location,
1545 progress_bar=progress_bar,
1546 credentials=credentials,
1547 )
1549 @classmethod
1550 def from_records(
1551 cls,
1552 data,
1553 index=None,
1554 exclude=None,
1555 columns=None,
1556 coerce_float=False,
1557 nrows=None,
1558 ) -> "DataFrame":
1559 """
1560 Convert structured or record ndarray to DataFrame.
1562 Parameters
1563 ----------
1564 data : ndarray (structured dtype), list of tuples, dict, or DataFrame
1565 index : str, list of fields, array-like
1566 Field of array to use as the index, alternately a specific set of
1567 input labels to use.
1568 exclude : sequence, default None
1569 Columns or fields to exclude.
1570 columns : sequence, default None
1571 Column names to use. If the passed data do not have names
1572 associated with them, this argument provides names for the
1573 columns. Otherwise this argument indicates the order of the columns
1574 in the result (any names not found in the data will become all-NA
1575 columns).
1576 coerce_float : bool, default False
1577 Attempt to convert values of non-string, non-numeric objects (like
1578 decimal.Decimal) to floating point, useful for SQL result sets.
1579 nrows : int, default None
1580 Number of rows to read if data is an iterator.
1582 Returns
1583 -------
1584 DataFrame
1585 """
1587 # Make a copy of the input columns so we can modify it
1588 if columns is not None:
1589 columns = ensure_index(columns)
1591 if is_iterator(data):
1592 if nrows == 0:
1593 return cls()
1595 try:
1596 first_row = next(data)
1597 except StopIteration:
1598 return cls(index=index, columns=columns)
1600 dtype = None
1601 if hasattr(first_row, "dtype") and first_row.dtype.names:
1602 dtype = first_row.dtype
1604 values = [first_row]
1606 if nrows is None:
1607 values += data
1608 else:
1609 values.extend(itertools.islice(data, nrows - 1))
1611 if dtype is not None:
1612 data = np.array(values, dtype=dtype)
1613 else:
1614 data = values
1616 if isinstance(data, dict):
1617 if columns is None:
1618 columns = arr_columns = ensure_index(sorted(data))
1619 arrays = [data[k] for k in columns]
1620 else:
1621 arrays = []
1622 arr_columns = []
1623 for k, v in data.items():
1624 if k in columns:
1625 arr_columns.append(k)
1626 arrays.append(v)
1628 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns)
1630 elif isinstance(data, (np.ndarray, DataFrame)):
1631 arrays, columns = to_arrays(data, columns)
1632 if columns is not None:
1633 columns = ensure_index(columns)
1634 arr_columns = columns
1635 else:
1636 arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float)
1638 arr_columns = ensure_index(arr_columns)
1639 if columns is not None:
1640 columns = ensure_index(columns)
1641 else:
1642 columns = arr_columns
1644 if exclude is None:
1645 exclude = set()
1646 else:
1647 exclude = set(exclude)
1649 result_index = None
1650 if index is not None:
1651 if isinstance(index, str) or not hasattr(index, "__iter__"):
1652 i = columns.get_loc(index)
1653 exclude.add(index)
1654 if len(arrays) > 0:
1655 result_index = Index(arrays[i], name=index)
1656 else:
1657 result_index = Index([], name=index)
1658 else:
1659 try:
1660 index_data = [arrays[arr_columns.get_loc(field)] for field in index]
1661 except (KeyError, TypeError):
1662 # raised by get_loc, see GH#29258
1663 result_index = index
1664 else:
1665 result_index = ensure_index_from_sequences(index_data, names=index)
1666 exclude.update(index)
1668 if any(exclude):
1669 arr_exclude = [x for x in exclude if x in arr_columns]
1670 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
1671 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
1673 arr_columns = arr_columns.drop(arr_exclude)
1674 columns = columns.drop(exclude)
1676 mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
1678 return cls(mgr)
1680 def to_records(
1681 self, index=True, column_dtypes=None, index_dtypes=None
1682 ) -> np.recarray:
1683 """
1684 Convert DataFrame to a NumPy record array.
1686 Index will be included as the first field of the record array if
1687 requested.
1689 Parameters
1690 ----------
1691 index : bool, default True
1692 Include index in resulting record array, stored in 'index'
1693 field or using the index label, if set.
1694 column_dtypes : str, type, dict, default None
1695 .. versionadded:: 0.24.0
1697 If a string or type, the data type to store all columns. If
1698 a dictionary, a mapping of column names and indices (zero-indexed)
1699 to specific data types.
1700 index_dtypes : str, type, dict, default None
1701 .. versionadded:: 0.24.0
1703 If a string or type, the data type to store all index levels. If
1704 a dictionary, a mapping of index level names and indices
1705 (zero-indexed) to specific data types.
1707 This mapping is applied only if `index=True`.
1709 Returns
1710 -------
1711 numpy.recarray
1712 NumPy ndarray with the DataFrame labels as fields and each row
1713 of the DataFrame as entries.
1715 See Also
1716 --------
1717 DataFrame.from_records: Convert structured or record ndarray
1718 to DataFrame.
1719 numpy.recarray: An ndarray that allows field access using
1720 attributes, analogous to typed columns in a
1721 spreadsheet.
1723 Examples
1724 --------
1725 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
1726 ... index=['a', 'b'])
1727 >>> df
1728 A B
1729 a 1 0.50
1730 b 2 0.75
1731 >>> df.to_records()
1732 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1733 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1735 If the DataFrame index has no label then the recarray field name
1736 is set to 'index'. If the index has a label then this is used as the
1737 field name:
1739 >>> df.index = df.index.rename("I")
1740 >>> df.to_records()
1741 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1742 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
1744 The index can be excluded from the record array:
1746 >>> df.to_records(index=False)
1747 rec.array([(1, 0.5 ), (2, 0.75)],
1748 dtype=[('A', '<i8'), ('B', '<f8')])
1750 Data types can be specified for the columns:
1752 >>> df.to_records(column_dtypes={"A": "int32"})
1753 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1754 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
1756 As well as for the index:
1758 >>> df.to_records(index_dtypes="<S2")
1759 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
1760 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
1762 >>> index_dtypes = f"<S{df.index.str.len().max()}"
1763 >>> df.to_records(index_dtypes=index_dtypes)
1764 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
1765 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
1766 """
1768 if index:
1769 if isinstance(self.index, ABCMultiIndex):
1770 # array of tuples to numpy cols. copy copy copy
1771 ix_vals = list(map(np.array, zip(*self.index.values)))
1772 else:
1773 ix_vals = [self.index.values]
1775 arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns]
1777 count = 0
1778 index_names = list(self.index.names)
1780 if isinstance(self.index, ABCMultiIndex):
1781 for i, n in enumerate(index_names):
1782 if n is None:
1783 index_names[i] = f"level_{count}"
1784 count += 1
1785 elif index_names[0] is None:
1786 index_names = ["index"]
1788 names = [str(name) for name in itertools.chain(index_names, self.columns)]
1789 else:
1790 arrays = [self[c]._internal_get_values() for c in self.columns]
1791 names = [str(c) for c in self.columns]
1792 index_names = []
1794 index_len = len(index_names)
1795 formats = []
1797 for i, v in enumerate(arrays):
1798 index = i
1800 # When the names and arrays are collected, we
1801 # first collect those in the DataFrame's index,
1802 # followed by those in its columns.
1803 #
1804 # Thus, the total length of the array is:
1805 # len(index_names) + len(DataFrame.columns).
1806 #
1807 # This check allows us to see whether we are
1808 # handling a name / array in the index or column.
1809 if index < index_len:
1810 dtype_mapping = index_dtypes
1811 name = index_names[index]
1812 else:
1813 index -= index_len
1814 dtype_mapping = column_dtypes
1815 name = self.columns[index]
1817 # We have a dictionary, so we get the data type
1818 # associated with the index or column (which can
1819 # be denoted by its name in the DataFrame or its
1820 # position in DataFrame's array of indices or
1821 # columns, whichever is applicable.
1822 if is_dict_like(dtype_mapping):
1823 if name in dtype_mapping:
1824 dtype_mapping = dtype_mapping[name]
1825 elif index in dtype_mapping:
1826 dtype_mapping = dtype_mapping[index]
1827 else:
1828 dtype_mapping = None
1830 # If no mapping can be found, use the array's
1831 # dtype attribute for formatting.
1832 #
1833 # A valid dtype must either be a type or
1834 # string naming a type.
1835 if dtype_mapping is None:
1836 formats.append(v.dtype)
1837 elif isinstance(dtype_mapping, (type, np.dtype, str)):
1838 formats.append(dtype_mapping)
1839 else:
1840 element = "row" if i < index_len else "column"
1841 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
1842 raise ValueError(msg)
1844 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
1846 @classmethod
1847 def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame":
1848 mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
1849 return cls(mgr)
1851 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
1852 def to_stata(
1853 self,
1854 path,
1855 convert_dates=None,
1856 write_index=True,
1857 byteorder=None,
1858 time_stamp=None,
1859 data_label=None,
1860 variable_labels=None,
1861 version=114,
1862 convert_strl=None,
1863 ):
1864 """
1865 Export DataFrame object to Stata dta format.
1867 Writes the DataFrame to a Stata dataset file.
1868 "dta" files contain a Stata dataset.
1870 Parameters
1871 ----------
1872 path : str, buffer or path object
1873 String, path object (pathlib.Path or py._path.local.LocalPath) or
1874 object implementing a binary write() function. If using a buffer
1875 then the buffer will not be automatically closed after the file
1876 data has been written.
1878 .. versionchanged:: 1.0.0
1880 Previously this was "fname"
1882 convert_dates : dict
1883 Dictionary mapping columns containing datetime types to stata
1884 internal format to use when writing the dates. Options are 'tc',
1885 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
1886 or a name. Datetime columns that do not have a conversion type
1887 specified will be converted to 'tc'. Raises NotImplementedError if
1888 a datetime column has timezone information.
1889 write_index : bool
1890 Write the index to Stata dataset.
1891 byteorder : str
1892 Can be ">", "<", "little", or "big". default is `sys.byteorder`.
1893 time_stamp : datetime
1894 A datetime to use as file creation date. Default is the current
1895 time.
1896 data_label : str, optional
1897 A label for the data set. Must be 80 characters or smaller.
1898 variable_labels : dict
1899 Dictionary containing columns as keys and variable labels as
1900 values. Each label must be 80 characters or smaller.
1901 version : {114, 117, 118, 119, None}, default 114
1902 Version to use in the output dta file. Set to None to let pandas
1903 decide between 118 or 119 formats depending on the number of
1904 columns in the frame. Version 114 can be read by Stata 10 and
1905 later. Version 117 can be read by Stata 13 or later. Version 118
1906 is supported in Stata 14 and later. Version 119 is supported in
1907 Stata 15 and later. Version 114 limits string variables to 244
1908 characters or fewer while versions 117 and later allow strings
1909 with lengths up to 2,000,000 characters. Versions 118 and 119
1910 support Unicode characters, and version 119 supports more than
1911 32,767 variables.
1913 .. versionadded:: 0.23.0
1914 .. versionchanged:: 1.0.0
1916 Added support for formats 118 and 119.
1918 convert_strl : list, optional
1919 List of column names to convert to string columns to Stata StrL
1920 format. Only available if version is 117. Storing strings in the
1921 StrL format can produce smaller dta files if strings have more than
1922 8 characters and values are repeated.
1924 .. versionadded:: 0.23.0
1926 Raises
1927 ------
1928 NotImplementedError
1929 * If datetimes contain timezone information
1930 * Column dtype is not representable in Stata
1931 ValueError
1932 * Columns listed in convert_dates are neither datetime64[ns]
1933 or datetime.datetime
1934 * Column listed in convert_dates is not in DataFrame
1935 * Categorical label contains more than 32,000 characters
1937 See Also
1938 --------
1939 read_stata : Import Stata data files.
1940 io.stata.StataWriter : Low-level writer for Stata data files.
1941 io.stata.StataWriter117 : Low-level writer for version 117 files.
1943 Examples
1944 --------
1945 >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
1946 ... 'parrot'],
1947 ... 'speed': [350, 18, 361, 15]})
1948 >>> df.to_stata('animals.dta') # doctest: +SKIP
1949 """
1950 if version not in (114, 117, 118, 119, None):
1951 raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
1952 if version == 114:
1953 if convert_strl is not None:
1954 raise ValueError("strl is not supported in format 114")
1955 from pandas.io.stata import StataWriter as statawriter
1956 elif version == 117:
1957 from pandas.io.stata import StataWriter117 as statawriter
1958 else: # versions 118 and 119
1959 from pandas.io.stata import StataWriterUTF8 as statawriter
1961 kwargs = {}
1962 if version is None or version >= 117:
1963 # strl conversion is only supported >= 117
1964 kwargs["convert_strl"] = convert_strl
1965 if version is None or version >= 118:
1966 # Specifying the version is only supported for UTF8 (118 or 119)
1967 kwargs["version"] = version
1969 writer = statawriter(
1970 path,
1971 self,
1972 convert_dates=convert_dates,
1973 byteorder=byteorder,
1974 time_stamp=time_stamp,
1975 data_label=data_label,
1976 write_index=write_index,
1977 variable_labels=variable_labels,
1978 **kwargs,
1979 )
1980 writer.write_file()
1982 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
1983 def to_feather(self, path) -> None:
1984 """
1985 Write out the binary feather-format for DataFrames.
1987 Parameters
1988 ----------
1989 path : str
1990 String file path.
1991 """
1992 from pandas.io.feather_format import to_feather
1994 to_feather(self, path)
1996 @Appender(
1997 """
1998 Examples
1999 --------
2000 >>> df = pd.DataFrame(
2001 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
2002 ... )
2003 >>> print(df.to_markdown())
2004 | | animal_1 | animal_2 |
2005 |---:|:-----------|:-----------|
2006 | 0 | elk | dog |
2007 | 1 | pig | quetzal |
2008 """
2009 )
2010 @Substitution(klass="DataFrame")
2011 @Appender(_shared_docs["to_markdown"])
2012 def to_markdown(
2013 self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs
2014 ) -> Optional[str]:
2015 kwargs.setdefault("headers", "keys")
2016 kwargs.setdefault("tablefmt", "pipe")
2017 tabulate = import_optional_dependency("tabulate")
2018 result = tabulate.tabulate(self, **kwargs)
2019 if buf is None:
2020 return result
2021 buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode)
2022 assert buf is not None # Help mypy.
2023 buf.writelines(result)
2024 return None
2026 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
2027 def to_parquet(
2028 self,
2029 path,
2030 engine="auto",
2031 compression="snappy",
2032 index=None,
2033 partition_cols=None,
2034 **kwargs,
2035 ) -> None:
2036 """
2037 Write a DataFrame to the binary parquet format.
2039 .. versionadded:: 0.21.0
2041 This function writes the dataframe as a `parquet file
2042 <https://parquet.apache.org/>`_. You can choose different parquet
2043 backends, and have the option of compression. See
2044 :ref:`the user guide <io.parquet>` for more details.
2046 Parameters
2047 ----------
2048 path : str
2049 File path or Root Directory path. Will be used as Root Directory
2050 path while writing a partitioned dataset.
2052 .. versionchanged:: 1.0.0
2054 Previously this was "fname"
2056 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
2057 Parquet library to use. If 'auto', then the option
2058 ``io.parquet.engine`` is used. The default ``io.parquet.engine``
2059 behavior is to try 'pyarrow', falling back to 'fastparquet' if
2060 'pyarrow' is unavailable.
2061 compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
2062 Name of the compression to use. Use ``None`` for no compression.
2063 index : bool, default None
2064 If ``True``, include the dataframe's index(es) in the file output.
2065 If ``False``, they will not be written to the file.
2066 If ``None``, similar to ``True`` the dataframe's index(es)
2067 will be saved. However, instead of being saved as values,
2068 the RangeIndex will be stored as a range in the metadata so it
2069 doesn't require much space and is faster. Other indexes will
2070 be included as columns in the file output.
2072 .. versionadded:: 0.24.0
2074 partition_cols : list, optional, default None
2075 Column names by which to partition the dataset.
2076 Columns are partitioned in the order they are given.
2078 .. versionadded:: 0.24.0
2080 **kwargs
2081 Additional arguments passed to the parquet library. See
2082 :ref:`pandas io <io.parquet>` for more details.
2084 See Also
2085 --------
2086 read_parquet : Read a parquet file.
2087 DataFrame.to_csv : Write a csv file.
2088 DataFrame.to_sql : Write to a sql table.
2089 DataFrame.to_hdf : Write to hdf.
2091 Notes
2092 -----
2093 This function requires either the `fastparquet
2094 <https://pypi.org/project/fastparquet>`_ or `pyarrow
2095 <https://arrow.apache.org/docs/python/>`_ library.
2097 Examples
2098 --------
2099 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
2100 >>> df.to_parquet('df.parquet.gzip',
2101 ... compression='gzip') # doctest: +SKIP
2102 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
2103 col1 col2
2104 0 1 3
2105 1 2 4
2106 """
2107 from pandas.io.parquet import to_parquet
2109 to_parquet(
2110 self,
2111 path,
2112 engine,
2113 compression=compression,
2114 index=index,
2115 partition_cols=partition_cols,
2116 **kwargs,
2117 )
2119 @Substitution(
2120 header_type="bool",
2121 header="Whether to print column labels, default True",
2122 col_space_type="str or int",
2123 col_space="The minimum width of each column in CSS length "
2124 "units. An int is assumed to be px units.\n\n"
2125 " .. versionadded:: 0.25.0\n"
2126 " Ability to use str",
2127 )
2128 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
2129 def to_html(
2130 self,
2131 buf=None,
2132 columns=None,
2133 col_space=None,
2134 header=True,
2135 index=True,
2136 na_rep="NaN",
2137 formatters=None,
2138 float_format=None,
2139 sparsify=None,
2140 index_names=True,
2141 justify=None,
2142 max_rows=None,
2143 max_cols=None,
2144 show_dimensions=False,
2145 decimal=".",
2146 bold_rows=True,
2147 classes=None,
2148 escape=True,
2149 notebook=False,
2150 border=None,
2151 table_id=None,
2152 render_links=False,
2153 encoding=None,
2154 ):
2155 """
2156 Render a DataFrame as an HTML table.
2157 %(shared_params)s
2158 bold_rows : bool, default True
2159 Make the row labels bold in the output.
2160 classes : str or list or tuple, default None
2161 CSS class(es) to apply to the resulting html table.
2162 escape : bool, default True
2163 Convert the characters <, >, and & to HTML-safe sequences.
2164 notebook : {True, False}, default False
2165 Whether the generated HTML is for IPython Notebook.
2166 border : int
2167 A ``border=border`` attribute is included in the opening
2168 `<table>` tag. Default ``pd.options.display.html.border``.
2169 encoding : str, default "utf-8"
2170 Set character encoding.
2172 .. versionadded:: 1.0
2174 table_id : str, optional
2175 A css id is included in the opening `<table>` tag if specified.
2177 .. versionadded:: 0.23.0
2179 render_links : bool, default False
2180 Convert URLs to HTML links.
2182 .. versionadded:: 0.24.0
2183 %(returns)s
2184 See Also
2185 --------
2186 to_string : Convert DataFrame to a string.
2187 """
2189 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
2190 raise ValueError("Invalid value for justify parameter")
2192 formatter = fmt.DataFrameFormatter(
2193 self,
2194 columns=columns,
2195 col_space=col_space,
2196 na_rep=na_rep,
2197 formatters=formatters,
2198 float_format=float_format,
2199 sparsify=sparsify,
2200 justify=justify,
2201 index_names=index_names,
2202 header=header,
2203 index=index,
2204 bold_rows=bold_rows,
2205 escape=escape,
2206 max_rows=max_rows,
2207 max_cols=max_cols,
2208 show_dimensions=show_dimensions,
2209 decimal=decimal,
2210 table_id=table_id,
2211 render_links=render_links,
2212 )
2213 # TODO: a generic formatter wld b in DataFrameFormatter
2214 return formatter.to_html(
2215 buf=buf,
2216 classes=classes,
2217 notebook=notebook,
2218 border=border,
2219 encoding=encoding,
2220 )
2222 # ----------------------------------------------------------------------
2224 def info(
2225 self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
2226 ) -> None:
2227 """
2228 Print a concise summary of a DataFrame.
2230 This method prints information about a DataFrame including
2231 the index dtype and column dtypes, non-null values and memory usage.
2233 Parameters
2234 ----------
2235 verbose : bool, optional
2236 Whether to print the full summary. By default, the setting in
2237 ``pandas.options.display.max_info_columns`` is followed.
2238 buf : writable buffer, defaults to sys.stdout
2239 Where to send the output. By default, the output is printed to
2240 sys.stdout. Pass a writable buffer if you need to further process
2241 the output.
2242 max_cols : int, optional
2243 When to switch from the verbose to the truncated output. If the
2244 DataFrame has more than `max_cols` columns, the truncated output
2245 is used. By default, the setting in
2246 ``pandas.options.display.max_info_columns`` is used.
2247 memory_usage : bool, str, optional
2248 Specifies whether total memory usage of the DataFrame
2249 elements (including the index) should be displayed. By default,
2250 this follows the ``pandas.options.display.memory_usage`` setting.
2252 True always show memory usage. False never shows memory usage.
2253 A value of 'deep' is equivalent to "True with deep introspection".
2254 Memory usage is shown in human-readable units (base-2
2255 representation). Without deep introspection a memory estimation is
2256 made based in column dtype and number of rows assuming values
2257 consume the same memory amount for corresponding dtypes. With deep
2258 memory introspection, a real memory usage calculation is performed
2259 at the cost of computational resources.
2260 null_counts : bool, optional
2261 Whether to show the non-null counts. By default, this is shown
2262 only if the frame is smaller than
2263 ``pandas.options.display.max_info_rows`` and
2264 ``pandas.options.display.max_info_columns``. A value of True always
2265 shows the counts, and False never shows the counts.
2267 Returns
2268 -------
2269 None
2270 This method prints a summary of a DataFrame and returns None.
2272 See Also
2273 --------
2274 DataFrame.describe: Generate descriptive statistics of DataFrame
2275 columns.
2276 DataFrame.memory_usage: Memory usage of DataFrame columns.
2278 Examples
2279 --------
2280 >>> int_values = [1, 2, 3, 4, 5]
2281 >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
2282 >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
2283 >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
2284 ... "float_col": float_values})
2285 >>> df
2286 int_col text_col float_col
2287 0 1 alpha 0.00
2288 1 2 beta 0.25
2289 2 3 gamma 0.50
2290 3 4 delta 0.75
2291 4 5 epsilon 1.00
2293 Prints information of all columns:
2295 >>> df.info(verbose=True)
2296 <class 'pandas.core.frame.DataFrame'>
2297 RangeIndex: 5 entries, 0 to 4
2298 Data columns (total 3 columns):
2299 # Column Non-Null Count Dtype
2300 --- ------ -------------- -----
2301 0 int_col 5 non-null int64
2302 1 text_col 5 non-null object
2303 2 float_col 5 non-null float64
2304 dtypes: float64(1), int64(1), object(1)
2305 memory usage: 248.0+ bytes
2307 Prints a summary of columns count and its dtypes but not per column
2308 information:
2310 >>> df.info(verbose=False)
2311 <class 'pandas.core.frame.DataFrame'>
2312 RangeIndex: 5 entries, 0 to 4
2313 Columns: 3 entries, int_col to float_col
2314 dtypes: float64(1), int64(1), object(1)
2315 memory usage: 248.0+ bytes
2317 Pipe output of DataFrame.info to buffer instead of sys.stdout, get
2318 buffer content and writes to a text file:
2320 >>> import io
2321 >>> buffer = io.StringIO()
2322 >>> df.info(buf=buffer)
2323 >>> s = buffer.getvalue()
2324 >>> with open("df_info.txt", "w",
2325 ... encoding="utf-8") as f: # doctest: +SKIP
2326 ... f.write(s)
2327 260
2329 The `memory_usage` parameter allows deep introspection mode, specially
2330 useful for big DataFrames and fine-tune memory optimization:
2332 >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
2333 >>> df = pd.DataFrame({
2334 ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
2335 ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
2336 ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
2337 ... })
2338 >>> df.info()
2339 <class 'pandas.core.frame.DataFrame'>
2340 RangeIndex: 1000000 entries, 0 to 999999
2341 Data columns (total 3 columns):
2342 # Column Non-Null Count Dtype
2343 --- ------ -------------- -----
2344 0 column_1 1000000 non-null object
2345 1 column_2 1000000 non-null object
2346 2 column_3 1000000 non-null object
2347 dtypes: object(3)
2348 memory usage: 22.9+ MB
2350 >>> df.info(memory_usage='deep')
2351 <class 'pandas.core.frame.DataFrame'>
2352 RangeIndex: 1000000 entries, 0 to 999999
2353 Data columns (total 3 columns):
2354 # Column Non-Null Count Dtype
2355 --- ------ -------------- -----
2356 0 column_1 1000000 non-null object
2357 1 column_2 1000000 non-null object
2358 2 column_3 1000000 non-null object
2359 dtypes: object(3)
2360 memory usage: 188.8 MB
2361 """
2363 if buf is None: # pragma: no cover
2364 buf = sys.stdout
2366 lines = []
2368 lines.append(str(type(self)))
2369 lines.append(self.index._summary())
2371 if len(self.columns) == 0:
2372 lines.append(f"Empty {type(self).__name__}")
2373 fmt.buffer_put_lines(buf, lines)
2374 return
2376 cols = self.columns
2377 col_count = len(self.columns)
2379 # hack
2380 if max_cols is None:
2381 max_cols = get_option("display.max_info_columns", len(self.columns) + 1)
2383 max_rows = get_option("display.max_info_rows", len(self) + 1)
2385 if null_counts is None:
2386 show_counts = (col_count <= max_cols) and (len(self) < max_rows)
2387 else:
2388 show_counts = null_counts
2389 exceeds_info_cols = col_count > max_cols
2391 def _verbose_repr():
2392 lines.append(f"Data columns (total {len(self.columns)} columns):")
2394 id_head = " # "
2395 column_head = "Column"
2396 col_space = 2
2398 max_col = max(len(pprint_thing(k)) for k in cols)
2399 len_column = len(pprint_thing(column_head))
2400 space = max(max_col, len_column) + col_space
2402 max_id = len(pprint_thing(col_count))
2403 len_id = len(pprint_thing(id_head))
2404 space_num = max(max_id, len_id) + col_space
2405 counts = None
2407 header = _put_str(id_head, space_num) + _put_str(column_head, space)
2408 if show_counts:
2409 counts = self.count()
2410 if len(cols) != len(counts): # pragma: no cover
2411 raise AssertionError(
2412 f"Columns must equal counts ({len(cols)} != {len(counts)})"
2413 )
2414 count_header = "Non-Null Count"
2415 len_count = len(count_header)
2416 non_null = " non-null"
2417 max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
2418 space_count = max(len_count, max_count) + col_space
2419 count_temp = "{count}" + non_null
2420 else:
2421 count_header = ""
2422 space_count = len(count_header)
2423 len_count = space_count
2424 count_temp = "{count}"
2426 dtype_header = "Dtype"
2427 len_dtype = len(dtype_header)
2428 max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
2429 space_dtype = max(len_dtype, max_dtypes)
2430 header += _put_str(count_header, space_count) + _put_str(
2431 dtype_header, space_dtype
2432 )
2434 lines.append(header)
2435 lines.append(
2436 _put_str("-" * len_id, space_num)
2437 + _put_str("-" * len_column, space)
2438 + _put_str("-" * len_count, space_count)
2439 + _put_str("-" * len_dtype, space_dtype)
2440 )
2442 for i, col in enumerate(self.columns):
2443 dtype = self.dtypes.iloc[i]
2444 col = pprint_thing(col)
2446 line_no = _put_str(" {num}".format(num=i), space_num)
2447 count = ""
2448 if show_counts:
2449 count = counts.iloc[i]
2451 lines.append(
2452 line_no
2453 + _put_str(col, space)
2454 + _put_str(count_temp.format(count=count), space_count)
2455 + _put_str(dtype, space_dtype)
2456 )
2458 def _non_verbose_repr():
2459 lines.append(self.columns._summary(name="Columns"))
2461 def _sizeof_fmt(num, size_qualifier):
2462 # returns size in human readable format
2463 for x in ["bytes", "KB", "MB", "GB", "TB"]:
2464 if num < 1024.0:
2465 return f"{num:3.1f}{size_qualifier} {x}"
2466 num /= 1024.0
2467 return f"{num:3.1f}{size_qualifier} PB"
2469 if verbose:
2470 _verbose_repr()
2471 elif verbose is False: # specifically set to False, not nesc None
2472 _non_verbose_repr()
2473 else:
2474 if exceeds_info_cols:
2475 _non_verbose_repr()
2476 else:
2477 _verbose_repr()
2479 counts = self._data.get_dtype_counts()
2480 dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())]
2481 lines.append(f"dtypes: {', '.join(dtypes)}")
2483 if memory_usage is None:
2484 memory_usage = get_option("display.memory_usage")
2485 if memory_usage:
2486 # append memory usage of df to display
2487 size_qualifier = ""
2488 if memory_usage == "deep":
2489 deep = True
2490 else:
2491 # size_qualifier is just a best effort; not guaranteed to catch
2492 # all cases (e.g., it misses categorical data even with object
2493 # categories)
2494 deep = False
2495 if "object" in counts or self.index._is_memory_usage_qualified():
2496 size_qualifier = "+"
2497 mem_usage = self.memory_usage(index=True, deep=deep).sum()
2498 lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n")
2499 fmt.buffer_put_lines(buf, lines)
2501 def memory_usage(self, index=True, deep=False) -> Series:
2502 """
2503 Return the memory usage of each column in bytes.
2505 The memory usage can optionally include the contribution of
2506 the index and elements of `object` dtype.
2508 This value is displayed in `DataFrame.info` by default. This can be
2509 suppressed by setting ``pandas.options.display.memory_usage`` to False.
2511 Parameters
2512 ----------
2513 index : bool, default True
2514 Specifies whether to include the memory usage of the DataFrame's
2515 index in returned Series. If ``index=True``, the memory usage of
2516 the index is the first item in the output.
2517 deep : bool, default False
2518 If True, introspect the data deeply by interrogating
2519 `object` dtypes for system-level memory consumption, and include
2520 it in the returned values.
2522 Returns
2523 -------
2524 Series
2525 A Series whose index is the original column names and whose values
2526 is the memory usage of each column in bytes.
2528 See Also
2529 --------
2530 numpy.ndarray.nbytes : Total bytes consumed by the elements of an
2531 ndarray.
2532 Series.memory_usage : Bytes consumed by a Series.
2533 Categorical : Memory-efficient array for string values with
2534 many repeated values.
2535 DataFrame.info : Concise summary of a DataFrame.
2537 Examples
2538 --------
2539 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
2540 >>> data = dict([(t, np.ones(shape=5000).astype(t))
2541 ... for t in dtypes])
2542 >>> df = pd.DataFrame(data)
2543 >>> df.head()
2544 int64 float64 complex128 object bool
2545 0 1 1.0 1.000000+0.000000j 1 True
2546 1 1 1.0 1.000000+0.000000j 1 True
2547 2 1 1.0 1.000000+0.000000j 1 True
2548 3 1 1.0 1.000000+0.000000j 1 True
2549 4 1 1.0 1.000000+0.000000j 1 True
2551 >>> df.memory_usage()
2552 Index 128
2553 int64 40000
2554 float64 40000
2555 complex128 80000
2556 object 40000
2557 bool 5000
2558 dtype: int64
2560 >>> df.memory_usage(index=False)
2561 int64 40000
2562 float64 40000
2563 complex128 80000
2564 object 40000
2565 bool 5000
2566 dtype: int64
2568 The memory footprint of `object` dtype columns is ignored by default:
2570 >>> df.memory_usage(deep=True)
2571 Index 128
2572 int64 40000
2573 float64 40000
2574 complex128 80000
2575 object 160000
2576 bool 5000
2577 dtype: int64
2579 Use a Categorical for efficient storage of an object-dtype column with
2580 many repeated values.
2582 >>> df['object'].astype('category').memory_usage(deep=True)
2583 5216
2584 """
2585 result = Series(
2586 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
2587 index=self.columns,
2588 )
2589 if index:
2590 result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append(
2591 result
2592 )
2593 return result
2595 def transpose(self, *args, copy: bool = False) -> "DataFrame":
2596 """
2597 Transpose index and columns.
2599 Reflect the DataFrame over its main diagonal by writing rows as columns
2600 and vice-versa. The property :attr:`.T` is an accessor to the method
2601 :meth:`transpose`.
2603 Parameters
2604 ----------
2605 *args : tuple, optional
2606 Accepted for compatibility with NumPy.
2607 copy : bool, default False
2608 Whether to copy the data after transposing, even for DataFrames
2609 with a single dtype.
2611 Note that a copy is always required for mixed dtype DataFrames,
2612 or for DataFrames with any extension types.
2614 Returns
2615 -------
2616 DataFrame
2617 The transposed DataFrame.
2619 See Also
2620 --------
2621 numpy.transpose : Permute the dimensions of a given array.
2623 Notes
2624 -----
2625 Transposing a DataFrame with mixed dtypes will result in a homogeneous
2626 DataFrame with the `object` dtype. In such a case, a copy of the data
2627 is always made.
2629 Examples
2630 --------
2631 **Square DataFrame with homogeneous dtype**
2633 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
2634 >>> df1 = pd.DataFrame(data=d1)
2635 >>> df1
2636 col1 col2
2637 0 1 3
2638 1 2 4
2640 >>> df1_transposed = df1.T # or df1.transpose()
2641 >>> df1_transposed
2642 0 1
2643 col1 1 2
2644 col2 3 4
2646 When the dtype is homogeneous in the original DataFrame, we get a
2647 transposed DataFrame with the same dtype:
2649 >>> df1.dtypes
2650 col1 int64
2651 col2 int64
2652 dtype: object
2653 >>> df1_transposed.dtypes
2654 0 int64
2655 1 int64
2656 dtype: object
2658 **Non-square DataFrame with mixed dtypes**
2660 >>> d2 = {'name': ['Alice', 'Bob'],
2661 ... 'score': [9.5, 8],
2662 ... 'employed': [False, True],
2663 ... 'kids': [0, 0]}
2664 >>> df2 = pd.DataFrame(data=d2)
2665 >>> df2
2666 name score employed kids
2667 0 Alice 9.5 False 0
2668 1 Bob 8.0 True 0
2670 >>> df2_transposed = df2.T # or df2.transpose()
2671 >>> df2_transposed
2672 0 1
2673 name Alice Bob
2674 score 9.5 8
2675 employed False True
2676 kids 0 0
2678 When the DataFrame has mixed dtypes, we get a transposed DataFrame with
2679 the `object` dtype:
2681 >>> df2.dtypes
2682 name object
2683 score float64
2684 employed bool
2685 kids int64
2686 dtype: object
2687 >>> df2_transposed.dtypes
2688 0 object
2689 1 object
2690 dtype: object
2691 """
2692 nv.validate_transpose(args, dict())
2693 # construct the args
2695 dtypes = list(self.dtypes)
2696 if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]):
2697 # We have EAs with the same dtype. We can preserve that dtype in transpose.
2698 dtype = dtypes[0]
2699 arr_type = dtype.construct_array_type()
2700 values = self.values
2702 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
2703 result = self._constructor(
2704 dict(zip(self.index, new_values)), index=self.columns
2705 )
2707 else:
2708 new_values = self.values.T
2709 if copy:
2710 new_values = new_values.copy()
2711 result = self._constructor(
2712 new_values, index=self.columns, columns=self.index
2713 )
2715 return result.__finalize__(self)
2717 T = property(transpose)
2719 # ----------------------------------------------------------------------
2720 # Indexing Methods
2722 def _ixs(self, i: int, axis: int = 0):
2723 """
2724 Parameters
2725 ----------
2726 i : int
2727 axis : int
2729 Notes
2730 -----
2731 If slice passed, the resulting data will be a view.
2732 """
2733 # irow
2734 if axis == 0:
2735 new_values = self._data.fast_xs(i)
2737 # if we are a copy, mark as such
2738 copy = isinstance(new_values, np.ndarray) and new_values.base is None
2739 result = self._constructor_sliced(
2740 new_values,
2741 index=self.columns,
2742 name=self.index[i],
2743 dtype=new_values.dtype,
2744 )
2745 result._set_is_copy(self, copy=copy)
2746 return result
2748 # icol
2749 else:
2750 label = self.columns[i]
2752 # if the values returned are not the same length
2753 # as the index (iow a not found value), iget returns
2754 # a 0-len ndarray. This is effectively catching
2755 # a numpy error (as numpy should really raise)
2756 values = self._data.iget(i)
2758 if len(self.index) and not len(values):
2759 values = np.array([np.nan] * len(self.index), dtype=object)
2760 result = self._box_col_values(values, label)
2762 # this is a cached value, mark it so
2763 result._set_as_cached(label, self)
2765 return result
2767 def __getitem__(self, key):
2768 key = lib.item_from_zerodim(key)
2769 key = com.apply_if_callable(key, self)
2771 if is_hashable(key):
2772 # shortcut if the key is in columns
2773 if self.columns.is_unique and key in self.columns:
2774 if self.columns.nlevels > 1:
2775 return self._getitem_multilevel(key)
2776 return self._get_item_cache(key)
2778 # Do we have a slicer (on rows)?
2779 indexer = convert_to_index_sliceable(self, key)
2780 if indexer is not None:
2781 # either we have a slice or we have a string that can be converted
2782 # to a slice for partial-string date indexing
2783 return self._slice(indexer, axis=0)
2785 # Do we have a (boolean) DataFrame?
2786 if isinstance(key, DataFrame):
2787 return self.where(key)
2789 # Do we have a (boolean) 1d indexer?
2790 if com.is_bool_indexer(key):
2791 return self._getitem_bool_array(key)
2793 # We are left with two options: a single key, and a collection of keys,
2794 # We interpret tuples as collections only for non-MultiIndex
2795 is_single_key = isinstance(key, tuple) or not is_list_like(key)
2797 if is_single_key:
2798 if self.columns.nlevels > 1:
2799 return self._getitem_multilevel(key)
2800 indexer = self.columns.get_loc(key)
2801 if is_integer(indexer):
2802 indexer = [indexer]
2803 else:
2804 if is_iterator(key):
2805 key = list(key)
2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2808 # take() does not accept boolean indexers
2809 if getattr(indexer, "dtype", None) == bool:
2810 indexer = np.where(indexer)[0]
2812 data = self._take_with_is_copy(indexer, axis=1)
2814 if is_single_key:
2815 # What does looking for a single key in a non-unique index return?
2816 # The behavior is inconsistent. It returns a Series, except when
2817 # - the key itself is repeated (test on data.shape, #9519), or
2818 # - we have a MultiIndex on columns (test on self.columns, #21309)
2819 if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex):
2820 data = data[key]
2822 return data
2824 def _getitem_bool_array(self, key):
2825 # also raises Exception if object array with NA values
2826 # warning here just in case -- previously __setitem__ was
2827 # reindexing but __getitem__ was not; it seems more reasonable to
2828 # go with the __setitem__ behavior since that is more consistent
2829 # with all other indexing behavior
2830 if isinstance(key, Series) and not key.index.equals(self.index):
2831 warnings.warn(
2832 "Boolean Series key will be reindexed to match DataFrame index.",
2833 UserWarning,
2834 stacklevel=3,
2835 )
2836 elif len(key) != len(self.index):
2837 raise ValueError(
2838 f"Item wrong length {len(key)} instead of {len(self.index)}."
2839 )
2841 # check_bool_indexer will throw exception if Series key cannot
2842 # be reindexed to match DataFrame rows
2843 key = check_bool_indexer(self.index, key)
2844 indexer = key.nonzero()[0]
2845 return self._take_with_is_copy(indexer, axis=0)
2847 def _getitem_multilevel(self, key):
2848 # self.columns is a MultiIndex
2849 loc = self.columns.get_loc(key)
2850 if isinstance(loc, (slice, Series, np.ndarray, Index)):
2851 new_columns = self.columns[loc]
2852 result_columns = maybe_droplevels(new_columns, key)
2853 if self._is_mixed_type:
2854 result = self.reindex(columns=new_columns)
2855 result.columns = result_columns
2856 else:
2857 new_values = self.values[:, loc]
2858 result = self._constructor(
2859 new_values, index=self.index, columns=result_columns
2860 )
2861 result = result.__finalize__(self)
2863 # If there is only one column being returned, and its name is
2864 # either an empty string, or a tuple with an empty string as its
2865 # first element, then treat the empty string as a placeholder
2866 # and return the column as if the user had provided that empty
2867 # string in the key. If the result is a Series, exclude the
2868 # implied empty string from its name.
2869 if len(result.columns) == 1:
2870 top = result.columns[0]
2871 if isinstance(top, tuple):
2872 top = top[0]
2873 if top == "":
2874 result = result[""]
2875 if isinstance(result, Series):
2876 result = self._constructor_sliced(
2877 result, index=self.index, name=key
2878 )
2880 result._set_is_copy(self)
2881 return result
2882 else:
2883 return self._get_item_cache(key)
2885 def _get_value(self, index, col, takeable: bool = False):
2886 """
2887 Quickly retrieve single value at passed column and index.
2889 Parameters
2890 ----------
2891 index : row label
2892 col : column label
2893 takeable : interpret the index/col as indexers, default False
2895 Returns
2896 -------
2897 scalar
2898 """
2899 if takeable:
2900 series = self._iget_item_cache(col)
2901 return com.maybe_box_datetimelike(series._values[index])
2903 series = self._get_item_cache(col)
2904 engine = self.index._engine
2906 try:
2907 return engine.get_value(series._values, index)
2908 except KeyError:
2909 # GH 20629
2910 if self.index.nlevels > 1:
2911 # partial indexing forbidden
2912 raise
2913 except (TypeError, ValueError):
2914 pass
2916 # we cannot handle direct indexing
2917 # use positional
2918 col = self.columns.get_loc(col)
2919 index = self.index.get_loc(index)
2920 return self._get_value(index, col, takeable=True)
2922 def __setitem__(self, key, value):
2923 key = com.apply_if_callable(key, self)
2925 # see if we can slice the rows
2926 indexer = convert_to_index_sliceable(self, key)
2927 if indexer is not None:
2928 # either we have a slice or we have a string that can be converted
2929 # to a slice for partial-string date indexing
2930 return self._setitem_slice(indexer, value)
2932 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
2933 self._setitem_frame(key, value)
2934 elif isinstance(key, (Series, np.ndarray, list, Index)):
2935 self._setitem_array(key, value)
2936 else:
2937 # set column
2938 self._set_item(key, value)
2940 def _setitem_slice(self, key, value):
2941 # NB: we can't just use self.loc[key] = value because that
2942 # operates on labels and we need to operate positional for
2943 # backwards-compat, xref GH#31469
2944 self._check_setitem_copy()
2945 self.loc._setitem_with_indexer(key, value)
2947 def _setitem_array(self, key, value):
2948 # also raises Exception if object array with NA values
2949 if com.is_bool_indexer(key):
2950 if len(key) != len(self.index):
2951 raise ValueError(
2952 f"Item wrong length {len(key)} instead of {len(self.index)}!"
2953 )
2954 key = check_bool_indexer(self.index, key)
2955 indexer = key.nonzero()[0]
2956 self._check_setitem_copy()
2957 self.loc._setitem_with_indexer(indexer, value)
2958 else:
2959 if isinstance(value, DataFrame):
2960 if len(value.columns) != len(key):
2961 raise ValueError("Columns must be same length as key")
2962 for k1, k2 in zip(key, value.columns):
2963 self[k1] = value[k2]
2964 else:
2965 indexer = self.loc._get_listlike_indexer(
2966 key, axis=1, raise_missing=False
2967 )[1]
2968 self._check_setitem_copy()
2969 self.loc._setitem_with_indexer((slice(None), indexer), value)
2971 def _setitem_frame(self, key, value):
2972 # support boolean setting with DataFrame input, e.g.
2973 # df[df > df2] = 0
2974 if isinstance(key, np.ndarray):
2975 if key.shape != self.shape:
2976 raise ValueError("Array conditional must be same shape as self")
2977 key = self._constructor(key, **self._construct_axes_dict())
2979 if key.values.size and not is_bool_dtype(key.values):
2980 raise TypeError(
2981 "Must pass DataFrame or 2-d ndarray with boolean values only"
2982 )
2984 self._check_inplace_setting(value)
2985 self._check_setitem_copy()
2986 self._where(-key, value, inplace=True)
2988 def _set_item(self, key, value):
2989 """
2990 Add series to DataFrame in specified column.
2992 If series is a numpy-array (not a Series/TimeSeries), it must be the
2993 same length as the DataFrames index or an error will be thrown.
2995 Series/TimeSeries will be conformed to the DataFrames index to
2996 ensure homogeneity.
2997 """
2999 self._ensure_valid_index(value)
3000 value = self._sanitize_column(key, value)
3001 NDFrame._set_item(self, key, value)
3003 # check if we are modifying a copy
3004 # try to set first as we want an invalid
3005 # value exception to occur first
3006 if len(self):
3007 self._check_setitem_copy()
3009 def _set_value(self, index, col, value, takeable: bool = False):
3010 """
3011 Put single value at passed column and index.
3013 Parameters
3014 ----------
3015 index : row label
3016 col : column label
3017 value : scalar
3018 takeable : interpret the index/col as indexers, default False
3020 Returns
3021 -------
3022 DataFrame
3023 If label pair is contained, will be reference to calling DataFrame,
3024 otherwise a new object.
3025 """
3026 try:
3027 if takeable is True:
3028 series = self._iget_item_cache(col)
3029 return series._set_value(index, value, takeable=True)
3031 series = self._get_item_cache(col)
3032 engine = self.index._engine
3033 engine.set_value(series._values, index, value)
3034 return self
3035 except (KeyError, TypeError):
3037 # set using a non-recursive method & reset the cache
3038 if takeable:
3039 self.iloc[index, col] = value
3040 else:
3041 self.loc[index, col] = value
3042 self._item_cache.pop(col, None)
3044 return self
3046 def _ensure_valid_index(self, value):
3047 """
3048 Ensure that if we don't have an index, that we can create one from the
3049 passed value.
3050 """
3051 # GH5632, make sure that we are a Series convertible
3052 if not len(self.index) and is_list_like(value) and len(value):
3053 try:
3054 value = Series(value)
3055 except (ValueError, NotImplementedError, TypeError):
3056 raise ValueError(
3057 "Cannot set a frame with no defined index "
3058 "and a value that cannot be converted to a "
3059 "Series"
3060 )
3062 self._data = self._data.reindex_axis(
3063 value.index.copy(), axis=1, fill_value=np.nan
3064 )
3066 def _box_item_values(self, key, values):
3067 items = self.columns[self.columns.get_loc(key)]
3068 if values.ndim == 2:
3069 return self._constructor(values.T, columns=items, index=self.index)
3070 else:
3071 return self._box_col_values(values, items)
3073 def _box_col_values(self, values, items):
3074 """
3075 Provide boxed values for a column.
3076 """
3077 klass = self._constructor_sliced
3078 return klass(values, index=self.index, name=items, fastpath=True)
3080 # ----------------------------------------------------------------------
3081 # Unsorted
3083 def query(self, expr, inplace=False, **kwargs):
3084 """
3085 Query the columns of a DataFrame with a boolean expression.
3087 Parameters
3088 ----------
3089 expr : str
3090 The query string to evaluate.
3092 You can refer to variables
3093 in the environment by prefixing them with an '@' character like
3094 ``@a + b``.
3096 You can refer to column names that contain spaces or operators by
3097 surrounding them in backticks. This way you can also escape
3098 names that start with a digit, or those that are a Python keyword.
3099 Basically when it is not valid Python identifier. See notes down
3100 for more details.
3102 For example, if one of your columns is called ``a a`` and you want
3103 to sum it with ``b``, your query should be ```a a` + b``.
3105 .. versionadded:: 0.25.0
3106 Backtick quoting introduced.
3108 .. versionadded:: 1.0.0
3109 Expanding functionality of backtick quoting for more than only spaces.
3111 inplace : bool
3112 Whether the query should modify the data in place or return
3113 a modified copy.
3114 **kwargs
3115 See the documentation for :func:`eval` for complete details
3116 on the keyword arguments accepted by :meth:`DataFrame.query`.
3118 Returns
3119 -------
3120 DataFrame
3121 DataFrame resulting from the provided query expression.
3123 See Also
3124 --------
3125 eval : Evaluate a string describing operations on
3126 DataFrame columns.
3127 DataFrame.eval : Evaluate a string describing operations on
3128 DataFrame columns.
3130 Notes
3131 -----
3132 The result of the evaluation of this expression is first passed to
3133 :attr:`DataFrame.loc` and if that fails because of a
3134 multidimensional key (e.g., a DataFrame) then the result will be passed
3135 to :meth:`DataFrame.__getitem__`.
3137 This method uses the top-level :func:`eval` function to
3138 evaluate the passed query.
3140 The :meth:`~pandas.DataFrame.query` method uses a slightly
3141 modified Python syntax by default. For example, the ``&`` and ``|``
3142 (bitwise) operators have the precedence of their boolean cousins,
3143 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
3144 however the semantics are different.
3146 You can change the semantics of the expression by passing the keyword
3147 argument ``parser='python'``. This enforces the same semantics as
3148 evaluation in Python space. Likewise, you can pass ``engine='python'``
3149 to evaluate an expression using Python itself as a backend. This is not
3150 recommended as it is inefficient compared to using ``numexpr`` as the
3151 engine.
3153 The :attr:`DataFrame.index` and
3154 :attr:`DataFrame.columns` attributes of the
3155 :class:`~pandas.DataFrame` instance are placed in the query namespace
3156 by default, which allows you to treat both the index and columns of the
3157 frame as a column in the frame.
3158 The identifier ``index`` is used for the frame index; you can also
3159 use the name of the index to identify it in a query. Please note that
3160 Python keywords may not be used as identifiers.
3162 For further details and examples see the ``query`` documentation in
3163 :ref:`indexing <indexing.query>`.
3165 *Backtick quoted variables*
3167 Backtick quoted variables are parsed as literal Python code and
3168 are converted internally to a Python valid identifier.
3169 This can lead to the following problems.
3171 During parsing a number of disallowed characters inside the backtick
3172 quoted string are replaced by strings that are allowed as a Python identifier.
3173 These characters include all operators in Python, the space character, the
3174 question mark, the exclamation mark, the dollar sign, and the euro sign.
3175 For other characters that fall outside the ASCII range (U+0001..U+007F)
3176 and those that are not further specified in PEP 3131,
3177 the query parser will raise an error.
3178 This excludes whitespace different than the space character,
3179 but also the hashtag (as it is used for comments) and the backtick
3180 itself (backtick can also not be escaped).
3182 In a special case, quotes that make a pair around a backtick can
3183 confuse the parser.
3184 For example, ```it's` > `that's``` will raise an error,
3185 as it forms a quoted string (``'s > `that'``) with a backtick inside.
3187 See also the Python documentation about lexical analysis
3188 (https://docs.python.org/3/reference/lexical_analysis.html)
3189 in combination with the source code in :mod:`pandas.core.computation.parsing`.
3191 Examples
3192 --------
3193 >>> df = pd.DataFrame({'A': range(1, 6),
3194 ... 'B': range(10, 0, -2),
3195 ... 'C C': range(10, 5, -1)})
3196 >>> df
3197 A B C C
3198 0 1 10 10
3199 1 2 8 9
3200 2 3 6 8
3201 3 4 4 7
3202 4 5 2 6
3203 >>> df.query('A > B')
3204 A B C C
3205 4 5 2 6
3207 The previous expression is equivalent to
3209 >>> df[df.A > df.B]
3210 A B C C
3211 4 5 2 6
3213 For columns with spaces in their name, you can use backtick quoting.
3215 >>> df.query('B == `C C`')
3216 A B C C
3217 0 1 10 10
3219 The previous expression is equivalent to
3221 >>> df[df.B == df['C C']]
3222 A B C C
3223 0 1 10 10
3224 """
3225 inplace = validate_bool_kwarg(inplace, "inplace")
3226 if not isinstance(expr, str):
3227 msg = f"expr must be a string to be evaluated, {type(expr)} given"
3228 raise ValueError(msg)
3229 kwargs["level"] = kwargs.pop("level", 0) + 1
3230 kwargs["target"] = None
3231 res = self.eval(expr, **kwargs)
3233 try:
3234 new_data = self.loc[res]
3235 except ValueError:
3236 # when res is multi-dimensional loc raises, but this is sometimes a
3237 # valid query
3238 new_data = self[res]
3240 if inplace:
3241 self._update_inplace(new_data)
3242 else:
3243 return new_data
3245 def eval(self, expr, inplace=False, **kwargs):
3246 """
3247 Evaluate a string describing operations on DataFrame columns.
3249 Operates on columns only, not specific rows or elements. This allows
3250 `eval` to run arbitrary code, which can make you vulnerable to code
3251 injection if you pass user input to this function.
3253 Parameters
3254 ----------
3255 expr : str
3256 The expression string to evaluate.
3257 inplace : bool, default False
3258 If the expression contains an assignment, whether to perform the
3259 operation inplace and mutate the existing DataFrame. Otherwise,
3260 a new DataFrame is returned.
3261 **kwargs
3262 See the documentation for :func:`eval` for complete details
3263 on the keyword arguments accepted by
3264 :meth:`~pandas.DataFrame.query`.
3266 Returns
3267 -------
3268 ndarray, scalar, or pandas object
3269 The result of the evaluation.
3271 See Also
3272 --------
3273 DataFrame.query : Evaluates a boolean expression to query the columns
3274 of a frame.
3275 DataFrame.assign : Can evaluate an expression or function to create new
3276 values for a column.
3277 eval : Evaluate a Python expression as a string using various
3278 backends.
3280 Notes
3281 -----
3282 For more details see the API documentation for :func:`~eval`.
3283 For detailed examples see :ref:`enhancing performance with eval
3284 <enhancingperf.eval>`.
3286 Examples
3287 --------
3288 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
3289 >>> df
3290 A B
3291 0 1 10
3292 1 2 8
3293 2 3 6
3294 3 4 4
3295 4 5 2
3296 >>> df.eval('A + B')
3297 0 11
3298 1 10
3299 2 9
3300 3 8
3301 4 7
3302 dtype: int64
3304 Assignment is allowed though by default the original DataFrame is not
3305 modified.
3307 >>> df.eval('C = A + B')
3308 A B C
3309 0 1 10 11
3310 1 2 8 10
3311 2 3 6 9
3312 3 4 4 8
3313 4 5 2 7
3314 >>> df
3315 A B
3316 0 1 10
3317 1 2 8
3318 2 3 6
3319 3 4 4
3320 4 5 2
3322 Use ``inplace=True`` to modify the original DataFrame.
3324 >>> df.eval('C = A + B', inplace=True)
3325 >>> df
3326 A B C
3327 0 1 10 11
3328 1 2 8 10
3329 2 3 6 9
3330 3 4 4 8
3331 4 5 2 7
3332 """
3333 from pandas.core.computation.eval import eval as _eval
3335 inplace = validate_bool_kwarg(inplace, "inplace")
3336 resolvers = kwargs.pop("resolvers", None)
3337 kwargs["level"] = kwargs.pop("level", 0) + 1
3338 if resolvers is None:
3339 index_resolvers = self._get_index_resolvers()
3340 column_resolvers = self._get_cleaned_column_resolvers()
3341 resolvers = column_resolvers, index_resolvers
3342 if "target" not in kwargs:
3343 kwargs["target"] = self
3344 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
3346 return _eval(expr, inplace=inplace, **kwargs)
3348 def select_dtypes(self, include=None, exclude=None) -> "DataFrame":
3349 """
3350 Return a subset of the DataFrame's columns based on the column dtypes.
3352 Parameters
3353 ----------
3354 include, exclude : scalar or list-like
3355 A selection of dtypes or strings to be included/excluded. At least
3356 one of these parameters must be supplied.
3358 Returns
3359 -------
3360 DataFrame
3361 The subset of the frame including the dtypes in ``include`` and
3362 excluding the dtypes in ``exclude``.
3364 Raises
3365 ------
3366 ValueError
3367 * If both of ``include`` and ``exclude`` are empty
3368 * If ``include`` and ``exclude`` have overlapping elements
3369 * If any kind of string dtype is passed in.
3371 Notes
3372 -----
3373 * To select all *numeric* types, use ``np.number`` or ``'number'``
3374 * To select strings you must use the ``object`` dtype, but note that
3375 this will return *all* object dtype columns
3376 * See the `numpy dtype hierarchy
3377 <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
3378 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
3379 ``'datetime64'``
3380 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
3381 ``'timedelta64'``
3382 * To select Pandas categorical dtypes, use ``'category'``
3383 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
3384 0.20.0) or ``'datetime64[ns, tz]'``
3386 Examples
3387 --------
3388 >>> df = pd.DataFrame({'a': [1, 2] * 3,
3389 ... 'b': [True, False] * 3,
3390 ... 'c': [1.0, 2.0] * 3})
3391 >>> df
3392 a b c
3393 0 1 True 1.0
3394 1 2 False 2.0
3395 2 1 True 1.0
3396 3 2 False 2.0
3397 4 1 True 1.0
3398 5 2 False 2.0
3400 >>> df.select_dtypes(include='bool')
3401 b
3402 0 True
3403 1 False
3404 2 True
3405 3 False
3406 4 True
3407 5 False
3409 >>> df.select_dtypes(include=['float64'])
3410 c
3411 0 1.0
3412 1 2.0
3413 2 1.0
3414 3 2.0
3415 4 1.0
3416 5 2.0
3418 >>> df.select_dtypes(exclude=['int'])
3419 b c
3420 0 True 1.0
3421 1 False 2.0
3422 2 True 1.0
3423 3 False 2.0
3424 4 True 1.0
3425 5 False 2.0
3426 """
3428 if not is_list_like(include):
3429 include = (include,) if include is not None else ()
3430 if not is_list_like(exclude):
3431 exclude = (exclude,) if exclude is not None else ()
3433 selection = (frozenset(include), frozenset(exclude))
3435 if not any(selection):
3436 raise ValueError("at least one of include or exclude must be nonempty")
3438 # convert the myriad valid dtypes object to a single representation
3439 include = frozenset(infer_dtype_from_object(x) for x in include)
3440 exclude = frozenset(infer_dtype_from_object(x) for x in exclude)
3441 for dtypes in (include, exclude):
3442 invalidate_string_dtypes(dtypes)
3444 # can't both include AND exclude!
3445 if not include.isdisjoint(exclude):
3446 raise ValueError(f"include and exclude overlap on {(include & exclude)}")
3448 # We raise when both include and exclude are empty
3449 # Hence, we can just shrink the columns we want to keep
3450 keep_these = np.full(self.shape[1], True)
3452 def extract_unique_dtypes_from_dtypes_set(
3453 dtypes_set: FrozenSet[Dtype], unique_dtypes: np.ndarray
3454 ) -> List[Dtype]:
3455 extracted_dtypes = [
3456 unique_dtype
3457 for unique_dtype in unique_dtypes
3458 if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore
3459 ]
3460 return extracted_dtypes
3462 unique_dtypes = self.dtypes.unique()
3464 if include:
3465 included_dtypes = extract_unique_dtypes_from_dtypes_set(
3466 include, unique_dtypes
3467 )
3468 keep_these &= self.dtypes.isin(included_dtypes)
3470 if exclude:
3471 excluded_dtypes = extract_unique_dtypes_from_dtypes_set(
3472 exclude, unique_dtypes
3473 )
3474 keep_these &= ~self.dtypes.isin(excluded_dtypes)
3476 return self.iloc[:, keep_these.values]
3478 def insert(self, loc, column, value, allow_duplicates=False) -> None:
3479 """
3480 Insert column into DataFrame at specified location.
3482 Raises a ValueError if `column` is already contained in the DataFrame,
3483 unless `allow_duplicates` is set to True.
3485 Parameters
3486 ----------
3487 loc : int
3488 Insertion index. Must verify 0 <= loc <= len(columns).
3489 column : str, number, or hashable object
3490 Label of the inserted column.
3491 value : int, Series, or array-like
3492 allow_duplicates : bool, optional
3493 """
3494 self._ensure_valid_index(value)
3495 value = self._sanitize_column(column, value, broadcast=False)
3496 self._data.insert(loc, column, value, allow_duplicates=allow_duplicates)
3498 def assign(self, **kwargs) -> "DataFrame":
3499 r"""
3500 Assign new columns to a DataFrame.
3502 Returns a new object with all original columns in addition to new ones.
3503 Existing columns that are re-assigned will be overwritten.
3505 Parameters
3506 ----------
3507 **kwargs : dict of {str: callable or Series}
3508 The column names are keywords. If the values are
3509 callable, they are computed on the DataFrame and
3510 assigned to the new columns. The callable must not
3511 change input DataFrame (though pandas doesn't check it).
3512 If the values are not callable, (e.g. a Series, scalar, or array),
3513 they are simply assigned.
3515 Returns
3516 -------
3517 DataFrame
3518 A new DataFrame with the new columns in addition to
3519 all the existing columns.
3521 Notes
3522 -----
3523 Assigning multiple columns within the same ``assign`` is possible.
3524 Later items in '\*\*kwargs' may refer to newly created or modified
3525 columns in 'df'; items are computed and assigned into 'df' in order.
3527 .. versionchanged:: 0.23.0
3529 Keyword argument order is maintained.
3531 Examples
3532 --------
3533 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
3534 ... index=['Portland', 'Berkeley'])
3535 >>> df
3536 temp_c
3537 Portland 17.0
3538 Berkeley 25.0
3540 Where the value is a callable, evaluated on `df`:
3542 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
3543 temp_c temp_f
3544 Portland 17.0 62.6
3545 Berkeley 25.0 77.0
3547 Alternatively, the same behavior can be achieved by directly
3548 referencing an existing Series or sequence:
3550 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
3551 temp_c temp_f
3552 Portland 17.0 62.6
3553 Berkeley 25.0 77.0
3555 You can create multiple columns within the same assign where one
3556 of the columns depends on another one defined within the same assign:
3558 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
3559 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
3560 temp_c temp_f temp_k
3561 Portland 17.0 62.6 290.15
3562 Berkeley 25.0 77.0 298.15
3563 """
3564 data = self.copy()
3566 for k, v in kwargs.items():
3567 data[k] = com.apply_if_callable(v, data)
3568 return data
3570 def _sanitize_column(self, key, value, broadcast=True):
3571 """
3572 Ensures new columns (which go into the BlockManager as new blocks) are
3573 always copied and converted into an array.
3575 Parameters
3576 ----------
3577 key : object
3578 value : scalar, Series, or array-like
3579 broadcast : bool, default True
3580 If ``key`` matches multiple duplicate column names in the
3581 DataFrame, this parameter indicates whether ``value`` should be
3582 tiled so that the returned array contains a (duplicated) column for
3583 each occurrence of the key. If False, ``value`` will not be tiled.
3585 Returns
3586 -------
3587 numpy.ndarray
3588 """
3590 def reindexer(value):
3591 # reindex if necessary
3593 if value.index.equals(self.index) or not len(self.index):
3594 value = value._values.copy()
3595 else:
3597 # GH 4107
3598 try:
3599 value = value.reindex(self.index)._values
3600 except ValueError as err:
3601 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
3602 if not value.index.is_unique:
3603 # duplicate axis
3604 raise err
3606 # other
3607 raise TypeError(
3608 "incompatible index of inserted column with frame index"
3609 )
3610 return value
3612 if isinstance(value, Series):
3613 value = reindexer(value)
3615 elif isinstance(value, DataFrame):
3616 # align right-hand-side columns if self.columns
3617 # is multi-index and self[key] is a sub-frame
3618 if isinstance(self.columns, ABCMultiIndex) and key in self.columns:
3619 loc = self.columns.get_loc(key)
3620 if isinstance(loc, (slice, Series, np.ndarray, Index)):
3621 cols = maybe_droplevels(self.columns[loc], key)
3622 if len(cols) and not cols.equals(value.columns):
3623 value = value.reindex(cols, axis=1)
3624 # now align rows
3625 value = reindexer(value).T
3627 elif isinstance(value, ExtensionArray):
3628 # Explicitly copy here, instead of in sanitize_index,
3629 # as sanitize_index won't copy an EA, even with copy=True
3630 value = value.copy()
3631 value = sanitize_index(value, self.index, copy=False)
3633 elif isinstance(value, Index) or is_sequence(value):
3635 # turn me into an ndarray
3636 value = sanitize_index(value, self.index, copy=False)
3637 if not isinstance(value, (np.ndarray, Index)):
3638 if isinstance(value, list) and len(value) > 0:
3639 value = maybe_convert_platform(value)
3640 else:
3641 value = com.asarray_tuplesafe(value)
3642 elif value.ndim == 2:
3643 value = value.copy().T
3644 elif isinstance(value, Index):
3645 value = value.copy(deep=True)
3646 else:
3647 value = value.copy()
3649 # possibly infer to datetimelike
3650 if is_object_dtype(value.dtype):
3651 value = maybe_infer_to_datetimelike(value)
3653 else:
3654 # cast ignores pandas dtypes. so save the dtype first
3655 infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)
3657 # upcast
3658 value = cast_scalar_to_array(len(self.index), value)
3659 value = maybe_cast_to_datetime(value, infer_dtype)
3661 # return internal types directly
3662 if is_extension_array_dtype(value):
3663 return value
3665 # broadcast across multiple columns if necessary
3666 if broadcast and key in self.columns and value.ndim == 1:
3667 if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex):
3668 existing_piece = self[key]
3669 if isinstance(existing_piece, DataFrame):
3670 value = np.tile(value, (len(existing_piece.columns), 1))
3672 return np.atleast_2d(np.asarray(value))
3674 @property
3675 def _series(self):
3676 return {
3677 item: Series(self._data.iget(idx), index=self.index, name=item)
3678 for idx, item in enumerate(self.columns)
3679 }
3681 def lookup(self, row_labels, col_labels) -> np.ndarray:
3682 """
3683 Label-based "fancy indexing" function for DataFrame.
3685 Given equal-length arrays of row and column labels, return an
3686 array of the values corresponding to each (row, col) pair.
3688 Parameters
3689 ----------
3690 row_labels : sequence
3691 The row labels to use for lookup.
3692 col_labels : sequence
3693 The column labels to use for lookup.
3695 Returns
3696 -------
3697 numpy.ndarray
3699 Examples
3700 --------
3701 values : ndarray
3702 The found values
3703 """
3704 n = len(row_labels)
3705 if n != len(col_labels):
3706 raise ValueError("Row labels must have same size as column labels")
3708 thresh = 1000
3709 if not self._is_mixed_type or n > thresh:
3710 values = self.values
3711 ridx = self.index.get_indexer(row_labels)
3712 cidx = self.columns.get_indexer(col_labels)
3713 if (ridx == -1).any():
3714 raise KeyError("One or more row labels was not found")
3715 if (cidx == -1).any():
3716 raise KeyError("One or more column labels was not found")
3717 flat_index = ridx * len(self.columns) + cidx
3718 result = values.flat[flat_index]
3719 else:
3720 result = np.empty(n, dtype="O")
3721 for i, (r, c) in enumerate(zip(row_labels, col_labels)):
3722 result[i] = self._get_value(r, c)
3724 if is_object_dtype(result):
3725 result = lib.maybe_convert_objects(result)
3727 return result
3729 # ----------------------------------------------------------------------
3730 # Reindexing and alignment
3732 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
3733 frame = self
3735 columns = axes["columns"]
3736 if columns is not None:
3737 frame = frame._reindex_columns(
3738 columns, method, copy, level, fill_value, limit, tolerance
3739 )
3741 index = axes["index"]
3742 if index is not None:
3743 frame = frame._reindex_index(
3744 index, method, copy, level, fill_value, limit, tolerance
3745 )
3747 return frame
3749 def _reindex_index(
3750 self,
3751 new_index,
3752 method,
3753 copy,
3754 level,
3755 fill_value=np.nan,
3756 limit=None,
3757 tolerance=None,
3758 ):
3759 new_index, indexer = self.index.reindex(
3760 new_index, method=method, level=level, limit=limit, tolerance=tolerance
3761 )
3762 return self._reindex_with_indexers(
3763 {0: [new_index, indexer]},
3764 copy=copy,
3765 fill_value=fill_value,
3766 allow_dups=False,
3767 )
3769 def _reindex_columns(
3770 self,
3771 new_columns,
3772 method,
3773 copy,
3774 level,
3775 fill_value=None,
3776 limit=None,
3777 tolerance=None,
3778 ):
3779 new_columns, indexer = self.columns.reindex(
3780 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
3781 )
3782 return self._reindex_with_indexers(
3783 {1: [new_columns, indexer]},
3784 copy=copy,
3785 fill_value=fill_value,
3786 allow_dups=False,
3787 )
3789 def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame":
3790 """
3791 We are guaranteed non-Nones in the axes.
3792 """
3794 new_index, row_indexer = self.index.reindex(axes["index"])
3795 new_columns, col_indexer = self.columns.reindex(axes["columns"])
3797 if row_indexer is not None and col_indexer is not None:
3798 indexer = row_indexer, col_indexer
3799 new_values = algorithms.take_2d_multi(
3800 self.values, indexer, fill_value=fill_value
3801 )
3802 return self._constructor(new_values, index=new_index, columns=new_columns)
3803 else:
3804 return self._reindex_with_indexers(
3805 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
3806 copy=copy,
3807 fill_value=fill_value,
3808 )
3810 @Appender(_shared_docs["align"] % _shared_doc_kwargs)
3811 def align(
3812 self,
3813 other,
3814 join="outer",
3815 axis=None,
3816 level=None,
3817 copy=True,
3818 fill_value=None,
3819 method=None,
3820 limit=None,
3821 fill_axis=0,
3822 broadcast_axis=None,
3823 ) -> "DataFrame":
3824 return super().align(
3825 other,
3826 join=join,
3827 axis=axis,
3828 level=level,
3829 copy=copy,
3830 fill_value=fill_value,
3831 method=method,
3832 limit=limit,
3833 fill_axis=fill_axis,
3834 broadcast_axis=broadcast_axis,
3835 )
3837 @Substitution(**_shared_doc_kwargs)
3838 @Appender(NDFrame.reindex.__doc__)
3839 @rewrite_axis_style_signature(
3840 "labels",
3841 [
3842 ("method", None),
3843 ("copy", True),
3844 ("level", None),
3845 ("fill_value", np.nan),
3846 ("limit", None),
3847 ("tolerance", None),
3848 ],
3849 )
3850 def reindex(self, *args, **kwargs) -> "DataFrame":
3851 axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")
3852 kwargs.update(axes)
3853 # Pop these, since the values are in `kwargs` under different names
3854 kwargs.pop("axis", None)
3855 kwargs.pop("labels", None)
3856 return super().reindex(**kwargs)
3858 def drop(
3859 self,
3860 labels=None,
3861 axis=0,
3862 index=None,
3863 columns=None,
3864 level=None,
3865 inplace=False,
3866 errors="raise",
3867 ):
3868 """
3869 Drop specified labels from rows or columns.
3871 Remove rows or columns by specifying label names and corresponding
3872 axis, or by specifying directly index or column names. When using a
3873 multi-index, labels on different levels can be removed by specifying
3874 the level.
3876 Parameters
3877 ----------
3878 labels : single label or list-like
3879 Index or column labels to drop.
3880 axis : {0 or 'index', 1 or 'columns'}, default 0
3881 Whether to drop labels from the index (0 or 'index') or
3882 columns (1 or 'columns').
3883 index : single label or list-like
3884 Alternative to specifying axis (``labels, axis=0``
3885 is equivalent to ``index=labels``).
3887 .. versionadded:: 0.21.0
3888 columns : single label or list-like
3889 Alternative to specifying axis (``labels, axis=1``
3890 is equivalent to ``columns=labels``).
3892 .. versionadded:: 0.21.0
3893 level : int or level name, optional
3894 For MultiIndex, level from which the labels will be removed.
3895 inplace : bool, default False
3896 If True, do operation inplace and return None.
3897 errors : {'ignore', 'raise'}, default 'raise'
3898 If 'ignore', suppress error and only existing labels are
3899 dropped.
3901 Returns
3902 -------
3903 DataFrame
3904 DataFrame without the removed index or column labels.
3906 Raises
3907 ------
3908 KeyError
3909 If any of the labels is not found in the selected axis.
3911 See Also
3912 --------
3913 DataFrame.loc : Label-location based indexer for selection by label.
3914 DataFrame.dropna : Return DataFrame with labels on given axis omitted
3915 where (all or any) data are missing.
3916 DataFrame.drop_duplicates : Return DataFrame with duplicate rows
3917 removed, optionally only considering certain columns.
3918 Series.drop : Return Series with specified index labels removed.
3920 Examples
3921 --------
3922 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
3923 ... columns=['A', 'B', 'C', 'D'])
3924 >>> df
3925 A B C D
3926 0 0 1 2 3
3927 1 4 5 6 7
3928 2 8 9 10 11
3930 Drop columns
3932 >>> df.drop(['B', 'C'], axis=1)
3933 A D
3934 0 0 3
3935 1 4 7
3936 2 8 11
3938 >>> df.drop(columns=['B', 'C'])
3939 A D
3940 0 0 3
3941 1 4 7
3942 2 8 11
3944 Drop a row by index
3946 >>> df.drop([0, 1])
3947 A B C D
3948 2 8 9 10 11
3950 Drop columns and/or rows of MultiIndex DataFrame
3952 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
3953 ... ['speed', 'weight', 'length']],
3954 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
3955 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
3956 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
3957 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
3958 ... [250, 150], [1.5, 0.8], [320, 250],
3959 ... [1, 0.8], [0.3, 0.2]])
3960 >>> df
3961 big small
3962 lama speed 45.0 30.0
3963 weight 200.0 100.0
3964 length 1.5 1.0
3965 cow speed 30.0 20.0
3966 weight 250.0 150.0
3967 length 1.5 0.8
3968 falcon speed 320.0 250.0
3969 weight 1.0 0.8
3970 length 0.3 0.2
3972 >>> df.drop(index='cow', columns='small')
3973 big
3974 lama speed 45.0
3975 weight 200.0
3976 length 1.5
3977 falcon speed 320.0
3978 weight 1.0
3979 length 0.3
3981 >>> df.drop(index='length', level=1)
3982 big small
3983 lama speed 45.0 30.0
3984 weight 200.0 100.0
3985 cow speed 30.0 20.0
3986 weight 250.0 150.0
3987 falcon speed 320.0 250.0
3988 weight 1.0 0.8
3989 """
3990 return super().drop(
3991 labels=labels,
3992 axis=axis,
3993 index=index,
3994 columns=columns,
3995 level=level,
3996 inplace=inplace,
3997 errors=errors,
3998 )
4000 @rewrite_axis_style_signature(
4001 "mapper",
4002 [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")],
4003 )
4004 def rename(
4005 self,
4006 mapper: Optional[Renamer] = None,
4007 *,
4008 index: Optional[Renamer] = None,
4009 columns: Optional[Renamer] = None,
4010 axis: Optional[Axis] = None,
4011 copy: bool = True,
4012 inplace: bool = False,
4013 level: Optional[Level] = None,
4014 errors: str = "ignore",
4015 ) -> Optional["DataFrame"]:
4017 """
4018 Alter axes labels.
4020 Function / dict values must be unique (1-to-1). Labels not contained in
4021 a dict / Series will be left as-is. Extra labels listed don't throw an
4022 error.
4024 See the :ref:`user guide <basics.rename>` for more.
4026 Parameters
4027 ----------
4028 mapper : dict-like or function
4029 Dict-like or functions transformations to apply to
4030 that axis' values. Use either ``mapper`` and ``axis`` to
4031 specify the axis to target with ``mapper``, or ``index`` and
4032 ``columns``.
4033 index : dict-like or function
4034 Alternative to specifying axis (``mapper, axis=0``
4035 is equivalent to ``index=mapper``).
4036 columns : dict-like or function
4037 Alternative to specifying axis (``mapper, axis=1``
4038 is equivalent to ``columns=mapper``).
4039 axis : int or str
4040 Axis to target with ``mapper``. Can be either the axis name
4041 ('index', 'columns') or number (0, 1). The default is 'index'.
4042 copy : bool, default True
4043 Also copy underlying data.
4044 inplace : bool, default False
4045 Whether to return a new DataFrame. If True then value of copy is
4046 ignored.
4047 level : int or level name, default None
4048 In case of a MultiIndex, only rename labels in the specified
4049 level.
4050 errors : {'ignore', 'raise'}, default 'ignore'
4051 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
4052 or `columns` contains labels that are not present in the Index
4053 being transformed.
4054 If 'ignore', existing keys will be renamed and extra keys will be
4055 ignored.
4057 Returns
4058 -------
4059 DataFrame
4060 DataFrame with the renamed axis labels.
4062 Raises
4063 ------
4064 KeyError
4065 If any of the labels is not found in the selected axis and
4066 "errors='raise'".
4068 See Also
4069 --------
4070 DataFrame.rename_axis : Set the name of the axis.
4072 Examples
4073 --------
4075 ``DataFrame.rename`` supports two calling conventions
4077 * ``(index=index_mapper, columns=columns_mapper, ...)``
4078 * ``(mapper, axis={'index', 'columns'}, ...)``
4080 We *highly* recommend using keyword arguments to clarify your
4081 intent.
4083 Rename columns using a mapping:
4085 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
4086 >>> df.rename(columns={"A": "a", "B": "c"})
4087 a c
4088 0 1 4
4089 1 2 5
4090 2 3 6
4092 Rename index using a mapping:
4094 >>> df.rename(index={0: "x", 1: "y", 2: "z"})
4095 A B
4096 x 1 4
4097 y 2 5
4098 z 3 6
4100 Cast index labels to a different type:
4102 >>> df.index
4103 RangeIndex(start=0, stop=3, step=1)
4104 >>> df.rename(index=str).index
4105 Index(['0', '1', '2'], dtype='object')
4107 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
4108 Traceback (most recent call last):
4109 KeyError: ['C'] not found in axis
4111 Using axis-style parameters
4113 >>> df.rename(str.lower, axis='columns')
4114 a b
4115 0 1 4
4116 1 2 5
4117 2 3 6
4119 >>> df.rename({1: 2, 2: 4}, axis='index')
4120 A B
4121 0 1 4
4122 2 2 5
4123 4 3 6
4124 """
4125 return super().rename(
4126 mapper=mapper,
4127 index=index,
4128 columns=columns,
4129 axis=axis,
4130 copy=copy,
4131 inplace=inplace,
4132 level=level,
4133 errors=errors,
4134 )
4136 @Substitution(**_shared_doc_kwargs)
4137 @Appender(NDFrame.fillna.__doc__)
4138 def fillna(
4139 self,
4140 value=None,
4141 method=None,
4142 axis=None,
4143 inplace=False,
4144 limit=None,
4145 downcast=None,
4146 ) -> Optional["DataFrame"]:
4147 return super().fillna(
4148 value=value,
4149 method=method,
4150 axis=axis,
4151 inplace=inplace,
4152 limit=limit,
4153 downcast=downcast,
4154 )
4156 @Appender(_shared_docs["replace"] % _shared_doc_kwargs)
4157 def replace(
4158 self,
4159 to_replace=None,
4160 value=None,
4161 inplace=False,
4162 limit=None,
4163 regex=False,
4164 method="pad",
4165 ):
4166 return super().replace(
4167 to_replace=to_replace,
4168 value=value,
4169 inplace=inplace,
4170 limit=limit,
4171 regex=regex,
4172 method=method,
4173 )
4175 @Appender(_shared_docs["shift"] % _shared_doc_kwargs)
4176 def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame":
4177 return super().shift(
4178 periods=periods, freq=freq, axis=axis, fill_value=fill_value
4179 )
4181 def set_index(
4182 self, keys, drop=True, append=False, inplace=False, verify_integrity=False
4183 ):
4184 """
4185 Set the DataFrame index using existing columns.
4187 Set the DataFrame index (row labels) using one or more existing
4188 columns or arrays (of the correct length). The index can replace the
4189 existing index or expand on it.
4191 Parameters
4192 ----------
4193 keys : label or array-like or list of labels/arrays
4194 This parameter can be either a single column key, a single array of
4195 the same length as the calling DataFrame, or a list containing an
4196 arbitrary combination of column keys and arrays. Here, "array"
4197 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
4198 instances of :class:`~collections.abc.Iterator`.
4199 drop : bool, default True
4200 Delete columns to be used as the new index.
4201 append : bool, default False
4202 Whether to append columns to existing index.
4203 inplace : bool, default False
4204 Modify the DataFrame in place (do not create a new object).
4205 verify_integrity : bool, default False
4206 Check the new index for duplicates. Otherwise defer the check until
4207 necessary. Setting to False will improve the performance of this
4208 method.
4210 Returns
4211 -------
4212 DataFrame
4213 Changed row labels.
4215 See Also
4216 --------
4217 DataFrame.reset_index : Opposite of set_index.
4218 DataFrame.reindex : Change to new indices or expand indices.
4219 DataFrame.reindex_like : Change to same indices as other DataFrame.
4221 Examples
4222 --------
4223 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
4224 ... 'year': [2012, 2014, 2013, 2014],
4225 ... 'sale': [55, 40, 84, 31]})
4226 >>> df
4227 month year sale
4228 0 1 2012 55
4229 1 4 2014 40
4230 2 7 2013 84
4231 3 10 2014 31
4233 Set the index to become the 'month' column:
4235 >>> df.set_index('month')
4236 year sale
4237 month
4238 1 2012 55
4239 4 2014 40
4240 7 2013 84
4241 10 2014 31
4243 Create a MultiIndex using columns 'year' and 'month':
4245 >>> df.set_index(['year', 'month'])
4246 sale
4247 year month
4248 2012 1 55
4249 2014 4 40
4250 2013 7 84
4251 2014 10 31
4253 Create a MultiIndex using an Index and a column:
4255 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
4256 month sale
4257 year
4258 1 2012 1 55
4259 2 2014 4 40
4260 3 2013 7 84
4261 4 2014 10 31
4263 Create a MultiIndex using two Series:
4265 >>> s = pd.Series([1, 2, 3, 4])
4266 >>> df.set_index([s, s**2])
4267 month year sale
4268 1 1 1 2012 55
4269 2 4 4 2014 40
4270 3 9 7 2013 84
4271 4 16 10 2014 31
4272 """
4273 inplace = validate_bool_kwarg(inplace, "inplace")
4274 if not isinstance(keys, list):
4275 keys = [keys]
4277 err_msg = (
4278 'The parameter "keys" may be a column key, one-dimensional '
4279 "array, or a list containing only valid column keys and "
4280 "one-dimensional arrays."
4281 )
4283 missing: List[Optional[Hashable]] = []
4284 for col in keys:
4285 if isinstance(
4286 col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator)
4287 ):
4288 # arrays are fine as long as they are one-dimensional
4289 # iterators get converted to list below
4290 if getattr(col, "ndim", 1) != 1:
4291 raise ValueError(err_msg)
4292 else:
4293 # everything else gets tried as a key; see GH 24969
4294 try:
4295 found = col in self.columns
4296 except TypeError:
4297 raise TypeError(f"{err_msg}. Received column of type {type(col)}")
4298 else:
4299 if not found:
4300 missing.append(col)
4302 if missing:
4303 raise KeyError(f"None of {missing} are in the columns")
4305 if inplace:
4306 frame = self
4307 else:
4308 frame = self.copy()
4310 arrays = []
4311 names = []
4312 if append:
4313 names = list(self.index.names)
4314 if isinstance(self.index, ABCMultiIndex):
4315 for i in range(self.index.nlevels):
4316 arrays.append(self.index._get_level_values(i))
4317 else:
4318 arrays.append(self.index)
4320 to_remove: List[Optional[Hashable]] = []
4321 for col in keys:
4322 if isinstance(col, ABCMultiIndex):
4323 for n in range(col.nlevels):
4324 arrays.append(col._get_level_values(n))
4325 names.extend(col.names)
4326 elif isinstance(col, (ABCIndexClass, ABCSeries)):
4327 # if Index then not MultiIndex (treated above)
4328 arrays.append(col)
4329 names.append(col.name)
4330 elif isinstance(col, (list, np.ndarray)):
4331 arrays.append(col)
4332 names.append(None)
4333 elif isinstance(col, abc.Iterator):
4334 arrays.append(list(col))
4335 names.append(None)
4336 # from here, col can only be a column label
4337 else:
4338 arrays.append(frame[col]._values)
4339 names.append(col)
4340 if drop:
4341 to_remove.append(col)
4343 if len(arrays[-1]) != len(self):
4344 # check newest element against length of calling frame, since
4345 # ensure_index_from_sequences would not raise for append=False.
4346 raise ValueError(
4347 f"Length mismatch: Expected {len(self)} rows, "
4348 f"received array of length {len(arrays[-1])}"
4349 )
4351 index = ensure_index_from_sequences(arrays, names)
4353 if verify_integrity and not index.is_unique:
4354 duplicates = index[index.duplicated()].unique()
4355 raise ValueError(f"Index has duplicate keys: {duplicates}")
4357 # use set to handle duplicate column names gracefully in case of drop
4358 for c in set(to_remove):
4359 del frame[c]
4361 # clear up memory usage
4362 index._cleanup()
4364 frame.index = index
4366 if not inplace:
4367 return frame
4369 def reset_index(
4370 self,
4371 level: Optional[Union[Hashable, Sequence[Hashable]]] = None,
4372 drop: bool = False,
4373 inplace: bool = False,
4374 col_level: Hashable = 0,
4375 col_fill: Optional[Hashable] = "",
4376 ) -> Optional["DataFrame"]:
4377 """
4378 Reset the index, or a level of it.
4380 Reset the index of the DataFrame, and use the default one instead.
4381 If the DataFrame has a MultiIndex, this method can remove one or more
4382 levels.
4384 Parameters
4385 ----------
4386 level : int, str, tuple, or list, default None
4387 Only remove the given levels from the index. Removes all levels by
4388 default.
4389 drop : bool, default False
4390 Do not try to insert index into dataframe columns. This resets
4391 the index to the default integer index.
4392 inplace : bool, default False
4393 Modify the DataFrame in place (do not create a new object).
4394 col_level : int or str, default 0
4395 If the columns have multiple levels, determines which level the
4396 labels are inserted into. By default it is inserted into the first
4397 level.
4398 col_fill : object, default ''
4399 If the columns have multiple levels, determines how the other
4400 levels are named. If None then the index name is repeated.
4402 Returns
4403 -------
4404 DataFrame or None
4405 DataFrame with the new index or None if ``inplace=True``.
4407 See Also
4408 --------
4409 DataFrame.set_index : Opposite of reset_index.
4410 DataFrame.reindex : Change to new indices or expand indices.
4411 DataFrame.reindex_like : Change to same indices as other DataFrame.
4413 Examples
4414 --------
4415 >>> df = pd.DataFrame([('bird', 389.0),
4416 ... ('bird', 24.0),
4417 ... ('mammal', 80.5),
4418 ... ('mammal', np.nan)],
4419 ... index=['falcon', 'parrot', 'lion', 'monkey'],
4420 ... columns=('class', 'max_speed'))
4421 >>> df
4422 class max_speed
4423 falcon bird 389.0
4424 parrot bird 24.0
4425 lion mammal 80.5
4426 monkey mammal NaN
4428 When we reset the index, the old index is added as a column, and a
4429 new sequential index is used:
4431 >>> df.reset_index()
4432 index class max_speed
4433 0 falcon bird 389.0
4434 1 parrot bird 24.0
4435 2 lion mammal 80.5
4436 3 monkey mammal NaN
4438 We can use the `drop` parameter to avoid the old index being added as
4439 a column:
4441 >>> df.reset_index(drop=True)
4442 class max_speed
4443 0 bird 389.0
4444 1 bird 24.0
4445 2 mammal 80.5
4446 3 mammal NaN
4448 You can also use `reset_index` with `MultiIndex`.
4450 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
4451 ... ('bird', 'parrot'),
4452 ... ('mammal', 'lion'),
4453 ... ('mammal', 'monkey')],
4454 ... names=['class', 'name'])
4455 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
4456 ... ('species', 'type')])
4457 >>> df = pd.DataFrame([(389.0, 'fly'),
4458 ... ( 24.0, 'fly'),
4459 ... ( 80.5, 'run'),
4460 ... (np.nan, 'jump')],
4461 ... index=index,
4462 ... columns=columns)
4463 >>> df
4464 speed species
4465 max type
4466 class name
4467 bird falcon 389.0 fly
4468 parrot 24.0 fly
4469 mammal lion 80.5 run
4470 monkey NaN jump
4472 If the index has multiple levels, we can reset a subset of them:
4474 >>> df.reset_index(level='class')
4475 class speed species
4476 max type
4477 name
4478 falcon bird 389.0 fly
4479 parrot bird 24.0 fly
4480 lion mammal 80.5 run
4481 monkey mammal NaN jump
4483 If we are not dropping the index, by default, it is placed in the top
4484 level. We can place it in another level:
4486 >>> df.reset_index(level='class', col_level=1)
4487 speed species
4488 class max type
4489 name
4490 falcon bird 389.0 fly
4491 parrot bird 24.0 fly
4492 lion mammal 80.5 run
4493 monkey mammal NaN jump
4495 When the index is inserted under another level, we can specify under
4496 which one with the parameter `col_fill`:
4498 >>> df.reset_index(level='class', col_level=1, col_fill='species')
4499 species speed species
4500 class max type
4501 name
4502 falcon bird 389.0 fly
4503 parrot bird 24.0 fly
4504 lion mammal 80.5 run
4505 monkey mammal NaN jump
4507 If we specify a nonexistent level for `col_fill`, it is created:
4509 >>> df.reset_index(level='class', col_level=1, col_fill='genus')
4510 genus speed species
4511 class max type
4512 name
4513 falcon bird 389.0 fly
4514 parrot bird 24.0 fly
4515 lion mammal 80.5 run
4516 monkey mammal NaN jump
4517 """
4518 inplace = validate_bool_kwarg(inplace, "inplace")
4519 if inplace:
4520 new_obj = self
4521 else:
4522 new_obj = self.copy()
4524 def _maybe_casted_values(index, labels=None):
4525 values = index._values
4526 if not isinstance(index, (PeriodIndex, DatetimeIndex)):
4527 if values.dtype == np.object_:
4528 values = lib.maybe_convert_objects(values)
4530 # if we have the labels, extract the values with a mask
4531 if labels is not None:
4532 mask = labels == -1
4534 # we can have situations where the whole mask is -1,
4535 # meaning there is nothing found in labels, so make all nan's
4536 if mask.all():
4537 values = np.empty(len(mask))
4538 values.fill(np.nan)
4539 else:
4540 values = values.take(labels)
4542 # TODO(https://github.com/pandas-dev/pandas/issues/24206)
4543 # Push this into maybe_upcast_putmask?
4544 # We can't pass EAs there right now. Looks a bit
4545 # complicated.
4546 # So we unbox the ndarray_values, op, re-box.
4547 values_type = type(values)
4548 values_dtype = values.dtype
4550 if issubclass(values_type, DatetimeLikeArray):
4551 values = values._data
4553 if mask.any():
4554 values, _ = maybe_upcast_putmask(values, mask, np.nan)
4556 if issubclass(values_type, DatetimeLikeArray):
4557 values = values_type(values, dtype=values_dtype)
4559 return values
4561 new_index = ibase.default_index(len(new_obj))
4562 if level is not None:
4563 if not isinstance(level, (tuple, list)):
4564 level = [level]
4565 level = [self.index._get_level_number(lev) for lev in level]
4566 if len(level) < self.index.nlevels:
4567 new_index = self.index.droplevel(level)
4569 if not drop:
4570 to_insert: Iterable[Tuple[Any, Optional[Any]]]
4571 if isinstance(self.index, ABCMultiIndex):
4572 names = [
4573 (n if n is not None else f"level_{i}")
4574 for i, n in enumerate(self.index.names)
4575 ]
4576 to_insert = zip(self.index.levels, self.index.codes)
4577 else:
4578 default = "index" if "index" not in self else "level_0"
4579 names = [default] if self.index.name is None else [self.index.name]
4580 to_insert = ((self.index, None),)
4582 multi_col = isinstance(self.columns, ABCMultiIndex)
4583 for i, (lev, lab) in reversed(list(enumerate(to_insert))):
4584 if not (level is None or i in level):
4585 continue
4586 name = names[i]
4587 if multi_col:
4588 col_name = list(name) if isinstance(name, tuple) else [name]
4589 if col_fill is None:
4590 if len(col_name) not in (1, self.columns.nlevels):
4591 raise ValueError(
4592 "col_fill=None is incompatible "
4593 f"with incomplete column name {name}"
4594 )
4595 col_fill = col_name[0]
4597 lev_num = self.columns._get_level_number(col_level)
4598 name_lst = [col_fill] * lev_num + col_name
4599 missing = self.columns.nlevels - len(name_lst)
4600 name_lst += [col_fill] * missing
4601 name = tuple(name_lst)
4602 # to ndarray and maybe infer different dtype
4603 level_values = _maybe_casted_values(lev, lab)
4604 new_obj.insert(0, name, level_values)
4606 new_obj.index = new_index
4607 if not inplace:
4608 return new_obj
4610 return None
4612 # ----------------------------------------------------------------------
4613 # Reindex-based selection methods
4615 @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
4616 def isna(self) -> "DataFrame":
4617 return super().isna()
4619 @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
4620 def isnull(self) -> "DataFrame":
4621 return super().isnull()
4623 @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
4624 def notna(self) -> "DataFrame":
4625 return super().notna()
4627 @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
4628 def notnull(self) -> "DataFrame":
4629 return super().notnull()
4631 def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
4632 """
4633 Remove missing values.
4635 See the :ref:`User Guide <missing_data>` for more on which values are
4636 considered missing, and how to work with missing data.
4638 Parameters
4639 ----------
4640 axis : {0 or 'index', 1 or 'columns'}, default 0
4641 Determine if rows or columns which contain missing values are
4642 removed.
4644 * 0, or 'index' : Drop rows which contain missing values.
4645 * 1, or 'columns' : Drop columns which contain missing value.
4647 .. versionchanged:: 1.0.0
4649 Pass tuple or list to drop on multiple axes.
4650 Only a single axis is allowed.
4652 how : {'any', 'all'}, default 'any'
4653 Determine if row or column is removed from DataFrame, when we have
4654 at least one NA or all NA.
4656 * 'any' : If any NA values are present, drop that row or column.
4657 * 'all' : If all values are NA, drop that row or column.
4659 thresh : int, optional
4660 Require that many non-NA values.
4661 subset : array-like, optional
4662 Labels along other axis to consider, e.g. if you are dropping rows
4663 these would be a list of columns to include.
4664 inplace : bool, default False
4665 If True, do operation inplace and return None.
4667 Returns
4668 -------
4669 DataFrame
4670 DataFrame with NA entries dropped from it.
4672 See Also
4673 --------
4674 DataFrame.isna: Indicate missing values.
4675 DataFrame.notna : Indicate existing (non-missing) values.
4676 DataFrame.fillna : Replace missing values.
4677 Series.dropna : Drop missing values.
4678 Index.dropna : Drop missing indices.
4680 Examples
4681 --------
4682 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
4683 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
4684 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
4685 ... pd.NaT]})
4686 >>> df
4687 name toy born
4688 0 Alfred NaN NaT
4689 1 Batman Batmobile 1940-04-25
4690 2 Catwoman Bullwhip NaT
4692 Drop the rows where at least one element is missing.
4694 >>> df.dropna()
4695 name toy born
4696 1 Batman Batmobile 1940-04-25
4698 Drop the columns where at least one element is missing.
4700 >>> df.dropna(axis='columns')
4701 name
4702 0 Alfred
4703 1 Batman
4704 2 Catwoman
4706 Drop the rows where all elements are missing.
4708 >>> df.dropna(how='all')
4709 name toy born
4710 0 Alfred NaN NaT
4711 1 Batman Batmobile 1940-04-25
4712 2 Catwoman Bullwhip NaT
4714 Keep only the rows with at least 2 non-NA values.
4716 >>> df.dropna(thresh=2)
4717 name toy born
4718 1 Batman Batmobile 1940-04-25
4719 2 Catwoman Bullwhip NaT
4721 Define in which columns to look for missing values.
4723 >>> df.dropna(subset=['name', 'born'])
4724 name toy born
4725 1 Batman Batmobile 1940-04-25
4727 Keep the DataFrame with valid entries in the same variable.
4729 >>> df.dropna(inplace=True)
4730 >>> df
4731 name toy born
4732 1 Batman Batmobile 1940-04-25
4733 """
4734 inplace = validate_bool_kwarg(inplace, "inplace")
4735 if isinstance(axis, (tuple, list)):
4736 # GH20987
4737 raise TypeError("supplying multiple axes to axis is no longer supported.")
4739 axis = self._get_axis_number(axis)
4740 agg_axis = 1 - axis
4742 agg_obj = self
4743 if subset is not None:
4744 ax = self._get_axis(agg_axis)
4745 indices = ax.get_indexer_for(subset)
4746 check = indices == -1
4747 if check.any():
4748 raise KeyError(list(np.compress(check, subset)))
4749 agg_obj = self.take(indices, axis=agg_axis)
4751 count = agg_obj.count(axis=agg_axis)
4753 if thresh is not None:
4754 mask = count >= thresh
4755 elif how == "any":
4756 mask = count == len(agg_obj._get_axis(agg_axis))
4757 elif how == "all":
4758 mask = count > 0
4759 else:
4760 if how is not None:
4761 raise ValueError(f"invalid how option: {how}")
4762 else:
4763 raise TypeError("must specify how or thresh")
4765 result = self.loc(axis=axis)[mask]
4767 if inplace:
4768 self._update_inplace(result)
4769 else:
4770 return result
4772 def drop_duplicates(
4773 self,
4774 subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
4775 keep: Union[str, bool] = "first",
4776 inplace: bool = False,
4777 ignore_index: bool = False,
4778 ) -> Optional["DataFrame"]:
4779 """
4780 Return DataFrame with duplicate rows removed.
4782 Considering certain columns is optional. Indexes, including time indexes
4783 are ignored.
4785 Parameters
4786 ----------
4787 subset : column label or sequence of labels, optional
4788 Only consider certain columns for identifying duplicates, by
4789 default use all of the columns.
4790 keep : {'first', 'last', False}, default 'first'
4791 Determines which duplicates (if any) to keep.
4792 - ``first`` : Drop duplicates except for the first occurrence.
4793 - ``last`` : Drop duplicates except for the last occurrence.
4794 - False : Drop all duplicates.
4795 inplace : bool, default False
4796 Whether to drop duplicates in place or to return a copy.
4797 ignore_index : bool, default False
4798 If True, the resulting axis will be labeled 0, 1, …, n - 1.
4800 .. versionadded:: 1.0.0
4802 Returns
4803 -------
4804 DataFrame
4805 DataFrame with duplicates removed or None if ``inplace=True``.
4806 """
4807 if self.empty:
4808 return self.copy()
4810 inplace = validate_bool_kwarg(inplace, "inplace")
4811 duplicated = self.duplicated(subset, keep=keep)
4813 if inplace:
4814 (inds,) = (-duplicated)._ndarray_values.nonzero()
4815 new_data = self._data.take(inds)
4817 if ignore_index:
4818 new_data.axes[1] = ibase.default_index(len(inds))
4819 self._update_inplace(new_data)
4820 else:
4821 result = self[-duplicated]
4823 if ignore_index:
4824 result.index = ibase.default_index(len(result))
4825 return result
4827 return None
4829 def duplicated(
4830 self,
4831 subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
4832 keep: Union[str, bool] = "first",
4833 ) -> "Series":
4834 """
4835 Return boolean Series denoting duplicate rows.
4837 Considering certain columns is optional.
4839 Parameters
4840 ----------
4841 subset : column label or sequence of labels, optional
4842 Only consider certain columns for identifying duplicates, by
4843 default use all of the columns.
4844 keep : {'first', 'last', False}, default 'first'
4845 Determines which duplicates (if any) to mark.
4847 - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
4848 - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
4849 - False : Mark all duplicates as ``True``.
4851 Returns
4852 -------
4853 Series
4854 """
4855 from pandas.core.sorting import get_group_index
4856 from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
4858 if self.empty:
4859 return Series(dtype=bool)
4861 def f(vals):
4862 labels, shape = algorithms.factorize(
4863 vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)
4864 )
4865 return labels.astype("i8", copy=False), len(shape)
4867 if subset is None:
4868 subset = self.columns
4869 elif (
4870 not np.iterable(subset)
4871 or isinstance(subset, str)
4872 or isinstance(subset, tuple)
4873 and subset in self.columns
4874 ):
4875 subset = (subset,)
4877 # needed for mypy since can't narrow types using np.iterable
4878 subset = cast(Iterable, subset)
4880 # Verify all columns in subset exist in the queried dataframe
4881 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
4882 # key that doesn't exist.
4883 diff = Index(subset).difference(self.columns)
4884 if not diff.empty:
4885 raise KeyError(diff)
4887 vals = (col.values for name, col in self.items() if name in subset)
4888 labels, shape = map(list, zip(*map(f, vals)))
4890 ids = get_group_index(labels, shape, sort=False, xnull=False)
4891 return Series(duplicated_int64(ids, keep), index=self.index)
4893 # ----------------------------------------------------------------------
4894 # Sorting
4896 @Substitution(**_shared_doc_kwargs)
4897 @Appender(NDFrame.sort_values.__doc__)
4898 def sort_values(
4899 self,
4900 by,
4901 axis=0,
4902 ascending=True,
4903 inplace=False,
4904 kind="quicksort",
4905 na_position="last",
4906 ignore_index=False,
4907 ):
4908 inplace = validate_bool_kwarg(inplace, "inplace")
4909 axis = self._get_axis_number(axis)
4911 if not isinstance(by, list):
4912 by = [by]
4913 if is_sequence(ascending) and len(by) != len(ascending):
4914 raise ValueError(
4915 f"Length of ascending ({len(ascending)}) != length of by ({len(by)})"
4916 )
4917 if len(by) > 1:
4918 from pandas.core.sorting import lexsort_indexer
4920 keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
4921 indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
4922 indexer = ensure_platform_int(indexer)
4923 else:
4924 from pandas.core.sorting import nargsort
4926 by = by[0]
4927 k = self._get_label_or_level_values(by, axis=axis)
4929 if isinstance(ascending, (tuple, list)):
4930 ascending = ascending[0]
4932 indexer = nargsort(
4933 k, kind=kind, ascending=ascending, na_position=na_position
4934 )
4936 new_data = self._data.take(
4937 indexer, axis=self._get_block_manager_axis(axis), verify=False
4938 )
4940 if ignore_index:
4941 new_data.axes[1] = ibase.default_index(len(indexer))
4943 if inplace:
4944 return self._update_inplace(new_data)
4945 else:
4946 return self._constructor(new_data).__finalize__(self)
4948 @Substitution(**_shared_doc_kwargs)
4949 @Appender(NDFrame.sort_index.__doc__)
4950 def sort_index(
4951 self,
4952 axis=0,
4953 level=None,
4954 ascending=True,
4955 inplace=False,
4956 kind="quicksort",
4957 na_position="last",
4958 sort_remaining=True,
4959 ignore_index: bool = False,
4960 ):
4962 # TODO: this can be combined with Series.sort_index impl as
4963 # almost identical
4965 inplace = validate_bool_kwarg(inplace, "inplace")
4967 axis = self._get_axis_number(axis)
4968 labels = self._get_axis(axis)
4970 # make sure that the axis is lexsorted to start
4971 # if not we need to reconstruct to get the correct indexer
4972 labels = labels._sort_levels_monotonic()
4973 if level is not None:
4975 new_axis, indexer = labels.sortlevel(
4976 level, ascending=ascending, sort_remaining=sort_remaining
4977 )
4979 elif isinstance(labels, ABCMultiIndex):
4980 from pandas.core.sorting import lexsort_indexer
4982 indexer = lexsort_indexer(
4983 labels._get_codes_for_sorting(),
4984 orders=ascending,
4985 na_position=na_position,
4986 )
4987 else:
4988 from pandas.core.sorting import nargsort
4990 # Check monotonic-ness before sort an index
4991 # GH11080
4992 if (ascending and labels.is_monotonic_increasing) or (
4993 not ascending and labels.is_monotonic_decreasing
4994 ):
4995 if inplace:
4996 return
4997 else:
4998 return self.copy()
5000 indexer = nargsort(
5001 labels, kind=kind, ascending=ascending, na_position=na_position
5002 )
5004 baxis = self._get_block_manager_axis(axis)
5005 new_data = self._data.take(indexer, axis=baxis, verify=False)
5007 # reconstruct axis if needed
5008 new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
5010 if ignore_index:
5011 new_data.axes[1] = ibase.default_index(len(indexer))
5013 if inplace:
5014 return self._update_inplace(new_data)
5015 else:
5016 return self._constructor(new_data).__finalize__(self)
5018 def nlargest(self, n, columns, keep="first") -> "DataFrame":
5019 """
5020 Return the first `n` rows ordered by `columns` in descending order.
5022 Return the first `n` rows with the largest values in `columns`, in
5023 descending order. The columns that are not specified are returned as
5024 well, but not used for ordering.
5026 This method is equivalent to
5027 ``df.sort_values(columns, ascending=False).head(n)``, but more
5028 performant.
5030 Parameters
5031 ----------
5032 n : int
5033 Number of rows to return.
5034 columns : label or list of labels
5035 Column label(s) to order by.
5036 keep : {'first', 'last', 'all'}, default 'first'
5037 Where there are duplicate values:
5039 - `first` : prioritize the first occurrence(s)
5040 - `last` : prioritize the last occurrence(s)
5041 - ``all`` : do not drop any duplicates, even it means
5042 selecting more than `n` items.
5044 .. versionadded:: 0.24.0
5046 Returns
5047 -------
5048 DataFrame
5049 The first `n` rows ordered by the given columns in descending
5050 order.
5052 See Also
5053 --------
5054 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
5055 ascending order.
5056 DataFrame.sort_values : Sort DataFrame by the values.
5057 DataFrame.head : Return the first `n` rows without re-ordering.
5059 Notes
5060 -----
5061 This function cannot be used with all column types. For example, when
5062 specifying columns with `object` or `category` dtypes, ``TypeError`` is
5063 raised.
5065 Examples
5066 --------
5067 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
5068 ... 434000, 434000, 337000, 11300,
5069 ... 11300, 11300],
5070 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
5071 ... 17036, 182, 38, 311],
5072 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
5073 ... "IS", "NR", "TV", "AI"]},
5074 ... index=["Italy", "France", "Malta",
5075 ... "Maldives", "Brunei", "Iceland",
5076 ... "Nauru", "Tuvalu", "Anguilla"])
5077 >>> df
5078 population GDP alpha-2
5079 Italy 59000000 1937894 IT
5080 France 65000000 2583560 FR
5081 Malta 434000 12011 MT
5082 Maldives 434000 4520 MV
5083 Brunei 434000 12128 BN
5084 Iceland 337000 17036 IS
5085 Nauru 11300 182 NR
5086 Tuvalu 11300 38 TV
5087 Anguilla 11300 311 AI
5089 In the following example, we will use ``nlargest`` to select the three
5090 rows having the largest values in column "population".
5092 >>> df.nlargest(3, 'population')
5093 population GDP alpha-2
5094 France 65000000 2583560 FR
5095 Italy 59000000 1937894 IT
5096 Malta 434000 12011 MT
5098 When using ``keep='last'``, ties are resolved in reverse order:
5100 >>> df.nlargest(3, 'population', keep='last')
5101 population GDP alpha-2
5102 France 65000000 2583560 FR
5103 Italy 59000000 1937894 IT
5104 Brunei 434000 12128 BN
5106 When using ``keep='all'``, all duplicate items are maintained:
5108 >>> df.nlargest(3, 'population', keep='all')
5109 population GDP alpha-2
5110 France 65000000 2583560 FR
5111 Italy 59000000 1937894 IT
5112 Malta 434000 12011 MT
5113 Maldives 434000 4520 MV
5114 Brunei 434000 12128 BN
5116 To order by the largest values in column "population" and then "GDP",
5117 we can specify multiple columns like in the next example.
5119 >>> df.nlargest(3, ['population', 'GDP'])
5120 population GDP alpha-2
5121 France 65000000 2583560 FR
5122 Italy 59000000 1937894 IT
5123 Brunei 434000 12128 BN
5124 """
5125 return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
5127 def nsmallest(self, n, columns, keep="first") -> "DataFrame":
5128 """
5129 Return the first `n` rows ordered by `columns` in ascending order.
5131 Return the first `n` rows with the smallest values in `columns`, in
5132 ascending order. The columns that are not specified are returned as
5133 well, but not used for ordering.
5135 This method is equivalent to
5136 ``df.sort_values(columns, ascending=True).head(n)``, but more
5137 performant.
5139 Parameters
5140 ----------
5141 n : int
5142 Number of items to retrieve.
5143 columns : list or str
5144 Column name or names to order by.
5145 keep : {'first', 'last', 'all'}, default 'first'
5146 Where there are duplicate values:
5148 - ``first`` : take the first occurrence.
5149 - ``last`` : take the last occurrence.
5150 - ``all`` : do not drop any duplicates, even it means
5151 selecting more than `n` items.
5153 .. versionadded:: 0.24.0
5155 Returns
5156 -------
5157 DataFrame
5159 See Also
5160 --------
5161 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
5162 descending order.
5163 DataFrame.sort_values : Sort DataFrame by the values.
5164 DataFrame.head : Return the first `n` rows without re-ordering.
5166 Examples
5167 --------
5168 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
5169 ... 434000, 434000, 337000, 11300,
5170 ... 11300, 11300],
5171 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
5172 ... 17036, 182, 38, 311],
5173 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
5174 ... "IS", "NR", "TV", "AI"]},
5175 ... index=["Italy", "France", "Malta",
5176 ... "Maldives", "Brunei", "Iceland",
5177 ... "Nauru", "Tuvalu", "Anguilla"])
5178 >>> df
5179 population GDP alpha-2
5180 Italy 59000000 1937894 IT
5181 France 65000000 2583560 FR
5182 Malta 434000 12011 MT
5183 Maldives 434000 4520 MV
5184 Brunei 434000 12128 BN
5185 Iceland 337000 17036 IS
5186 Nauru 11300 182 NR
5187 Tuvalu 11300 38 TV
5188 Anguilla 11300 311 AI
5190 In the following example, we will use ``nsmallest`` to select the
5191 three rows having the smallest values in column "a".
5193 >>> df.nsmallest(3, 'population')
5194 population GDP alpha-2
5195 Nauru 11300 182 NR
5196 Tuvalu 11300 38 TV
5197 Anguilla 11300 311 AI
5199 When using ``keep='last'``, ties are resolved in reverse order:
5201 >>> df.nsmallest(3, 'population', keep='last')
5202 population GDP alpha-2
5203 Anguilla 11300 311 AI
5204 Tuvalu 11300 38 TV
5205 Nauru 11300 182 NR
5207 When using ``keep='all'``, all duplicate items are maintained:
5209 >>> df.nsmallest(3, 'population', keep='all')
5210 population GDP alpha-2
5211 Nauru 11300 182 NR
5212 Tuvalu 11300 38 TV
5213 Anguilla 11300 311 AI
5215 To order by the largest values in column "a" and then "c", we can
5216 specify multiple columns like in the next example.
5218 >>> df.nsmallest(3, ['population', 'GDP'])
5219 population GDP alpha-2
5220 Tuvalu 11300 38 TV
5221 Nauru 11300 182 NR
5222 Anguilla 11300 311 AI
5223 """
5224 return algorithms.SelectNFrame(
5225 self, n=n, keep=keep, columns=columns
5226 ).nsmallest()
5228 def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame":
5229 """
5230 Swap levels i and j in a MultiIndex on a particular axis.
5232 Parameters
5233 ----------
5234 i, j : int or str
5235 Levels of the indices to be swapped. Can pass level name as string.
5237 Returns
5238 -------
5239 DataFrame
5240 """
5241 result = self.copy()
5243 axis = self._get_axis_number(axis)
5244 if axis == 0:
5245 result.index = result.index.swaplevel(i, j)
5246 else:
5247 result.columns = result.columns.swaplevel(i, j)
5248 return result
5250 def reorder_levels(self, order, axis=0) -> "DataFrame":
5251 """
5252 Rearrange index levels using input order. May not drop or duplicate levels.
5254 Parameters
5255 ----------
5256 order : list of int or list of str
5257 List representing new level order. Reference level by number
5258 (position) or by key (label).
5259 axis : int
5260 Where to reorder levels.
5262 Returns
5263 -------
5264 DataFrame
5265 """
5266 axis = self._get_axis_number(axis)
5267 if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover
5268 raise TypeError("Can only reorder levels on a hierarchical axis.")
5270 result = self.copy()
5272 if axis == 0:
5273 result.index = result.index.reorder_levels(order)
5274 else:
5275 result.columns = result.columns.reorder_levels(order)
5276 return result
5278 # ----------------------------------------------------------------------
5279 # Arithmetic / combination related
5281 def _combine_frame(self, other, func, fill_value=None, level=None):
5282 # at this point we have `self._indexed_same(other)`
5284 if fill_value is None:
5285 # since _arith_op may be called in a loop, avoid function call
5286 # overhead if possible by doing this check once
5287 _arith_op = func
5289 else:
5291 def _arith_op(left, right):
5292 # for the mixed_type case where we iterate over columns,
5293 # _arith_op(left, right) is equivalent to
5294 # left._binop(right, func, fill_value=fill_value)
5295 left, right = ops.fill_binop(left, right, fill_value)
5296 return func(left, right)
5298 if ops.should_series_dispatch(self, other, func):
5299 # iterate over columns
5300 new_data = ops.dispatch_to_series(self, other, _arith_op)
5301 else:
5302 with np.errstate(all="ignore"):
5303 res_values = _arith_op(self.values, other.values)
5304 new_data = dispatch_fill_zeros(func, self.values, other.values, res_values)
5306 return new_data
5308 def _combine_match_index(self, other, func):
5309 # at this point we have `self.index.equals(other.index)`
5311 if ops.should_series_dispatch(self, other, func):
5312 # operate column-wise; avoid costly object-casting in `.values`
5313 new_data = ops.dispatch_to_series(self, other, func)
5314 else:
5315 # fastpath --> operate directly on values
5316 with np.errstate(all="ignore"):
5317 new_data = func(self.values.T, other.values).T
5318 return new_data
5320 def _construct_result(self, result) -> "DataFrame":
5321 """
5322 Wrap the result of an arithmetic, comparison, or logical operation.
5324 Parameters
5325 ----------
5326 result : DataFrame
5328 Returns
5329 -------
5330 DataFrame
5331 """
5332 out = self._constructor(result, index=self.index, copy=False)
5333 # Pin columns instead of passing to constructor for compat with
5334 # non-unique columns case
5335 out.columns = self.columns
5336 return out
5338 def combine(
5339 self, other: "DataFrame", func, fill_value=None, overwrite=True
5340 ) -> "DataFrame":
5341 """
5342 Perform column-wise combine with another DataFrame.
5344 Combines a DataFrame with `other` DataFrame using `func`
5345 to element-wise combine columns. The row and column indexes of the
5346 resulting DataFrame will be the union of the two.
5348 Parameters
5349 ----------
5350 other : DataFrame
5351 The DataFrame to merge column-wise.
5352 func : function
5353 Function that takes two series as inputs and return a Series or a
5354 scalar. Used to merge the two dataframes column by columns.
5355 fill_value : scalar value, default None
5356 The value to fill NaNs with prior to passing any column to the
5357 merge func.
5358 overwrite : bool, default True
5359 If True, columns in `self` that do not exist in `other` will be
5360 overwritten with NaNs.
5362 Returns
5363 -------
5364 DataFrame
5365 Combination of the provided DataFrames.
5367 See Also
5368 --------
5369 DataFrame.combine_first : Combine two DataFrame objects and default to
5370 non-null values in frame calling the method.
5372 Examples
5373 --------
5374 Combine using a simple function that chooses the smaller column.
5376 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
5377 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
5378 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
5379 >>> df1.combine(df2, take_smaller)
5380 A B
5381 0 0 3
5382 1 0 3
5384 Example using a true element-wise combine function.
5386 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
5387 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
5388 >>> df1.combine(df2, np.minimum)
5389 A B
5390 0 1 2
5391 1 0 3
5393 Using `fill_value` fills Nones prior to passing the column to the
5394 merge function.
5396 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
5397 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
5398 >>> df1.combine(df2, take_smaller, fill_value=-5)
5399 A B
5400 0 0 -5.0
5401 1 0 4.0
5403 However, if the same element in both dataframes is None, that None
5404 is preserved
5406 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
5407 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
5408 >>> df1.combine(df2, take_smaller, fill_value=-5)
5409 A B
5410 0 0 -5.0
5411 1 0 3.0
5413 Example that demonstrates the use of `overwrite` and behavior when
5414 the axis differ between the dataframes.
5416 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
5417 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
5418 >>> df1.combine(df2, take_smaller)
5419 A B C
5420 0 NaN NaN NaN
5421 1 NaN 3.0 -10.0
5422 2 NaN 3.0 1.0
5424 >>> df1.combine(df2, take_smaller, overwrite=False)
5425 A B C
5426 0 0.0 NaN NaN
5427 1 0.0 3.0 -10.0
5428 2 NaN 3.0 1.0
5430 Demonstrating the preference of the passed in dataframe.
5432 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
5433 >>> df2.combine(df1, take_smaller)
5434 A B C
5435 0 0.0 NaN NaN
5436 1 0.0 3.0 NaN
5437 2 NaN 3.0 NaN
5439 >>> df2.combine(df1, take_smaller, overwrite=False)
5440 A B C
5441 0 0.0 NaN NaN
5442 1 0.0 3.0 1.0
5443 2 NaN 3.0 1.0
5444 """
5445 other_idxlen = len(other.index) # save for compare
5447 this, other = self.align(other, copy=False)
5448 new_index = this.index
5450 if other.empty and len(new_index) == len(self.index):
5451 return self.copy()
5453 if self.empty and len(other) == other_idxlen:
5454 return other.copy()
5456 # sorts if possible
5457 new_columns = this.columns.union(other.columns)
5458 do_fill = fill_value is not None
5459 result = {}
5460 for col in new_columns:
5461 series = this[col]
5462 otherSeries = other[col]
5464 this_dtype = series.dtype
5465 other_dtype = otherSeries.dtype
5467 this_mask = isna(series)
5468 other_mask = isna(otherSeries)
5470 # don't overwrite columns unnecessarily
5471 # DO propagate if this column is not in the intersection
5472 if not overwrite and other_mask.all():
5473 result[col] = this[col].copy()
5474 continue
5476 if do_fill:
5477 series = series.copy()
5478 otherSeries = otherSeries.copy()
5479 series[this_mask] = fill_value
5480 otherSeries[other_mask] = fill_value
5482 if col not in self.columns:
5483 # If self DataFrame does not have col in other DataFrame,
5484 # try to promote series, which is all NaN, as other_dtype.
5485 new_dtype = other_dtype
5486 try:
5487 series = series.astype(new_dtype, copy=False)
5488 except ValueError:
5489 # e.g. new_dtype is integer types
5490 pass
5491 else:
5492 # if we have different dtypes, possibly promote
5493 new_dtype = find_common_type([this_dtype, other_dtype])
5494 if not is_dtype_equal(this_dtype, new_dtype):
5495 series = series.astype(new_dtype)
5496 if not is_dtype_equal(other_dtype, new_dtype):
5497 otherSeries = otherSeries.astype(new_dtype)
5499 arr = func(series, otherSeries)
5500 arr = maybe_downcast_to_dtype(arr, this_dtype)
5502 result[col] = arr
5504 # convert_objects just in case
5505 return self._constructor(result, index=new_index, columns=new_columns)
5507 def combine_first(self, other: "DataFrame") -> "DataFrame":
5508 """
5509 Update null elements with value in the same location in `other`.
5511 Combine two DataFrame objects by filling null values in one DataFrame
5512 with non-null values from other DataFrame. The row and column indexes
5513 of the resulting DataFrame will be the union of the two.
5515 Parameters
5516 ----------
5517 other : DataFrame
5518 Provided DataFrame to use to fill null values.
5520 Returns
5521 -------
5522 DataFrame
5524 See Also
5525 --------
5526 DataFrame.combine : Perform series-wise operation on two DataFrames
5527 using a given function.
5529 Examples
5530 --------
5532 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
5533 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
5534 >>> df1.combine_first(df2)
5535 A B
5536 0 1.0 3.0
5537 1 0.0 4.0
5539 Null values still persist if the location of that null value
5540 does not exist in `other`
5542 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
5543 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
5544 >>> df1.combine_first(df2)
5545 A B C
5546 0 NaN 4.0 NaN
5547 1 0.0 3.0 1.0
5548 2 NaN 3.0 1.0
5549 """
5550 import pandas.core.computation.expressions as expressions
5552 def extract_values(arr):
5553 # Does two things:
5554 # 1. maybe gets the values from the Series / Index
5555 # 2. convert datelike to i8
5556 if isinstance(arr, (ABCIndexClass, ABCSeries)):
5557 arr = arr._values
5559 if needs_i8_conversion(arr):
5560 if is_extension_array_dtype(arr.dtype):
5561 arr = arr.asi8
5562 else:
5563 arr = arr.view("i8")
5564 return arr
5566 def combiner(x, y):
5567 mask = isna(x)
5568 if isinstance(mask, (ABCIndexClass, ABCSeries)):
5569 mask = mask._values
5571 x_values = extract_values(x)
5572 y_values = extract_values(y)
5574 # If the column y in other DataFrame is not in first DataFrame,
5575 # just return y_values.
5576 if y.name not in self.columns:
5577 return y_values
5579 return expressions.where(mask, y_values, x_values)
5581 return self.combine(other, combiner, overwrite=False)
5583 def update(
5584 self, other, join="left", overwrite=True, filter_func=None, errors="ignore"
5585 ) -> None:
5586 """
5587 Modify in place using non-NA values from another DataFrame.
5589 Aligns on indices. There is no return value.
5591 Parameters
5592 ----------
5593 other : DataFrame, or object coercible into a DataFrame
5594 Should have at least one matching index/column label
5595 with the original DataFrame. If a Series is passed,
5596 its name attribute must be set, and that will be
5597 used as the column name to align with the original DataFrame.
5598 join : {'left'}, default 'left'
5599 Only left join is implemented, keeping the index and columns of the
5600 original object.
5601 overwrite : bool, default True
5602 How to handle non-NA values for overlapping keys:
5604 * True: overwrite original DataFrame's values
5605 with values from `other`.
5606 * False: only update values that are NA in
5607 the original DataFrame.
5609 filter_func : callable(1d-array) -> bool 1d-array, optional
5610 Can choose to replace values other than NA. Return True for values
5611 that should be updated.
5612 errors : {'raise', 'ignore'}, default 'ignore'
5613 If 'raise', will raise a ValueError if the DataFrame and `other`
5614 both contain non-NA data in the same place.
5616 .. versionchanged:: 0.24.0
5617 Changed from `raise_conflict=False|True`
5618 to `errors='ignore'|'raise'`.
5620 Returns
5621 -------
5622 None : method directly changes calling object
5624 Raises
5625 ------
5626 ValueError
5627 * When `errors='raise'` and there's overlapping non-NA data.
5628 * When `errors` is not either `'ignore'` or `'raise'`
5629 NotImplementedError
5630 * If `join != 'left'`
5632 See Also
5633 --------
5634 dict.update : Similar method for dictionaries.
5635 DataFrame.merge : For column(s)-on-columns(s) operations.
5637 Examples
5638 --------
5639 >>> df = pd.DataFrame({'A': [1, 2, 3],
5640 ... 'B': [400, 500, 600]})
5641 >>> new_df = pd.DataFrame({'B': [4, 5, 6],
5642 ... 'C': [7, 8, 9]})
5643 >>> df.update(new_df)
5644 >>> df
5645 A B
5646 0 1 4
5647 1 2 5
5648 2 3 6
5650 The DataFrame's length does not increase as a result of the update,
5651 only values at matching index/column labels are updated.
5653 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5654 ... 'B': ['x', 'y', 'z']})
5655 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
5656 >>> df.update(new_df)
5657 >>> df
5658 A B
5659 0 a d
5660 1 b e
5661 2 c f
5663 For Series, it's name attribute must be set.
5665 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5666 ... 'B': ['x', 'y', 'z']})
5667 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
5668 >>> df.update(new_column)
5669 >>> df
5670 A B
5671 0 a d
5672 1 b y
5673 2 c e
5674 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5675 ... 'B': ['x', 'y', 'z']})
5676 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
5677 >>> df.update(new_df)
5678 >>> df
5679 A B
5680 0 a x
5681 1 b d
5682 2 c e
5684 If `other` contains NaNs the corresponding values are not updated
5685 in the original dataframe.
5687 >>> df = pd.DataFrame({'A': [1, 2, 3],
5688 ... 'B': [400, 500, 600]})
5689 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
5690 >>> df.update(new_df)
5691 >>> df
5692 A B
5693 0 1 4.0
5694 1 2 500.0
5695 2 3 6.0
5696 """
5697 import pandas.core.computation.expressions as expressions
5699 # TODO: Support other joins
5700 if join != "left": # pragma: no cover
5701 raise NotImplementedError("Only left join is supported")
5702 if errors not in ["ignore", "raise"]:
5703 raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
5705 if not isinstance(other, DataFrame):
5706 other = DataFrame(other)
5708 other = other.reindex_like(self)
5710 for col in self.columns:
5711 this = self[col]._values
5712 that = other[col]._values
5713 if filter_func is not None:
5714 with np.errstate(all="ignore"):
5715 mask = ~filter_func(this) | isna(that)
5716 else:
5717 if errors == "raise":
5718 mask_this = notna(that)
5719 mask_that = notna(this)
5720 if any(mask_this & mask_that):
5721 raise ValueError("Data overlaps.")
5723 if overwrite:
5724 mask = isna(that)
5725 else:
5726 mask = notna(this)
5728 # don't overwrite columns unnecessarily
5729 if mask.all():
5730 continue
5732 self[col] = expressions.where(mask, this, that)
5734 # ----------------------------------------------------------------------
5735 # Data reshaping
5736 @Appender(
5737 """
5738Examples
5739--------
5740>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
5741... 'Parrot', 'Parrot'],
5742... 'Max Speed': [380., 370., 24., 26.]})
5743>>> df
5744 Animal Max Speed
57450 Falcon 380.0
57461 Falcon 370.0
57472 Parrot 24.0
57483 Parrot 26.0
5749>>> df.groupby(['Animal']).mean()
5750 Max Speed
5751Animal
5752Falcon 375.0
5753Parrot 25.0
5755**Hierarchical Indexes**
5757We can groupby different levels of a hierarchical index
5758using the `level` parameter:
5760>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
5761... ['Captive', 'Wild', 'Captive', 'Wild']]
5762>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
5763>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
5764... index=index)
5765>>> df
5766 Max Speed
5767Animal Type
5768Falcon Captive 390.0
5769 Wild 350.0
5770Parrot Captive 30.0
5771 Wild 20.0
5772>>> df.groupby(level=0).mean()
5773 Max Speed
5774Animal
5775Falcon 370.0
5776Parrot 25.0
5777>>> df.groupby(level="Type").mean()
5778 Max Speed
5779Type
5780Captive 210.0
5781Wild 185.0
5782"""
5783 )
5784 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
5785 def groupby(
5786 self,
5787 by=None,
5788 axis=0,
5789 level=None,
5790 as_index: bool = True,
5791 sort: bool = True,
5792 group_keys: bool = True,
5793 squeeze: bool = False,
5794 observed: bool = False,
5795 ) -> "groupby_generic.DataFrameGroupBy":
5797 if level is None and by is None:
5798 raise TypeError("You have to supply one of 'by' and 'level'")
5799 axis = self._get_axis_number(axis)
5801 return groupby_generic.DataFrameGroupBy(
5802 obj=self,
5803 keys=by,
5804 axis=axis,
5805 level=level,
5806 as_index=as_index,
5807 sort=sort,
5808 group_keys=group_keys,
5809 squeeze=squeeze,
5810 observed=observed,
5811 )
5813 _shared_docs[
5814 "pivot"
5815 ] = """
5816 Return reshaped DataFrame organized by given index / column values.
5818 Reshape data (produce a "pivot" table) based on column values. Uses
5819 unique values from specified `index` / `columns` to form axes of the
5820 resulting DataFrame. This function does not support data
5821 aggregation, multiple values will result in a MultiIndex in the
5822 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
5824 Parameters
5825 ----------%s
5826 index : str or object, optional
5827 Column to use to make new frame's index. If None, uses
5828 existing index.
5829 columns : str or object
5830 Column to use to make new frame's columns.
5831 values : str, object or a list of the previous, optional
5832 Column(s) to use for populating new frame's values. If not
5833 specified, all remaining columns will be used and the result will
5834 have hierarchically indexed columns.
5836 .. versionchanged:: 0.23.0
5837 Also accept list of column names.
5839 Returns
5840 -------
5841 DataFrame
5842 Returns reshaped DataFrame.
5844 Raises
5845 ------
5846 ValueError:
5847 When there are any `index`, `columns` combinations with multiple
5848 values. `DataFrame.pivot_table` when you need to aggregate.
5850 See Also
5851 --------
5852 DataFrame.pivot_table : Generalization of pivot that can handle
5853 duplicate values for one index/column pair.
5854 DataFrame.unstack : Pivot based on the index values instead of a
5855 column.
5857 Notes
5858 -----
5859 For finer-tuned control, see hierarchical indexing documentation along
5860 with the related stack/unstack methods.
5862 Examples
5863 --------
5864 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
5865 ... 'two'],
5866 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
5867 ... 'baz': [1, 2, 3, 4, 5, 6],
5868 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
5869 >>> df
5870 foo bar baz zoo
5871 0 one A 1 x
5872 1 one B 2 y
5873 2 one C 3 z
5874 3 two A 4 q
5875 4 two B 5 w
5876 5 two C 6 t
5878 >>> df.pivot(index='foo', columns='bar', values='baz')
5879 bar A B C
5880 foo
5881 one 1 2 3
5882 two 4 5 6
5884 >>> df.pivot(index='foo', columns='bar')['baz']
5885 bar A B C
5886 foo
5887 one 1 2 3
5888 two 4 5 6
5890 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
5891 baz zoo
5892 bar A B C A B C
5893 foo
5894 one 1 2 3 x y z
5895 two 4 5 6 q w t
5897 A ValueError is raised if there are any duplicates.
5899 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
5900 ... "bar": ['A', 'A', 'B', 'C'],
5901 ... "baz": [1, 2, 3, 4]})
5902 >>> df
5903 foo bar baz
5904 0 one A 1
5905 1 one A 2
5906 2 two B 3
5907 3 two C 4
5909 Notice that the first two rows are the same for our `index`
5910 and `columns` arguments.
5912 >>> df.pivot(index='foo', columns='bar', values='baz')
5913 Traceback (most recent call last):
5914 ...
5915 ValueError: Index contains duplicate entries, cannot reshape
5916 """
5918 @Substitution("")
5919 @Appender(_shared_docs["pivot"])
5920 def pivot(self, index=None, columns=None, values=None) -> "DataFrame":
5921 from pandas.core.reshape.pivot import pivot
5923 return pivot(self, index=index, columns=columns, values=values)
5925 _shared_docs[
5926 "pivot_table"
5927 ] = """
5928 Create a spreadsheet-style pivot table as a DataFrame.
5930 The levels in the pivot table will be stored in MultiIndex objects
5931 (hierarchical indexes) on the index and columns of the result DataFrame.
5933 Parameters
5934 ----------%s
5935 values : column to aggregate, optional
5936 index : column, Grouper, array, or list of the previous
5937 If an array is passed, it must be the same length as the data. The
5938 list can contain any of the other types (except list).
5939 Keys to group by on the pivot table index. If an array is passed,
5940 it is being used as the same manner as column values.
5941 columns : column, Grouper, array, or list of the previous
5942 If an array is passed, it must be the same length as the data. The
5943 list can contain any of the other types (except list).
5944 Keys to group by on the pivot table column. If an array is passed,
5945 it is being used as the same manner as column values.
5946 aggfunc : function, list of functions, dict, default numpy.mean
5947 If list of functions passed, the resulting pivot table will have
5948 hierarchical columns whose top level are the function names
5949 (inferred from the function objects themselves)
5950 If dict is passed, the key is column to aggregate and value
5951 is function or list of functions.
5952 fill_value : scalar, default None
5953 Value to replace missing values with.
5954 margins : bool, default False
5955 Add all row / columns (e.g. for subtotal / grand totals).
5956 dropna : bool, default True
5957 Do not include columns whose entries are all NaN.
5958 margins_name : str, default 'All'
5959 Name of the row / column that will contain the totals
5960 when margins is True.
5961 observed : bool, default False
5962 This only applies if any of the groupers are Categoricals.
5963 If True: only show observed values for categorical groupers.
5964 If False: show all values for categorical groupers.
5966 .. versionchanged:: 0.25.0
5968 Returns
5969 -------
5970 DataFrame
5971 An Excel style pivot table.
5973 See Also
5974 --------
5975 DataFrame.pivot : Pivot without aggregation that can handle
5976 non-numeric data.
5978 Examples
5979 --------
5980 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
5981 ... "bar", "bar", "bar", "bar"],
5982 ... "B": ["one", "one", "one", "two", "two",
5983 ... "one", "one", "two", "two"],
5984 ... "C": ["small", "large", "large", "small",
5985 ... "small", "large", "small", "small",
5986 ... "large"],
5987 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
5988 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
5989 >>> df
5990 A B C D E
5991 0 foo one small 1 2
5992 1 foo one large 2 4
5993 2 foo one large 2 5
5994 3 foo two small 3 5
5995 4 foo two small 3 6
5996 5 bar one large 4 6
5997 6 bar one small 5 8
5998 7 bar two small 6 9
5999 8 bar two large 7 9
6001 This first example aggregates values by taking the sum.
6003 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
6004 ... columns=['C'], aggfunc=np.sum)
6005 >>> table
6006 C large small
6007 A B
6008 bar one 4.0 5.0
6009 two 7.0 6.0
6010 foo one 4.0 1.0
6011 two NaN 6.0
6013 We can also fill missing values using the `fill_value` parameter.
6015 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
6016 ... columns=['C'], aggfunc=np.sum, fill_value=0)
6017 >>> table
6018 C large small
6019 A B
6020 bar one 4 5
6021 two 7 6
6022 foo one 4 1
6023 two 0 6
6025 The next example aggregates by taking the mean across multiple columns.
6027 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
6028 ... aggfunc={'D': np.mean,
6029 ... 'E': np.mean})
6030 >>> table
6031 D E
6032 A C
6033 bar large 5.500000 7.500000
6034 small 5.500000 8.500000
6035 foo large 2.000000 4.500000
6036 small 2.333333 4.333333
6038 We can also calculate multiple types of aggregations for any given
6039 value column.
6041 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
6042 ... aggfunc={'D': np.mean,
6043 ... 'E': [min, max, np.mean]})
6044 >>> table
6045 D E
6046 mean max mean min
6047 A C
6048 bar large 5.500000 9.0 7.500000 6.0
6049 small 5.500000 9.0 8.500000 8.0
6050 foo large 2.000000 5.0 4.500000 4.0
6051 small 2.333333 6.0 4.333333 2.0
6052 """
6054 @Substitution("")
6055 @Appender(_shared_docs["pivot_table"])
6056 def pivot_table(
6057 self,
6058 values=None,
6059 index=None,
6060 columns=None,
6061 aggfunc="mean",
6062 fill_value=None,
6063 margins=False,
6064 dropna=True,
6065 margins_name="All",
6066 observed=False,
6067 ) -> "DataFrame":
6068 from pandas.core.reshape.pivot import pivot_table
6070 return pivot_table(
6071 self,
6072 values=values,
6073 index=index,
6074 columns=columns,
6075 aggfunc=aggfunc,
6076 fill_value=fill_value,
6077 margins=margins,
6078 dropna=dropna,
6079 margins_name=margins_name,
6080 observed=observed,
6081 )
6083 def stack(self, level=-1, dropna=True):
6084 """
6085 Stack the prescribed level(s) from columns to index.
6087 Return a reshaped DataFrame or Series having a multi-level
6088 index with one or more new inner-most levels compared to the current
6089 DataFrame. The new inner-most levels are created by pivoting the
6090 columns of the current dataframe:
6092 - if the columns have a single level, the output is a Series;
6093 - if the columns have multiple levels, the new index
6094 level(s) is (are) taken from the prescribed level(s) and
6095 the output is a DataFrame.
6097 The new index levels are sorted.
6099 Parameters
6100 ----------
6101 level : int, str, list, default -1
6102 Level(s) to stack from the column axis onto the index
6103 axis, defined as one index or label, or a list of indices
6104 or labels.
6105 dropna : bool, default True
6106 Whether to drop rows in the resulting Frame/Series with
6107 missing values. Stacking a column level onto the index
6108 axis can create combinations of index and column values
6109 that are missing from the original dataframe. See Examples
6110 section.
6112 Returns
6113 -------
6114 DataFrame or Series
6115 Stacked dataframe or series.
6117 See Also
6118 --------
6119 DataFrame.unstack : Unstack prescribed level(s) from index axis
6120 onto column axis.
6121 DataFrame.pivot : Reshape dataframe from long format to wide
6122 format.
6123 DataFrame.pivot_table : Create a spreadsheet-style pivot table
6124 as a DataFrame.
6126 Notes
6127 -----
6128 The function is named by analogy with a collection of books
6129 being reorganized from being side by side on a horizontal
6130 position (the columns of the dataframe) to being stacked
6131 vertically on top of each other (in the index of the
6132 dataframe).
6134 Examples
6135 --------
6136 **Single level columns**
6138 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
6139 ... index=['cat', 'dog'],
6140 ... columns=['weight', 'height'])
6142 Stacking a dataframe with a single level column axis returns a Series:
6144 >>> df_single_level_cols
6145 weight height
6146 cat 0 1
6147 dog 2 3
6148 >>> df_single_level_cols.stack()
6149 cat weight 0
6150 height 1
6151 dog weight 2
6152 height 3
6153 dtype: int64
6155 **Multi level columns: simple case**
6157 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
6158 ... ('weight', 'pounds')])
6159 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
6160 ... index=['cat', 'dog'],
6161 ... columns=multicol1)
6163 Stacking a dataframe with a multi-level column axis:
6165 >>> df_multi_level_cols1
6166 weight
6167 kg pounds
6168 cat 1 2
6169 dog 2 4
6170 >>> df_multi_level_cols1.stack()
6171 weight
6172 cat kg 1
6173 pounds 2
6174 dog kg 2
6175 pounds 4
6177 **Missing values**
6179 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
6180 ... ('height', 'm')])
6181 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
6182 ... index=['cat', 'dog'],
6183 ... columns=multicol2)
6185 It is common to have missing values when stacking a dataframe
6186 with multi-level columns, as the stacked dataframe typically
6187 has more values than the original dataframe. Missing values
6188 are filled with NaNs:
6190 >>> df_multi_level_cols2
6191 weight height
6192 kg m
6193 cat 1.0 2.0
6194 dog 3.0 4.0
6195 >>> df_multi_level_cols2.stack()
6196 height weight
6197 cat kg NaN 1.0
6198 m 2.0 NaN
6199 dog kg NaN 3.0
6200 m 4.0 NaN
6202 **Prescribing the level(s) to be stacked**
6204 The first parameter controls which level or levels are stacked:
6206 >>> df_multi_level_cols2.stack(0)
6207 kg m
6208 cat height NaN 2.0
6209 weight 1.0 NaN
6210 dog height NaN 4.0
6211 weight 3.0 NaN
6212 >>> df_multi_level_cols2.stack([0, 1])
6213 cat height m 2.0
6214 weight kg 1.0
6215 dog height m 4.0
6216 weight kg 3.0
6217 dtype: float64
6219 **Dropping missing values**
6221 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
6222 ... index=['cat', 'dog'],
6223 ... columns=multicol2)
6225 Note that rows where all values are missing are dropped by
6226 default but this behaviour can be controlled via the dropna
6227 keyword parameter:
6229 >>> df_multi_level_cols3
6230 weight height
6231 kg m
6232 cat NaN 1.0
6233 dog 2.0 3.0
6234 >>> df_multi_level_cols3.stack(dropna=False)
6235 height weight
6236 cat kg NaN NaN
6237 m 1.0 NaN
6238 dog kg NaN 2.0
6239 m 3.0 NaN
6240 >>> df_multi_level_cols3.stack(dropna=True)
6241 height weight
6242 cat m 1.0 NaN
6243 dog kg NaN 2.0
6244 m 3.0 NaN
6245 """
6246 from pandas.core.reshape.reshape import stack, stack_multiple
6248 if isinstance(level, (tuple, list)):
6249 return stack_multiple(self, level, dropna=dropna)
6250 else:
6251 return stack(self, level, dropna=dropna)
6253 def explode(self, column: Union[str, Tuple]) -> "DataFrame":
6254 """
6255 Transform each element of a list-like to a row, replicating index values.
6257 .. versionadded:: 0.25.0
6259 Parameters
6260 ----------
6261 column : str or tuple
6262 Column to explode.
6264 Returns
6265 -------
6266 DataFrame
6267 Exploded lists to rows of the subset columns;
6268 index will be duplicated for these rows.
6270 Raises
6271 ------
6272 ValueError :
6273 if columns of the frame are not unique.
6275 See Also
6276 --------
6277 DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
6278 index labels.
6279 DataFrame.melt : Unpivot a DataFrame from wide format to long format.
6280 Series.explode : Explode a DataFrame from list-like columns to long format.
6282 Notes
6283 -----
6284 This routine will explode list-likes including lists, tuples,
6285 Series, and np.ndarray. The result dtype of the subset rows will
6286 be object. Scalars will be returned unchanged. Empty list-likes will
6287 result in a np.nan for that row.
6289 Examples
6290 --------
6291 >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
6292 >>> df
6293 A B
6294 0 [1, 2, 3] 1
6295 1 foo 1
6296 2 [] 1
6297 3 [3, 4] 1
6299 >>> df.explode('A')
6300 A B
6301 0 1 1
6302 0 2 1
6303 0 3 1
6304 1 foo 1
6305 2 NaN 1
6306 3 3 1
6307 3 4 1
6308 """
6310 if not (is_scalar(column) or isinstance(column, tuple)):
6311 raise ValueError("column must be a scalar")
6312 if not self.columns.is_unique:
6313 raise ValueError("columns must be unique")
6315 df = self.reset_index(drop=True)
6316 # TODO: use overload to refine return type of reset_index
6317 assert df is not None # needed for mypy
6318 result = df[column].explode()
6319 result = df.drop([column], axis=1).join(result)
6320 result.index = self.index.take(result.index)
6321 result = result.reindex(columns=self.columns, copy=False)
6323 return result
6325 def unstack(self, level=-1, fill_value=None):
6326 """
6327 Pivot a level of the (necessarily hierarchical) index labels.
6329 Returns a DataFrame having a new level of column labels whose inner-most level
6330 consists of the pivoted index labels.
6332 If the index is not a MultiIndex, the output will be a Series
6333 (the analogue of stack when the columns are not a MultiIndex).
6335 The level involved will automatically get sorted.
6337 Parameters
6338 ----------
6339 level : int, str, or list of these, default -1 (last level)
6340 Level(s) of index to unstack, can pass level name.
6341 fill_value : int, str or dict
6342 Replace NaN with this value if the unstack produces missing values.
6344 Returns
6345 -------
6346 Series or DataFrame
6348 See Also
6349 --------
6350 DataFrame.pivot : Pivot a table based on column values.
6351 DataFrame.stack : Pivot a level of the column labels (inverse operation
6352 from `unstack`).
6354 Examples
6355 --------
6356 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
6357 ... ('two', 'a'), ('two', 'b')])
6358 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
6359 >>> s
6360 one a 1.0
6361 b 2.0
6362 two a 3.0
6363 b 4.0
6364 dtype: float64
6366 >>> s.unstack(level=-1)
6367 a b
6368 one 1.0 2.0
6369 two 3.0 4.0
6371 >>> s.unstack(level=0)
6372 one two
6373 a 1.0 3.0
6374 b 2.0 4.0
6376 >>> df = s.unstack(level=0)
6377 >>> df.unstack()
6378 one a 1.0
6379 b 2.0
6380 two a 3.0
6381 b 4.0
6382 dtype: float64
6383 """
6384 from pandas.core.reshape.reshape import unstack
6386 return unstack(self, level, fill_value)
6388 _shared_docs[
6389 "melt"
6390 ] = """
6391 Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
6393 This function is useful to massage a DataFrame into a format where one
6394 or more columns are identifier variables (`id_vars`), while all other
6395 columns, considered measured variables (`value_vars`), are "unpivoted" to
6396 the row axis, leaving just two non-identifier columns, 'variable' and
6397 'value'.
6398 %(versionadded)s
6399 Parameters
6400 ----------
6401 id_vars : tuple, list, or ndarray, optional
6402 Column(s) to use as identifier variables.
6403 value_vars : tuple, list, or ndarray, optional
6404 Column(s) to unpivot. If not specified, uses all columns that
6405 are not set as `id_vars`.
6406 var_name : scalar
6407 Name to use for the 'variable' column. If None it uses
6408 ``frame.columns.name`` or 'variable'.
6409 value_name : scalar, default 'value'
6410 Name to use for the 'value' column.
6411 col_level : int or str, optional
6412 If columns are a MultiIndex then use this level to melt.
6414 Returns
6415 -------
6416 DataFrame
6417 Unpivoted DataFrame.
6419 See Also
6420 --------
6421 %(other)s
6422 pivot_table
6423 DataFrame.pivot
6424 Series.explode
6426 Examples
6427 --------
6428 >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
6429 ... 'B': {0: 1, 1: 3, 2: 5},
6430 ... 'C': {0: 2, 1: 4, 2: 6}})
6431 >>> df
6432 A B C
6433 0 a 1 2
6434 1 b 3 4
6435 2 c 5 6
6437 >>> %(caller)sid_vars=['A'], value_vars=['B'])
6438 A variable value
6439 0 a B 1
6440 1 b B 3
6441 2 c B 5
6443 >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
6444 A variable value
6445 0 a B 1
6446 1 b B 3
6447 2 c B 5
6448 3 a C 2
6449 4 b C 4
6450 5 c C 6
6452 The names of 'variable' and 'value' columns can be customized:
6454 >>> %(caller)sid_vars=['A'], value_vars=['B'],
6455 ... var_name='myVarname', value_name='myValname')
6456 A myVarname myValname
6457 0 a B 1
6458 1 b B 3
6459 2 c B 5
6461 If you have multi-index columns:
6463 >>> df.columns = [list('ABC'), list('DEF')]
6464 >>> df
6465 A B C
6466 D E F
6467 0 a 1 2
6468 1 b 3 4
6469 2 c 5 6
6471 >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
6472 A variable value
6473 0 a B 1
6474 1 b B 3
6475 2 c B 5
6477 >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
6478 (A, D) variable_0 variable_1 value
6479 0 a B E 1
6480 1 b B E 3
6481 2 c B E 5
6482 """
6484 @Appender(
6485 _shared_docs["melt"]
6486 % dict(
6487 caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt"
6488 )
6489 )
6490 def melt(
6491 self,
6492 id_vars=None,
6493 value_vars=None,
6494 var_name=None,
6495 value_name="value",
6496 col_level=None,
6497 ) -> "DataFrame":
6498 from pandas.core.reshape.melt import melt
6500 return melt(
6501 self,
6502 id_vars=id_vars,
6503 value_vars=value_vars,
6504 var_name=var_name,
6505 value_name=value_name,
6506 col_level=col_level,
6507 )
6509 # ----------------------------------------------------------------------
6510 # Time series-related
6512 def diff(self, periods=1, axis=0) -> "DataFrame":
6513 """
6514 First discrete difference of element.
6516 Calculates the difference of a DataFrame element compared with another
6517 element in the DataFrame (default is the element in the same column
6518 of the previous row).
6520 Parameters
6521 ----------
6522 periods : int, default 1
6523 Periods to shift for calculating difference, accepts negative
6524 values.
6525 axis : {0 or 'index', 1 or 'columns'}, default 0
6526 Take difference over rows (0) or columns (1).
6528 Returns
6529 -------
6530 DataFrame
6532 See Also
6533 --------
6534 Series.diff: First discrete difference for a Series.
6535 DataFrame.pct_change: Percent change over given number of periods.
6536 DataFrame.shift: Shift index by desired number of periods with an
6537 optional time freq.
6539 Notes
6540 -----
6541 For boolean dtypes, this uses :meth:`operator.xor` rather than
6542 :meth:`operator.sub`.
6544 Examples
6545 --------
6546 Difference with previous row
6548 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
6549 ... 'b': [1, 1, 2, 3, 5, 8],
6550 ... 'c': [1, 4, 9, 16, 25, 36]})
6551 >>> df
6552 a b c
6553 0 1 1 1
6554 1 2 1 4
6555 2 3 2 9
6556 3 4 3 16
6557 4 5 5 25
6558 5 6 8 36
6560 >>> df.diff()
6561 a b c
6562 0 NaN NaN NaN
6563 1 1.0 0.0 3.0
6564 2 1.0 1.0 5.0
6565 3 1.0 1.0 7.0
6566 4 1.0 2.0 9.0
6567 5 1.0 3.0 11.0
6569 Difference with previous column
6571 >>> df.diff(axis=1)
6572 a b c
6573 0 NaN 0.0 0.0
6574 1 NaN -1.0 3.0
6575 2 NaN -1.0 7.0
6576 3 NaN -1.0 13.0
6577 4 NaN 0.0 20.0
6578 5 NaN 2.0 28.0
6580 Difference with 3rd previous row
6582 >>> df.diff(periods=3)
6583 a b c
6584 0 NaN NaN NaN
6585 1 NaN NaN NaN
6586 2 NaN NaN NaN
6587 3 3.0 2.0 15.0
6588 4 3.0 4.0 21.0
6589 5 3.0 6.0 27.0
6591 Difference with following row
6593 >>> df.diff(periods=-1)
6594 a b c
6595 0 -1.0 0.0 -3.0
6596 1 -1.0 -1.0 -5.0
6597 2 -1.0 -1.0 -7.0
6598 3 -1.0 -2.0 -9.0
6599 4 -1.0 -3.0 -11.0
6600 5 NaN NaN NaN
6601 """
6602 bm_axis = self._get_block_manager_axis(axis)
6603 new_data = self._data.diff(n=periods, axis=bm_axis)
6604 return self._constructor(new_data)
6606 # ----------------------------------------------------------------------
6607 # Function application
6609 def _gotitem(
6610 self,
6611 key: Union[str, List[str]],
6612 ndim: int,
6613 subset: Optional[Union[Series, ABCDataFrame]] = None,
6614 ) -> Union[Series, ABCDataFrame]:
6615 """
6616 Sub-classes to define. Return a sliced object.
6618 Parameters
6619 ----------
6620 key : string / list of selections
6621 ndim : 1,2
6622 requested ndim of result
6623 subset : object, default None
6624 subset to act on
6625 """
6626 if subset is None:
6627 subset = self
6628 elif subset.ndim == 1: # is Series
6629 return subset
6631 # TODO: _shallow_copy(subset)?
6632 return subset[key]
6634 _agg_summary_and_see_also_doc = dedent(
6635 """
6636 The aggregation operations are always performed over an axis, either the
6637 index (default) or the column axis. This behavior is different from
6638 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
6639 `var`), where the default is to compute the aggregation of the flattened
6640 array, e.g., ``numpy.mean(arr_2d)`` as opposed to
6641 ``numpy.mean(arr_2d, axis=0)``.
6643 `agg` is an alias for `aggregate`. Use the alias.
6645 See Also
6646 --------
6647 DataFrame.apply : Perform any type of operations.
6648 DataFrame.transform : Perform transformation type operations.
6649 core.groupby.GroupBy : Perform operations over groups.
6650 core.resample.Resampler : Perform operations over resampled bins.
6651 core.window.Rolling : Perform operations over rolling window.
6652 core.window.Expanding : Perform operations over expanding window.
6653 core.window.EWM : Perform operation over exponential weighted
6654 window.
6655 """
6656 )
6658 _agg_examples_doc = dedent(
6659 """
6660 Examples
6661 --------
6662 >>> df = pd.DataFrame([[1, 2, 3],
6663 ... [4, 5, 6],
6664 ... [7, 8, 9],
6665 ... [np.nan, np.nan, np.nan]],
6666 ... columns=['A', 'B', 'C'])
6668 Aggregate these functions over the rows.
6670 >>> df.agg(['sum', 'min'])
6671 A B C
6672 sum 12.0 15.0 18.0
6673 min 1.0 2.0 3.0
6675 Different aggregations per column.
6677 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
6678 A B
6679 max NaN 8.0
6680 min 1.0 2.0
6681 sum 12.0 NaN
6683 Aggregate over the columns.
6685 >>> df.agg("mean", axis="columns")
6686 0 2.0
6687 1 5.0
6688 2 8.0
6689 3 NaN
6690 dtype: float64
6691 """
6692 )
6694 @Substitution(
6695 see_also=_agg_summary_and_see_also_doc,
6696 examples=_agg_examples_doc,
6697 versionadded="\n.. versionadded:: 0.20.0\n",
6698 **_shared_doc_kwargs,
6699 )
6700 @Appender(_shared_docs["aggregate"])
6701 def aggregate(self, func, axis=0, *args, **kwargs):
6702 axis = self._get_axis_number(axis)
6704 result = None
6705 try:
6706 result, how = self._aggregate(func, axis=axis, *args, **kwargs)
6707 except TypeError:
6708 pass
6709 if result is None:
6710 return self.apply(func, axis=axis, args=args, **kwargs)
6711 return result
6713 def _aggregate(self, arg, axis=0, *args, **kwargs):
6714 if axis == 1:
6715 # NDFrame.aggregate returns a tuple, and we need to transpose
6716 # only result
6717 result, how = self.T._aggregate(arg, *args, **kwargs)
6718 result = result.T if result is not None else result
6719 return result, how
6720 return super()._aggregate(arg, *args, **kwargs)
6722 agg = aggregate
6724 @Appender(_shared_docs["transform"] % _shared_doc_kwargs)
6725 def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame":
6726 axis = self._get_axis_number(axis)
6727 if axis == 1:
6728 return self.T.transform(func, *args, **kwargs).T
6729 return super().transform(func, *args, **kwargs)
6731 def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
6732 """
6733 Apply a function along an axis of the DataFrame.
6735 Objects passed to the function are Series objects whose index is
6736 either the DataFrame's index (``axis=0``) or the DataFrame's columns
6737 (``axis=1``). By default (``result_type=None``), the final return type
6738 is inferred from the return type of the applied function. Otherwise,
6739 it depends on the `result_type` argument.
6741 Parameters
6742 ----------
6743 func : function
6744 Function to apply to each column or row.
6745 axis : {0 or 'index', 1 or 'columns'}, default 0
6746 Axis along which the function is applied:
6748 * 0 or 'index': apply function to each column.
6749 * 1 or 'columns': apply function to each row.
6751 raw : bool, default False
6752 Determines if row or column is passed as a Series or ndarray object:
6754 * ``False`` : passes each row or column as a Series to the
6755 function.
6756 * ``True`` : the passed function will receive ndarray objects
6757 instead.
6758 If you are just applying a NumPy reduction function this will
6759 achieve much better performance.
6761 result_type : {'expand', 'reduce', 'broadcast', None}, default None
6762 These only act when ``axis=1`` (columns):
6764 * 'expand' : list-like results will be turned into columns.
6765 * 'reduce' : returns a Series if possible rather than expanding
6766 list-like results. This is the opposite of 'expand'.
6767 * 'broadcast' : results will be broadcast to the original shape
6768 of the DataFrame, the original index and columns will be
6769 retained.
6771 The default behaviour (None) depends on the return value of the
6772 applied function: list-like results will be returned as a Series
6773 of those. However if the apply function returns a Series these
6774 are expanded to columns.
6776 .. versionadded:: 0.23.0
6778 args : tuple
6779 Positional arguments to pass to `func` in addition to the
6780 array/series.
6781 **kwds
6782 Additional keyword arguments to pass as keywords arguments to
6783 `func`.
6785 Returns
6786 -------
6787 Series or DataFrame
6788 Result of applying ``func`` along the given axis of the
6789 DataFrame.
6791 See Also
6792 --------
6793 DataFrame.applymap: For elementwise operations.
6794 DataFrame.aggregate: Only perform aggregating type operations.
6795 DataFrame.transform: Only perform transforming type operations.
6797 Examples
6798 --------
6800 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
6801 >>> df
6802 A B
6803 0 4 9
6804 1 4 9
6805 2 4 9
6807 Using a numpy universal function (in this case the same as
6808 ``np.sqrt(df)``):
6810 >>> df.apply(np.sqrt)
6811 A B
6812 0 2.0 3.0
6813 1 2.0 3.0
6814 2 2.0 3.0
6816 Using a reducing function on either axis
6818 >>> df.apply(np.sum, axis=0)
6819 A 12
6820 B 27
6821 dtype: int64
6823 >>> df.apply(np.sum, axis=1)
6824 0 13
6825 1 13
6826 2 13
6827 dtype: int64
6829 Returning a list-like will result in a Series
6831 >>> df.apply(lambda x: [1, 2], axis=1)
6832 0 [1, 2]
6833 1 [1, 2]
6834 2 [1, 2]
6835 dtype: object
6837 Passing result_type='expand' will expand list-like results
6838 to columns of a Dataframe
6840 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
6841 0 1
6842 0 1 2
6843 1 1 2
6844 2 1 2
6846 Returning a Series inside the function is similar to passing
6847 ``result_type='expand'``. The resulting column names
6848 will be the Series index.
6850 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
6851 foo bar
6852 0 1 2
6853 1 1 2
6854 2 1 2
6856 Passing ``result_type='broadcast'`` will ensure the same shape
6857 result, whether list-like or scalar is returned by the function,
6858 and broadcast it along the axis. The resulting column names will
6859 be the originals.
6861 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
6862 A B
6863 0 1 2
6864 1 1 2
6865 2 1 2
6866 """
6867 from pandas.core.apply import frame_apply
6869 op = frame_apply(
6870 self,
6871 func=func,
6872 axis=axis,
6873 raw=raw,
6874 result_type=result_type,
6875 args=args,
6876 kwds=kwds,
6877 )
6878 return op.get_result()
6880 def applymap(self, func) -> "DataFrame":
6881 """
6882 Apply a function to a Dataframe elementwise.
6884 This method applies a function that accepts and returns a scalar
6885 to every element of a DataFrame.
6887 Parameters
6888 ----------
6889 func : callable
6890 Python function, returns a single value from a single value.
6892 Returns
6893 -------
6894 DataFrame
6895 Transformed DataFrame.
6897 See Also
6898 --------
6899 DataFrame.apply : Apply a function along input axis of DataFrame.
6901 Notes
6902 -----
6903 In the current implementation applymap calls `func` twice on the
6904 first column/row to decide whether it can take a fast or slow
6905 code path. This can lead to unexpected behavior if `func` has
6906 side-effects, as they will take effect twice for the first
6907 column/row.
6909 Examples
6910 --------
6911 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
6912 >>> df
6913 0 1
6914 0 1.000 2.120
6915 1 3.356 4.567
6917 >>> df.applymap(lambda x: len(str(x)))
6918 0 1
6919 0 3 4
6920 1 5 5
6922 Note that a vectorized version of `func` often exists, which will
6923 be much faster. You could square each number elementwise.
6925 >>> df.applymap(lambda x: x**2)
6926 0 1
6927 0 1.000000 4.494400
6928 1 11.262736 20.857489
6930 But it's better to avoid applymap in that case.
6932 >>> df ** 2
6933 0 1
6934 0 1.000000 4.494400
6935 1 11.262736 20.857489
6936 """
6938 # if we have a dtype == 'M8[ns]', provide boxed values
6939 def infer(x):
6940 if x.empty:
6941 return lib.map_infer(x, func)
6942 return lib.map_infer(x.astype(object).values, func)
6944 return self.apply(infer)
6946 # ----------------------------------------------------------------------
6947 # Merging / joining methods
6949 def append(
6950 self, other, ignore_index=False, verify_integrity=False, sort=False
6951 ) -> "DataFrame":
6952 """
6953 Append rows of `other` to the end of caller, returning a new object.
6955 Columns in `other` that are not in the caller are added as new columns.
6957 Parameters
6958 ----------
6959 other : DataFrame or Series/dict-like object, or list of these
6960 The data to append.
6961 ignore_index : bool, default False
6962 If True, do not use the index labels.
6963 verify_integrity : bool, default False
6964 If True, raise ValueError on creating index with duplicates.
6965 sort : bool, default False
6966 Sort columns if the columns of `self` and `other` are not aligned.
6968 .. versionadded:: 0.23.0
6969 .. versionchanged:: 1.0.0
6971 Changed to not sort by default.
6973 Returns
6974 -------
6975 DataFrame
6977 See Also
6978 --------
6979 concat : General function to concatenate DataFrame or Series objects.
6981 Notes
6982 -----
6983 If a list of dict/series is passed and the keys are all contained in
6984 the DataFrame's index, the order of the columns in the resulting
6985 DataFrame will be unchanged.
6987 Iteratively appending rows to a DataFrame can be more computationally
6988 intensive than a single concatenate. A better solution is to append
6989 those rows to a list and then concatenate the list with the original
6990 DataFrame all at once.
6992 Examples
6993 --------
6995 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
6996 >>> df
6997 A B
6998 0 1 2
6999 1 3 4
7000 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
7001 >>> df.append(df2)
7002 A B
7003 0 1 2
7004 1 3 4
7005 0 5 6
7006 1 7 8
7008 With `ignore_index` set to True:
7010 >>> df.append(df2, ignore_index=True)
7011 A B
7012 0 1 2
7013 1 3 4
7014 2 5 6
7015 3 7 8
7017 The following, while not recommended methods for generating DataFrames,
7018 show two ways to generate a DataFrame from multiple data sources.
7020 Less efficient:
7022 >>> df = pd.DataFrame(columns=['A'])
7023 >>> for i in range(5):
7024 ... df = df.append({'A': i}, ignore_index=True)
7025 >>> df
7026 A
7027 0 0
7028 1 1
7029 2 2
7030 3 3
7031 4 4
7033 More efficient:
7035 >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
7036 ... ignore_index=True)
7037 A
7038 0 0
7039 1 1
7040 2 2
7041 3 3
7042 4 4
7043 """
7044 if isinstance(other, (Series, dict)):
7045 if isinstance(other, dict):
7046 other = Series(other)
7047 if other.name is None and not ignore_index:
7048 raise TypeError(
7049 "Can only append a Series if ignore_index=True "
7050 "or if the Series has a name"
7051 )
7053 index = Index([other.name], name=self.index.name)
7054 idx_diff = other.index.difference(self.columns)
7055 try:
7056 combined_columns = self.columns.append(idx_diff)
7057 except TypeError:
7058 combined_columns = self.columns.astype(object).append(idx_diff)
7059 other = (
7060 other.reindex(combined_columns, copy=False)
7061 .to_frame()
7062 .T.infer_objects()
7063 .rename_axis(index.names, copy=False)
7064 )
7065 if not self.columns.equals(combined_columns):
7066 self = self.reindex(columns=combined_columns)
7067 elif isinstance(other, list):
7068 if not other:
7069 pass
7070 elif not isinstance(other[0], DataFrame):
7071 other = DataFrame(other)
7072 if (self.columns.get_indexer(other.columns) >= 0).all():
7073 other = other.reindex(columns=self.columns)
7075 from pandas.core.reshape.concat import concat
7077 if isinstance(other, (list, tuple)):
7078 to_concat = [self, *other]
7079 else:
7080 to_concat = [self, other]
7081 return concat(
7082 to_concat,
7083 ignore_index=ignore_index,
7084 verify_integrity=verify_integrity,
7085 sort=sort,
7086 )
7088 def join(
7089 self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
7090 ) -> "DataFrame":
7091 """
7092 Join columns of another DataFrame.
7094 Join columns with `other` DataFrame either on index or on a key
7095 column. Efficiently join multiple DataFrame objects by index at once by
7096 passing a list.
7098 Parameters
7099 ----------
7100 other : DataFrame, Series, or list of DataFrame
7101 Index should be similar to one of the columns in this one. If a
7102 Series is passed, its name attribute must be set, and that will be
7103 used as the column name in the resulting joined DataFrame.
7104 on : str, list of str, or array-like, optional
7105 Column or index level name(s) in the caller to join on the index
7106 in `other`, otherwise joins index-on-index. If multiple
7107 values given, the `other` DataFrame must have a MultiIndex. Can
7108 pass an array as the join key if it is not already contained in
7109 the calling DataFrame. Like an Excel VLOOKUP operation.
7110 how : {'left', 'right', 'outer', 'inner'}, default 'left'
7111 How to handle the operation of the two objects.
7113 * left: use calling frame's index (or column if on is specified)
7114 * right: use `other`'s index.
7115 * outer: form union of calling frame's index (or column if on is
7116 specified) with `other`'s index, and sort it.
7117 lexicographically.
7118 * inner: form intersection of calling frame's index (or column if
7119 on is specified) with `other`'s index, preserving the order
7120 of the calling's one.
7121 lsuffix : str, default ''
7122 Suffix to use from left frame's overlapping columns.
7123 rsuffix : str, default ''
7124 Suffix to use from right frame's overlapping columns.
7125 sort : bool, default False
7126 Order result DataFrame lexicographically by the join key. If False,
7127 the order of the join key depends on the join type (how keyword).
7129 Returns
7130 -------
7131 DataFrame
7132 A dataframe containing columns from both the caller and `other`.
7134 See Also
7135 --------
7136 DataFrame.merge : For column(s)-on-columns(s) operations.
7138 Notes
7139 -----
7140 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
7141 passing a list of `DataFrame` objects.
7143 Support for specifying index levels as the `on` parameter was added
7144 in version 0.23.0.
7146 Examples
7147 --------
7148 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
7149 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
7151 >>> df
7152 key A
7153 0 K0 A0
7154 1 K1 A1
7155 2 K2 A2
7156 3 K3 A3
7157 4 K4 A4
7158 5 K5 A5
7160 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
7161 ... 'B': ['B0', 'B1', 'B2']})
7163 >>> other
7164 key B
7165 0 K0 B0
7166 1 K1 B1
7167 2 K2 B2
7169 Join DataFrames using their indexes.
7171 >>> df.join(other, lsuffix='_caller', rsuffix='_other')
7172 key_caller A key_other B
7173 0 K0 A0 K0 B0
7174 1 K1 A1 K1 B1
7175 2 K2 A2 K2 B2
7176 3 K3 A3 NaN NaN
7177 4 K4 A4 NaN NaN
7178 5 K5 A5 NaN NaN
7180 If we want to join using the key columns, we need to set key to be
7181 the index in both `df` and `other`. The joined DataFrame will have
7182 key as its index.
7184 >>> df.set_index('key').join(other.set_index('key'))
7185 A B
7186 key
7187 K0 A0 B0
7188 K1 A1 B1
7189 K2 A2 B2
7190 K3 A3 NaN
7191 K4 A4 NaN
7192 K5 A5 NaN
7194 Another option to join using the key columns is to use the `on`
7195 parameter. DataFrame.join always uses `other`'s index but we can use
7196 any column in `df`. This method preserves the original DataFrame's
7197 index in the result.
7199 >>> df.join(other.set_index('key'), on='key')
7200 key A B
7201 0 K0 A0 B0
7202 1 K1 A1 B1
7203 2 K2 A2 B2
7204 3 K3 A3 NaN
7205 4 K4 A4 NaN
7206 5 K5 A5 NaN
7207 """
7208 return self._join_compat(
7209 other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
7210 )
7212 def _join_compat(
7213 self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
7214 ):
7215 from pandas.core.reshape.merge import merge
7216 from pandas.core.reshape.concat import concat
7218 if isinstance(other, Series):
7219 if other.name is None:
7220 raise ValueError("Other Series must have a name")
7221 other = DataFrame({other.name: other})
7223 if isinstance(other, DataFrame):
7224 return merge(
7225 self,
7226 other,
7227 left_on=on,
7228 how=how,
7229 left_index=on is None,
7230 right_index=True,
7231 suffixes=(lsuffix, rsuffix),
7232 sort=sort,
7233 )
7234 else:
7235 if on is not None:
7236 raise ValueError(
7237 "Joining multiple DataFrames only supported for joining on index"
7238 )
7240 frames = [self] + list(other)
7242 can_concat = all(df.index.is_unique for df in frames)
7244 # join indexes only using concat
7245 if can_concat:
7246 if how == "left":
7247 res = concat(
7248 frames, axis=1, join="outer", verify_integrity=True, sort=sort
7249 )
7250 return res.reindex(self.index, copy=False)
7251 else:
7252 return concat(
7253 frames, axis=1, join=how, verify_integrity=True, sort=sort
7254 )
7256 joined = frames[0]
7258 for frame in frames[1:]:
7259 joined = merge(
7260 joined, frame, how=how, left_index=True, right_index=True
7261 )
7263 return joined
7265 @Substitution("")
7266 @Appender(_merge_doc, indents=2)
7267 def merge(
7268 self,
7269 right,
7270 how="inner",
7271 on=None,
7272 left_on=None,
7273 right_on=None,
7274 left_index=False,
7275 right_index=False,
7276 sort=False,
7277 suffixes=("_x", "_y"),
7278 copy=True,
7279 indicator=False,
7280 validate=None,
7281 ) -> "DataFrame":
7282 from pandas.core.reshape.merge import merge
7284 return merge(
7285 self,
7286 right,
7287 how=how,
7288 on=on,
7289 left_on=left_on,
7290 right_on=right_on,
7291 left_index=left_index,
7292 right_index=right_index,
7293 sort=sort,
7294 suffixes=suffixes,
7295 copy=copy,
7296 indicator=indicator,
7297 validate=validate,
7298 )
7300 def round(self, decimals=0, *args, **kwargs) -> "DataFrame":
7301 """
7302 Round a DataFrame to a variable number of decimal places.
7304 Parameters
7305 ----------
7306 decimals : int, dict, Series
7307 Number of decimal places to round each column to. If an int is
7308 given, round each column to the same number of places.
7309 Otherwise dict and Series round to variable numbers of places.
7310 Column names should be in the keys if `decimals` is a
7311 dict-like, or in the index if `decimals` is a Series. Any
7312 columns not included in `decimals` will be left as is. Elements
7313 of `decimals` which are not columns of the input will be
7314 ignored.
7315 *args
7316 Additional keywords have no effect but might be accepted for
7317 compatibility with numpy.
7318 **kwargs
7319 Additional keywords have no effect but might be accepted for
7320 compatibility with numpy.
7322 Returns
7323 -------
7324 DataFrame
7325 A DataFrame with the affected columns rounded to the specified
7326 number of decimal places.
7328 See Also
7329 --------
7330 numpy.around : Round a numpy array to the given number of decimals.
7331 Series.round : Round a Series to the given number of decimals.
7333 Examples
7334 --------
7335 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
7336 ... columns=['dogs', 'cats'])
7337 >>> df
7338 dogs cats
7339 0 0.21 0.32
7340 1 0.01 0.67
7341 2 0.66 0.03
7342 3 0.21 0.18
7344 By providing an integer each column is rounded to the same number
7345 of decimal places
7347 >>> df.round(1)
7348 dogs cats
7349 0 0.2 0.3
7350 1 0.0 0.7
7351 2 0.7 0.0
7352 3 0.2 0.2
7354 With a dict, the number of places for specific columns can be
7355 specified with the column names as key and the number of decimal
7356 places as value
7358 >>> df.round({'dogs': 1, 'cats': 0})
7359 dogs cats
7360 0 0.2 0.0
7361 1 0.0 1.0
7362 2 0.7 0.0
7363 3 0.2 0.0
7365 Using a Series, the number of places for specific columns can be
7366 specified with the column names as index and the number of
7367 decimal places as value
7369 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
7370 >>> df.round(decimals)
7371 dogs cats
7372 0 0.2 0.0
7373 1 0.0 1.0
7374 2 0.7 0.0
7375 3 0.2 0.0
7376 """
7377 from pandas.core.reshape.concat import concat
7379 def _dict_round(df, decimals):
7380 for col, vals in df.items():
7381 try:
7382 yield _series_round(vals, decimals[col])
7383 except KeyError:
7384 yield vals
7386 def _series_round(s, decimals):
7387 if is_integer_dtype(s) or is_float_dtype(s):
7388 return s.round(decimals)
7389 return s
7391 nv.validate_round(args, kwargs)
7393 if isinstance(decimals, (dict, Series)):
7394 if isinstance(decimals, Series):
7395 if not decimals.index.is_unique:
7396 raise ValueError("Index of decimals must be unique")
7397 new_cols = list(_dict_round(self, decimals))
7398 elif is_integer(decimals):
7399 # Dispatch to Series.round
7400 new_cols = [_series_round(v, decimals) for _, v in self.items()]
7401 else:
7402 raise TypeError("decimals must be an integer, a dict-like or a Series")
7404 if len(new_cols) > 0:
7405 return self._constructor(
7406 concat(new_cols, axis=1), index=self.index, columns=self.columns
7407 )
7408 else:
7409 return self
7411 # ----------------------------------------------------------------------
7412 # Statistical methods, etc.
7414 def corr(self, method="pearson", min_periods=1) -> "DataFrame":
7415 """
7416 Compute pairwise correlation of columns, excluding NA/null values.
7418 Parameters
7419 ----------
7420 method : {'pearson', 'kendall', 'spearman'} or callable
7421 Method of correlation:
7423 * pearson : standard correlation coefficient
7424 * kendall : Kendall Tau correlation coefficient
7425 * spearman : Spearman rank correlation
7426 * callable: callable with input two 1d ndarrays
7427 and returning a float. Note that the returned matrix from corr
7428 will have 1 along the diagonals and will be symmetric
7429 regardless of the callable's behavior.
7431 .. versionadded:: 0.24.0
7433 min_periods : int, optional
7434 Minimum number of observations required per pair of columns
7435 to have a valid result. Currently only available for Pearson
7436 and Spearman correlation.
7438 Returns
7439 -------
7440 DataFrame
7441 Correlation matrix.
7443 See Also
7444 --------
7445 DataFrame.corrwith
7446 Series.corr
7448 Examples
7449 --------
7450 >>> def histogram_intersection(a, b):
7451 ... v = np.minimum(a, b).sum().round(decimals=1)
7452 ... return v
7453 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
7454 ... columns=['dogs', 'cats'])
7455 >>> df.corr(method=histogram_intersection)
7456 dogs cats
7457 dogs 1.0 0.3
7458 cats 0.3 1.0
7459 """
7460 numeric_df = self._get_numeric_data()
7461 cols = numeric_df.columns
7462 idx = cols.copy()
7463 mat = numeric_df.values
7465 if method == "pearson":
7466 correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
7467 elif method == "spearman":
7468 correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods)
7469 elif method == "kendall" or callable(method):
7470 if min_periods is None:
7471 min_periods = 1
7472 mat = ensure_float64(mat).T
7473 corrf = nanops.get_corr_func(method)
7474 K = len(cols)
7475 correl = np.empty((K, K), dtype=float)
7476 mask = np.isfinite(mat)
7477 for i, ac in enumerate(mat):
7478 for j, bc in enumerate(mat):
7479 if i > j:
7480 continue
7482 valid = mask[i] & mask[j]
7483 if valid.sum() < min_periods:
7484 c = np.nan
7485 elif i == j:
7486 c = 1.0
7487 elif not valid.all():
7488 c = corrf(ac[valid], bc[valid])
7489 else:
7490 c = corrf(ac, bc)
7491 correl[i, j] = c
7492 correl[j, i] = c
7493 else:
7494 raise ValueError(
7495 "method must be either 'pearson', "
7496 "'spearman', 'kendall', or a callable, "
7497 f"'{method}' was supplied"
7498 )
7500 return self._constructor(correl, index=idx, columns=cols)
7502 def cov(self, min_periods=None) -> "DataFrame":
7503 """
7504 Compute pairwise covariance of columns, excluding NA/null values.
7506 Compute the pairwise covariance among the series of a DataFrame.
7507 The returned data frame is the `covariance matrix
7508 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
7509 of the DataFrame.
7511 Both NA and null values are automatically excluded from the
7512 calculation. (See the note below about bias from missing values.)
7513 A threshold can be set for the minimum number of
7514 observations for each value created. Comparisons with observations
7515 below this threshold will be returned as ``NaN``.
7517 This method is generally used for the analysis of time series data to
7518 understand the relationship between different measures
7519 across time.
7521 Parameters
7522 ----------
7523 min_periods : int, optional
7524 Minimum number of observations required per pair of columns
7525 to have a valid result.
7527 Returns
7528 -------
7529 DataFrame
7530 The covariance matrix of the series of the DataFrame.
7532 See Also
7533 --------
7534 Series.cov : Compute covariance with another Series.
7535 core.window.EWM.cov: Exponential weighted sample covariance.
7536 core.window.Expanding.cov : Expanding sample covariance.
7537 core.window.Rolling.cov : Rolling sample covariance.
7539 Notes
7540 -----
7541 Returns the covariance matrix of the DataFrame's time series.
7542 The covariance is normalized by N-1.
7544 For DataFrames that have Series that are missing data (assuming that
7545 data is `missing at random
7546 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
7547 the returned covariance matrix will be an unbiased estimate
7548 of the variance and covariance between the member Series.
7550 However, for many applications this estimate may not be acceptable
7551 because the estimate covariance matrix is not guaranteed to be positive
7552 semi-definite. This could lead to estimate correlations having
7553 absolute values which are greater than one, and/or a non-invertible
7554 covariance matrix. See `Estimation of covariance matrices
7555 <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
7556 matrices>`__ for more details.
7558 Examples
7559 --------
7560 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
7561 ... columns=['dogs', 'cats'])
7562 >>> df.cov()
7563 dogs cats
7564 dogs 0.666667 -1.000000
7565 cats -1.000000 1.666667
7567 >>> np.random.seed(42)
7568 >>> df = pd.DataFrame(np.random.randn(1000, 5),
7569 ... columns=['a', 'b', 'c', 'd', 'e'])
7570 >>> df.cov()
7571 a b c d e
7572 a 0.998438 -0.020161 0.059277 -0.008943 0.014144
7573 b -0.020161 1.059352 -0.008543 -0.024738 0.009826
7574 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
7575 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
7576 e 0.014144 0.009826 -0.000271 -0.013692 0.977795
7578 **Minimum number of periods**
7580 This method also supports an optional ``min_periods`` keyword
7581 that specifies the required minimum number of non-NA observations for
7582 each column pair in order to have a valid result:
7584 >>> np.random.seed(42)
7585 >>> df = pd.DataFrame(np.random.randn(20, 3),
7586 ... columns=['a', 'b', 'c'])
7587 >>> df.loc[df.index[:5], 'a'] = np.nan
7588 >>> df.loc[df.index[5:10], 'b'] = np.nan
7589 >>> df.cov(min_periods=12)
7590 a b c
7591 a 0.316741 NaN -0.150812
7592 b NaN 1.248003 0.191417
7593 c -0.150812 0.191417 0.895202
7594 """
7595 numeric_df = self._get_numeric_data()
7596 cols = numeric_df.columns
7597 idx = cols.copy()
7598 mat = numeric_df.values
7600 if notna(mat).all():
7601 if min_periods is not None and min_periods > len(mat):
7602 baseCov = np.empty((mat.shape[1], mat.shape[1]))
7603 baseCov.fill(np.nan)
7604 else:
7605 baseCov = np.cov(mat.T)
7606 baseCov = baseCov.reshape((len(cols), len(cols)))
7607 else:
7608 baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods)
7610 return self._constructor(baseCov, index=idx, columns=cols)
7612 def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series:
7613 """
7614 Compute pairwise correlation.
7616 Pairwise correlation is computed between rows or columns of
7617 DataFrame with rows or columns of Series or DataFrame. DataFrames
7618 are first aligned along both axes before computing the
7619 correlations.
7621 Parameters
7622 ----------
7623 other : DataFrame, Series
7624 Object with which to compute correlations.
7625 axis : {0 or 'index', 1 or 'columns'}, default 0
7626 The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for
7627 row-wise.
7628 drop : bool, default False
7629 Drop missing indices from result.
7630 method : {'pearson', 'kendall', 'spearman'} or callable
7631 Method of correlation:
7633 * pearson : standard correlation coefficient
7634 * kendall : Kendall Tau correlation coefficient
7635 * spearman : Spearman rank correlation
7636 * callable: callable with input two 1d ndarrays
7637 and returning a float.
7639 .. versionadded:: 0.24.0
7641 Returns
7642 -------
7643 Series
7644 Pairwise correlations.
7646 See Also
7647 --------
7648 DataFrame.corr
7649 """
7650 axis = self._get_axis_number(axis)
7651 this = self._get_numeric_data()
7653 if isinstance(other, Series):
7654 return this.apply(lambda x: other.corr(x, method=method), axis=axis)
7656 other = other._get_numeric_data()
7657 left, right = this.align(other, join="inner", copy=False)
7659 if axis == 1:
7660 left = left.T
7661 right = right.T
7663 if method == "pearson":
7664 # mask missing values
7665 left = left + right * 0
7666 right = right + left * 0
7668 # demeaned data
7669 ldem = left - left.mean()
7670 rdem = right - right.mean()
7672 num = (ldem * rdem).sum()
7673 dom = (left.count() - 1) * left.std() * right.std()
7675 correl = num / dom
7677 elif method in ["kendall", "spearman"] or callable(method):
7679 def c(x):
7680 return nanops.nancorr(x[0], x[1], method=method)
7682 correl = Series(
7683 map(c, zip(left.values.T, right.values.T)), index=left.columns
7684 )
7686 else:
7687 raise ValueError(
7688 f"Invalid method {method} was passed, "
7689 "valid methods are: 'pearson', 'kendall', "
7690 "'spearman', or callable"
7691 )
7693 if not drop:
7694 # Find non-matching labels along the given axis
7695 # and append missing correlations (GH 22375)
7696 raxis = 1 if axis == 0 else 0
7697 result_index = this._get_axis(raxis).union(other._get_axis(raxis))
7698 idx_diff = result_index.difference(correl.index)
7700 if len(idx_diff) > 0:
7701 correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff))
7703 return correl
7705 # ----------------------------------------------------------------------
7706 # ndarray-like stats methods
7708 def count(self, axis=0, level=None, numeric_only=False):
7709 """
7710 Count non-NA cells for each column or row.
7712 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
7713 on `pandas.options.mode.use_inf_as_na`) are considered NA.
7715 Parameters
7716 ----------
7717 axis : {0 or 'index', 1 or 'columns'}, default 0
7718 If 0 or 'index' counts are generated for each column.
7719 If 1 or 'columns' counts are generated for each **row**.
7720 level : int or str, optional
7721 If the axis is a `MultiIndex` (hierarchical), count along a
7722 particular `level`, collapsing into a `DataFrame`.
7723 A `str` specifies the level name.
7724 numeric_only : bool, default False
7725 Include only `float`, `int` or `boolean` data.
7727 Returns
7728 -------
7729 Series or DataFrame
7730 For each column/row the number of non-NA/null entries.
7731 If `level` is specified returns a `DataFrame`.
7733 See Also
7734 --------
7735 Series.count: Number of non-NA elements in a Series.
7736 DataFrame.shape: Number of DataFrame rows and columns (including NA
7737 elements).
7738 DataFrame.isna: Boolean same-sized DataFrame showing places of NA
7739 elements.
7741 Examples
7742 --------
7743 Constructing DataFrame from a dictionary:
7745 >>> df = pd.DataFrame({"Person":
7746 ... ["John", "Myla", "Lewis", "John", "Myla"],
7747 ... "Age": [24., np.nan, 21., 33, 26],
7748 ... "Single": [False, True, True, True, False]})
7749 >>> df
7750 Person Age Single
7751 0 John 24.0 False
7752 1 Myla NaN True
7753 2 Lewis 21.0 True
7754 3 John 33.0 True
7755 4 Myla 26.0 False
7757 Notice the uncounted NA values:
7759 >>> df.count()
7760 Person 5
7761 Age 4
7762 Single 5
7763 dtype: int64
7765 Counts for each **row**:
7767 >>> df.count(axis='columns')
7768 0 3
7769 1 2
7770 2 3
7771 3 3
7772 4 3
7773 dtype: int64
7775 Counts for one level of a `MultiIndex`:
7777 >>> df.set_index(["Person", "Single"]).count(level="Person")
7778 Age
7779 Person
7780 John 2
7781 Lewis 1
7782 Myla 1
7783 """
7784 axis = self._get_axis_number(axis)
7785 if level is not None:
7786 return self._count_level(level, axis=axis, numeric_only=numeric_only)
7788 if numeric_only:
7789 frame = self._get_numeric_data()
7790 else:
7791 frame = self
7793 # GH #423
7794 if len(frame._get_axis(axis)) == 0:
7795 result = Series(0, index=frame._get_agg_axis(axis))
7796 else:
7797 if frame._is_mixed_type or frame._data.any_extension_types:
7798 # the or any_extension_types is really only hit for single-
7799 # column frames with an extension array
7800 result = notna(frame).sum(axis=axis)
7801 else:
7802 # GH13407
7803 series_counts = notna(frame).sum(axis=axis)
7804 counts = series_counts.values
7805 result = Series(counts, index=frame._get_agg_axis(axis))
7807 return result.astype("int64")
7809 def _count_level(self, level, axis=0, numeric_only=False):
7810 if numeric_only:
7811 frame = self._get_numeric_data()
7812 else:
7813 frame = self
7815 count_axis = frame._get_axis(axis)
7816 agg_axis = frame._get_agg_axis(axis)
7818 if not isinstance(count_axis, ABCMultiIndex):
7819 raise TypeError(
7820 f"Can only count levels on hierarchical {self._get_axis_name(axis)}."
7821 )
7823 if frame._is_mixed_type:
7824 # Since we have mixed types, calling notna(frame.values) might
7825 # upcast everything to object
7826 mask = notna(frame).values
7827 else:
7828 # But use the speedup when we have homogeneous dtypes
7829 mask = notna(frame.values)
7831 if axis == 1:
7832 # We're transposing the mask rather than frame to avoid potential
7833 # upcasts to object, which induces a ~20x slowdown
7834 mask = mask.T
7836 if isinstance(level, str):
7837 level = count_axis._get_level_number(level)
7839 level_name = count_axis._names[level]
7840 level_index = count_axis.levels[level]._shallow_copy(name=level_name)
7841 level_codes = ensure_int64(count_axis.codes[level])
7842 counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0)
7844 result = DataFrame(counts, index=level_index, columns=agg_axis)
7846 if axis == 1:
7847 # Undo our earlier transpose
7848 return result.T
7849 else:
7850 return result
7852 def _reduce(
7853 self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
7854 ):
7855 if axis is None and filter_type == "bool":
7856 labels = None
7857 constructor = None
7858 else:
7859 # TODO: Make other agg func handle axis=None properly
7860 axis = self._get_axis_number(axis)
7861 labels = self._get_agg_axis(axis)
7862 constructor = self._constructor
7864 def f(x):
7865 return op(x, axis=axis, skipna=skipna, **kwds)
7867 def _get_data(axis_matters):
7868 if filter_type is None or filter_type == "numeric":
7869 data = self._get_numeric_data()
7870 elif filter_type == "bool":
7871 if axis_matters:
7872 # GH#25101, GH#24434
7873 data = self._get_bool_data() if axis == 0 else self
7874 else:
7875 data = self._get_bool_data()
7876 else: # pragma: no cover
7877 msg = (
7878 f"Generating numeric_only data with filter_type {filter_type} "
7879 "not supported."
7880 )
7881 raise NotImplementedError(msg)
7882 return data
7884 if numeric_only is not None and axis in [0, 1]:
7885 df = self
7886 if numeric_only is True:
7887 df = _get_data(axis_matters=True)
7888 if axis == 1:
7889 df = df.T
7890 axis = 0
7892 out_dtype = "bool" if filter_type == "bool" else None
7894 def blk_func(values):
7895 if isinstance(values, ExtensionArray):
7896 return values._reduce(name, skipna=skipna, **kwds)
7897 else:
7898 return op(values, axis=1, skipna=skipna, **kwds)
7900 # After possibly _get_data and transposing, we are now in the
7901 # simple case where we can use BlockManager._reduce
7902 res = df._data.reduce(blk_func)
7903 assert isinstance(res, dict)
7904 if len(res):
7905 assert len(res) == max(list(res.keys())) + 1, res.keys()
7906 out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype)
7907 out.index = df.columns
7908 return out
7910 if numeric_only is None:
7911 values = self.values
7912 try:
7913 result = f(values)
7915 if filter_type == "bool" and is_object_dtype(values) and axis is None:
7916 # work around https://github.com/numpy/numpy/issues/10489
7917 # TODO: combine with hasattr(result, 'dtype') further down
7918 # hard since we don't have `values` down there.
7919 result = np.bool_(result)
7920 except TypeError:
7921 # e.g. in nanops trying to convert strs to float
7923 # try by-column first
7924 if filter_type is None and axis == 0:
7925 # this can end up with a non-reduction
7926 # but not always. if the types are mixed
7927 # with datelike then need to make sure a series
7929 # we only end up here if we have not specified
7930 # numeric_only and yet we have tried a
7931 # column-by-column reduction, where we have mixed type.
7932 # So let's just do what we can
7933 from pandas.core.apply import frame_apply
7935 opa = frame_apply(
7936 self, func=f, result_type="expand", ignore_failures=True
7937 )
7938 result = opa.get_result()
7939 if result.ndim == self.ndim:
7940 result = result.iloc[0]
7941 return result
7943 # TODO: why doesnt axis matter here?
7944 data = _get_data(axis_matters=False)
7945 with np.errstate(all="ignore"):
7946 result = f(data.values)
7947 labels = data._get_agg_axis(axis)
7948 else:
7949 if numeric_only:
7950 data = _get_data(axis_matters=True)
7952 values = data.values
7953 labels = data._get_agg_axis(axis)
7954 else:
7955 values = self.values
7956 result = f(values)
7958 if hasattr(result, "dtype") and is_object_dtype(result.dtype):
7959 try:
7960 if filter_type is None or filter_type == "numeric":
7961 result = result.astype(np.float64)
7962 elif filter_type == "bool" and notna(result).all():
7963 result = result.astype(np.bool_)
7964 except (ValueError, TypeError):
7966 # try to coerce to the original dtypes item by item if we can
7967 if axis == 0:
7968 result = coerce_to_dtypes(result, self.dtypes)
7970 if constructor is not None:
7971 result = Series(result, index=labels)
7972 return result
7974 def nunique(self, axis=0, dropna=True) -> Series:
7975 """
7976 Count distinct observations over requested axis.
7978 Return Series with number of distinct observations. Can ignore NaN
7979 values.
7981 Parameters
7982 ----------
7983 axis : {0 or 'index', 1 or 'columns'}, default 0
7984 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
7985 column-wise.
7986 dropna : bool, default True
7987 Don't include NaN in the counts.
7989 Returns
7990 -------
7991 Series
7993 See Also
7994 --------
7995 Series.nunique: Method nunique for Series.
7996 DataFrame.count: Count non-NA cells for each column or row.
7998 Examples
7999 --------
8000 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
8001 >>> df.nunique()
8002 A 3
8003 B 1
8004 dtype: int64
8006 >>> df.nunique(axis=1)
8007 0 1
8008 1 2
8009 2 2
8010 dtype: int64
8011 """
8012 return self.apply(Series.nunique, axis=axis, dropna=dropna)
8014 def idxmin(self, axis=0, skipna=True) -> Series:
8015 """
8016 Return index of first occurrence of minimum over requested axis.
8018 NA/null values are excluded.
8020 Parameters
8021 ----------
8022 axis : {0 or 'index', 1 or 'columns'}, default 0
8023 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
8024 skipna : bool, default True
8025 Exclude NA/null values. If an entire row/column is NA, the result
8026 will be NA.
8028 Returns
8029 -------
8030 Series
8031 Indexes of minima along the specified axis.
8033 Raises
8034 ------
8035 ValueError
8036 * If the row/column is empty
8038 See Also
8039 --------
8040 Series.idxmin
8042 Notes
8043 -----
8044 This method is the DataFrame version of ``ndarray.argmin``.
8045 """
8046 axis = self._get_axis_number(axis)
8047 indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
8048 index = self._get_axis(axis)
8049 result = [index[i] if i >= 0 else np.nan for i in indices]
8050 return Series(result, index=self._get_agg_axis(axis))
8052 def idxmax(self, axis=0, skipna=True) -> Series:
8053 """
8054 Return index of first occurrence of maximum over requested axis.
8056 NA/null values are excluded.
8058 Parameters
8059 ----------
8060 axis : {0 or 'index', 1 or 'columns'}, default 0
8061 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
8062 skipna : bool, default True
8063 Exclude NA/null values. If an entire row/column is NA, the result
8064 will be NA.
8066 Returns
8067 -------
8068 Series
8069 Indexes of maxima along the specified axis.
8071 Raises
8072 ------
8073 ValueError
8074 * If the row/column is empty
8076 See Also
8077 --------
8078 Series.idxmax
8080 Notes
8081 -----
8082 This method is the DataFrame version of ``ndarray.argmax``.
8083 """
8084 axis = self._get_axis_number(axis)
8085 indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
8086 index = self._get_axis(axis)
8087 result = [index[i] if i >= 0 else np.nan for i in indices]
8088 return Series(result, index=self._get_agg_axis(axis))
8090 def _get_agg_axis(self, axis_num):
8091 """
8092 Let's be explicit about this.
8093 """
8094 if axis_num == 0:
8095 return self.columns
8096 elif axis_num == 1:
8097 return self.index
8098 else:
8099 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
8101 def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame":
8102 """
8103 Get the mode(s) of each element along the selected axis.
8105 The mode of a set of values is the value that appears most often.
8106 It can be multiple values.
8108 Parameters
8109 ----------
8110 axis : {0 or 'index', 1 or 'columns'}, default 0
8111 The axis to iterate over while searching for the mode:
8113 * 0 or 'index' : get mode of each column
8114 * 1 or 'columns' : get mode of each row.
8116 numeric_only : bool, default False
8117 If True, only apply to numeric columns.
8118 dropna : bool, default True
8119 Don't consider counts of NaN/NaT.
8121 .. versionadded:: 0.24.0
8123 Returns
8124 -------
8125 DataFrame
8126 The modes of each column or row.
8128 See Also
8129 --------
8130 Series.mode : Return the highest frequency value in a Series.
8131 Series.value_counts : Return the counts of values in a Series.
8133 Examples
8134 --------
8135 >>> df = pd.DataFrame([('bird', 2, 2),
8136 ... ('mammal', 4, np.nan),
8137 ... ('arthropod', 8, 0),
8138 ... ('bird', 2, np.nan)],
8139 ... index=('falcon', 'horse', 'spider', 'ostrich'),
8140 ... columns=('species', 'legs', 'wings'))
8141 >>> df
8142 species legs wings
8143 falcon bird 2 2.0
8144 horse mammal 4 NaN
8145 spider arthropod 8 0.0
8146 ostrich bird 2 NaN
8148 By default, missing values are not considered, and the mode of wings
8149 are both 0 and 2. The second row of species and legs contains ``NaN``,
8150 because they have only one mode, but the DataFrame has two rows.
8152 >>> df.mode()
8153 species legs wings
8154 0 bird 2.0 0.0
8155 1 NaN NaN 2.0
8157 Setting ``dropna=False`` ``NaN`` values are considered and they can be
8158 the mode (like for wings).
8160 >>> df.mode(dropna=False)
8161 species legs wings
8162 0 bird 2 NaN
8164 Setting ``numeric_only=True``, only the mode of numeric columns is
8165 computed, and columns of other types are ignored.
8167 >>> df.mode(numeric_only=True)
8168 legs wings
8169 0 2.0 0.0
8170 1 NaN 2.0
8172 To compute the mode over columns and not rows, use the axis parameter:
8174 >>> df.mode(axis='columns', numeric_only=True)
8175 0 1
8176 falcon 2.0 NaN
8177 horse 4.0 NaN
8178 spider 0.0 8.0
8179 ostrich 2.0 NaN
8180 """
8181 data = self if not numeric_only else self._get_numeric_data()
8183 def f(s):
8184 return s.mode(dropna=dropna)
8186 return data.apply(f, axis=axis)
8188 def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
8189 """
8190 Return values at the given quantile over requested axis.
8192 Parameters
8193 ----------
8194 q : float or array-like, default 0.5 (50% quantile)
8195 Value between 0 <= q <= 1, the quantile(s) to compute.
8196 axis : {0, 1, 'index', 'columns'} (default 0)
8197 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
8198 numeric_only : bool, default True
8199 If False, the quantile of datetime and timedelta data will be
8200 computed as well.
8201 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
8202 This optional parameter specifies the interpolation method to use,
8203 when the desired quantile lies between two data points `i` and `j`:
8205 * linear: `i + (j - i) * fraction`, where `fraction` is the
8206 fractional part of the index surrounded by `i` and `j`.
8207 * lower: `i`.
8208 * higher: `j`.
8209 * nearest: `i` or `j` whichever is nearest.
8210 * midpoint: (`i` + `j`) / 2.
8212 Returns
8213 -------
8214 Series or DataFrame
8216 If ``q`` is an array, a DataFrame will be returned where the
8217 index is ``q``, the columns are the columns of self, and the
8218 values are the quantiles.
8219 If ``q`` is a float, a Series will be returned where the
8220 index is the columns of self and the values are the quantiles.
8222 See Also
8223 --------
8224 core.window.Rolling.quantile: Rolling quantile.
8225 numpy.percentile: Numpy function to compute the percentile.
8227 Examples
8228 --------
8229 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
8230 ... columns=['a', 'b'])
8231 >>> df.quantile(.1)
8232 a 1.3
8233 b 3.7
8234 Name: 0.1, dtype: float64
8235 >>> df.quantile([.1, .5])
8236 a b
8237 0.1 1.3 3.7
8238 0.5 2.5 55.0
8240 Specifying `numeric_only=False` will also compute the quantile of
8241 datetime and timedelta data.
8243 >>> df = pd.DataFrame({'A': [1, 2],
8244 ... 'B': [pd.Timestamp('2010'),
8245 ... pd.Timestamp('2011')],
8246 ... 'C': [pd.Timedelta('1 days'),
8247 ... pd.Timedelta('2 days')]})
8248 >>> df.quantile(0.5, numeric_only=False)
8249 A 1.5
8250 B 2010-07-02 12:00:00
8251 C 1 days 12:00:00
8252 Name: 0.5, dtype: object
8253 """
8254 validate_percentile(q)
8256 data = self._get_numeric_data() if numeric_only else self
8257 axis = self._get_axis_number(axis)
8258 is_transposed = axis == 1
8260 if is_transposed:
8261 data = data.T
8263 if len(data.columns) == 0:
8264 # GH#23925 _get_numeric_data may have dropped all columns
8265 cols = Index([], name=self.columns.name)
8266 if is_list_like(q):
8267 return self._constructor([], index=q, columns=cols)
8268 return self._constructor_sliced([], index=cols, name=q, dtype=np.float64)
8270 result = data._data.quantile(
8271 qs=q, axis=1, interpolation=interpolation, transposed=is_transposed
8272 )
8274 if result.ndim == 2:
8275 result = self._constructor(result)
8276 else:
8277 result = self._constructor_sliced(result, name=q)
8279 if is_transposed:
8280 result = result.T
8282 return result
8284 def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame":
8285 """
8286 Cast to DatetimeIndex of timestamps, at *beginning* of period.
8288 Parameters
8289 ----------
8290 freq : str, default frequency of PeriodIndex
8291 Desired frequency.
8292 how : {'s', 'e', 'start', 'end'}
8293 Convention for converting period to timestamp; start of period
8294 vs. end.
8295 axis : {0 or 'index', 1 or 'columns'}, default 0
8296 The axis to convert (the index by default).
8297 copy : bool, default True
8298 If False then underlying input data is not copied.
8300 Returns
8301 -------
8302 DataFrame with DatetimeIndex
8303 """
8304 new_data = self._data
8305 if copy:
8306 new_data = new_data.copy()
8308 axis = self._get_axis_number(axis)
8309 if axis == 0:
8310 new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
8311 elif axis == 1:
8312 new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
8313 else: # pragma: no cover
8314 raise AssertionError(f"Axis must be 0 or 1. Got {axis}")
8316 return self._constructor(new_data)
8318 def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame":
8319 """
8320 Convert DataFrame from DatetimeIndex to PeriodIndex.
8322 Convert DataFrame from DatetimeIndex to PeriodIndex with desired
8323 frequency (inferred from index if not passed).
8325 Parameters
8326 ----------
8327 freq : str, default
8328 Frequency of the PeriodIndex.
8329 axis : {0 or 'index', 1 or 'columns'}, default 0
8330 The axis to convert (the index by default).
8331 copy : bool, default True
8332 If False then underlying input data is not copied.
8334 Returns
8335 -------
8336 TimeSeries with PeriodIndex
8337 """
8338 new_data = self._data
8339 if copy:
8340 new_data = new_data.copy()
8342 axis = self._get_axis_number(axis)
8343 if axis == 0:
8344 new_data.set_axis(1, self.index.to_period(freq=freq))
8345 elif axis == 1:
8346 new_data.set_axis(0, self.columns.to_period(freq=freq))
8347 else: # pragma: no cover
8348 raise AssertionError(f"Axis must be 0 or 1. Got {axis}")
8350 return self._constructor(new_data)
8352 def isin(self, values) -> "DataFrame":
8353 """
8354 Whether each element in the DataFrame is contained in values.
8356 Parameters
8357 ----------
8358 values : iterable, Series, DataFrame or dict
8359 The result will only be true at a location if all the
8360 labels match. If `values` is a Series, that's the index. If
8361 `values` is a dict, the keys must be the column names,
8362 which must match. If `values` is a DataFrame,
8363 then both the index and column labels must match.
8365 Returns
8366 -------
8367 DataFrame
8368 DataFrame of booleans showing whether each element in the DataFrame
8369 is contained in values.
8371 See Also
8372 --------
8373 DataFrame.eq: Equality test for DataFrame.
8374 Series.isin: Equivalent method on Series.
8375 Series.str.contains: Test if pattern or regex is contained within a
8376 string of a Series or Index.
8378 Examples
8379 --------
8381 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
8382 ... index=['falcon', 'dog'])
8383 >>> df
8384 num_legs num_wings
8385 falcon 2 2
8386 dog 4 0
8388 When ``values`` is a list check whether every value in the DataFrame
8389 is present in the list (which animals have 0 or 2 legs or wings)
8391 >>> df.isin([0, 2])
8392 num_legs num_wings
8393 falcon True True
8394 dog False True
8396 When ``values`` is a dict, we can pass values to check for each
8397 column separately:
8399 >>> df.isin({'num_wings': [0, 3]})
8400 num_legs num_wings
8401 falcon False False
8402 dog False True
8404 When ``values`` is a Series or DataFrame the index and column must
8405 match. Note that 'falcon' does not match based on the number of legs
8406 in df2.
8408 >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]},
8409 ... index=['spider', 'falcon'])
8410 >>> df.isin(other)
8411 num_legs num_wings
8412 falcon True True
8413 dog False False
8414 """
8415 if isinstance(values, dict):
8416 from pandas.core.reshape.concat import concat
8418 values = collections.defaultdict(list, values)
8419 return concat(
8420 (
8421 self.iloc[:, [i]].isin(values[col])
8422 for i, col in enumerate(self.columns)
8423 ),
8424 axis=1,
8425 )
8426 elif isinstance(values, Series):
8427 if not values.index.is_unique:
8428 raise ValueError("cannot compute isin with a duplicate axis.")
8429 return self.eq(values.reindex_like(self), axis="index")
8430 elif isinstance(values, DataFrame):
8431 if not (values.columns.is_unique and values.index.is_unique):
8432 raise ValueError("cannot compute isin with a duplicate axis.")
8433 return self.eq(values.reindex_like(self))
8434 else:
8435 if not is_list_like(values):
8436 raise TypeError(
8437 "only list-like or dict-like objects are allowed "
8438 "to be passed to DataFrame.isin(), "
8439 f"you passed a {repr(type(values).__name__)}"
8440 )
8441 return DataFrame(
8442 algorithms.isin(self.values.ravel(), values).reshape(self.shape),
8443 self.index,
8444 self.columns,
8445 )
8447 # ----------------------------------------------------------------------
8448 # Add plotting methods to DataFrame
8449 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
8450 hist = pandas.plotting.hist_frame
8451 boxplot = pandas.plotting.boxplot_frame
8452 sparse = CachedAccessor("sparse", SparseFrameAccessor)
8455DataFrame._setup_axes(
8456 ["index", "columns"],
8457 docs={
8458 "index": "The index (row labels) of the DataFrame.",
8459 "columns": "The column labels of the DataFrame.",
8460 },
8461)
8462DataFrame._add_numeric_operations()
8463DataFrame._add_series_or_dataframe_operations()
8465ops.add_flex_arithmetic_methods(DataFrame)
8466ops.add_special_arithmetic_methods(DataFrame)
8469def _from_nested_dict(data):
8470 # TODO: this should be seriously cythonized
8471 new_data = {}
8472 for index, s in data.items():
8473 for col, v in s.items():
8474 new_data[col] = new_data.get(col, {})
8475 new_data[col][index] = v
8476 return new_data
8479def _put_str(s, space):
8480 return str(s)[:space].ljust(space)