Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/arrays/categorical.py : 21%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import operator
2from shutil import get_terminal_size
3from typing import Dict, Hashable, List, Type, Union, cast
4from warnings import warn
6import numpy as np
8from pandas._config import get_option
10from pandas._libs import algos as libalgos, hashtable as htable
11from pandas._typing import ArrayLike, Dtype, Ordered, Scalar
12from pandas.compat.numpy import function as nv
13from pandas.util._decorators import (
14 Appender,
15 Substitution,
16 cache_readonly,
17 deprecate_kwarg,
18)
19from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
21from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike
22from pandas.core.dtypes.common import (
23 ensure_int64,
24 ensure_object,
25 ensure_platform_int,
26 is_categorical_dtype,
27 is_datetime64_dtype,
28 is_dict_like,
29 is_dtype_equal,
30 is_extension_array_dtype,
31 is_integer_dtype,
32 is_iterator,
33 is_list_like,
34 is_object_dtype,
35 is_scalar,
36 is_sequence,
37 is_timedelta64_dtype,
38 needs_i8_conversion,
39)
40from pandas.core.dtypes.dtypes import CategoricalDtype
41from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
42from pandas.core.dtypes.inference import is_hashable
43from pandas.core.dtypes.missing import isna, notna
45from pandas.core import ops
46from pandas.core.accessor import PandasDelegate, delegate_names
47import pandas.core.algorithms as algorithms
48from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d
49from pandas.core.arrays.base import (
50 ExtensionArray,
51 _extension_array_shared_docs,
52 try_cast_to_ea,
53)
54from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
55import pandas.core.common as com
56from pandas.core.construction import array, extract_array, sanitize_array
57from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing
58from pandas.core.missing import interpolate_2d
59from pandas.core.ops.common import unpack_zerodim_and_defer
60from pandas.core.sorting import nargsort
62from pandas.io.formats import console
65def _cat_compare_op(op):
66 opname = f"__{op.__name__}__"
68 @unpack_zerodim_and_defer(opname)
69 def func(self, other):
70 if is_list_like(other) and len(other) != len(self):
71 # TODO: Could this fail if the categories are listlike objects?
72 raise ValueError("Lengths must match.")
74 if not self.ordered:
75 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
76 raise TypeError(
77 "Unordered Categoricals can only compare equality or not"
78 )
79 if isinstance(other, Categorical):
80 # Two Categoricals can only be be compared if the categories are
81 # the same (maybe up to ordering, depending on ordered)
83 msg = "Categoricals can only be compared if 'categories' are the same."
84 if len(self.categories) != len(other.categories):
85 raise TypeError(msg + " Categories are different lengths")
86 elif self.ordered and not (self.categories == other.categories).all():
87 raise TypeError(msg)
88 elif not set(self.categories) == set(other.categories):
89 raise TypeError(msg)
91 if not (self.ordered == other.ordered):
92 raise TypeError(
93 "Categoricals can only be compared if 'ordered' is the same"
94 )
95 if not self.ordered and not self.categories.equals(other.categories):
96 # both unordered and different order
97 other_codes = _get_codes_for_values(other, self.categories)
98 else:
99 other_codes = other._codes
101 f = getattr(self._codes, opname)
102 ret = f(other_codes)
103 mask = (self._codes == -1) | (other_codes == -1)
104 if mask.any():
105 # In other series, the leads to False, so do that here too
106 ret[mask] = False
107 return ret
109 if is_scalar(other):
110 if other in self.categories:
111 i = self.categories.get_loc(other)
112 ret = getattr(self._codes, opname)(i)
114 if opname not in {"__eq__", "__ge__", "__gt__"}:
115 # check for NaN needed if we are not equal or larger
116 mask = self._codes == -1
117 ret[mask] = False
118 return ret
119 else:
120 if opname == "__eq__":
121 return np.zeros(len(self), dtype=bool)
122 elif opname == "__ne__":
123 return np.ones(len(self), dtype=bool)
124 else:
125 raise TypeError(
126 f"Cannot compare a Categorical for op {opname} with a "
127 "scalar, which is not a category."
128 )
129 else:
131 # allow categorical vs object dtype array comparisons for equality
132 # these are only positional comparisons
133 if opname in ["__eq__", "__ne__"]:
134 return getattr(np.array(self), opname)(np.array(other))
136 raise TypeError(
137 f"Cannot compare a Categorical for op {opname} with "
138 f"type {type(other)}.\nIf you want to compare values, "
139 "use 'np.asarray(cat) <op> other'."
140 )
142 func.__name__ = opname
144 return func
147def contains(cat, key, container):
148 """
149 Helper for membership check for ``key`` in ``cat``.
151 This is a helper method for :method:`__contains__`
152 and :class:`CategoricalIndex.__contains__`.
154 Returns True if ``key`` is in ``cat.categories`` and the
155 location of ``key`` in ``categories`` is in ``container``.
157 Parameters
158 ----------
159 cat : :class:`Categorical`or :class:`categoricalIndex`
160 key : a hashable object
161 The key to check membership for.
162 container : Container (e.g. list-like or mapping)
163 The container to check for membership in.
165 Returns
166 -------
167 is_in : bool
168 True if ``key`` is in ``self.categories`` and location of
169 ``key`` in ``categories`` is in ``container``, else False.
171 Notes
172 -----
173 This method does not check for NaN values. Do that separately
174 before calling this method.
175 """
176 hash(key)
178 # get location of key in categories.
179 # If a KeyError, the key isn't in categories, so logically
180 # can't be in container either.
181 try:
182 loc = cat.categories.get_loc(key)
183 except (KeyError, TypeError):
184 return False
186 # loc is the location of key in categories, but also the *value*
187 # for key in container. So, `key` may be in categories,
188 # but still not in `container`. Example ('b' in categories,
189 # but not in values):
190 # 'b' in Categorical(['a'], categories=['a', 'b']) # False
191 if is_scalar(loc):
192 return loc in container
193 else:
194 # if categories is an IntervalIndex, loc is an array.
195 return any(loc_ in container for loc_ in loc)
198_codes_doc = """
199The category codes of this categorical.
201Level codes are an array if integer which are the positions of the real
202values in the categories array.
204There is not setter, use the other categorical methods and the normal item
205setter to change values in the categorical.
206"""
209class Categorical(ExtensionArray, PandasObject):
210 """
211 Represent a categorical variable in classic R / S-plus fashion.
213 `Categoricals` can only take on only a limited, and usually fixed, number
214 of possible values (`categories`). In contrast to statistical categorical
215 variables, a `Categorical` might have an order, but numerical operations
216 (additions, divisions, ...) are not possible.
218 All values of the `Categorical` are either in `categories` or `np.nan`.
219 Assigning values outside of `categories` will raise a `ValueError`. Order
220 is defined by the order of the `categories`, not lexical order of the
221 values.
223 Parameters
224 ----------
225 values : list-like
226 The values of the categorical. If categories are given, values not in
227 categories will be replaced with NaN.
228 categories : Index-like (unique), optional
229 The unique categories for this categorical. If not given, the
230 categories are assumed to be the unique values of `values` (sorted, if
231 possible, otherwise in the order in which they appear).
232 ordered : bool, default False
233 Whether or not this categorical is treated as a ordered categorical.
234 If True, the resulting categorical will be ordered.
235 An ordered categorical respects, when sorted, the order of its
236 `categories` attribute (which in turn is the `categories` argument, if
237 provided).
238 dtype : CategoricalDtype
239 An instance of ``CategoricalDtype`` to use for this categorical.
241 .. versionadded:: 0.21.0
243 Attributes
244 ----------
245 categories : Index
246 The categories of this categorical
247 codes : ndarray
248 The codes (integer positions, which point to the categories) of this
249 categorical, read only.
250 ordered : bool
251 Whether or not this Categorical is ordered.
252 dtype : CategoricalDtype
253 The instance of ``CategoricalDtype`` storing the ``categories``
254 and ``ordered``.
256 .. versionadded:: 0.21.0
258 Methods
259 -------
260 from_codes
261 __array__
263 Raises
264 ------
265 ValueError
266 If the categories do not validate.
267 TypeError
268 If an explicit ``ordered=True`` is given but no `categories` and the
269 `values` are not sortable.
271 See Also
272 --------
273 CategoricalDtype : Type for categorical data.
274 CategoricalIndex : An Index with an underlying ``Categorical``.
276 Notes
277 -----
278 See the `user guide
279 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_
280 for more.
282 Examples
283 --------
284 >>> pd.Categorical([1, 2, 3, 1, 2, 3])
285 [1, 2, 3, 1, 2, 3]
286 Categories (3, int64): [1, 2, 3]
288 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
289 [a, b, c, a, b, c]
290 Categories (3, object): [a, b, c]
292 Ordered `Categoricals` can be sorted according to the custom order
293 of the categories and can have a min and max value.
295 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
296 ... categories=['c', 'b', 'a'])
297 >>> c
298 [a, b, c, a, b, c]
299 Categories (3, object): [c < b < a]
300 >>> c.min()
301 'c'
302 """
304 # For comparisons, so that numpy uses our implementation if the compare
305 # ops, which raise
306 __array_priority__ = 1000
307 _dtype = CategoricalDtype(ordered=False)
308 # tolist is not actually deprecated, just suppressed in the __dir__
309 _deprecations = PandasObject._deprecations | frozenset(["tolist"])
310 _typ = "categorical"
312 def __init__(
313 self, values, categories=None, ordered=None, dtype=None, fastpath=False
314 ):
316 dtype = CategoricalDtype._from_values_or_dtype(
317 values, categories, ordered, dtype
318 )
319 # At this point, dtype is always a CategoricalDtype, but
320 # we may have dtype.categories be None, and we need to
321 # infer categories in a factorization step further below
323 if fastpath:
324 self._codes = coerce_indexer_dtype(values, dtype.categories)
325 self._dtype = self._dtype.update_dtype(dtype)
326 return
328 # null_mask indicates missing values we want to exclude from inference.
329 # This means: only missing values in list-likes (not arrays/ndframes).
330 null_mask = np.array(False)
332 # sanitize input
333 if is_categorical_dtype(values):
334 if dtype.categories is None:
335 dtype = CategoricalDtype(values.categories, dtype.ordered)
336 elif not isinstance(values, (ABCIndexClass, ABCSeries)):
337 # sanitize_array coerces np.nan to a string under certain versions
338 # of numpy
339 values = maybe_infer_to_datetimelike(values, convert_dates=True)
340 if not isinstance(values, np.ndarray):
341 values = _convert_to_list_like(values)
343 # By convention, empty lists result in object dtype:
344 if len(values) == 0:
345 sanitize_dtype = "object"
346 else:
347 sanitize_dtype = None
348 null_mask = isna(values)
349 if null_mask.any():
350 values = [values[idx] for idx in np.where(~null_mask)[0]]
351 values = sanitize_array(values, None, dtype=sanitize_dtype)
353 if dtype.categories is None:
354 try:
355 codes, categories = factorize(values, sort=True)
356 except TypeError:
357 codes, categories = factorize(values, sort=False)
358 if dtype.ordered:
359 # raise, as we don't have a sortable data structure and so
360 # the user should give us one by specifying categories
361 raise TypeError(
362 "'values' is not ordered, please "
363 "explicitly specify the categories order "
364 "by passing in a categories argument."
365 )
366 except ValueError:
368 # FIXME
369 raise NotImplementedError(
370 "> 1 ndim Categorical are not supported at this time"
371 )
373 # we're inferring from values
374 dtype = CategoricalDtype(categories, dtype.ordered)
376 elif is_categorical_dtype(values):
377 old_codes = (
378 values._values.codes if isinstance(values, ABCSeries) else values.codes
379 )
380 codes = _recode_for_categories(
381 old_codes, values.dtype.categories, dtype.categories
382 )
384 else:
385 codes = _get_codes_for_values(values, dtype.categories)
387 if null_mask.any():
388 # Reinsert -1 placeholders for previously removed missing values
389 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
390 full_codes[~null_mask] = codes
391 codes = full_codes
393 self._dtype = self._dtype.update_dtype(dtype)
394 self._codes = coerce_indexer_dtype(codes, dtype.categories)
396 @property
397 def categories(self):
398 """
399 The categories of this categorical.
401 Setting assigns new values to each category (effectively a rename of
402 each individual category).
404 The assigned value has to be a list-like object. All items must be
405 unique and the number of items in the new categories must be the same
406 as the number of items in the old categories.
408 Assigning to `categories` is a inplace operation!
410 Raises
411 ------
412 ValueError
413 If the new categories do not validate as categories or if the
414 number of new categories is unequal the number of old categories
416 See Also
417 --------
418 rename_categories
419 reorder_categories
420 add_categories
421 remove_categories
422 remove_unused_categories
423 set_categories
424 """
425 return self.dtype.categories
427 @categories.setter
428 def categories(self, categories):
429 new_dtype = CategoricalDtype(categories, ordered=self.ordered)
430 if self.dtype.categories is not None and len(self.dtype.categories) != len(
431 new_dtype.categories
432 ):
433 raise ValueError(
434 "new categories need to have the same number of "
435 "items as the old categories!"
436 )
437 self._dtype = new_dtype
439 @property
440 def ordered(self) -> Ordered:
441 """
442 Whether the categories have an ordered relationship.
443 """
444 return self.dtype.ordered
446 @property
447 def dtype(self) -> CategoricalDtype:
448 """
449 The :class:`~pandas.api.types.CategoricalDtype` for this instance.
450 """
451 return self._dtype
453 @property
454 def _ndarray_values(self) -> np.ndarray:
455 return self.codes
457 @property
458 def _constructor(self) -> Type["Categorical"]:
459 return Categorical
461 @classmethod
462 def _from_sequence(cls, scalars, dtype=None, copy=False):
463 return Categorical(scalars, dtype=dtype)
465 def _formatter(self, boxed=False):
466 # Defer to CategoricalFormatter's formatter.
467 return None
469 def copy(self) -> "Categorical":
470 """
471 Copy constructor.
472 """
473 return self._constructor(
474 values=self._codes.copy(), dtype=self.dtype, fastpath=True
475 )
477 def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
478 """
479 Coerce this type to another dtype
481 Parameters
482 ----------
483 dtype : numpy dtype or pandas type
484 copy : bool, default True
485 By default, astype always returns a newly allocated object.
486 If copy is set to False and dtype is categorical, the original
487 object is returned.
488 """
489 if is_categorical_dtype(dtype):
490 dtype = cast(Union[str, CategoricalDtype], dtype)
492 # GH 10696/18593
493 dtype = self.dtype.update_dtype(dtype)
494 self = self.copy() if copy else self
495 if dtype == self.dtype:
496 return self
497 return self._set_dtype(dtype)
498 if is_extension_array_dtype(dtype):
499 return array(self, dtype=dtype, copy=copy) # type: ignore # GH 28770
500 if is_integer_dtype(dtype) and self.isna().any():
501 raise ValueError("Cannot convert float NaN to integer")
502 return np.array(self, dtype=dtype, copy=copy)
504 @cache_readonly
505 def size(self) -> int:
506 """
507 Return the len of myself.
508 """
509 return self._codes.size
511 @cache_readonly
512 def itemsize(self) -> int:
513 """
514 return the size of a single category
515 """
516 return self.categories.itemsize
518 def tolist(self) -> List[Scalar]:
519 """
520 Return a list of the values.
522 These are each a scalar type, which is a Python scalar
523 (for str, int, float) or a pandas scalar
524 (for Timestamp/Timedelta/Interval/Period)
525 """
526 return list(self)
528 to_list = tolist
530 @classmethod
531 def _from_inferred_categories(
532 cls, inferred_categories, inferred_codes, dtype, true_values=None
533 ):
534 """
535 Construct a Categorical from inferred values.
537 For inferred categories (`dtype` is None) the categories are sorted.
538 For explicit `dtype`, the `inferred_categories` are cast to the
539 appropriate type.
541 Parameters
542 ----------
543 inferred_categories : Index
544 inferred_codes : Index
545 dtype : CategoricalDtype or 'category'
546 true_values : list, optional
547 If none are provided, the default ones are
548 "True", "TRUE", and "true."
550 Returns
551 -------
552 Categorical
553 """
554 from pandas import Index, to_numeric, to_datetime, to_timedelta
556 cats = Index(inferred_categories)
557 known_categories = (
558 isinstance(dtype, CategoricalDtype) and dtype.categories is not None
559 )
561 if known_categories:
562 # Convert to a specialized type with `dtype` if specified.
563 if dtype.categories.is_numeric():
564 cats = to_numeric(inferred_categories, errors="coerce")
565 elif is_datetime64_dtype(dtype.categories):
566 cats = to_datetime(inferred_categories, errors="coerce")
567 elif is_timedelta64_dtype(dtype.categories):
568 cats = to_timedelta(inferred_categories, errors="coerce")
569 elif dtype.categories.is_boolean():
570 if true_values is None:
571 true_values = ["True", "TRUE", "true"]
573 cats = cats.isin(true_values)
575 if known_categories:
576 # Recode from observation order to dtype.categories order.
577 categories = dtype.categories
578 codes = _recode_for_categories(inferred_codes, cats, categories)
579 elif not cats.is_monotonic_increasing:
580 # Sort categories and recode for unknown categories.
581 unsorted = cats.copy()
582 categories = cats.sort_values()
584 codes = _recode_for_categories(inferred_codes, unsorted, categories)
585 dtype = CategoricalDtype(categories, ordered=False)
586 else:
587 dtype = CategoricalDtype(cats, ordered=False)
588 codes = inferred_codes
590 return cls(codes, dtype=dtype, fastpath=True)
592 @classmethod
593 def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
594 """
595 Make a Categorical type from codes and categories or dtype.
597 This constructor is useful if you already have codes and
598 categories/dtype and so do not need the (computation intensive)
599 factorization step, which is usually done on the constructor.
601 If your data does not follow this convention, please use the normal
602 constructor.
604 Parameters
605 ----------
606 codes : array-like of int
607 An integer array, where each integer points to a category in
608 categories or dtype.categories, or else is -1 for NaN.
609 categories : index-like, optional
610 The categories for the categorical. Items need to be unique.
611 If the categories are not given here, then they must be provided
612 in `dtype`.
613 ordered : bool, optional
614 Whether or not this categorical is treated as an ordered
615 categorical. If not given here or in `dtype`, the resulting
616 categorical will be unordered.
617 dtype : CategoricalDtype or "category", optional
618 If :class:`CategoricalDtype`, cannot be used together with
619 `categories` or `ordered`.
621 .. versionadded:: 0.24.0
623 When `dtype` is provided, neither `categories` nor `ordered`
624 should be provided.
626 Returns
627 -------
628 Categorical
630 Examples
631 --------
632 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
633 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
634 [a, b, a, b]
635 Categories (2, object): [a < b]
636 """
637 dtype = CategoricalDtype._from_values_or_dtype(
638 categories=categories, ordered=ordered, dtype=dtype
639 )
640 if dtype.categories is None:
641 msg = (
642 "The categories must be provided in 'categories' or "
643 "'dtype'. Both were None."
644 )
645 raise ValueError(msg)
647 if is_extension_array_dtype(codes) and is_integer_dtype(codes):
648 # Avoid the implicit conversion of Int to object
649 if isna(codes).any():
650 raise ValueError("codes cannot contain NA values")
651 codes = codes.to_numpy(dtype=np.int64)
652 else:
653 codes = np.asarray(codes)
654 if len(codes) and not is_integer_dtype(codes):
655 raise ValueError("codes need to be array-like integers")
657 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
658 raise ValueError("codes need to be between -1 and len(categories)-1")
660 return cls(codes, dtype=dtype, fastpath=True)
662 def _get_codes(self):
663 """
664 Get the codes.
666 Returns
667 -------
668 codes : integer array view
669 A non writable view of the `codes` array.
670 """
671 v = self._codes.view()
672 v.flags.writeable = False
673 return v
675 def _set_codes(self, codes):
676 """
677 Not settable by the user directly
678 """
679 raise ValueError("cannot set Categorical codes directly")
681 codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc)
683 def _set_categories(self, categories, fastpath=False):
684 """
685 Sets new categories inplace
687 Parameters
688 ----------
689 fastpath : bool, default False
690 Don't perform validation of the categories for uniqueness or nulls
692 Examples
693 --------
694 >>> c = pd.Categorical(['a', 'b'])
695 >>> c
696 [a, b]
697 Categories (2, object): [a, b]
699 >>> c._set_categories(pd.Index(['a', 'c']))
700 >>> c
701 [a, c]
702 Categories (2, object): [a, c]
703 """
705 if fastpath:
706 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
707 else:
708 new_dtype = CategoricalDtype(categories, ordered=self.ordered)
709 if (
710 not fastpath
711 and self.dtype.categories is not None
712 and len(new_dtype.categories) != len(self.dtype.categories)
713 ):
714 raise ValueError(
715 "new categories need to have the same number of "
716 "items than the old categories!"
717 )
719 self._dtype = new_dtype
721 def _set_dtype(self, dtype: CategoricalDtype) -> "Categorical":
722 """
723 Internal method for directly updating the CategoricalDtype
725 Parameters
726 ----------
727 dtype : CategoricalDtype
729 Notes
730 -----
731 We don't do any validation here. It's assumed that the dtype is
732 a (valid) instance of `CategoricalDtype`.
733 """
734 codes = _recode_for_categories(self.codes, self.categories, dtype.categories)
735 return type(self)(codes, dtype=dtype, fastpath=True)
737 def set_ordered(self, value, inplace=False):
738 """
739 Set the ordered attribute to the boolean value.
741 Parameters
742 ----------
743 value : bool
744 Set whether this categorical is ordered (True) or not (False).
745 inplace : bool, default False
746 Whether or not to set the ordered attribute in-place or return
747 a copy of this categorical with ordered set to the value.
748 """
749 inplace = validate_bool_kwarg(inplace, "inplace")
750 new_dtype = CategoricalDtype(self.categories, ordered=value)
751 cat = self if inplace else self.copy()
752 cat._dtype = new_dtype
753 if not inplace:
754 return cat
756 def as_ordered(self, inplace=False):
757 """
758 Set the Categorical to be ordered.
760 Parameters
761 ----------
762 inplace : bool, default False
763 Whether or not to set the ordered attribute in-place or return
764 a copy of this categorical with ordered set to True.
766 Returns
767 -------
768 Categorical
769 Ordered Categorical.
770 """
771 inplace = validate_bool_kwarg(inplace, "inplace")
772 return self.set_ordered(True, inplace=inplace)
774 def as_unordered(self, inplace=False):
775 """
776 Set the Categorical to be unordered.
778 Parameters
779 ----------
780 inplace : bool, default False
781 Whether or not to set the ordered attribute in-place or return
782 a copy of this categorical with ordered set to False.
784 Returns
785 -------
786 Categorical
787 Unordered Categorical.
788 """
789 inplace = validate_bool_kwarg(inplace, "inplace")
790 return self.set_ordered(False, inplace=inplace)
792 def set_categories(self, new_categories, ordered=None, rename=False, inplace=False):
793 """
794 Set the categories to the specified new_categories.
796 `new_categories` can include new categories (which will result in
797 unused categories) or remove old categories (which results in values
798 set to NaN). If `rename==True`, the categories will simple be renamed
799 (less or more items than in old categories will result in values set to
800 NaN or in unused categories respectively).
802 This method can be used to perform more than one action of adding,
803 removing, and reordering simultaneously and is therefore faster than
804 performing the individual steps via the more specialised methods.
806 On the other hand this methods does not do checks (e.g., whether the
807 old categories are included in the new categories on a reorder), which
808 can result in surprising changes, for example when using special string
809 dtypes, which does not considers a S1 string equal to a single char
810 python string.
812 Parameters
813 ----------
814 new_categories : Index-like
815 The categories in new order.
816 ordered : bool, default False
817 Whether or not the categorical is treated as a ordered categorical.
818 If not given, do not change the ordered information.
819 rename : bool, default False
820 Whether or not the new_categories should be considered as a rename
821 of the old categories or as reordered categories.
822 inplace : bool, default False
823 Whether or not to reorder the categories in-place or return a copy
824 of this categorical with reordered categories.
826 Returns
827 -------
828 Categorical with reordered categories or None if inplace.
830 Raises
831 ------
832 ValueError
833 If new_categories does not validate as categories
835 See Also
836 --------
837 rename_categories
838 reorder_categories
839 add_categories
840 remove_categories
841 remove_unused_categories
842 """
843 inplace = validate_bool_kwarg(inplace, "inplace")
844 if ordered is None:
845 ordered = self.dtype.ordered
846 new_dtype = CategoricalDtype(new_categories, ordered=ordered)
848 cat = self if inplace else self.copy()
849 if rename:
850 if cat.dtype.categories is not None and len(new_dtype.categories) < len(
851 cat.dtype.categories
852 ):
853 # remove all _codes which are larger and set to -1/NaN
854 cat._codes[cat._codes >= len(new_dtype.categories)] = -1
855 else:
856 codes = _recode_for_categories(
857 cat.codes, cat.categories, new_dtype.categories
858 )
859 cat._codes = codes
860 cat._dtype = new_dtype
862 if not inplace:
863 return cat
865 def rename_categories(self, new_categories, inplace=False):
866 """
867 Rename categories.
869 Parameters
870 ----------
871 new_categories : list-like, dict-like or callable
873 New categories which will replace old categories.
875 * list-like: all items must be unique and the number of items in
876 the new categories must match the existing number of categories.
878 * dict-like: specifies a mapping from
879 old categories to new. Categories not contained in the mapping
880 are passed through and extra categories in the mapping are
881 ignored.
883 .. versionadded:: 0.21.0.
885 * callable : a callable that is called on all items in the old
886 categories and whose return values comprise the new categories.
888 .. versionadded:: 0.23.0.
890 inplace : bool, default False
891 Whether or not to rename the categories inplace or return a copy of
892 this categorical with renamed categories.
894 Returns
895 -------
896 cat : Categorical or None
897 With ``inplace=False``, the new categorical is returned.
898 With ``inplace=True``, there is no return value.
900 Raises
901 ------
902 ValueError
903 If new categories are list-like and do not have the same number of
904 items than the current categories or do not validate as categories
906 See Also
907 --------
908 reorder_categories
909 add_categories
910 remove_categories
911 remove_unused_categories
912 set_categories
914 Examples
915 --------
916 >>> c = pd.Categorical(['a', 'a', 'b'])
917 >>> c.rename_categories([0, 1])
918 [0, 0, 1]
919 Categories (2, int64): [0, 1]
921 For dict-like ``new_categories``, extra keys are ignored and
922 categories not in the dictionary are passed through
924 >>> c.rename_categories({'a': 'A', 'c': 'C'})
925 [A, A, b]
926 Categories (2, object): [A, b]
928 You may also provide a callable to create the new categories
930 >>> c.rename_categories(lambda x: x.upper())
931 [A, A, B]
932 Categories (2, object): [A, B]
933 """
934 inplace = validate_bool_kwarg(inplace, "inplace")
935 cat = self if inplace else self.copy()
937 if is_dict_like(new_categories):
938 cat.categories = [new_categories.get(item, item) for item in cat.categories]
939 elif callable(new_categories):
940 cat.categories = [new_categories(item) for item in cat.categories]
941 else:
942 cat.categories = new_categories
943 if not inplace:
944 return cat
946 def reorder_categories(self, new_categories, ordered=None, inplace=False):
947 """
948 Reorder categories as specified in new_categories.
950 `new_categories` need to include all old categories and no new category
951 items.
953 Parameters
954 ----------
955 new_categories : Index-like
956 The categories in new order.
957 ordered : bool, optional
958 Whether or not the categorical is treated as a ordered categorical.
959 If not given, do not change the ordered information.
960 inplace : bool, default False
961 Whether or not to reorder the categories inplace or return a copy of
962 this categorical with reordered categories.
964 Returns
965 -------
966 cat : Categorical with reordered categories or None if inplace.
968 Raises
969 ------
970 ValueError
971 If the new categories do not contain all old category items or any
972 new ones
974 See Also
975 --------
976 rename_categories
977 add_categories
978 remove_categories
979 remove_unused_categories
980 set_categories
981 """
982 inplace = validate_bool_kwarg(inplace, "inplace")
983 if set(self.dtype.categories) != set(new_categories):
984 raise ValueError(
985 "items in new_categories are not the same as in old categories"
986 )
987 return self.set_categories(new_categories, ordered=ordered, inplace=inplace)
989 def add_categories(self, new_categories, inplace=False):
990 """
991 Add new categories.
993 `new_categories` will be included at the last/highest place in the
994 categories and will be unused directly after this call.
996 Parameters
997 ----------
998 new_categories : category or list-like of category
999 The new categories to be included.
1000 inplace : bool, default False
1001 Whether or not to add the categories inplace or return a copy of
1002 this categorical with added categories.
1004 Returns
1005 -------
1006 cat : Categorical with new categories added or None if inplace.
1008 Raises
1009 ------
1010 ValueError
1011 If the new categories include old categories or do not validate as
1012 categories
1014 See Also
1015 --------
1016 rename_categories
1017 reorder_categories
1018 remove_categories
1019 remove_unused_categories
1020 set_categories
1021 """
1022 inplace = validate_bool_kwarg(inplace, "inplace")
1023 if not is_list_like(new_categories):
1024 new_categories = [new_categories]
1025 already_included = set(new_categories) & set(self.dtype.categories)
1026 if len(already_included) != 0:
1027 raise ValueError(
1028 f"new categories must not include old categories: {already_included}"
1029 )
1030 new_categories = list(self.dtype.categories) + list(new_categories)
1031 new_dtype = CategoricalDtype(new_categories, self.ordered)
1033 cat = self if inplace else self.copy()
1034 cat._dtype = new_dtype
1035 cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
1036 if not inplace:
1037 return cat
1039 def remove_categories(self, removals, inplace=False):
1040 """
1041 Remove the specified categories.
1043 `removals` must be included in the old categories. Values which were in
1044 the removed categories will be set to NaN
1046 Parameters
1047 ----------
1048 removals : category or list of categories
1049 The categories which should be removed.
1050 inplace : bool, default False
1051 Whether or not to remove the categories inplace or return a copy of
1052 this categorical with removed categories.
1054 Returns
1055 -------
1056 cat : Categorical with removed categories or None if inplace.
1058 Raises
1059 ------
1060 ValueError
1061 If the removals are not contained in the categories
1063 See Also
1064 --------
1065 rename_categories
1066 reorder_categories
1067 add_categories
1068 remove_unused_categories
1069 set_categories
1070 """
1071 inplace = validate_bool_kwarg(inplace, "inplace")
1072 if not is_list_like(removals):
1073 removals = [removals]
1075 removal_set = set(removals)
1076 not_included = removal_set - set(self.dtype.categories)
1077 new_categories = [c for c in self.dtype.categories if c not in removal_set]
1079 # GH 10156
1080 if any(isna(removals)):
1081 not_included = {x for x in not_included if notna(x)}
1082 new_categories = [x for x in new_categories if notna(x)]
1084 if len(not_included) != 0:
1085 raise ValueError(f"removals must all be in old categories: {not_included}")
1087 return self.set_categories(
1088 new_categories, ordered=self.ordered, rename=False, inplace=inplace
1089 )
1091 def remove_unused_categories(self, inplace=False):
1092 """
1093 Remove categories which are not used.
1095 Parameters
1096 ----------
1097 inplace : bool, default False
1098 Whether or not to drop unused categories inplace or return a copy of
1099 this categorical with unused categories dropped.
1101 Returns
1102 -------
1103 cat : Categorical with unused categories dropped or None if inplace.
1105 See Also
1106 --------
1107 rename_categories
1108 reorder_categories
1109 add_categories
1110 remove_categories
1111 set_categories
1112 """
1113 inplace = validate_bool_kwarg(inplace, "inplace")
1114 cat = self if inplace else self.copy()
1115 idx, inv = np.unique(cat._codes, return_inverse=True)
1117 if idx.size != 0 and idx[0] == -1: # na sentinel
1118 idx, inv = idx[1:], inv - 1
1120 new_categories = cat.dtype.categories.take(idx)
1121 new_dtype = CategoricalDtype._from_fastpath(
1122 new_categories, ordered=self.ordered
1123 )
1124 cat._dtype = new_dtype
1125 cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
1127 if not inplace:
1128 return cat
1130 def map(self, mapper):
1131 """
1132 Map categories using input correspondence (dict, Series, or function).
1134 Maps the categories to new categories. If the mapping correspondence is
1135 one-to-one the result is a :class:`~pandas.Categorical` which has the
1136 same order property as the original, otherwise a :class:`~pandas.Index`
1137 is returned. NaN values are unaffected.
1139 If a `dict` or :class:`~pandas.Series` is used any unmapped category is
1140 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
1141 will be returned.
1143 Parameters
1144 ----------
1145 mapper : function, dict, or Series
1146 Mapping correspondence.
1148 Returns
1149 -------
1150 pandas.Categorical or pandas.Index
1151 Mapped categorical.
1153 See Also
1154 --------
1155 CategoricalIndex.map : Apply a mapping correspondence on a
1156 :class:`~pandas.CategoricalIndex`.
1157 Index.map : Apply a mapping correspondence on an
1158 :class:`~pandas.Index`.
1159 Series.map : Apply a mapping correspondence on a
1160 :class:`~pandas.Series`.
1161 Series.apply : Apply more complex functions on a
1162 :class:`~pandas.Series`.
1164 Examples
1165 --------
1166 >>> cat = pd.Categorical(['a', 'b', 'c'])
1167 >>> cat
1168 [a, b, c]
1169 Categories (3, object): [a, b, c]
1170 >>> cat.map(lambda x: x.upper())
1171 [A, B, C]
1172 Categories (3, object): [A, B, C]
1173 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
1174 [first, second, third]
1175 Categories (3, object): [first, second, third]
1177 If the mapping is one-to-one the ordering of the categories is
1178 preserved:
1180 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
1181 >>> cat
1182 [a, b, c]
1183 Categories (3, object): [a < b < c]
1184 >>> cat.map({'a': 3, 'b': 2, 'c': 1})
1185 [3, 2, 1]
1186 Categories (3, int64): [3 < 2 < 1]
1188 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
1190 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
1191 Index(['first', 'second', 'first'], dtype='object')
1193 If a `dict` is used, all unmapped categories are mapped to `NaN` and
1194 the result is an :class:`~pandas.Index`:
1196 >>> cat.map({'a': 'first', 'b': 'second'})
1197 Index(['first', 'second', nan], dtype='object')
1198 """
1199 new_categories = self.categories.map(mapper)
1200 try:
1201 return self.from_codes(
1202 self._codes.copy(), categories=new_categories, ordered=self.ordered
1203 )
1204 except ValueError:
1205 # NA values are represented in self._codes with -1
1206 # np.take causes NA values to take final element in new_categories
1207 if np.any(self._codes == -1):
1208 new_categories = new_categories.insert(len(new_categories), np.nan)
1209 return np.take(new_categories, self._codes)
1211 __eq__ = _cat_compare_op(operator.eq)
1212 __ne__ = _cat_compare_op(operator.ne)
1213 __lt__ = _cat_compare_op(operator.lt)
1214 __gt__ = _cat_compare_op(operator.gt)
1215 __le__ = _cat_compare_op(operator.le)
1216 __ge__ = _cat_compare_op(operator.ge)
1218 # for Series/ndarray like compat
1219 @property
1220 def shape(self):
1221 """
1222 Shape of the Categorical.
1224 For internal compatibility with numpy arrays.
1226 Returns
1227 -------
1228 shape : tuple
1229 """
1231 return tuple([len(self._codes)])
1233 def shift(self, periods, fill_value=None):
1234 """
1235 Shift Categorical by desired number of periods.
1237 Parameters
1238 ----------
1239 periods : int
1240 Number of periods to move, can be positive or negative
1241 fill_value : object, optional
1242 The scalar value to use for newly introduced missing values.
1244 .. versionadded:: 0.24.0
1246 Returns
1247 -------
1248 shifted : Categorical
1249 """
1250 # since categoricals always have ndim == 1, an axis parameter
1251 # doesn't make any sense here.
1252 codes = self.codes
1253 if codes.ndim > 1:
1254 raise NotImplementedError("Categorical with ndim > 1.")
1255 if np.prod(codes.shape) and (periods != 0):
1256 codes = np.roll(codes, ensure_platform_int(periods), axis=0)
1257 if isna(fill_value):
1258 fill_value = -1
1259 elif fill_value in self.categories:
1260 fill_value = self.categories.get_loc(fill_value)
1261 else:
1262 raise ValueError(
1263 f"'fill_value={fill_value}' is not present "
1264 "in this Categorical's categories"
1265 )
1266 if periods > 0:
1267 codes[:periods] = fill_value
1268 else:
1269 codes[periods:] = fill_value
1271 return self.from_codes(codes, dtype=self.dtype)
1273 def __array__(self, dtype=None) -> np.ndarray:
1274 """
1275 The numpy array interface.
1277 Returns
1278 -------
1279 numpy.array
1280 A numpy array of either the specified dtype or,
1281 if dtype==None (default), the same dtype as
1282 categorical.categories.dtype.
1283 """
1284 ret = take_1d(self.categories.values, self._codes)
1285 if dtype and not is_dtype_equal(dtype, self.categories.dtype):
1286 return np.asarray(ret, dtype)
1287 if is_extension_array_dtype(ret):
1288 # When we're a Categorical[ExtensionArray], like Interval,
1289 # we need to ensure __array__ get's all the way to an
1290 # ndarray.
1291 ret = np.asarray(ret)
1292 return ret
1294 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
1295 # for binary ops, use our custom dunder methods
1296 result = ops.maybe_dispatch_ufunc_to_dunder_op(
1297 self, ufunc, method, *inputs, **kwargs
1298 )
1299 if result is not NotImplemented:
1300 return result
1302 # for all other cases, raise for now (similarly as what happens in
1303 # Series.__array_prepare__)
1304 raise TypeError(
1305 f"Object with dtype {self.dtype} cannot perform "
1306 f"the numpy op {ufunc.__name__}"
1307 )
1309 def __setstate__(self, state):
1310 """Necessary for making this object picklable"""
1311 if not isinstance(state, dict):
1312 raise Exception("invalid pickle state")
1314 # compat with pre 0.21.0 CategoricalDtype change
1315 if "_dtype" not in state:
1316 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
1318 for k, v in state.items():
1319 setattr(self, k, v)
1321 @property
1322 def T(self):
1323 """
1324 Return transposed numpy array.
1325 """
1326 return self
1328 @property
1329 def nbytes(self):
1330 return self._codes.nbytes + self.dtype.categories.values.nbytes
1332 def memory_usage(self, deep=False):
1333 """
1334 Memory usage of my values
1336 Parameters
1337 ----------
1338 deep : bool
1339 Introspect the data deeply, interrogate
1340 `object` dtypes for system-level memory consumption
1342 Returns
1343 -------
1344 bytes used
1346 Notes
1347 -----
1348 Memory usage does not include memory consumed by elements that
1349 are not components of the array if deep=False
1351 See Also
1352 --------
1353 numpy.ndarray.nbytes
1354 """
1355 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
1357 @Substitution(klass="Categorical")
1358 @Appender(_shared_docs["searchsorted"])
1359 def searchsorted(self, value, side="left", sorter=None):
1360 # searchsorted is very performance sensitive. By converting codes
1361 # to same dtype as self.codes, we get much faster performance.
1362 if is_scalar(value):
1363 codes = self.categories.get_loc(value)
1364 codes = self.codes.dtype.type(codes)
1365 else:
1366 locs = [self.categories.get_loc(x) for x in value]
1367 codes = np.array(locs, dtype=self.codes.dtype)
1368 return self.codes.searchsorted(codes, side=side, sorter=sorter)
1370 def isna(self):
1371 """
1372 Detect missing values
1374 Missing values (-1 in .codes) are detected.
1376 Returns
1377 -------
1378 a boolean array of whether my values are null
1380 See Also
1381 --------
1382 isna : Top-level isna.
1383 isnull : Alias of isna.
1384 Categorical.notna : Boolean inverse of Categorical.isna.
1386 """
1388 ret = self._codes == -1
1389 return ret
1391 isnull = isna
1393 def notna(self):
1394 """
1395 Inverse of isna
1397 Both missing values (-1 in .codes) and NA as a category are detected as
1398 null.
1400 Returns
1401 -------
1402 a boolean array of whether my values are not null
1404 See Also
1405 --------
1406 notna : Top-level notna.
1407 notnull : Alias of notna.
1408 Categorical.isna : Boolean inverse of Categorical.notna.
1410 """
1411 return ~self.isna()
1413 notnull = notna
1415 def put(self, *args, **kwargs):
1416 """
1417 Replace specific elements in the Categorical with given values.
1418 """
1419 raise NotImplementedError(("'put' is not yet implemented for Categorical"))
1421 def dropna(self):
1422 """
1423 Return the Categorical without null values.
1425 Missing values (-1 in .codes) are detected.
1427 Returns
1428 -------
1429 valid : Categorical
1430 """
1431 result = self[self.notna()]
1433 return result
1435 def value_counts(self, dropna=True):
1436 """
1437 Return a Series containing counts of each category.
1439 Every category will have an entry, even those with a count of 0.
1441 Parameters
1442 ----------
1443 dropna : bool, default True
1444 Don't include counts of NaN.
1446 Returns
1447 -------
1448 counts : Series
1450 See Also
1451 --------
1452 Series.value_counts
1453 """
1454 from pandas import Series, CategoricalIndex
1456 code, cat = self._codes, self.categories
1457 ncat, mask = len(cat), 0 <= code
1458 ix, clean = np.arange(ncat), mask.all()
1460 if dropna or clean:
1461 obs = code if clean else code[mask]
1462 count = np.bincount(obs, minlength=ncat or 0)
1463 else:
1464 count = np.bincount(np.where(mask, code, ncat))
1465 ix = np.append(ix, -1)
1467 ix = self._constructor(ix, dtype=self.dtype, fastpath=True)
1469 return Series(count, index=CategoricalIndex(ix), dtype="int64")
1471 def _internal_get_values(self):
1472 """
1473 Return the values.
1475 For internal compatibility with pandas formatting.
1477 Returns
1478 -------
1479 np.ndarray or Index
1480 A numpy array of the same dtype as categorical.categories.dtype or
1481 Index if datetime / periods.
1482 """
1483 # if we are a datetime and period index, return Index to keep metadata
1484 if needs_i8_conversion(self.categories):
1485 return self.categories.take(self._codes, fill_value=np.nan)
1486 elif is_integer_dtype(self.categories) and -1 in self._codes:
1487 return self.categories.astype("object").take(self._codes, fill_value=np.nan)
1488 return np.array(self)
1490 def check_for_ordered(self, op):
1491 """ assert that we are ordered """
1492 if not self.ordered:
1493 raise TypeError(
1494 f"Categorical is not ordered for operation {op}\n"
1495 "you can use .as_ordered() to change the "
1496 "Categorical to an ordered one\n"
1497 )
1499 def _values_for_argsort(self):
1500 return self._codes.copy()
1502 def argsort(self, ascending=True, kind="quicksort", *args, **kwargs):
1503 """
1504 Return the indices that would sort the Categorical.
1506 .. versionchanged:: 0.25.0
1508 Changed to sort missing values at the end.
1510 Parameters
1511 ----------
1512 ascending : bool, default True
1513 Whether the indices should result in an ascending
1514 or descending sort.
1515 kind : {'quicksort', 'mergesort', 'heapsort'}, optional
1516 Sorting algorithm.
1517 *args, **kwargs:
1518 passed through to :func:`numpy.argsort`.
1520 Returns
1521 -------
1522 numpy.array
1524 See Also
1525 --------
1526 numpy.ndarray.argsort
1528 Notes
1529 -----
1530 While an ordering is applied to the category values, arg-sorting
1531 in this context refers more to organizing and grouping together
1532 based on matching category values. Thus, this function can be
1533 called on an unordered Categorical instance unlike the functions
1534 'Categorical.min' and 'Categorical.max'.
1536 Examples
1537 --------
1538 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
1539 array([2, 0, 1, 3])
1541 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
1542 ... categories=['c', 'b', 'a'],
1543 ... ordered=True)
1544 >>> cat.argsort()
1545 array([3, 0, 1, 2])
1547 Missing values are placed at the end
1549 >>> cat = pd.Categorical([2, None, 1])
1550 >>> cat.argsort()
1551 array([2, 0, 1])
1552 """
1553 return super().argsort(ascending=ascending, kind=kind, *args, **kwargs)
1555 def sort_values(self, inplace=False, ascending=True, na_position="last"):
1556 """
1557 Sort the Categorical by category value returning a new
1558 Categorical by default.
1560 While an ordering is applied to the category values, sorting in this
1561 context refers more to organizing and grouping together based on
1562 matching category values. Thus, this function can be called on an
1563 unordered Categorical instance unlike the functions 'Categorical.min'
1564 and 'Categorical.max'.
1566 Parameters
1567 ----------
1568 inplace : bool, default False
1569 Do operation in place.
1570 ascending : bool, default True
1571 Order ascending. Passing False orders descending. The
1572 ordering parameter provides the method by which the
1573 category values are organized.
1574 na_position : {'first', 'last'} (optional, default='last')
1575 'first' puts NaNs at the beginning
1576 'last' puts NaNs at the end
1578 Returns
1579 -------
1580 Categorical or None
1582 See Also
1583 --------
1584 Categorical.sort
1585 Series.sort_values
1587 Examples
1588 --------
1589 >>> c = pd.Categorical([1, 2, 2, 1, 5])
1590 >>> c
1591 [1, 2, 2, 1, 5]
1592 Categories (3, int64): [1, 2, 5]
1593 >>> c.sort_values()
1594 [1, 1, 2, 2, 5]
1595 Categories (3, int64): [1, 2, 5]
1596 >>> c.sort_values(ascending=False)
1597 [5, 2, 2, 1, 1]
1598 Categories (3, int64): [1, 2, 5]
1600 Inplace sorting can be done as well:
1602 >>> c.sort_values(inplace=True)
1603 >>> c
1604 [1, 1, 2, 2, 5]
1605 Categories (3, int64): [1, 2, 5]
1606 >>>
1607 >>> c = pd.Categorical([1, 2, 2, 1, 5])
1609 'sort_values' behaviour with NaNs. Note that 'na_position'
1610 is independent of the 'ascending' parameter:
1612 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
1613 >>> c
1614 [NaN, 2.0, 2.0, NaN, 5.0]
1615 Categories (2, int64): [2, 5]
1616 >>> c.sort_values()
1617 [2.0, 2.0, 5.0, NaN, NaN]
1618 Categories (2, int64): [2, 5]
1619 >>> c.sort_values(ascending=False)
1620 [5.0, 2.0, 2.0, NaN, NaN]
1621 Categories (2, int64): [2, 5]
1622 >>> c.sort_values(na_position='first')
1623 [NaN, NaN, 2.0, 2.0, 5.0]
1624 Categories (2, int64): [2, 5]
1625 >>> c.sort_values(ascending=False, na_position='first')
1626 [NaN, NaN, 5.0, 2.0, 2.0]
1627 Categories (2, int64): [2, 5]
1628 """
1629 inplace = validate_bool_kwarg(inplace, "inplace")
1630 if na_position not in ["last", "first"]:
1631 raise ValueError(f"invalid na_position: {repr(na_position)}")
1633 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
1635 if inplace:
1636 self._codes = self._codes[sorted_idx]
1637 else:
1638 return self._constructor(
1639 values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True
1640 )
1642 def _values_for_rank(self):
1643 """
1644 For correctly ranking ordered categorical data. See GH#15420
1646 Ordered categorical data should be ranked on the basis of
1647 codes with -1 translated to NaN.
1649 Returns
1650 -------
1651 numpy.array
1653 """
1654 from pandas import Series
1656 if self.ordered:
1657 values = self.codes
1658 mask = values == -1
1659 if mask.any():
1660 values = values.astype("float64")
1661 values[mask] = np.nan
1662 elif self.categories.is_numeric():
1663 values = np.array(self)
1664 else:
1665 # reorder the categories (so rank can use the float codes)
1666 # instead of passing an object array to rank
1667 values = np.array(
1668 self.rename_categories(Series(self.categories).rank().values)
1669 )
1670 return values
1672 def view(self, dtype=None):
1673 if dtype is not None:
1674 raise NotImplementedError(dtype)
1675 return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True)
1677 def to_dense(self):
1678 """
1679 Return my 'dense' representation
1681 For internal compatibility with numpy arrays.
1683 Returns
1684 -------
1685 dense : array
1686 """
1687 return np.asarray(self)
1689 def fillna(self, value=None, method=None, limit=None):
1690 """
1691 Fill NA/NaN values using the specified method.
1693 Parameters
1694 ----------
1695 value : scalar, dict, Series
1696 If a scalar value is passed it is used to fill all missing values.
1697 Alternatively, a Series or dict can be used to fill in different
1698 values for each index. The value should not be a list. The
1699 value(s) passed should either be in the categories or should be
1700 NaN.
1701 method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
1702 Method to use for filling holes in reindexed Series
1703 pad / ffill: propagate last valid observation forward to next valid
1704 backfill / bfill: use NEXT valid observation to fill gap
1705 limit : int, default None
1706 (Not implemented yet for Categorical!)
1707 If method is specified, this is the maximum number of consecutive
1708 NaN values to forward/backward fill. In other words, if there is
1709 a gap with more than this number of consecutive NaNs, it will only
1710 be partially filled. If method is not specified, this is the
1711 maximum number of entries along the entire axis where NaNs will be
1712 filled.
1714 Returns
1715 -------
1716 filled : Categorical with NA/NaN filled
1717 """
1718 value, method = validate_fillna_kwargs(
1719 value, method, validate_scalar_dict_value=False
1720 )
1722 if value is None:
1723 value = np.nan
1724 if limit is not None:
1725 raise NotImplementedError(
1726 "specifying a limit for fillna has not been implemented yet"
1727 )
1729 codes = self._codes
1731 # pad / bfill
1732 if method is not None:
1734 values = self.to_dense().reshape(-1, len(self))
1735 values = interpolate_2d(values, method, 0, None, value).astype(
1736 self.categories.dtype
1737 )[0]
1738 codes = _get_codes_for_values(values, self.categories)
1740 else:
1742 # If value is a dict or a Series (a dict value has already
1743 # been converted to a Series)
1744 if isinstance(value, ABCSeries):
1745 if not value[~value.isin(self.categories)].isna().all():
1746 raise ValueError("fill value must be in categories")
1748 values_codes = _get_codes_for_values(value, self.categories)
1749 indexer = np.where(codes == -1)
1750 codes[indexer] = values_codes[indexer]
1752 # If value is not a dict or Series it should be a scalar
1753 elif is_hashable(value):
1754 if not isna(value) and value not in self.categories:
1755 raise ValueError("fill value must be in categories")
1757 mask = codes == -1
1758 if mask.any():
1759 codes = codes.copy()
1760 if isna(value):
1761 codes[mask] = -1
1762 else:
1763 codes[mask] = self.categories.get_loc(value)
1765 else:
1766 raise TypeError(
1767 f"'value' parameter must be a scalar, dict "
1768 f"or Series, but you passed a {type(value).__name__}"
1769 )
1771 return self._constructor(codes, dtype=self.dtype, fastpath=True)
1773 def take(self, indexer, allow_fill: bool = False, fill_value=None):
1774 """
1775 Take elements from the Categorical.
1777 Parameters
1778 ----------
1779 indexer : sequence of int
1780 The indices in `self` to take. The meaning of negative values in
1781 `indexer` depends on the value of `allow_fill`.
1782 allow_fill : bool, default False
1783 How to handle negative values in `indexer`.
1785 * False: negative values in `indices` indicate positional indices
1786 from the right. This is similar to
1787 :func:`numpy.take`.
1789 * True: negative values in `indices` indicate missing values
1790 (the default). These values are set to `fill_value`. Any other
1791 other negative values raise a ``ValueError``.
1793 .. versionchanged:: 1.0.0
1795 Default value changed from ``True`` to ``False``.
1797 fill_value : object
1798 The value to use for `indices` that are missing (-1), when
1799 ``allow_fill=True``. This should be the category, i.e. a value
1800 in ``self.categories``, not a code.
1802 Returns
1803 -------
1804 Categorical
1805 This Categorical will have the same categories and ordered as
1806 `self`.
1808 See Also
1809 --------
1810 Series.take : Similar method for Series.
1811 numpy.ndarray.take : Similar method for NumPy arrays.
1813 Examples
1814 --------
1815 >>> cat = pd.Categorical(['a', 'a', 'b'])
1816 >>> cat
1817 [a, a, b]
1818 Categories (2, object): [a, b]
1820 Specify ``allow_fill==False`` to have negative indices mean indexing
1821 from the right.
1823 >>> cat.take([0, -1, -2], allow_fill=False)
1824 [a, b, a]
1825 Categories (2, object): [a, b]
1827 With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
1828 values that should be filled with the `fill_value`, which is
1829 ``np.nan`` by default.
1831 >>> cat.take([0, -1, -1], allow_fill=True)
1832 [a, NaN, NaN]
1833 Categories (2, object): [a, b]
1835 The fill value can be specified.
1837 >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')
1838 [a, a, a]
1839 Categories (3, object): [a, b]
1841 Specifying a fill value that's not in ``self.categories``
1842 will raise a ``TypeError``.
1843 """
1844 indexer = np.asarray(indexer, dtype=np.intp)
1846 dtype = self.dtype
1848 if isna(fill_value):
1849 fill_value = -1
1850 elif allow_fill:
1851 # convert user-provided `fill_value` to codes
1852 if fill_value in self.categories:
1853 fill_value = self.categories.get_loc(fill_value)
1854 else:
1855 msg = (
1856 f"'fill_value' ('{fill_value}') is not in this "
1857 "Categorical's categories."
1858 )
1859 raise TypeError(msg)
1861 codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
1862 result = type(self).from_codes(codes, dtype=dtype)
1863 return result
1865 def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
1866 # GH#27745 deprecate alias that other EAs dont have
1867 warn(
1868 "Categorical.take_nd is deprecated, use Categorical.take instead",
1869 FutureWarning,
1870 stacklevel=2,
1871 )
1872 return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value)
1874 def __len__(self) -> int:
1875 """
1876 The length of this Categorical.
1877 """
1878 return len(self._codes)
1880 def __iter__(self):
1881 """
1882 Returns an Iterator over the values of this Categorical.
1883 """
1884 return iter(self._internal_get_values().tolist())
1886 def __contains__(self, key) -> bool:
1887 """
1888 Returns True if `key` is in this Categorical.
1889 """
1890 # if key is a NaN, check if any NaN is in self.
1891 if is_scalar(key) and isna(key):
1892 return self.isna().any()
1894 return contains(self, key, container=self._codes)
1896 def _tidy_repr(self, max_vals=10, footer=True) -> str:
1897 """ a short repr displaying only max_vals and an optional (but default
1898 footer)
1899 """
1900 num = max_vals // 2
1901 head = self[:num]._get_repr(length=False, footer=False)
1902 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
1904 result = f"{head[:-1]}, ..., {tail[1:]}"
1905 if footer:
1906 result = f"{result}\n{self._repr_footer()}"
1908 return str(result)
1910 def _repr_categories(self):
1911 """
1912 return the base repr for the categories
1913 """
1914 max_categories = (
1915 10
1916 if get_option("display.max_categories") == 0
1917 else get_option("display.max_categories")
1918 )
1919 from pandas.io.formats import format as fmt
1921 if len(self.categories) > max_categories:
1922 num = max_categories // 2
1923 head = fmt.format_array(self.categories[:num], None)
1924 tail = fmt.format_array(self.categories[-num:], None)
1925 category_strs = head + ["..."] + tail
1926 else:
1927 category_strs = fmt.format_array(self.categories, None)
1929 # Strip all leading spaces, which format_array adds for columns...
1930 category_strs = [x.strip() for x in category_strs]
1931 return category_strs
1933 def _repr_categories_info(self) -> str:
1934 """
1935 Returns a string representation of the footer.
1936 """
1938 category_strs = self._repr_categories()
1939 dtype = str(self.categories.dtype)
1940 levheader = f"Categories ({len(self.categories)}, {dtype}): "
1941 width, height = get_terminal_size()
1942 max_width = get_option("display.width") or width
1943 if console.in_ipython_frontend():
1944 # 0 = no breaks
1945 max_width = 0
1946 levstring = ""
1947 start = True
1948 cur_col_len = len(levheader) # header
1949 sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
1950 linesep = sep.rstrip() + "\n" # remove whitespace
1951 for val in category_strs:
1952 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
1953 levstring += linesep + (" " * (len(levheader) + 1))
1954 cur_col_len = len(levheader) + 1 # header + a whitespace
1955 elif not start:
1956 levstring += sep
1957 cur_col_len += len(val)
1958 levstring += val
1959 start = False
1960 # replace to simple save space by
1961 return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]"
1963 def _repr_footer(self) -> str:
1964 info = self._repr_categories_info()
1965 return f"Length: {len(self)}\n{info}"
1967 def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str:
1968 from pandas.io.formats import format as fmt
1970 formatter = fmt.CategoricalFormatter(
1971 self, length=length, na_rep=na_rep, footer=footer
1972 )
1973 result = formatter.to_string()
1974 return str(result)
1976 def __repr__(self) -> str:
1977 """
1978 String representation.
1979 """
1980 _maxlen = 10
1981 if len(self._codes) > _maxlen:
1982 result = self._tidy_repr(_maxlen)
1983 elif len(self._codes) > 0:
1984 result = self._get_repr(length=len(self) > _maxlen)
1985 else:
1986 msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
1987 result = f"[], {msg}"
1989 return result
1991 def _maybe_coerce_indexer(self, indexer):
1992 """
1993 return an indexer coerced to the codes dtype
1994 """
1995 if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i":
1996 indexer = indexer.astype(self._codes.dtype)
1997 return indexer
1999 def __getitem__(self, key):
2000 """
2001 Return an item.
2002 """
2003 if isinstance(key, (int, np.integer)):
2004 i = self._codes[key]
2005 if i == -1:
2006 return np.nan
2007 else:
2008 return self.categories[i]
2010 key = check_array_indexer(self, key)
2012 result = self._codes[key]
2013 if result.ndim > 1:
2014 deprecate_ndim_indexing(result)
2015 return result
2016 return self._constructor(result, dtype=self.dtype, fastpath=True)
2018 def __setitem__(self, key, value):
2019 """
2020 Item assignment.
2022 Raises
2023 ------
2024 ValueError
2025 If (one or more) Value is not in categories or if a assigned
2026 `Categorical` does not have the same categories
2027 """
2028 value = extract_array(value, extract_numpy=True)
2030 # require identical categories set
2031 if isinstance(value, Categorical):
2032 if not is_dtype_equal(self, value):
2033 raise ValueError(
2034 "Cannot set a Categorical with another, "
2035 "without identical categories"
2036 )
2037 if not self.categories.equals(value.categories):
2038 new_codes = _recode_for_categories(
2039 value.codes, value.categories, self.categories
2040 )
2041 value = Categorical.from_codes(new_codes, dtype=self.dtype)
2043 rvalue = value if is_list_like(value) else [value]
2045 from pandas import Index
2047 to_add = Index(rvalue).difference(self.categories)
2049 # no assignments of values not in categories, but it's always ok to set
2050 # something to np.nan
2051 if len(to_add) and not isna(to_add).all():
2052 raise ValueError(
2053 "Cannot setitem on a Categorical with a new "
2054 "category, set the categories first"
2055 )
2057 # set by position
2058 if isinstance(key, (int, np.integer)):
2059 pass
2061 # tuple of indexers (dataframe)
2062 elif isinstance(key, tuple):
2063 # only allow 1 dimensional slicing, but can
2064 # in a 2-d case be passd (slice(None),....)
2065 if len(key) == 2:
2066 if not com.is_null_slice(key[0]):
2067 raise AssertionError("invalid slicing for a 1-ndim categorical")
2068 key = key[1]
2069 elif len(key) == 1:
2070 key = key[0]
2071 else:
2072 raise AssertionError("invalid slicing for a 1-ndim categorical")
2074 # slicing in Series or Categorical
2075 elif isinstance(key, slice):
2076 pass
2078 # else: array of True/False in Series or Categorical
2080 lindexer = self.categories.get_indexer(rvalue)
2081 lindexer = self._maybe_coerce_indexer(lindexer)
2083 key = check_array_indexer(self, key)
2084 self._codes[key] = lindexer
2086 def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
2087 """
2088 Compute the inverse of a categorical, returning
2089 a dict of categories -> indexers.
2091 *This is an internal function*
2093 Returns
2094 -------
2095 dict of categories -> indexers
2097 Examples
2098 --------
2099 >>> c = pd.Categorical(list('aabca'))
2100 >>> c
2101 [a, a, b, c, a]
2102 Categories (3, object): [a, b, c]
2103 >>> c.categories
2104 Index(['a', 'b', 'c'], dtype='object')
2105 >>> c.codes
2106 array([0, 0, 1, 2, 0], dtype=int8)
2107 >>> c._reverse_indexer()
2108 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
2110 """
2111 categories = self.categories
2112 r, counts = libalgos.groupsort_indexer(
2113 self.codes.astype("int64"), categories.size
2114 )
2115 counts = counts.cumsum()
2116 _result = (r[start:end] for start, end in zip(counts, counts[1:]))
2117 result = dict(zip(categories, _result))
2118 return result
2120 # reduction ops #
2121 def _reduce(self, name, axis=0, **kwargs):
2122 func = getattr(self, name, None)
2123 if func is None:
2124 raise TypeError(f"Categorical cannot perform the operation {name}")
2125 return func(**kwargs)
2127 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
2128 def min(self, skipna=True, **kwargs):
2129 """
2130 The minimum value of the object.
2132 Only ordered `Categoricals` have a minimum!
2134 .. versionchanged:: 1.0.0
2136 Returns an NA value on empty arrays
2138 Raises
2139 ------
2140 TypeError
2141 If the `Categorical` is not `ordered`.
2143 Returns
2144 -------
2145 min : the minimum of this `Categorical`
2146 """
2147 nv.validate_min((), kwargs)
2148 self.check_for_ordered("min")
2150 if not len(self._codes):
2151 return self.dtype.na_value
2153 good = self._codes != -1
2154 if not good.all():
2155 if skipna and good.any():
2156 pointer = self._codes[good].min()
2157 else:
2158 return np.nan
2159 else:
2160 pointer = self._codes.min()
2161 return self.categories[pointer]
2163 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
2164 def max(self, skipna=True, **kwargs):
2165 """
2166 The maximum value of the object.
2168 Only ordered `Categoricals` have a maximum!
2170 .. versionchanged:: 1.0.0
2172 Returns an NA value on empty arrays
2174 Raises
2175 ------
2176 TypeError
2177 If the `Categorical` is not `ordered`.
2179 Returns
2180 -------
2181 max : the maximum of this `Categorical`
2182 """
2183 nv.validate_max((), kwargs)
2184 self.check_for_ordered("max")
2186 if not len(self._codes):
2187 return self.dtype.na_value
2189 good = self._codes != -1
2190 if not good.all():
2191 if skipna and good.any():
2192 pointer = self._codes[good].max()
2193 else:
2194 return np.nan
2195 else:
2196 pointer = self._codes.max()
2197 return self.categories[pointer]
2199 def mode(self, dropna=True):
2200 """
2201 Returns the mode(s) of the Categorical.
2203 Always returns `Categorical` even if only one value.
2205 Parameters
2206 ----------
2207 dropna : bool, default True
2208 Don't consider counts of NaN/NaT.
2210 .. versionadded:: 0.24.0
2212 Returns
2213 -------
2214 modes : `Categorical` (sorted)
2215 """
2216 codes = self._codes
2217 if dropna:
2218 good = self._codes != -1
2219 codes = self._codes[good]
2220 codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
2221 return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
2223 def unique(self):
2224 """
2225 Return the ``Categorical`` which ``categories`` and ``codes`` are
2226 unique. Unused categories are NOT returned.
2228 - unordered category: values and categories are sorted by appearance
2229 order.
2230 - ordered category: values are sorted by appearance order, categories
2231 keeps existing order.
2233 Returns
2234 -------
2235 unique values : ``Categorical``
2237 Examples
2238 --------
2239 An unordered Categorical will return categories in the
2240 order of appearance.
2242 >>> pd.Categorical(list('baabc'))
2243 [b, a, c]
2244 Categories (3, object): [b, a, c]
2246 >>> pd.Categorical(list('baabc'), categories=list('abc'))
2247 [b, a, c]
2248 Categories (3, object): [b, a, c]
2250 An ordered Categorical preserves the category ordering.
2252 >>> pd.Categorical(list('baabc'),
2253 ... categories=list('abc'),
2254 ... ordered=True)
2255 [b, a, c]
2256 Categories (3, object): [a < b < c]
2258 See Also
2259 --------
2260 unique
2261 CategoricalIndex.unique
2262 Series.unique
2264 """
2266 # unlike np.unique, unique1d does not sort
2267 unique_codes = unique1d(self.codes)
2268 cat = self.copy()
2270 # keep nan in codes
2271 cat._codes = unique_codes
2273 # exclude nan from indexer for categories
2274 take_codes = unique_codes[unique_codes != -1]
2275 if self.ordered:
2276 take_codes = np.sort(take_codes)
2277 return cat.set_categories(cat.categories.take(take_codes))
2279 def _values_for_factorize(self):
2280 codes = self.codes.astype("int64")
2281 return codes, -1
2283 @classmethod
2284 def _from_factorized(cls, uniques, original):
2285 return original._constructor(
2286 original.categories.take(uniques), dtype=original.dtype
2287 )
2289 def equals(self, other):
2290 """
2291 Returns True if categorical arrays are equal.
2293 Parameters
2294 ----------
2295 other : `Categorical`
2297 Returns
2298 -------
2299 bool
2300 """
2301 if self.is_dtype_equal(other):
2302 if self.categories.equals(other.categories):
2303 # fastpath to avoid re-coding
2304 other_codes = other._codes
2305 else:
2306 other_codes = _recode_for_categories(
2307 other.codes, other.categories, self.categories
2308 )
2309 return np.array_equal(self._codes, other_codes)
2310 return False
2312 def is_dtype_equal(self, other):
2313 """
2314 Returns True if categoricals are the same dtype
2315 same categories, and same ordered
2317 Parameters
2318 ----------
2319 other : Categorical
2321 Returns
2322 -------
2323 bool
2324 """
2326 try:
2327 return hash(self.dtype) == hash(other.dtype)
2328 except (AttributeError, TypeError):
2329 return False
2331 def describe(self):
2332 """
2333 Describes this Categorical
2335 Returns
2336 -------
2337 description: `DataFrame`
2338 A dataframe with frequency and counts by category.
2339 """
2340 counts = self.value_counts(dropna=False)
2341 freqs = counts / float(counts.sum())
2343 from pandas.core.reshape.concat import concat
2345 result = concat([counts, freqs], axis=1)
2346 result.columns = ["counts", "freqs"]
2347 result.index.name = "categories"
2349 return result
2351 @Substitution(klass="Categorical")
2352 @Appender(_extension_array_shared_docs["repeat"])
2353 def repeat(self, repeats, axis=None):
2354 nv.validate_repeat(tuple(), dict(axis=axis))
2355 codes = self._codes.repeat(repeats)
2356 return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
2358 # Implement the ExtensionArray interface
2359 @property
2360 def _can_hold_na(self):
2361 return True
2363 @classmethod
2364 def _concat_same_type(self, to_concat):
2365 from pandas.core.dtypes.concat import concat_categorical
2367 return concat_categorical(to_concat)
2369 def isin(self, values):
2370 """
2371 Check whether `values` are contained in Categorical.
2373 Return a boolean NumPy Array showing whether each element in
2374 the Categorical matches an element in the passed sequence of
2375 `values` exactly.
2377 Parameters
2378 ----------
2379 values : set or list-like
2380 The sequence of values to test. Passing in a single string will
2381 raise a ``TypeError``. Instead, turn a single string into a
2382 list of one element.
2384 Returns
2385 -------
2386 isin : numpy.ndarray (bool dtype)
2388 Raises
2389 ------
2390 TypeError
2391 * If `values` is not a set or list-like
2393 See Also
2394 --------
2395 pandas.Series.isin : Equivalent method on Series.
2397 Examples
2398 --------
2400 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
2401 ... 'hippo'])
2402 >>> s.isin(['cow', 'lama'])
2403 array([ True, True, True, False, True, False])
2405 Passing a single string as ``s.isin('lama')`` will raise an error. Use
2406 a list of one element instead:
2408 >>> s.isin(['lama'])
2409 array([ True, False, True, False, True, False])
2410 """
2411 if not is_list_like(values):
2412 values_type = type(values).__name__
2413 raise TypeError(
2414 "only list-like objects are allowed to be passed"
2415 f" to isin(), you passed a [{values_type}]"
2416 )
2417 values = sanitize_array(values, None, None)
2418 null_mask = np.asarray(isna(values))
2419 code_values = self.categories.get_indexer(values)
2420 code_values = code_values[null_mask | (code_values >= 0)]
2421 return algorithms.isin(self.codes, code_values)
2423 def replace(self, to_replace, value, inplace: bool = False):
2424 """
2425 Replaces all instances of one value with another
2427 Parameters
2428 ----------
2429 to_replace: object
2430 The value to be replaced
2432 value: object
2433 The value to replace it with
2435 inplace: bool
2436 Whether the operation is done in-place
2438 Returns
2439 -------
2440 None if inplace is True, otherwise the new Categorical after replacement
2443 Examples
2444 --------
2445 >>> s = pd.Categorical([1, 2, 1, 3])
2446 >>> s.replace(1, 3)
2447 [3, 3, 2, 3]
2448 Categories (2, int64): [2, 3]
2449 """
2450 inplace = validate_bool_kwarg(inplace, "inplace")
2451 cat = self if inplace else self.copy()
2453 # build a dict of (to replace -> value) pairs
2454 if is_list_like(to_replace):
2455 # if to_replace is list-like and value is scalar
2456 replace_dict = {replace_value: value for replace_value in to_replace}
2457 else:
2458 # if both to_replace and value are scalar
2459 replace_dict = {to_replace: value}
2461 # other cases, like if both to_replace and value are list-like or if
2462 # to_replace is a dict, are handled separately in NDFrame
2463 for replace_value, new_value in replace_dict.items():
2464 if new_value == replace_value:
2465 continue
2466 if replace_value in cat.categories:
2467 if isna(new_value):
2468 cat.remove_categories(replace_value, inplace=True)
2469 continue
2470 categories = cat.categories.tolist()
2471 index = categories.index(replace_value)
2472 if new_value in cat.categories:
2473 value_index = categories.index(new_value)
2474 cat._codes[cat._codes == index] = value_index
2475 cat.remove_categories(replace_value, inplace=True)
2476 else:
2477 categories[index] = new_value
2478 cat.rename_categories(categories, inplace=True)
2479 if not inplace:
2480 return cat
2483# The Series.cat accessor
2486@delegate_names(
2487 delegate=Categorical, accessors=["categories", "ordered"], typ="property"
2488)
2489@delegate_names(
2490 delegate=Categorical,
2491 accessors=[
2492 "rename_categories",
2493 "reorder_categories",
2494 "add_categories",
2495 "remove_categories",
2496 "remove_unused_categories",
2497 "set_categories",
2498 "as_ordered",
2499 "as_unordered",
2500 ],
2501 typ="method",
2502)
2503class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
2504 """
2505 Accessor object for categorical properties of the Series values.
2507 Be aware that assigning to `categories` is a inplace operation, while all
2508 methods return new categorical data per default (but can be called with
2509 `inplace=True`).
2511 Parameters
2512 ----------
2513 data : Series or CategoricalIndex
2515 Examples
2516 --------
2517 >>> s.cat.categories
2518 >>> s.cat.categories = list('abc')
2519 >>> s.cat.rename_categories(list('cab'))
2520 >>> s.cat.reorder_categories(list('cab'))
2521 >>> s.cat.add_categories(['d','e'])
2522 >>> s.cat.remove_categories(['d'])
2523 >>> s.cat.remove_unused_categories()
2524 >>> s.cat.set_categories(list('abcde'))
2525 >>> s.cat.as_ordered()
2526 >>> s.cat.as_unordered()
2527 """
2529 _deprecations = PandasObject._deprecations | frozenset(
2530 ["categorical", "index", "name"]
2531 )
2533 def __init__(self, data):
2534 self._validate(data)
2535 self._parent = data.values
2536 self._index = data.index
2537 self._name = data.name
2538 self._freeze()
2540 @staticmethod
2541 def _validate(data):
2542 if not is_categorical_dtype(data.dtype):
2543 raise AttributeError("Can only use .cat accessor with a 'category' dtype")
2545 def _delegate_property_get(self, name):
2546 return getattr(self._parent, name)
2548 def _delegate_property_set(self, name, new_values):
2549 return setattr(self._parent, name, new_values)
2551 @property
2552 def codes(self):
2553 """
2554 Return Series of codes as well as the index.
2555 """
2556 from pandas import Series
2558 return Series(self._parent.codes, index=self._index)
2560 def _delegate_method(self, name, *args, **kwargs):
2561 from pandas import Series
2563 method = getattr(self._parent, name)
2564 res = method(*args, **kwargs)
2565 if res is not None:
2566 return Series(res, index=self._index, name=self._name)
2569# utility routines
2572def _get_codes_for_values(values, categories):
2573 """
2574 utility routine to turn values into codes given the specified categories
2575 """
2576 dtype_equal = is_dtype_equal(values.dtype, categories.dtype)
2578 if dtype_equal:
2579 # To prevent erroneous dtype coercion in _get_data_algo, retrieve
2580 # the underlying numpy array. gh-22702
2581 values = getattr(values, "_ndarray_values", values)
2582 categories = getattr(categories, "_ndarray_values", categories)
2583 elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values):
2584 # Support inferring the correct extension dtype from an array of
2585 # scalar objects. e.g.
2586 # Categorical(array[Period, Period], categories=PeriodIndex(...))
2587 cls = categories.dtype.construct_array_type()
2588 values = try_cast_to_ea(cls, values)
2589 if not isinstance(values, cls):
2590 # exception raised in _from_sequence
2591 values = ensure_object(values)
2592 categories = ensure_object(categories)
2593 else:
2594 values = ensure_object(values)
2595 categories = ensure_object(categories)
2597 hash_klass, vals = _get_data_algo(values)
2598 _, cats = _get_data_algo(categories)
2599 t = hash_klass(len(cats))
2600 t.map_locations(cats)
2601 return coerce_indexer_dtype(t.lookup(vals), cats)
2604def _recode_for_categories(codes: np.ndarray, old_categories, new_categories):
2605 """
2606 Convert a set of codes for to a new set of categories
2608 Parameters
2609 ----------
2610 codes : np.ndarray
2611 old_categories, new_categories : Index
2613 Returns
2614 -------
2615 new_codes : np.ndarray[np.int64]
2617 Examples
2618 --------
2619 >>> old_cat = pd.Index(['b', 'a', 'c'])
2620 >>> new_cat = pd.Index(['a', 'b'])
2621 >>> codes = np.array([0, 1, 1, 2])
2622 >>> _recode_for_categories(codes, old_cat, new_cat)
2623 array([ 1, 0, 0, -1])
2624 """
2625 if len(old_categories) == 0:
2626 # All null anyway, so just retain the nulls
2627 return codes.copy()
2628 elif new_categories.equals(old_categories):
2629 # Same categories, so no need to actually recode
2630 return codes.copy()
2631 indexer = coerce_indexer_dtype(
2632 new_categories.get_indexer(old_categories), new_categories
2633 )
2634 new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
2635 return new_codes
2638def _convert_to_list_like(list_like):
2639 if hasattr(list_like, "dtype"):
2640 return list_like
2641 if isinstance(list_like, list):
2642 return list_like
2643 if is_sequence(list_like) or isinstance(list_like, tuple) or is_iterator(list_like):
2644 return list(list_like)
2645 elif is_scalar(list_like):
2646 return [list_like]
2647 else:
2648 # TODO: is this reached?
2649 return [list_like]
2652def factorize_from_iterable(values):
2653 """
2654 Factorize an input `values` into `categories` and `codes`. Preserves
2655 categorical dtype in `categories`.
2657 *This is an internal function*
2659 Parameters
2660 ----------
2661 values : list-like
2663 Returns
2664 -------
2665 codes : ndarray
2666 categories : Index
2667 If `values` has a categorical dtype, then `categories` is
2668 a CategoricalIndex keeping the categories and order of `values`.
2669 """
2670 if not is_list_like(values):
2671 raise TypeError("Input must be list-like")
2673 if is_categorical_dtype(values):
2674 values = extract_array(values)
2675 # The Categorical we want to build has the same categories
2676 # as values but its codes are by def [0, ..., len(n_categories) - 1]
2677 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
2678 categories = Categorical.from_codes(cat_codes, dtype=values.dtype)
2679 codes = values.codes
2680 else:
2681 # The value of ordered is irrelevant since we don't use cat as such,
2682 # but only the resulting categories, the order of which is independent
2683 # from ordered. Set ordered to False as default. See GH #15457
2684 cat = Categorical(values, ordered=False)
2685 categories = cat.categories
2686 codes = cat.codes
2687 return codes, categories
2690def factorize_from_iterables(iterables):
2691 """
2692 A higher-level wrapper over `factorize_from_iterable`.
2694 *This is an internal function*
2696 Parameters
2697 ----------
2698 iterables : list-like of list-likes
2700 Returns
2701 -------
2702 codes_list : list of ndarrays
2703 categories_list : list of Indexes
2705 Notes
2706 -----
2707 See `factorize_from_iterable` for more info.
2708 """
2709 if len(iterables) == 0:
2710 # For consistency, it should return a list of 2 lists.
2711 return [[], []]
2712 return map(list, zip(*(factorize_from_iterable(it) for it in iterables)))