Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/groupby/generic.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Define the SeriesGroupBy and DataFrameGroupBy
3classes that hold the groupby interfaces (and some implementations).
5These are user facing as the result of the ``df.groupby(...)`` operations,
6which here returns a DataFrameGroupBy object.
7"""
8from collections import abc, defaultdict, namedtuple
9import copy
10from functools import partial
11from textwrap import dedent
12import typing
13from typing import (
14 TYPE_CHECKING,
15 Any,
16 Callable,
17 Dict,
18 FrozenSet,
19 Iterable,
20 List,
21 Mapping,
22 Sequence,
23 Tuple,
24 Type,
25 Union,
26 cast,
27)
28import warnings
30import numpy as np
32from pandas._libs import Timestamp, lib
33from pandas._typing import FrameOrSeries
34from pandas.util._decorators import Appender, Substitution
36from pandas.core.dtypes.cast import (
37 maybe_convert_objects,
38 maybe_downcast_numeric,
39 maybe_downcast_to_dtype,
40)
41from pandas.core.dtypes.common import (
42 ensure_int64,
43 ensure_platform_int,
44 is_bool,
45 is_dict_like,
46 is_integer_dtype,
47 is_interval_dtype,
48 is_list_like,
49 is_numeric_dtype,
50 is_object_dtype,
51 is_scalar,
52 needs_i8_conversion,
53)
54from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna
56import pandas.core.algorithms as algorithms
57from pandas.core.base import DataError, SpecificationError
58import pandas.core.common as com
59from pandas.core.construction import create_series_with_explicit_dtype
60from pandas.core.frame import DataFrame
61from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs
62from pandas.core.groupby import base
63from pandas.core.groupby.groupby import (
64 GroupBy,
65 _apply_docs,
66 _transform_template,
67 get_groupby,
68)
69from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same
70import pandas.core.indexes.base as ibase
71from pandas.core.internals import BlockManager, make_block
72from pandas.core.series import Series
74from pandas.plotting import boxplot_frame_groupby
76if TYPE_CHECKING:
77 from pandas.core.internals import Block
80NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
81# TODO(typing) the return value on this callable should be any *scalar*.
82AggScalar = Union[str, Callable[..., Any]]
83# TODO: validate types on ScalarResult and move to _typing
84# Blocked from using by https://github.com/python/mypy/issues/1484
85# See note at _mangle_lambda_list
86ScalarResult = typing.TypeVar("ScalarResult")
89def generate_property(name: str, klass: Type[FrameOrSeries]):
90 """
91 Create a property for a GroupBy subclass to dispatch to DataFrame/Series.
93 Parameters
94 ----------
95 name : str
96 klass : {DataFrame, Series}
98 Returns
99 -------
100 property
101 """
103 def prop(self):
104 return self._make_wrapper(name)
106 parent_method = getattr(klass, name)
107 prop.__doc__ = parent_method.__doc__ or ""
108 prop.__name__ = name
109 return property(prop)
112def pin_whitelisted_properties(klass: Type[FrameOrSeries], whitelist: FrozenSet[str]):
113 """
114 Create GroupBy member defs for DataFrame/Series names in a whitelist.
116 Parameters
117 ----------
118 klass : DataFrame or Series class
119 class where members are defined.
120 whitelist : frozenset[str]
121 Set of names of klass methods to be constructed
123 Returns
124 -------
125 class decorator
127 Notes
128 -----
129 Since we don't want to override methods explicitly defined in the
130 base class, any such name is skipped.
131 """
133 def pinner(cls):
134 for name in whitelist:
135 if hasattr(cls, name):
136 # don't override anything that was explicitly defined
137 # in the base class
138 continue
140 prop = generate_property(name, klass)
141 setattr(cls, name, prop)
143 return cls
145 return pinner
148@pin_whitelisted_properties(Series, base.series_apply_whitelist)
149class SeriesGroupBy(GroupBy):
150 _apply_whitelist = base.series_apply_whitelist
152 def _iterate_slices(self) -> Iterable[Series]:
153 yield self._selected_obj
155 @property
156 def _selection_name(self):
157 """
158 since we are a series, we by definition only have
159 a single name, but may be the result of a selection or
160 the name of our object
161 """
162 if self._selection is None:
163 return self.obj.name
164 else:
165 return self._selection
167 _agg_see_also_doc = dedent(
168 """
169 See Also
170 --------
171 pandas.Series.groupby.apply
172 pandas.Series.groupby.transform
173 pandas.Series.aggregate
174 """
175 )
177 _agg_examples_doc = dedent(
178 """
179 Examples
180 --------
181 >>> s = pd.Series([1, 2, 3, 4])
183 >>> s
184 0 1
185 1 2
186 2 3
187 3 4
188 dtype: int64
190 >>> s.groupby([1, 1, 2, 2]).min()
191 1 1
192 2 3
193 dtype: int64
195 >>> s.groupby([1, 1, 2, 2]).agg('min')
196 1 1
197 2 3
198 dtype: int64
200 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
201 min max
202 1 1 2
203 2 3 4
205 The output column names can be controlled by passing
206 the desired column names and aggregations as keyword arguments.
208 >>> s.groupby([1, 1, 2, 2]).agg(
209 ... minimum='min',
210 ... maximum='max',
211 ... )
212 minimum maximum
213 1 1 2
214 2 3 4
215 """
216 )
218 @Appender(
219 _apply_docs["template"].format(
220 input="series", examples=_apply_docs["series_examples"]
221 )
222 )
223 def apply(self, func, *args, **kwargs):
224 return super().apply(func, *args, **kwargs)
226 @Substitution(
227 see_also=_agg_see_also_doc,
228 examples=_agg_examples_doc,
229 versionadded="",
230 klass="Series",
231 axis="",
232 )
233 @Appender(_shared_docs["aggregate"])
234 def aggregate(self, func=None, *args, **kwargs):
236 relabeling = func is None
237 columns = None
238 no_arg_message = "Must provide 'func' or named aggregation **kwargs."
239 if relabeling:
240 columns = list(kwargs)
241 func = [kwargs[col] for col in columns]
242 kwargs = {}
243 if not columns:
244 raise TypeError(no_arg_message)
246 if isinstance(func, str):
247 return getattr(self, func)(*args, **kwargs)
249 elif isinstance(func, abc.Iterable):
250 # Catch instances of lists / tuples
251 # but not the class list / tuple itself.
252 func = _maybe_mangle_lambdas(func)
253 ret = self._aggregate_multiple_funcs(func)
254 if relabeling:
255 ret.columns = columns
256 else:
257 cyfunc = self._get_cython_func(func)
258 if cyfunc and not args and not kwargs:
259 return getattr(self, cyfunc)()
261 if self.grouper.nkeys > 1:
262 return self._python_agg_general(func, *args, **kwargs)
264 try:
265 return self._python_agg_general(func, *args, **kwargs)
266 except (ValueError, KeyError):
267 # TODO: KeyError is raised in _python_agg_general,
268 # see see test_groupby.test_basic
269 result = self._aggregate_named(func, *args, **kwargs)
271 index = Index(sorted(result), name=self.grouper.names[0])
272 ret = create_series_with_explicit_dtype(
273 result, index=index, dtype_if_empty=object
274 )
276 if not self.as_index: # pragma: no cover
277 print("Warning, ignoring as_index=True")
279 if isinstance(ret, dict):
280 from pandas import concat
282 ret = concat(ret, axis=1)
283 return ret
285 agg = aggregate
287 def _aggregate_multiple_funcs(self, arg):
288 if isinstance(arg, dict):
290 # show the deprecation, but only if we
291 # have not shown a higher level one
292 # GH 15931
293 if isinstance(self._selected_obj, Series):
294 raise SpecificationError("nested renamer is not supported")
296 columns = list(arg.keys())
297 arg = arg.items()
298 elif any(isinstance(x, (tuple, list)) for x in arg):
299 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
301 # indicated column order
302 columns = next(zip(*arg))
303 else:
304 # list of functions / function names
305 columns = []
306 for f in arg:
307 columns.append(com.get_callable_name(f) or f)
309 arg = zip(columns, arg)
311 results = {}
312 for name, func in arg:
313 obj = self
315 # reset the cache so that we
316 # only include the named selection
317 if name in self._selected_obj:
318 obj = copy.copy(obj)
319 obj._reset_cache()
320 obj._selection = name
321 results[name] = obj.aggregate(func)
323 if any(isinstance(x, DataFrame) for x in results.values()):
324 # let higher level handle
325 return results
327 return DataFrame(results, columns=columns)
329 def _wrap_series_output(
330 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index
331 ) -> Union[Series, DataFrame]:
332 """
333 Wraps the output of a SeriesGroupBy operation into the expected result.
335 Parameters
336 ----------
337 output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
338 Data to wrap.
339 index : pd.Index
340 Index to apply to the output.
342 Returns
343 -------
344 Series or DataFrame
346 Notes
347 -----
348 In the vast majority of cases output and columns will only contain one
349 element. The exception is operations that expand dimensions, like ohlc.
350 """
351 indexed_output = {key.position: val for key, val in output.items()}
352 columns = Index(key.label for key in output)
354 result: Union[Series, DataFrame]
355 if len(output) > 1:
356 result = DataFrame(indexed_output, index=index)
357 result.columns = columns
358 else:
359 result = Series(indexed_output[0], index=index, name=columns[0])
361 return result
363 def _wrap_aggregated_output(
364 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
365 ) -> Union[Series, DataFrame]:
366 """
367 Wraps the output of a SeriesGroupBy aggregation into the expected result.
369 Parameters
370 ----------
371 output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
372 Data to wrap.
374 Returns
375 -------
376 Series or DataFrame
378 Notes
379 -----
380 In the vast majority of cases output will only contain one element.
381 The exception is operations that expand dimensions, like ohlc.
382 """
383 result = self._wrap_series_output(
384 output=output, index=self.grouper.result_index
385 )
386 return self._reindex_output(result)._convert(datetime=True)
388 def _wrap_transformed_output(
389 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
390 ) -> Series:
391 """
392 Wraps the output of a SeriesGroupBy aggregation into the expected result.
394 Parameters
395 ----------
396 output : dict[base.OutputKey, Union[Series, np.ndarray]]
397 Dict with a sole key of 0 and a value of the result values.
399 Returns
400 -------
401 Series
403 Notes
404 -----
405 output should always contain one element. It is specified as a dict
406 for consistency with DataFrame methods and _wrap_aggregated_output.
407 """
408 assert len(output) == 1
409 result = self._wrap_series_output(output=output, index=self.obj.index)
411 # No transformations increase the ndim of the result
412 assert isinstance(result, Series)
413 return result
415 def _wrap_applied_output(self, keys, values, not_indexed_same=False):
416 if len(keys) == 0:
417 # GH #6265
418 return Series([], name=self._selection_name, index=keys, dtype=np.float64)
420 def _get_index() -> Index:
421 if self.grouper.nkeys > 1:
422 index = MultiIndex.from_tuples(keys, names=self.grouper.names)
423 else:
424 index = Index(keys, name=self.grouper.names[0])
425 return index
427 if isinstance(values[0], dict):
428 # GH #823 #24880
429 index = _get_index()
430 result = self._reindex_output(DataFrame(values, index=index))
431 # if self.observed is False,
432 # keep all-NaN rows created while re-indexing
433 result = result.stack(dropna=self.observed)
434 result.name = self._selection_name
435 return result
437 if isinstance(values[0], Series):
438 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
439 elif isinstance(values[0], DataFrame):
440 # possible that Series -> DataFrame by applied function
441 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
442 else:
443 # GH #6265 #24880
444 result = Series(data=values, index=_get_index(), name=self._selection_name)
445 return self._reindex_output(result)
447 def _aggregate_named(self, func, *args, **kwargs):
448 result = {}
450 for name, group in self:
451 group.name = name
452 output = func(group, *args, **kwargs)
453 if isinstance(output, (Series, Index, np.ndarray)):
454 raise ValueError("Must produce aggregated value")
455 result[name] = output
457 return result
459 @Substitution(klass="Series", selected="A.")
460 @Appender(_transform_template)
461 def transform(self, func, *args, **kwargs):
462 func = self._get_cython_func(func) or func
464 if not isinstance(func, str):
465 return self._transform_general(func, *args, **kwargs)
467 elif func not in base.transform_kernel_whitelist:
468 msg = f"'{func}' is not a valid function name for transform(name)"
469 raise ValueError(msg)
470 elif func in base.cythonized_kernels:
471 # cythonized transform or canned "agg+broadcast"
472 return getattr(self, func)(*args, **kwargs)
474 # If func is a reduction, we need to broadcast the
475 # result to the whole group. Compute func result
476 # and deal with possible broadcasting below.
477 result = getattr(self, func)(*args, **kwargs)
478 return self._transform_fast(result, func)
480 def _transform_general(self, func, *args, **kwargs):
481 """
482 Transform with a non-str `func`.
483 """
484 klass = type(self._selected_obj)
486 results = []
487 for name, group in self:
488 object.__setattr__(group, "name", name)
489 res = func(group, *args, **kwargs)
491 if isinstance(res, (ABCDataFrame, ABCSeries)):
492 res = res._values
494 indexer = self._get_index(name)
495 ser = klass(res, indexer)
496 results.append(ser)
498 # check for empty "results" to avoid concat ValueError
499 if results:
500 from pandas.core.reshape.concat import concat
502 result = concat(results).sort_index()
503 else:
504 result = Series(dtype=np.float64)
506 # we will only try to coerce the result type if
507 # we have a numeric dtype, as these are *always* user-defined funcs
508 # the cython take a different path (and casting)
509 dtype = self._selected_obj.dtype
510 if is_numeric_dtype(dtype):
511 result = maybe_downcast_to_dtype(result, dtype)
513 result.name = self._selected_obj.name
514 result.index = self._selected_obj.index
515 return result
517 def _transform_fast(self, result, func_nm: str) -> Series:
518 """
519 fast version of transform, only applicable to
520 builtin/cythonizable functions
521 """
522 ids, _, ngroup = self.grouper.group_info
523 cast = self._transform_should_cast(func_nm)
524 out = algorithms.take_1d(result._values, ids)
525 if cast:
526 out = self._try_cast(out, self.obj)
527 return Series(out, index=self.obj.index, name=self.obj.name)
529 def filter(self, func, dropna=True, *args, **kwargs):
530 """
531 Return a copy of a Series excluding elements from groups that
532 do not satisfy the boolean criterion specified by func.
534 Parameters
535 ----------
536 func : function
537 To apply to each group. Should return True or False.
538 dropna : Drop groups that do not pass the filter. True by default;
539 if False, groups that evaluate False are filled with NaNs.
541 Examples
542 --------
543 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
544 ... 'foo', 'bar'],
545 ... 'B' : [1, 2, 3, 4, 5, 6],
546 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
547 >>> grouped = df.groupby('A')
548 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
549 1 2
550 3 4
551 5 6
552 Name: B, dtype: int64
554 Returns
555 -------
556 filtered : Series
557 """
558 if isinstance(func, str):
559 wrapper = lambda x: getattr(x, func)(*args, **kwargs)
560 else:
561 wrapper = lambda x: func(x, *args, **kwargs)
563 # Interpret np.nan as False.
564 def true_and_notna(x, *args, **kwargs) -> bool:
565 b = wrapper(x, *args, **kwargs)
566 return b and notna(b)
568 try:
569 indices = [
570 self._get_index(name) for name, group in self if true_and_notna(group)
571 ]
572 except (ValueError, TypeError):
573 raise TypeError("the filter must return a boolean result")
575 filtered = self._apply_filter(indices, dropna)
576 return filtered
578 def nunique(self, dropna: bool = True) -> Series:
579 """
580 Return number of unique elements in the group.
582 Returns
583 -------
584 Series
585 Number of unique values within each group.
586 """
587 ids, _, _ = self.grouper.group_info
589 val = self.obj._internal_get_values()
591 codes, _ = algorithms.factorize(val, sort=False)
592 sorter = np.lexsort((codes, ids))
593 codes = codes[sorter]
594 ids = ids[sorter]
596 # group boundaries are where group ids change
597 # unique observations are where sorted values change
598 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
599 inc = np.r_[1, codes[1:] != codes[:-1]]
601 # 1st item of each group is a new unique observation
602 mask = codes == -1
603 if dropna:
604 inc[idx] = 1
605 inc[mask] = 0
606 else:
607 inc[mask & np.r_[False, mask[:-1]]] = 0
608 inc[idx] = 1
610 out = np.add.reduceat(inc, idx).astype("int64", copy=False)
611 if len(ids):
612 # NaN/NaT group exists if the head of ids is -1,
613 # so remove it from res and exclude its index from idx
614 if ids[0] == -1:
615 res = out[1:]
616 idx = idx[np.flatnonzero(idx)]
617 else:
618 res = out
619 else:
620 res = out[1:]
621 ri = self.grouper.result_index
623 # we might have duplications among the bins
624 if len(res) != len(ri):
625 res, out = np.zeros(len(ri), dtype=out.dtype), res
626 res[ids[idx]] = out
628 result = Series(res, index=ri, name=self._selection_name)
629 return self._reindex_output(result, fill_value=0)
631 @Appender(Series.describe.__doc__)
632 def describe(self, **kwargs):
633 result = self.apply(lambda x: x.describe(**kwargs))
634 if self.axis == 1:
635 return result.T
636 return result.unstack()
638 def value_counts(
639 self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
640 ):
642 from pandas.core.reshape.tile import cut
643 from pandas.core.reshape.merge import _get_join_indexers
645 if bins is not None and not np.iterable(bins):
646 # scalar bins cannot be done at top level
647 # in a backward compatible way
648 return self.apply(
649 Series.value_counts,
650 normalize=normalize,
651 sort=sort,
652 ascending=ascending,
653 bins=bins,
654 )
656 ids, _, _ = self.grouper.group_info
657 val = self.obj._internal_get_values()
659 # groupby removes null keys from groupings
660 mask = ids != -1
661 ids, val = ids[mask], val[mask]
663 if bins is None:
664 lab, lev = algorithms.factorize(val, sort=True)
665 llab = lambda lab, inc: lab[inc]
666 else:
668 # lab is a Categorical with categories an IntervalIndex
669 lab = cut(Series(val), bins, include_lowest=True)
670 lev = lab.cat.categories
671 lab = lev.take(lab.cat.codes)
672 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
674 if is_interval_dtype(lab):
675 # TODO: should we do this inside II?
676 sorter = np.lexsort((lab.left, lab.right, ids))
677 else:
678 sorter = np.lexsort((lab, ids))
680 ids, lab = ids[sorter], lab[sorter]
682 # group boundaries are where group ids change
683 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
685 # new values are where sorted labels change
686 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
687 inc = np.r_[True, lchanges]
688 inc[idx] = True # group boundaries are also new values
689 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
691 # num. of times each group should be repeated
692 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
694 # multi-index components
695 codes = self.grouper.reconstructed_codes
696 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
697 levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
698 names = self.grouper.names + [self._selection_name]
700 if dropna:
701 mask = codes[-1] != -1
702 if mask.all():
703 dropna = False
704 else:
705 out, codes = out[mask], [level_codes[mask] for level_codes in codes]
707 if normalize:
708 out = out.astype("float")
709 d = np.diff(np.r_[idx, len(ids)])
710 if dropna:
711 m = ids[lab == -1]
712 np.add.at(d, m, -1)
713 acc = rep(d)[mask]
714 else:
715 acc = rep(d)
716 out /= acc
718 if sort and bins is None:
719 cat = ids[inc][mask] if dropna else ids[inc]
720 sorter = np.lexsort((out if ascending else -out, cat))
721 out, codes[-1] = out[sorter], codes[-1][sorter]
723 if bins is None:
724 mi = MultiIndex(
725 levels=levels, codes=codes, names=names, verify_integrity=False
726 )
728 if is_integer_dtype(out):
729 out = ensure_int64(out)
730 return Series(out, index=mi, name=self._selection_name)
732 # for compat. with libgroupby.value_counts need to ensure every
733 # bin is present at every index level, null filled with zeros
734 diff = np.zeros(len(out), dtype="bool")
735 for level_codes in codes[:-1]:
736 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
738 ncat, nbin = diff.sum(), len(levels[-1])
740 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
742 right = [diff.cumsum() - 1, codes[-1]]
744 _, idx = _get_join_indexers(left, right, sort=False, how="left")
745 out = np.where(idx != -1, out[idx], 0)
747 if sort:
748 sorter = np.lexsort((out if ascending else -out, left[0]))
749 out, left[-1] = out[sorter], left[-1][sorter]
751 # build the multi-index w/ full levels
752 def build_codes(lev_codes: np.ndarray) -> np.ndarray:
753 return np.repeat(lev_codes[diff], nbin)
755 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
756 codes.append(left[-1])
758 mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
760 if is_integer_dtype(out):
761 out = ensure_int64(out)
762 return Series(out, index=mi, name=self._selection_name)
764 def count(self) -> Series:
765 """
766 Compute count of group, excluding missing values.
768 Returns
769 -------
770 Series
771 Count of values within each group.
772 """
773 ids, _, ngroups = self.grouper.group_info
774 val = self.obj._internal_get_values()
776 mask = (ids != -1) & ~isna(val)
777 ids = ensure_platform_int(ids)
778 minlength = ngroups or 0
779 out = np.bincount(ids[mask], minlength=minlength)
781 result = Series(
782 out,
783 index=self.grouper.result_index,
784 name=self._selection_name,
785 dtype="int64",
786 )
787 return self._reindex_output(result, fill_value=0)
789 def _apply_to_column_groupbys(self, func):
790 """ return a pass thru """
791 return func(self)
793 def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
794 """Calculate pct_change of each value to previous entry in group"""
795 # TODO: Remove this conditional when #23918 is fixed
796 if freq:
797 return self.apply(
798 lambda x: x.pct_change(
799 periods=periods, fill_method=fill_method, limit=limit, freq=freq
800 )
801 )
802 if fill_method is None: # GH30463
803 fill_method = "pad"
804 limit = 0
805 filled = getattr(self, fill_method)(limit=limit)
806 fill_grp = filled.groupby(self.grouper.codes)
807 shifted = fill_grp.shift(periods=periods, freq=freq)
809 return (filled / shifted) - 1
812@pin_whitelisted_properties(DataFrame, base.dataframe_apply_whitelist)
813class DataFrameGroupBy(GroupBy):
815 _apply_whitelist = base.dataframe_apply_whitelist
817 _agg_see_also_doc = dedent(
818 """
819 See Also
820 --------
821 pandas.DataFrame.groupby.apply
822 pandas.DataFrame.groupby.transform
823 pandas.DataFrame.aggregate
824 """
825 )
827 _agg_examples_doc = dedent(
828 """
829 Examples
830 --------
832 >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
833 ... 'B': [1, 2, 3, 4],
834 ... 'C': np.random.randn(4)})
836 >>> df
837 A B C
838 0 1 1 0.362838
839 1 1 2 0.227877
840 2 2 3 1.267767
841 3 2 4 -0.562860
843 The aggregation is for each column.
845 >>> df.groupby('A').agg('min')
846 B C
847 A
848 1 1 0.227877
849 2 3 -0.562860
851 Multiple aggregations
853 >>> df.groupby('A').agg(['min', 'max'])
854 B C
855 min max min max
856 A
857 1 1 2 0.227877 0.362838
858 2 3 4 -0.562860 1.267767
860 Select a column for aggregation
862 >>> df.groupby('A').B.agg(['min', 'max'])
863 min max
864 A
865 1 1 2
866 2 3 4
868 Different aggregations per column
870 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
871 B C
872 min max sum
873 A
874 1 1 2 0.590716
875 2 3 4 0.704907
877 To control the output names with different aggregations per column,
878 pandas supports "named aggregation"
880 >>> df.groupby("A").agg(
881 ... b_min=pd.NamedAgg(column="B", aggfunc="min"),
882 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
883 b_min c_sum
884 A
885 1 1 -1.956929
886 2 3 -0.322183
888 - The keywords are the *output* column names
889 - The values are tuples whose first element is the column to select
890 and the second element is the aggregation to apply to that column.
891 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
892 ``['column', 'aggfunc']`` to make it clearer what the arguments are.
893 As usual, the aggregation can be a callable or a string alias.
895 See :ref:`groupby.aggregate.named` for more.
896 """
897 )
899 @Substitution(
900 see_also=_agg_see_also_doc,
901 examples=_agg_examples_doc,
902 versionadded="",
903 klass="DataFrame",
904 axis="",
905 )
906 @Appender(_shared_docs["aggregate"])
907 def aggregate(self, func=None, *args, **kwargs):
909 relabeling = func is None and _is_multi_agg_with_relabel(**kwargs)
910 if relabeling:
911 func, columns, order = _normalize_keyword_aggregation(kwargs)
913 kwargs = {}
914 elif isinstance(func, list) and len(func) > len(set(func)):
916 # GH 28426 will raise error if duplicated function names are used and
917 # there is no reassigned name
918 raise SpecificationError(
919 "Function names must be unique if there is no new column "
920 "names assigned"
921 )
922 elif func is None:
923 # nicer error message
924 raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).")
926 func = _maybe_mangle_lambdas(func)
928 result, how = self._aggregate(func, *args, **kwargs)
929 if how is None:
930 return result
932 if result is None:
934 # grouper specific aggregations
935 if self.grouper.nkeys > 1:
936 return self._python_agg_general(func, *args, **kwargs)
937 elif args or kwargs:
938 result = self._aggregate_frame(func, *args, **kwargs)
940 elif self.axis == 1:
941 # _aggregate_multiple_funcs does not allow self.axis == 1
942 result = self._aggregate_frame(func)
944 else:
946 # try to treat as if we are passing a list
947 try:
948 result = self._aggregate_multiple_funcs([func], _axis=self.axis)
949 except ValueError as err:
950 if "no results" not in str(err):
951 # raised directly by _aggregate_multiple_funcs
952 raise
953 result = self._aggregate_frame(func)
954 else:
955 # select everything except for the last level, which is the one
956 # containing the name of the function(s), see GH 32040
957 result.columns = result.columns.rename(
958 [self._selected_obj.columns.name] * result.columns.nlevels
959 ).droplevel(-1)
961 if not self.as_index:
962 self._insert_inaxis_grouper_inplace(result)
963 result.index = np.arange(len(result))
965 if relabeling:
967 # used reordered index of columns
968 result = result.iloc[:, order]
969 result.columns = columns
971 return result._convert(datetime=True)
973 agg = aggregate
975 def _iterate_slices(self) -> Iterable[Series]:
976 obj = self._selected_obj
977 if self.axis == 1:
978 obj = obj.T
980 if isinstance(obj, Series) and obj.name not in self.exclusions:
981 # Occurs when doing DataFrameGroupBy(...)["X"]
982 yield obj
983 else:
984 for label, values in obj.items():
985 if label in self.exclusions:
986 continue
988 yield values
990 def _cython_agg_general(
991 self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
992 ) -> DataFrame:
993 agg_blocks, agg_items = self._cython_agg_blocks(
994 how, alt=alt, numeric_only=numeric_only, min_count=min_count
995 )
996 return self._wrap_agged_blocks(agg_blocks, items=agg_items)
998 def _cython_agg_blocks(
999 self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
1000 ) -> "Tuple[List[Block], Index]":
1001 # TODO: the actual managing of mgr_locs is a PITA
1002 # here, it should happen via BlockManager.combine
1004 data: BlockManager = self._get_data_to_aggregate()
1006 if numeric_only:
1007 data = data.get_numeric_data(copy=False)
1009 agg_blocks: List[Block] = []
1010 new_items: List[np.ndarray] = []
1011 deleted_items: List[np.ndarray] = []
1012 # Some object-dtype blocks might be split into List[Block[T], Block[U]]
1013 split_items: List[np.ndarray] = []
1014 split_frames: List[DataFrame] = []
1016 no_result = object()
1017 for block in data.blocks:
1018 # Avoid inheriting result from earlier in the loop
1019 result = no_result
1020 locs = block.mgr_locs.as_array
1021 try:
1022 result, _ = self.grouper.aggregate(
1023 block.values, how, axis=1, min_count=min_count
1024 )
1025 except NotImplementedError:
1026 # generally if we have numeric_only=False
1027 # and non-applicable functions
1028 # try to python agg
1030 if alt is None:
1031 # we cannot perform the operation
1032 # in an alternate way, exclude the block
1033 assert how == "ohlc"
1034 deleted_items.append(locs)
1035 continue
1037 # call our grouper again with only this block
1038 obj = self.obj[data.items[locs]]
1039 if obj.shape[1] == 1:
1040 # Avoid call to self.values that can occur in DataFrame
1041 # reductions; see GH#28949
1042 obj = obj.iloc[:, 0]
1044 s = get_groupby(obj, self.grouper)
1045 try:
1046 result = s.aggregate(lambda x: alt(x, axis=self.axis))
1047 except TypeError:
1048 # we may have an exception in trying to aggregate
1049 # continue and exclude the block
1050 deleted_items.append(locs)
1051 continue
1052 else:
1053 result = cast(DataFrame, result)
1054 # unwrap DataFrame to get array
1055 if len(result._data.blocks) != 1:
1056 # We've split an object block! Everything we've assumed
1057 # about a single block input returning a single block output
1058 # is a lie. To keep the code-path for the typical non-split case
1059 # clean, we choose to clean up this mess later on.
1060 split_items.append(locs)
1061 split_frames.append(result)
1062 continue
1064 assert len(result._data.blocks) == 1
1065 result = result._data.blocks[0].values
1066 if isinstance(result, np.ndarray) and result.ndim == 1:
1067 result = result.reshape(1, -1)
1069 assert not isinstance(result, DataFrame)
1071 if result is not no_result:
1072 # see if we can cast the block back to the original dtype
1073 result = maybe_downcast_numeric(result, block.dtype)
1075 if block.is_extension and isinstance(result, np.ndarray):
1076 # e.g. block.values was an IntegerArray
1077 # (1, N) case can occur if block.values was Categorical
1078 # and result is ndarray[object]
1079 assert result.ndim == 1 or result.shape[0] == 1
1080 try:
1081 # Cast back if feasible
1082 result = type(block.values)._from_sequence(
1083 result.ravel(), dtype=block.values.dtype
1084 )
1085 except (ValueError, TypeError):
1086 # reshape to be valid for non-Extension Block
1087 result = result.reshape(1, -1)
1089 agg_block: Block = block.make_block(result)
1091 new_items.append(locs)
1092 agg_blocks.append(agg_block)
1094 if not (agg_blocks or split_frames):
1095 raise DataError("No numeric types to aggregate")
1097 if split_items:
1098 # Clean up the mess left over from split blocks.
1099 for locs, result in zip(split_items, split_frames):
1100 assert len(locs) == result.shape[1]
1101 for i, loc in enumerate(locs):
1102 new_items.append(np.array([loc], dtype=locs.dtype))
1103 agg_blocks.append(result.iloc[:, [i]]._data.blocks[0])
1105 # reset the locs in the blocks to correspond to our
1106 # current ordering
1107 indexer = np.concatenate(new_items)
1108 agg_items = data.items.take(np.sort(indexer))
1110 if deleted_items:
1112 # we need to adjust the indexer to account for the
1113 # items we have removed
1114 # really should be done in internals :<
1116 deleted = np.concatenate(deleted_items)
1117 ai = np.arange(len(data))
1118 mask = np.zeros(len(data))
1119 mask[deleted] = 1
1120 indexer = (ai - mask.cumsum())[indexer]
1122 offset = 0
1123 for blk in agg_blocks:
1124 loc = len(blk.mgr_locs)
1125 blk.mgr_locs = indexer[offset : (offset + loc)]
1126 offset += loc
1128 return agg_blocks, agg_items
1130 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
1131 if self.grouper.nkeys != 1:
1132 raise AssertionError("Number of keys must be 1")
1134 axis = self.axis
1135 obj = self._obj_with_exclusions
1137 result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {}
1138 if axis != obj._info_axis_number:
1139 for name, data in self:
1140 fres = func(data, *args, **kwargs)
1141 result[name] = fres
1142 else:
1143 for name in self.indices:
1144 data = self.get_group(name, obj=obj)
1145 fres = func(data, *args, **kwargs)
1146 result[name] = fres
1148 return self._wrap_frame_output(result, obj)
1150 def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
1151 # only for axis==0
1153 obj = self._obj_with_exclusions
1154 result: Dict[Union[int, str], NDFrame] = {}
1155 cannot_agg = []
1156 for item in obj:
1157 data = obj[item]
1158 colg = SeriesGroupBy(data, selection=item, grouper=self.grouper)
1160 cast = self._transform_should_cast(func)
1161 try:
1162 result[item] = colg.aggregate(func, *args, **kwargs)
1164 except ValueError as err:
1165 if "Must produce aggregated value" in str(err):
1166 # raised in _aggregate_named, handle at higher level
1167 # see test_apply_with_mutated_index
1168 raise
1169 # otherwise we get here from an AttributeError in _make_wrapper
1170 cannot_agg.append(item)
1171 continue
1173 else:
1174 if cast:
1175 result[item] = self._try_cast(result[item], data)
1177 result_columns = obj.columns
1178 if cannot_agg:
1179 result_columns = result_columns.drop(cannot_agg)
1181 return DataFrame(result, columns=result_columns)
1183 def _wrap_applied_output(self, keys, values, not_indexed_same=False):
1184 if len(keys) == 0:
1185 return DataFrame(index=keys)
1187 key_names = self.grouper.names
1189 # GH12824.
1190 def first_not_none(values):
1191 try:
1192 return next(com.not_none(*values))
1193 except StopIteration:
1194 return None
1196 v = first_not_none(values)
1198 if v is None:
1199 # GH9684. If all values are None, then this will throw an error.
1200 # We'd prefer it return an empty dataframe.
1201 return DataFrame()
1202 elif isinstance(v, DataFrame):
1203 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
1204 elif self.grouper.groupings is not None:
1205 if len(self.grouper.groupings) > 1:
1206 key_index = self.grouper.result_index
1208 else:
1209 ping = self.grouper.groupings[0]
1210 if len(keys) == ping.ngroups:
1211 key_index = ping.group_index
1212 key_index.name = key_names[0]
1214 key_lookup = Index(keys)
1215 indexer = key_lookup.get_indexer(key_index)
1217 # reorder the values
1218 values = [values[i] for i in indexer]
1219 else:
1221 key_index = Index(keys, name=key_names[0])
1223 # don't use the key indexer
1224 if not self.as_index:
1225 key_index = None
1227 # make Nones an empty object
1228 v = first_not_none(values)
1229 if v is None:
1230 return DataFrame()
1231 elif isinstance(v, NDFrame):
1233 # this is to silence a DeprecationWarning
1234 # TODO: Remove when default dtype of empty Series is object
1235 kwargs = v._construct_axes_dict()
1236 if v._constructor is Series:
1237 backup = create_series_with_explicit_dtype(
1238 **kwargs, dtype_if_empty=object
1239 )
1240 else:
1241 backup = v._constructor(**kwargs)
1243 values = [x if (x is not None) else backup for x in values]
1245 v = values[0]
1247 if isinstance(v, (np.ndarray, Index, Series)):
1248 if isinstance(v, Series):
1249 applied_index = self._selected_obj._get_axis(self.axis)
1250 all_indexed_same = all_indexes_same([x.index for x in values])
1251 singular_series = len(values) == 1 and applied_index.nlevels == 1
1253 # GH3596
1254 # provide a reduction (Frame -> Series) if groups are
1255 # unique
1256 if self.squeeze:
1257 # assign the name to this series
1258 if singular_series:
1259 values[0].name = keys[0]
1261 # GH2893
1262 # we have series in the values array, we want to
1263 # produce a series:
1264 # if any of the sub-series are not indexed the same
1265 # OR we don't have a multi-index and we have only a
1266 # single values
1267 return self._concat_objects(
1268 keys, values, not_indexed_same=not_indexed_same
1269 )
1271 # still a series
1272 # path added as of GH 5545
1273 elif all_indexed_same:
1274 from pandas.core.reshape.concat import concat
1276 return concat(values)
1278 if not all_indexed_same:
1279 # GH 8467
1280 return self._concat_objects(keys, values, not_indexed_same=True)
1282 if self.axis == 0 and isinstance(v, ABCSeries):
1283 # GH6124 if the list of Series have a consistent name,
1284 # then propagate that name to the result.
1285 index = v.index.copy()
1286 if index.name is None:
1287 # Only propagate the series name to the result
1288 # if all series have a consistent name. If the
1289 # series do not have a consistent name, do
1290 # nothing.
1291 names = {v.name for v in values}
1292 if len(names) == 1:
1293 index.name = list(names)[0]
1295 # normally use vstack as its faster than concat
1296 # and if we have mi-columns
1297 if (
1298 isinstance(v.index, MultiIndex)
1299 or key_index is None
1300 or isinstance(key_index, MultiIndex)
1301 ):
1302 stacked_values = np.vstack([np.asarray(v) for v in values])
1303 result = DataFrame(
1304 stacked_values, index=key_index, columns=index
1305 )
1306 else:
1307 # GH5788 instead of stacking; concat gets the
1308 # dtypes correct
1309 from pandas.core.reshape.concat import concat
1311 result = concat(
1312 values,
1313 keys=key_index,
1314 names=key_index.names,
1315 axis=self.axis,
1316 ).unstack()
1317 result.columns = index
1318 elif isinstance(v, ABCSeries):
1319 stacked_values = np.vstack([np.asarray(v) for v in values])
1320 result = DataFrame(
1321 stacked_values.T, index=v.index, columns=key_index
1322 )
1323 else:
1324 # GH#1738: values is list of arrays of unequal lengths
1325 # fall through to the outer else clause
1326 # TODO: sure this is right? we used to do this
1327 # after raising AttributeError above
1328 return Series(values, index=key_index, name=self._selection_name)
1330 # if we have date/time like in the original, then coerce dates
1331 # as we are stacking can easily have object dtypes here
1332 so = self._selected_obj
1333 if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any():
1334 result = _recast_datetimelike_result(result)
1335 else:
1336 result = result._convert(datetime=True)
1338 return self._reindex_output(result)
1340 # values are not series or array-like but scalars
1341 else:
1342 # only coerce dates if we find at least 1 datetime
1343 should_coerce = any(isinstance(x, Timestamp) for x in values)
1344 # self._selection_name not passed through to Series as the
1345 # result should not take the name of original selection
1346 # of columns
1347 return Series(values, index=key_index)._convert(
1348 datetime=True, coerce=should_coerce
1349 )
1351 else:
1352 # Handle cases like BinGrouper
1353 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
1355 def _transform_general(self, func, *args, **kwargs):
1356 from pandas.core.reshape.concat import concat
1358 applied = []
1359 obj = self._obj_with_exclusions
1360 gen = self.grouper.get_iterator(obj, axis=self.axis)
1361 fast_path, slow_path = self._define_paths(func, *args, **kwargs)
1363 path = None
1364 for name, group in gen:
1365 object.__setattr__(group, "name", name)
1367 if path is None:
1368 # Try slow path and fast path.
1369 try:
1370 path, res = self._choose_path(fast_path, slow_path, group)
1371 except TypeError:
1372 return self._transform_item_by_item(obj, fast_path)
1373 except ValueError:
1374 msg = "transform must return a scalar value for each group"
1375 raise ValueError(msg)
1376 else:
1377 res = path(group)
1379 if isinstance(res, Series):
1381 # we need to broadcast across the
1382 # other dimension; this will preserve dtypes
1383 # GH14457
1384 if not np.prod(group.shape):
1385 continue
1386 elif res.index.is_(obj.index):
1387 r = concat([res] * len(group.columns), axis=1)
1388 r.columns = group.columns
1389 r.index = group.index
1390 else:
1391 r = DataFrame(
1392 np.concatenate([res.values] * len(group.index)).reshape(
1393 group.shape
1394 ),
1395 columns=group.columns,
1396 index=group.index,
1397 )
1399 applied.append(r)
1400 else:
1401 applied.append(res)
1403 concat_index = obj.columns if self.axis == 0 else obj.index
1404 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
1405 concatenated = concat(applied, axis=self.axis, verify_integrity=False)
1406 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
1407 return self._set_result_index_ordered(concatenated)
1409 @Substitution(klass="DataFrame", selected="")
1410 @Appender(_transform_template)
1411 def transform(self, func, *args, **kwargs):
1413 # optimized transforms
1414 func = self._get_cython_func(func) or func
1416 if not isinstance(func, str):
1417 return self._transform_general(func, *args, **kwargs)
1419 elif func not in base.transform_kernel_whitelist:
1420 msg = f"'{func}' is not a valid function name for transform(name)"
1421 raise ValueError(msg)
1422 elif func in base.cythonized_kernels:
1423 # cythonized transformation or canned "reduction+broadcast"
1424 return getattr(self, func)(*args, **kwargs)
1426 # If func is a reduction, we need to broadcast the
1427 # result to the whole group. Compute func result
1428 # and deal with possible broadcasting below.
1429 result = getattr(self, func)(*args, **kwargs)
1431 # a reduction transform
1432 if not isinstance(result, DataFrame):
1433 return self._transform_general(func, *args, **kwargs)
1435 obj = self._obj_with_exclusions
1437 # nuisance columns
1438 if not result.columns.equals(obj.columns):
1439 return self._transform_general(func, *args, **kwargs)
1441 return self._transform_fast(result, func)
1443 def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame:
1444 """
1445 Fast transform path for aggregations
1446 """
1447 # if there were groups with no observations (Categorical only?)
1448 # try casting data to original dtype
1449 cast = self._transform_should_cast(func_nm)
1451 obj = self._obj_with_exclusions
1453 # for each col, reshape to to size of original frame
1454 # by take operation
1455 ids, _, ngroup = self.grouper.group_info
1456 output = []
1457 for i, _ in enumerate(result.columns):
1458 res = algorithms.take_1d(result.iloc[:, i].values, ids)
1459 # TODO: we have no test cases that get here with EA dtypes;
1460 # try_cast may not be needed if EAs never get here
1461 if cast:
1462 res = self._try_cast(res, obj.iloc[:, i])
1463 output.append(res)
1465 return DataFrame._from_arrays(output, columns=result.columns, index=obj.index)
1467 def _define_paths(self, func, *args, **kwargs):
1468 if isinstance(func, str):
1469 fast_path = lambda group: getattr(group, func)(*args, **kwargs)
1470 slow_path = lambda group: group.apply(
1471 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
1472 )
1473 else:
1474 fast_path = lambda group: func(group, *args, **kwargs)
1475 slow_path = lambda group: group.apply(
1476 lambda x: func(x, *args, **kwargs), axis=self.axis
1477 )
1478 return fast_path, slow_path
1480 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
1481 path = slow_path
1482 res = slow_path(group)
1484 # if we make it here, test if we can use the fast path
1485 try:
1486 res_fast = fast_path(group)
1487 except AssertionError:
1488 raise
1489 except Exception:
1490 # GH#29631 For user-defined function, we cant predict what may be
1491 # raised; see test_transform.test_transform_fastpath_raises
1492 return path, res
1494 # verify fast path does not change columns (and names), otherwise
1495 # its results cannot be joined with those of the slow path
1496 if not isinstance(res_fast, DataFrame):
1497 return path, res
1499 if not res_fast.columns.equals(group.columns):
1500 return path, res
1502 if res_fast.equals(res):
1503 path = fast_path
1505 return path, res
1507 def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
1508 # iterate through columns
1509 output = {}
1510 inds = []
1511 for i, col in enumerate(obj):
1512 try:
1513 output[col] = self[col].transform(wrapper)
1514 except TypeError:
1515 # e.g. trying to call nanmean with string values
1516 pass
1517 else:
1518 inds.append(i)
1520 if len(output) == 0:
1521 raise TypeError("Transform function invalid for data types")
1523 columns = obj.columns
1524 if len(output) < len(obj.columns):
1525 columns = columns.take(inds)
1527 return DataFrame(output, index=obj.index, columns=columns)
1529 def filter(self, func, dropna=True, *args, **kwargs):
1530 """
1531 Return a copy of a DataFrame excluding elements from groups that
1532 do not satisfy the boolean criterion specified by func.
1534 Parameters
1535 ----------
1536 f : function
1537 Function to apply to each subframe. Should return True or False.
1538 dropna : Drop groups that do not pass the filter. True by default;
1539 If False, groups that evaluate False are filled with NaNs.
1541 Returns
1542 -------
1543 filtered : DataFrame
1545 Notes
1546 -----
1547 Each subframe is endowed the attribute 'name' in case you need to know
1548 which group you are working on.
1550 Examples
1551 --------
1552 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
1553 ... 'foo', 'bar'],
1554 ... 'B' : [1, 2, 3, 4, 5, 6],
1555 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
1556 >>> grouped = df.groupby('A')
1557 >>> grouped.filter(lambda x: x['B'].mean() > 3.)
1558 A B C
1559 1 bar 2 5.0
1560 3 bar 4 1.0
1561 5 bar 6 9.0
1562 """
1564 indices = []
1566 obj = self._selected_obj
1567 gen = self.grouper.get_iterator(obj, axis=self.axis)
1569 for name, group in gen:
1570 object.__setattr__(group, "name", name)
1572 res = func(group, *args, **kwargs)
1574 try:
1575 res = res.squeeze()
1576 except AttributeError: # allow e.g., scalars and frames to pass
1577 pass
1579 # interpret the result of the filter
1580 if is_bool(res) or (is_scalar(res) and isna(res)):
1581 if res and notna(res):
1582 indices.append(self._get_index(name))
1583 else:
1584 # non scalars aren't allowed
1585 raise TypeError(
1586 f"filter function returned a {type(res).__name__}, "
1587 "but expected a scalar bool"
1588 )
1590 return self._apply_filter(indices, dropna)
1592 def __getitem__(self, key):
1593 # per GH 23566
1594 if isinstance(key, tuple) and len(key) > 1:
1595 # if len == 1, then it becomes a SeriesGroupBy and this is actually
1596 # valid syntax, so don't raise warning
1597 warnings.warn(
1598 "Indexing with multiple keys (implicitly converted to a tuple "
1599 "of keys) will be deprecated, use a list instead.",
1600 FutureWarning,
1601 stacklevel=2,
1602 )
1603 return super().__getitem__(key)
1605 def _gotitem(self, key, ndim: int, subset=None):
1606 """
1607 sub-classes to define
1608 return a sliced object
1610 Parameters
1611 ----------
1612 key : string / list of selections
1613 ndim : 1,2
1614 requested ndim of result
1615 subset : object, default None
1616 subset to act on
1617 """
1619 if ndim == 2:
1620 if subset is None:
1621 subset = self.obj
1622 return DataFrameGroupBy(
1623 subset,
1624 self.grouper,
1625 selection=key,
1626 grouper=self.grouper,
1627 exclusions=self.exclusions,
1628 as_index=self.as_index,
1629 observed=self.observed,
1630 )
1631 elif ndim == 1:
1632 if subset is None:
1633 subset = self.obj[key]
1634 return SeriesGroupBy(
1635 subset, selection=key, grouper=self.grouper, observed=self.observed
1636 )
1638 raise AssertionError("invalid ndim for _gotitem")
1640 def _wrap_frame_output(self, result, obj) -> DataFrame:
1641 result_index = self.grouper.levels[0]
1643 if self.axis == 0:
1644 return DataFrame(result, index=obj.columns, columns=result_index).T
1645 else:
1646 return DataFrame(result, index=obj.index, columns=result_index)
1648 def _get_data_to_aggregate(self) -> BlockManager:
1649 obj = self._obj_with_exclusions
1650 if self.axis == 1:
1651 return obj.T._data
1652 else:
1653 return obj._data
1655 def _insert_inaxis_grouper_inplace(self, result):
1656 # zip in reverse so we can always insert at loc 0
1657 izip = zip(
1658 *map(
1659 reversed,
1660 (
1661 self.grouper.names,
1662 self.grouper.get_group_levels(),
1663 [grp.in_axis for grp in self.grouper.groupings],
1664 ),
1665 )
1666 )
1668 for name, lev, in_axis in izip:
1669 if in_axis:
1670 result.insert(0, name, lev)
1672 def _wrap_aggregated_output(
1673 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
1674 ) -> DataFrame:
1675 """
1676 Wraps the output of DataFrameGroupBy aggregations into the expected result.
1678 Parameters
1679 ----------
1680 output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
1681 Data to wrap.
1683 Returns
1684 -------
1685 DataFrame
1686 """
1687 indexed_output = {key.position: val for key, val in output.items()}
1688 columns = Index(key.label for key in output)
1690 result = DataFrame(indexed_output)
1691 result.columns = columns
1693 if not self.as_index:
1694 self._insert_inaxis_grouper_inplace(result)
1695 result = result._consolidate()
1696 else:
1697 index = self.grouper.result_index
1698 result.index = index
1700 if self.axis == 1:
1701 result = result.T
1703 return self._reindex_output(result)._convert(datetime=True)
1705 def _wrap_transformed_output(
1706 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
1707 ) -> DataFrame:
1708 """
1709 Wraps the output of DataFrameGroupBy transformations into the expected result.
1711 Parameters
1712 ----------
1713 output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
1714 Data to wrap.
1716 Returns
1717 -------
1718 DataFrame
1719 """
1720 indexed_output = {key.position: val for key, val in output.items()}
1721 columns = Index(key.label for key in output)
1723 result = DataFrame(indexed_output)
1724 result.columns = columns
1725 result.index = self.obj.index
1727 return result
1729 def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame:
1730 if not self.as_index:
1731 index = np.arange(blocks[0].values.shape[-1])
1732 mgr = BlockManager(blocks, axes=[items, index])
1733 result = DataFrame(mgr)
1735 self._insert_inaxis_grouper_inplace(result)
1736 result = result._consolidate()
1737 else:
1738 index = self.grouper.result_index
1739 mgr = BlockManager(blocks, axes=[items, index])
1740 result = DataFrame(mgr)
1742 if self.axis == 1:
1743 result = result.T
1745 return self._reindex_output(result)._convert(datetime=True)
1747 def _iterate_column_groupbys(self):
1748 for i, colname in enumerate(self._selected_obj.columns):
1749 yield colname, SeriesGroupBy(
1750 self._selected_obj.iloc[:, i],
1751 selection=colname,
1752 grouper=self.grouper,
1753 exclusions=self.exclusions,
1754 )
1756 def _apply_to_column_groupbys(self, func):
1757 from pandas.core.reshape.concat import concat
1759 return concat(
1760 (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()),
1761 keys=self._selected_obj.columns,
1762 axis=1,
1763 )
1765 def count(self):
1766 """
1767 Compute count of group, excluding missing values.
1769 Returns
1770 -------
1771 DataFrame
1772 Count of values within each group.
1773 """
1774 data = self._get_data_to_aggregate()
1775 ids, _, ngroups = self.grouper.group_info
1776 mask = ids != -1
1778 vals = (
1779 (mask & ~_isna_ndarraylike(np.atleast_2d(blk.get_values())))
1780 for blk in data.blocks
1781 )
1782 locs = (blk.mgr_locs for blk in data.blocks)
1784 counted = (
1785 lib.count_level_2d(x, labels=ids, max_bin=ngroups, axis=1) for x in vals
1786 )
1787 blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)]
1789 return self._wrap_agged_blocks(blocks, items=data.items)
1791 def nunique(self, dropna: bool = True):
1792 """
1793 Return DataFrame with number of distinct observations per group for
1794 each column.
1796 Parameters
1797 ----------
1798 dropna : bool, default True
1799 Don't include NaN in the counts.
1801 Returns
1802 -------
1803 nunique: DataFrame
1805 Examples
1806 --------
1807 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
1808 ... 'ham', 'ham'],
1809 ... 'value1': [1, 5, 5, 2, 5, 5],
1810 ... 'value2': list('abbaxy')})
1811 >>> df
1812 id value1 value2
1813 0 spam 1 a
1814 1 egg 5 b
1815 2 egg 5 b
1816 3 spam 2 a
1817 4 ham 5 x
1818 5 ham 5 y
1820 >>> df.groupby('id').nunique()
1821 id value1 value2
1822 id
1823 egg 1 1 1
1824 ham 1 1 2
1825 spam 1 2 1
1827 Check for rows with the same id but conflicting values:
1829 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
1830 id value1 value2
1831 0 spam 1 a
1832 3 spam 2 a
1833 4 ham 5 x
1834 5 ham 5 y
1835 """
1837 obj = self._selected_obj
1839 def groupby_series(obj, col=None):
1840 return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
1841 dropna=dropna
1842 )
1844 if isinstance(obj, Series):
1845 results = groupby_series(obj)
1846 else:
1847 # TODO: this is duplicative of how GroupBy naturally works
1848 # Try to consolidate with normal wrapping functions
1849 from pandas.core.reshape.concat import concat
1851 axis_number = obj._get_axis_number(self.axis)
1852 other_axis = int(not axis_number)
1853 if axis_number == 0:
1854 iter_func = obj.items
1855 else:
1856 iter_func = obj.iterrows
1858 results = [groupby_series(content, label) for label, content in iter_func()]
1859 results = concat(results, axis=1)
1861 if axis_number == 1:
1862 results = results.T
1864 results._get_axis(other_axis).names = obj._get_axis(other_axis).names
1866 if not self.as_index:
1867 results.index = ibase.default_index(len(results))
1868 return results
1870 boxplot = boxplot_frame_groupby
1873def _is_multi_agg_with_relabel(**kwargs) -> bool:
1874 """
1875 Check whether kwargs passed to .agg look like multi-agg with relabeling.
1877 Parameters
1878 ----------
1879 **kwargs : dict
1881 Returns
1882 -------
1883 bool
1885 Examples
1886 --------
1887 >>> _is_multi_agg_with_relabel(a='max')
1888 False
1889 >>> _is_multi_agg_with_relabel(a_max=('a', 'max'),
1890 ... a_min=('a', 'min'))
1891 True
1892 >>> _is_multi_agg_with_relabel()
1893 False
1894 """
1895 return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and (
1896 len(kwargs) > 0
1897 )
1900def _normalize_keyword_aggregation(kwargs):
1901 """
1902 Normalize user-provided "named aggregation" kwargs.
1904 Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs
1905 to the old Dict[str, List[scalar]]].
1907 Parameters
1908 ----------
1909 kwargs : dict
1911 Returns
1912 -------
1913 aggspec : dict
1914 The transformed kwargs.
1915 columns : List[str]
1916 The user-provided keys.
1917 col_idx_order : List[int]
1918 List of columns indices.
1920 Examples
1921 --------
1922 >>> _normalize_keyword_aggregation({'output': ('input', 'sum')})
1923 ({'input': ['sum']}, ('output',), [('input', 'sum')])
1924 """
1925 # Normalize the aggregation functions as Mapping[column, List[func]],
1926 # process normally, then fixup the names.
1927 # TODO: aggspec type: typing.Dict[str, List[AggScalar]]
1928 # May be hitting https://github.com/python/mypy/issues/5958
1929 # saying it doesn't have an attribute __name__
1930 aggspec = defaultdict(list)
1931 order = []
1932 columns, pairs = list(zip(*kwargs.items()))
1934 for name, (column, aggfunc) in zip(columns, pairs):
1935 aggspec[column].append(aggfunc)
1936 order.append((column, com.get_callable_name(aggfunc) or aggfunc))
1938 # uniquify aggfunc name if duplicated in order list
1939 uniquified_order = _make_unique(order)
1941 # GH 25719, due to aggspec will change the order of assigned columns in aggregation
1942 # uniquified_aggspec will store uniquified order list and will compare it with order
1943 # based on index
1944 aggspec_order = [
1945 (column, com.get_callable_name(aggfunc) or aggfunc)
1946 for column, aggfuncs in aggspec.items()
1947 for aggfunc in aggfuncs
1948 ]
1949 uniquified_aggspec = _make_unique(aggspec_order)
1951 # get the new indice of columns by comparison
1952 col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
1953 return aggspec, columns, col_idx_order
1956def _make_unique(seq):
1957 """Uniquify aggfunc name of the pairs in the order list
1959 Examples:
1960 --------
1961 >>> _make_unique([('a', '<lambda>'), ('a', '<lambda>'), ('b', '<lambda>')])
1962 [('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')]
1963 """
1964 return [
1965 (pair[0], "_".join([pair[1], str(seq[:i].count(pair))]))
1966 if seq.count(pair) > 1
1967 else pair
1968 for i, pair in enumerate(seq)
1969 ]
1972# TODO: Can't use, because mypy doesn't like us setting __name__
1973# error: "partial[Any]" has no attribute "__name__"
1974# the type is:
1975# typing.Sequence[Callable[..., ScalarResult]]
1976# -> typing.Sequence[Callable[..., ScalarResult]]:
1979def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]:
1980 """
1981 Possibly mangle a list of aggfuncs.
1983 Parameters
1984 ----------
1985 aggfuncs : Sequence
1987 Returns
1988 -------
1989 mangled: list-like
1990 A new AggSpec sequence, where lambdas have been converted
1991 to have unique names.
1993 Notes
1994 -----
1995 If just one aggfunc is passed, the name will not be mangled.
1996 """
1997 if len(aggfuncs) <= 1:
1998 # don't mangle for .agg([lambda x: .])
1999 return aggfuncs
2000 i = 0
2001 mangled_aggfuncs = []
2002 for aggfunc in aggfuncs:
2003 if com.get_callable_name(aggfunc) == "<lambda>":
2004 aggfunc = partial(aggfunc)
2005 aggfunc.__name__ = f"<lambda_{i}>"
2006 i += 1
2007 mangled_aggfuncs.append(aggfunc)
2009 return mangled_aggfuncs
2012def _maybe_mangle_lambdas(agg_spec: Any) -> Any:
2013 """
2014 Make new lambdas with unique names.
2016 Parameters
2017 ----------
2018 agg_spec : Any
2019 An argument to GroupBy.agg.
2020 Non-dict-like `agg_spec` are pass through as is.
2021 For dict-like `agg_spec` a new spec is returned
2022 with name-mangled lambdas.
2024 Returns
2025 -------
2026 mangled : Any
2027 Same type as the input.
2029 Examples
2030 --------
2031 >>> _maybe_mangle_lambdas('sum')
2032 'sum'
2034 >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP
2035 [<function __main__.<lambda_0>,
2036 <function pandas...._make_lambda.<locals>.f(*args, **kwargs)>]
2037 """
2038 is_dict = is_dict_like(agg_spec)
2039 if not (is_dict or is_list_like(agg_spec)):
2040 return agg_spec
2041 mangled_aggspec = type(agg_spec)() # dict or OrderdDict
2043 if is_dict:
2044 for key, aggfuncs in agg_spec.items():
2045 if is_list_like(aggfuncs) and not is_dict_like(aggfuncs):
2046 mangled_aggfuncs = _managle_lambda_list(aggfuncs)
2047 else:
2048 mangled_aggfuncs = aggfuncs
2050 mangled_aggspec[key] = mangled_aggfuncs
2051 else:
2052 mangled_aggspec = _managle_lambda_list(agg_spec)
2054 return mangled_aggspec
2057def _recast_datetimelike_result(result: DataFrame) -> DataFrame:
2058 """
2059 If we have date/time like in the original, then coerce dates
2060 as we are stacking can easily have object dtypes here.
2062 Parameters
2063 ----------
2064 result : DataFrame
2066 Returns
2067 -------
2068 DataFrame
2070 Notes
2071 -----
2072 - Assumes Groupby._selected_obj has ndim==2 and at least one
2073 datetimelike column
2074 """
2075 result = result.copy()
2077 obj_cols = [
2078 idx
2079 for idx in range(len(result.columns))
2080 if is_object_dtype(result.dtypes.iloc[idx])
2081 ]
2083 # See GH#26285
2084 for n in obj_cols:
2085 converted = maybe_convert_objects(
2086 result.iloc[:, n].values, convert_numeric=False
2087 )
2089 result.iloc[:, n] = converted
2090 return result