Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/groupby/grouper.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Provide user facing operators for doing the split part of the
3split-apply-combine paradigm.
4"""
6from typing import Dict, Hashable, List, Optional, Tuple
8import numpy as np
10from pandas._typing import FrameOrSeries
11from pandas.util._decorators import cache_readonly
13from pandas.core.dtypes.common import (
14 ensure_categorical,
15 is_categorical_dtype,
16 is_datetime64_dtype,
17 is_list_like,
18 is_scalar,
19 is_timedelta64_dtype,
20)
21from pandas.core.dtypes.generic import ABCSeries
23import pandas.core.algorithms as algorithms
24from pandas.core.arrays import Categorical, ExtensionArray
25import pandas.core.common as com
26from pandas.core.frame import DataFrame
27from pandas.core.groupby import ops
28from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby
29from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex
30from pandas.core.series import Series
32from pandas.io.formats.printing import pprint_thing
35class Grouper:
36 """
37 A Grouper allows the user to specify a groupby instruction for an object.
39 This specification will select a column via the key parameter, or if the
40 level and/or axis parameters are given, a level of the index of the target
41 object.
43 If `axis` and/or `level` are passed as keywords to both `Grouper` and
44 `groupby`, the values passed to `Grouper` take precedence.
46 Parameters
47 ----------
48 key : str, defaults to None
49 Groupby key, which selects the grouping column of the target.
50 level : name/number, defaults to None
51 The level for the target index.
52 freq : str / frequency object, defaults to None
53 This will groupby the specified frequency if the target selection
54 (via key or level) is a datetime-like object. For full specification
55 of available frequencies, please see `here
56 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
57 axis : str, int, defaults to 0
58 Number/name of the axis.
59 sort : bool, default to False
60 Whether to sort the resulting labels.
61 closed : {'left' or 'right'}
62 Closed end of interval. Only when `freq` parameter is passed.
63 label : {'left' or 'right'}
64 Interval boundary to use for labeling.
65 Only when `freq` parameter is passed.
66 convention : {'start', 'end', 'e', 's'}
67 If grouper is PeriodIndex and `freq` parameter is passed.
68 base : int, default 0
69 Only when `freq` parameter is passed.
70 loffset : str, DateOffset, timedelta object
71 Only when `freq` parameter is passed.
73 Returns
74 -------
75 A specification for a groupby instruction
77 Examples
78 --------
80 Syntactic sugar for ``df.groupby('A')``
82 >>> df.groupby(Grouper(key='A'))
84 Specify a resample operation on the column 'date'
86 >>> df.groupby(Grouper(key='date', freq='60s'))
88 Specify a resample operation on the level 'date' on the columns axis
89 with a frequency of 60s
91 >>> df.groupby(Grouper(level='date', freq='60s', axis=1))
92 """
94 _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort")
96 def __new__(cls, *args, **kwargs):
97 if kwargs.get("freq") is not None:
98 from pandas.core.resample import TimeGrouper
100 cls = TimeGrouper
101 return super().__new__(cls)
103 def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
104 self.key = key
105 self.level = level
106 self.freq = freq
107 self.axis = axis
108 self.sort = sort
110 self.grouper = None
111 self.obj = None
112 self.indexer = None
113 self.binner = None
114 self._grouper = None
116 @property
117 def ax(self):
118 return self.grouper
120 def _get_grouper(self, obj, validate: bool = True):
121 """
122 Parameters
123 ----------
124 obj : the subject object
125 validate : boolean, default True
126 if True, validate the grouper
128 Returns
129 -------
130 a tuple of binner, grouper, obj (possibly sorted)
131 """
133 self._set_grouper(obj)
134 self.grouper, _, self.obj = get_grouper(
135 self.obj,
136 [self.key],
137 axis=self.axis,
138 level=self.level,
139 sort=self.sort,
140 validate=validate,
141 )
142 return self.binner, self.grouper, self.obj
144 def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
145 """
146 given an object and the specifications, setup the internal grouper
147 for this particular specification
149 Parameters
150 ----------
151 obj : Series or DataFrame
152 sort : bool, default False
153 whether the resulting grouper should be sorted
154 """
155 assert obj is not None
157 if self.key is not None and self.level is not None:
158 raise ValueError("The Grouper cannot specify both a key and a level!")
160 # Keep self.grouper value before overriding
161 if self._grouper is None:
162 self._grouper = self.grouper
164 # the key must be a valid info item
165 if self.key is not None:
166 key = self.key
167 # The 'on' is already defined
168 if getattr(self.grouper, "name", None) == key and isinstance(
169 obj, ABCSeries
170 ):
171 ax = self._grouper.take(obj.index)
172 else:
173 if key not in obj._info_axis:
174 raise KeyError(f"The grouper name {key} is not found")
175 ax = Index(obj[key], name=key)
177 else:
178 ax = obj._get_axis(self.axis)
179 if self.level is not None:
180 level = self.level
182 # if a level is given it must be a mi level or
183 # equivalent to the axis name
184 if isinstance(ax, MultiIndex):
185 level = ax._get_level_number(level)
186 ax = Index(ax._get_level_values(level), name=ax.names[level])
188 else:
189 if level not in (0, ax.name):
190 raise ValueError(f"The level {level} is not valid")
192 # possibly sort
193 if (self.sort or sort) and not ax.is_monotonic:
194 # use stable sort to support first, last, nth
195 indexer = self.indexer = ax.argsort(kind="mergesort")
196 ax = ax.take(indexer)
197 obj = obj.take(indexer, axis=self.axis)
199 self.obj = obj
200 self.grouper = ax
201 return self.grouper
203 @property
204 def groups(self):
205 return self.grouper.groups
207 def __repr__(self) -> str:
208 attrs_list = (
209 f"{attr_name}={repr(getattr(self, attr_name))}"
210 for attr_name in self._attributes
211 if getattr(self, attr_name) is not None
212 )
213 attrs = ", ".join(attrs_list)
214 cls_name = type(self).__name__
215 return f"{cls_name}({attrs})"
218class Grouping:
219 """
220 Holds the grouping information for a single key
222 Parameters
223 ----------
224 index : Index
225 grouper :
226 obj Union[DataFrame, Series]:
227 name :
228 level :
229 observed : bool, default False
230 If we are a Categorical, use the observed values
231 in_axis : if the Grouping is a column in self.obj and hence among
232 Groupby.exclusions list
234 Returns
235 -------
236 **Attributes**:
237 * indices : dict of {group -> index_list}
238 * codes : ndarray, group codes
239 * group_index : unique groups
240 * groups : dict of {group -> label_list}
241 """
243 def __init__(
244 self,
245 index: Index,
246 grouper=None,
247 obj: Optional[FrameOrSeries] = None,
248 name=None,
249 level=None,
250 sort: bool = True,
251 observed: bool = False,
252 in_axis: bool = False,
253 ):
254 self.name = name
255 self.level = level
256 self.grouper = _convert_grouper(index, grouper)
257 self.all_grouper = None
258 self.index = index
259 self.sort = sort
260 self.obj = obj
261 self.observed = observed
262 self.in_axis = in_axis
264 # right place for this?
265 if isinstance(grouper, (Series, Index)) and name is None:
266 self.name = grouper.name
268 if isinstance(grouper, MultiIndex):
269 self.grouper = grouper.values
271 # we have a single grouper which may be a myriad of things,
272 # some of which are dependent on the passing in level
274 if level is not None:
275 if not isinstance(level, int):
276 if level not in index.names:
277 raise AssertionError(f"Level {level} not in index")
278 level = index.names.index(level)
280 if self.name is None:
281 self.name = index.names[level]
283 (
284 self.grouper,
285 self._codes,
286 self._group_index,
287 ) = index._get_grouper_for_level(self.grouper, level)
289 # a passed Grouper like, directly get the grouper in the same way
290 # as single grouper groupby, use the group_info to get codes
291 elif isinstance(self.grouper, Grouper):
292 # get the new grouper; we already have disambiguated
293 # what key/level refer to exactly, don't need to
294 # check again as we have by this point converted these
295 # to an actual value (rather than a pd.Grouper)
296 _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
297 if self.name is None:
298 self.name = grouper.result_index.name
299 self.obj = self.grouper.obj
300 self.grouper = grouper._get_grouper()
302 else:
303 if self.grouper is None and self.name is not None and self.obj is not None:
304 self.grouper = self.obj[self.name]
306 elif isinstance(self.grouper, (list, tuple)):
307 self.grouper = com.asarray_tuplesafe(self.grouper)
309 # a passed Categorical
310 elif is_categorical_dtype(self.grouper):
312 self.grouper, self.all_grouper = recode_for_groupby(
313 self.grouper, self.sort, observed
314 )
315 categories = self.grouper.categories
317 # we make a CategoricalIndex out of the cat grouper
318 # preserving the categories / ordered attributes
319 self._codes = self.grouper.codes
320 if observed:
321 codes = algorithms.unique1d(self.grouper.codes)
322 codes = codes[codes != -1]
323 if sort or self.grouper.ordered:
324 codes = np.sort(codes)
325 else:
326 codes = np.arange(len(categories))
328 self._group_index = CategoricalIndex(
329 Categorical.from_codes(
330 codes=codes, categories=categories, ordered=self.grouper.ordered
331 ),
332 name=self.name,
333 )
335 # we are done
336 if isinstance(self.grouper, Grouping):
337 self.grouper = self.grouper.grouper
339 # no level passed
340 elif not isinstance(
341 self.grouper, (Series, Index, ExtensionArray, np.ndarray)
342 ):
343 if getattr(self.grouper, "ndim", 1) != 1:
344 t = self.name or str(type(self.grouper))
345 raise ValueError(f"Grouper for '{t}' not 1-dimensional")
346 self.grouper = self.index.map(self.grouper)
347 if not (
348 hasattr(self.grouper, "__len__")
349 and len(self.grouper) == len(self.index)
350 ):
351 grper = pprint_thing(self.grouper)
352 errmsg = (
353 "Grouper result violates len(labels) == "
354 f"len(data)\nresult: {grper}"
355 )
356 self.grouper = None # Try for sanity
357 raise AssertionError(errmsg)
359 # if we have a date/time-like grouper, make sure that we have
360 # Timestamps like
361 if getattr(self.grouper, "dtype", None) is not None:
362 if is_datetime64_dtype(self.grouper):
363 self.grouper = self.grouper.astype("datetime64[ns]")
364 elif is_timedelta64_dtype(self.grouper):
366 self.grouper = self.grouper.astype("timedelta64[ns]")
368 def __repr__(self) -> str:
369 return f"Grouping({self.name})"
371 def __iter__(self):
372 return iter(self.indices)
374 _codes: Optional[np.ndarray] = None
375 _group_index: Optional[Index] = None
377 @property
378 def ngroups(self) -> int:
379 return len(self.group_index)
381 @cache_readonly
382 def indices(self):
383 # we have a list of groupers
384 if isinstance(self.grouper, ops.BaseGrouper):
385 return self.grouper.indices
387 values = ensure_categorical(self.grouper)
388 return values._reverse_indexer()
390 @property
391 def codes(self) -> np.ndarray:
392 if self._codes is None:
393 self._make_codes()
394 return self._codes
396 @cache_readonly
397 def result_index(self) -> Index:
398 if self.all_grouper is not None:
399 return recode_from_groupby(self.all_grouper, self.sort, self.group_index)
400 return self.group_index
402 @property
403 def group_index(self) -> Index:
404 if self._group_index is None:
405 self._make_codes()
406 assert self._group_index is not None
407 return self._group_index
409 def _make_codes(self) -> None:
410 if self._codes is None or self._group_index is None:
411 # we have a list of groupers
412 if isinstance(self.grouper, ops.BaseGrouper):
413 codes = self.grouper.codes_info
414 uniques = self.grouper.result_index
415 else:
416 codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
417 uniques = Index(uniques, name=self.name)
418 self._codes = codes
419 self._group_index = uniques
421 @cache_readonly
422 def groups(self) -> Dict[Hashable, np.ndarray]:
423 return self.index.groupby(Categorical.from_codes(self.codes, self.group_index))
426def get_grouper(
427 obj: FrameOrSeries,
428 key=None,
429 axis: int = 0,
430 level=None,
431 sort: bool = True,
432 observed: bool = False,
433 mutated: bool = False,
434 validate: bool = True,
435) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]":
436 """
437 Create and return a BaseGrouper, which is an internal
438 mapping of how to create the grouper indexers.
439 This may be composed of multiple Grouping objects, indicating
440 multiple groupers
442 Groupers are ultimately index mappings. They can originate as:
443 index mappings, keys to columns, functions, or Groupers
445 Groupers enable local references to axis,level,sort, while
446 the passed in axis, level, and sort are 'global'.
448 This routine tries to figure out what the passing in references
449 are and then creates a Grouping for each one, combined into
450 a BaseGrouper.
452 If observed & we have a categorical grouper, only show the observed
453 values.
455 If validate, then check for key/level overlaps.
457 """
458 group_axis = obj._get_axis(axis)
460 # validate that the passed single level is compatible with the passed
461 # axis of the object
462 if level is not None:
463 # TODO: These if-block and else-block are almost same.
464 # MultiIndex instance check is removable, but it seems that there are
465 # some processes only for non-MultiIndex in else-block,
466 # eg. `obj.index.name != level`. We have to consider carefully whether
467 # these are applicable for MultiIndex. Even if these are applicable,
468 # we need to check if it makes no side effect to subsequent processes
469 # on the outside of this condition.
470 # (GH 17621)
471 if isinstance(group_axis, MultiIndex):
472 if is_list_like(level) and len(level) == 1:
473 level = level[0]
475 if key is None and is_scalar(level):
476 # Get the level values from group_axis
477 key = group_axis.get_level_values(level)
478 level = None
480 else:
481 # allow level to be a length-one list-like object
482 # (e.g., level=[0])
483 # GH 13901
484 if is_list_like(level):
485 nlevels = len(level)
486 if nlevels == 1:
487 level = level[0]
488 elif nlevels == 0:
489 raise ValueError("No group keys passed!")
490 else:
491 raise ValueError("multiple levels only valid with MultiIndex")
493 if isinstance(level, str):
494 if obj._get_axis(axis).name != level:
495 raise ValueError(
496 f"level name {level} is not the name "
497 f"of the {obj._get_axis_name(axis)}"
498 )
499 elif level > 0 or level < -1:
500 raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
502 # NOTE: `group_axis` and `group_axis.get_level_values(level)`
503 # are same in this section.
504 level = None
505 key = group_axis
507 # a passed-in Grouper, directly convert
508 if isinstance(key, Grouper):
509 binner, grouper, obj = key._get_grouper(obj, validate=False)
510 if key.key is None:
511 return grouper, [], obj
512 else:
513 return grouper, [key.key], obj
515 # already have a BaseGrouper, just return it
516 elif isinstance(key, ops.BaseGrouper):
517 return key, [], obj
519 if not isinstance(key, list):
520 keys = [key]
521 match_axis_length = False
522 else:
523 keys = key
524 match_axis_length = len(keys) == len(group_axis)
526 # what are we after, exactly?
527 any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
528 any_groupers = any(isinstance(g, Grouper) for g in keys)
529 any_arraylike = any(
530 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
531 )
533 # is this an index replacement?
534 if (
535 not any_callable
536 and not any_arraylike
537 and not any_groupers
538 and match_axis_length
539 and level is None
540 ):
541 if isinstance(obj, DataFrame):
542 all_in_columns_index = all(
543 g in obj.columns or g in obj.index.names for g in keys
544 )
545 else:
546 assert isinstance(obj, Series)
547 all_in_columns_index = all(g in obj.index.names for g in keys)
549 if not all_in_columns_index:
550 keys = [com.asarray_tuplesafe(keys)]
552 if isinstance(level, (tuple, list)):
553 if key is None:
554 keys = [None] * len(level)
555 levels = level
556 else:
557 levels = [level] * len(keys)
559 groupings: List[Grouping] = []
560 exclusions: List[Hashable] = []
562 # if the actual grouper should be obj[key]
563 def is_in_axis(key) -> bool:
564 if not _is_label_like(key):
565 items = obj._data.items
566 try:
567 items.get_loc(key)
568 except (KeyError, TypeError):
569 # TypeError shows up here if we pass e.g. Int64Index
570 return False
572 return True
574 # if the grouper is obj[name]
575 def is_in_obj(gpr) -> bool:
576 if not hasattr(gpr, "name"):
577 return False
578 try:
579 return gpr is obj[gpr.name]
580 except (KeyError, IndexError, ValueError):
581 # TODO: ValueError: Given date string not likely a datetime.
582 # should be KeyError?
583 return False
585 for i, (gpr, level) in enumerate(zip(keys, levels)):
587 if is_in_obj(gpr): # df.groupby(df['name'])
588 in_axis, name = True, gpr.name
589 exclusions.append(name)
591 elif is_in_axis(gpr): # df.groupby('name')
592 if gpr in obj:
593 if validate:
594 obj._check_label_or_level_ambiguity(gpr, axis=axis)
595 in_axis, name, gpr = True, gpr, obj[gpr]
596 exclusions.append(name)
597 elif obj._is_level_reference(gpr, axis=axis):
598 in_axis, name, level, gpr = False, None, gpr, None
599 else:
600 raise KeyError(gpr)
601 elif isinstance(gpr, Grouper) and gpr.key is not None:
602 # Add key to exclusions
603 exclusions.append(gpr.key)
604 in_axis, name = False, None
605 else:
606 in_axis, name = False, None
608 if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:
609 raise ValueError(
610 f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) "
611 "must be same length"
612 )
614 # create the Grouping
615 # allow us to passing the actual Grouping as the gpr
616 ping = (
617 Grouping(
618 group_axis,
619 gpr,
620 obj=obj,
621 name=name,
622 level=level,
623 sort=sort,
624 observed=observed,
625 in_axis=in_axis,
626 )
627 if not isinstance(gpr, Grouping)
628 else gpr
629 )
631 groupings.append(ping)
633 if len(groupings) == 0 and len(obj):
634 raise ValueError("No group keys passed!")
635 elif len(groupings) == 0:
636 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
638 # create the internals grouper
639 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)
640 return grouper, exclusions, obj
643def _is_label_like(val) -> bool:
644 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
647def _convert_grouper(axis: Index, grouper):
648 if isinstance(grouper, dict):
649 return grouper.get
650 elif isinstance(grouper, Series):
651 if grouper.index.equals(axis):
652 return grouper._values
653 else:
654 return grouper.reindex(axis)._values
655 elif isinstance(grouper, (list, Series, Index, np.ndarray)):
656 if len(grouper) != len(axis):
657 raise ValueError("Grouper and axis must be same length")
658 return grouper
659 else:
660 return grouper