Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Provide user facing operators for doing the split part of the 

3split-apply-combine paradigm. 

4""" 

5 

6from typing import Dict, Hashable, List, Optional, Tuple 

7 

8import numpy as np 

9 

10from pandas._typing import FrameOrSeries 

11from pandas.util._decorators import cache_readonly 

12 

13from pandas.core.dtypes.common import ( 

14 ensure_categorical, 

15 is_categorical_dtype, 

16 is_datetime64_dtype, 

17 is_list_like, 

18 is_scalar, 

19 is_timedelta64_dtype, 

20) 

21from pandas.core.dtypes.generic import ABCSeries 

22 

23import pandas.core.algorithms as algorithms 

24from pandas.core.arrays import Categorical, ExtensionArray 

25import pandas.core.common as com 

26from pandas.core.frame import DataFrame 

27from pandas.core.groupby import ops 

28from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby 

29from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex 

30from pandas.core.series import Series 

31 

32from pandas.io.formats.printing import pprint_thing 

33 

34 

35class Grouper: 

36 """ 

37 A Grouper allows the user to specify a groupby instruction for an object. 

38 

39 This specification will select a column via the key parameter, or if the 

40 level and/or axis parameters are given, a level of the index of the target 

41 object. 

42 

43 If `axis` and/or `level` are passed as keywords to both `Grouper` and 

44 `groupby`, the values passed to `Grouper` take precedence. 

45 

46 Parameters 

47 ---------- 

48 key : str, defaults to None 

49 Groupby key, which selects the grouping column of the target. 

50 level : name/number, defaults to None 

51 The level for the target index. 

52 freq : str / frequency object, defaults to None 

53 This will groupby the specified frequency if the target selection 

54 (via key or level) is a datetime-like object. For full specification 

55 of available frequencies, please see `here 

56 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_. 

57 axis : str, int, defaults to 0 

58 Number/name of the axis. 

59 sort : bool, default to False 

60 Whether to sort the resulting labels. 

61 closed : {'left' or 'right'} 

62 Closed end of interval. Only when `freq` parameter is passed. 

63 label : {'left' or 'right'} 

64 Interval boundary to use for labeling. 

65 Only when `freq` parameter is passed. 

66 convention : {'start', 'end', 'e', 's'} 

67 If grouper is PeriodIndex and `freq` parameter is passed. 

68 base : int, default 0 

69 Only when `freq` parameter is passed. 

70 loffset : str, DateOffset, timedelta object 

71 Only when `freq` parameter is passed. 

72 

73 Returns 

74 ------- 

75 A specification for a groupby instruction 

76 

77 Examples 

78 -------- 

79 

80 Syntactic sugar for ``df.groupby('A')`` 

81 

82 >>> df.groupby(Grouper(key='A')) 

83 

84 Specify a resample operation on the column 'date' 

85 

86 >>> df.groupby(Grouper(key='date', freq='60s')) 

87 

88 Specify a resample operation on the level 'date' on the columns axis 

89 with a frequency of 60s 

90 

91 >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) 

92 """ 

93 

94 _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") 

95 

96 def __new__(cls, *args, **kwargs): 

97 if kwargs.get("freq") is not None: 

98 from pandas.core.resample import TimeGrouper 

99 

100 cls = TimeGrouper 

101 return super().__new__(cls) 

102 

103 def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): 

104 self.key = key 

105 self.level = level 

106 self.freq = freq 

107 self.axis = axis 

108 self.sort = sort 

109 

110 self.grouper = None 

111 self.obj = None 

112 self.indexer = None 

113 self.binner = None 

114 self._grouper = None 

115 

116 @property 

117 def ax(self): 

118 return self.grouper 

119 

120 def _get_grouper(self, obj, validate: bool = True): 

121 """ 

122 Parameters 

123 ---------- 

124 obj : the subject object 

125 validate : boolean, default True 

126 if True, validate the grouper 

127 

128 Returns 

129 ------- 

130 a tuple of binner, grouper, obj (possibly sorted) 

131 """ 

132 

133 self._set_grouper(obj) 

134 self.grouper, _, self.obj = get_grouper( 

135 self.obj, 

136 [self.key], 

137 axis=self.axis, 

138 level=self.level, 

139 sort=self.sort, 

140 validate=validate, 

141 ) 

142 return self.binner, self.grouper, self.obj 

143 

144 def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): 

145 """ 

146 given an object and the specifications, setup the internal grouper 

147 for this particular specification 

148 

149 Parameters 

150 ---------- 

151 obj : Series or DataFrame 

152 sort : bool, default False 

153 whether the resulting grouper should be sorted 

154 """ 

155 assert obj is not None 

156 

157 if self.key is not None and self.level is not None: 

158 raise ValueError("The Grouper cannot specify both a key and a level!") 

159 

160 # Keep self.grouper value before overriding 

161 if self._grouper is None: 

162 self._grouper = self.grouper 

163 

164 # the key must be a valid info item 

165 if self.key is not None: 

166 key = self.key 

167 # The 'on' is already defined 

168 if getattr(self.grouper, "name", None) == key and isinstance( 

169 obj, ABCSeries 

170 ): 

171 ax = self._grouper.take(obj.index) 

172 else: 

173 if key not in obj._info_axis: 

174 raise KeyError(f"The grouper name {key} is not found") 

175 ax = Index(obj[key], name=key) 

176 

177 else: 

178 ax = obj._get_axis(self.axis) 

179 if self.level is not None: 

180 level = self.level 

181 

182 # if a level is given it must be a mi level or 

183 # equivalent to the axis name 

184 if isinstance(ax, MultiIndex): 

185 level = ax._get_level_number(level) 

186 ax = Index(ax._get_level_values(level), name=ax.names[level]) 

187 

188 else: 

189 if level not in (0, ax.name): 

190 raise ValueError(f"The level {level} is not valid") 

191 

192 # possibly sort 

193 if (self.sort or sort) and not ax.is_monotonic: 

194 # use stable sort to support first, last, nth 

195 indexer = self.indexer = ax.argsort(kind="mergesort") 

196 ax = ax.take(indexer) 

197 obj = obj.take(indexer, axis=self.axis) 

198 

199 self.obj = obj 

200 self.grouper = ax 

201 return self.grouper 

202 

203 @property 

204 def groups(self): 

205 return self.grouper.groups 

206 

207 def __repr__(self) -> str: 

208 attrs_list = ( 

209 f"{attr_name}={repr(getattr(self, attr_name))}" 

210 for attr_name in self._attributes 

211 if getattr(self, attr_name) is not None 

212 ) 

213 attrs = ", ".join(attrs_list) 

214 cls_name = type(self).__name__ 

215 return f"{cls_name}({attrs})" 

216 

217 

218class Grouping: 

219 """ 

220 Holds the grouping information for a single key 

221 

222 Parameters 

223 ---------- 

224 index : Index 

225 grouper : 

226 obj Union[DataFrame, Series]: 

227 name : 

228 level : 

229 observed : bool, default False 

230 If we are a Categorical, use the observed values 

231 in_axis : if the Grouping is a column in self.obj and hence among 

232 Groupby.exclusions list 

233 

234 Returns 

235 ------- 

236 **Attributes**: 

237 * indices : dict of {group -> index_list} 

238 * codes : ndarray, group codes 

239 * group_index : unique groups 

240 * groups : dict of {group -> label_list} 

241 """ 

242 

243 def __init__( 

244 self, 

245 index: Index, 

246 grouper=None, 

247 obj: Optional[FrameOrSeries] = None, 

248 name=None, 

249 level=None, 

250 sort: bool = True, 

251 observed: bool = False, 

252 in_axis: bool = False, 

253 ): 

254 self.name = name 

255 self.level = level 

256 self.grouper = _convert_grouper(index, grouper) 

257 self.all_grouper = None 

258 self.index = index 

259 self.sort = sort 

260 self.obj = obj 

261 self.observed = observed 

262 self.in_axis = in_axis 

263 

264 # right place for this? 

265 if isinstance(grouper, (Series, Index)) and name is None: 

266 self.name = grouper.name 

267 

268 if isinstance(grouper, MultiIndex): 

269 self.grouper = grouper.values 

270 

271 # we have a single grouper which may be a myriad of things, 

272 # some of which are dependent on the passing in level 

273 

274 if level is not None: 

275 if not isinstance(level, int): 

276 if level not in index.names: 

277 raise AssertionError(f"Level {level} not in index") 

278 level = index.names.index(level) 

279 

280 if self.name is None: 

281 self.name = index.names[level] 

282 

283 ( 

284 self.grouper, 

285 self._codes, 

286 self._group_index, 

287 ) = index._get_grouper_for_level(self.grouper, level) 

288 

289 # a passed Grouper like, directly get the grouper in the same way 

290 # as single grouper groupby, use the group_info to get codes 

291 elif isinstance(self.grouper, Grouper): 

292 # get the new grouper; we already have disambiguated 

293 # what key/level refer to exactly, don't need to 

294 # check again as we have by this point converted these 

295 # to an actual value (rather than a pd.Grouper) 

296 _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) 

297 if self.name is None: 

298 self.name = grouper.result_index.name 

299 self.obj = self.grouper.obj 

300 self.grouper = grouper._get_grouper() 

301 

302 else: 

303 if self.grouper is None and self.name is not None and self.obj is not None: 

304 self.grouper = self.obj[self.name] 

305 

306 elif isinstance(self.grouper, (list, tuple)): 

307 self.grouper = com.asarray_tuplesafe(self.grouper) 

308 

309 # a passed Categorical 

310 elif is_categorical_dtype(self.grouper): 

311 

312 self.grouper, self.all_grouper = recode_for_groupby( 

313 self.grouper, self.sort, observed 

314 ) 

315 categories = self.grouper.categories 

316 

317 # we make a CategoricalIndex out of the cat grouper 

318 # preserving the categories / ordered attributes 

319 self._codes = self.grouper.codes 

320 if observed: 

321 codes = algorithms.unique1d(self.grouper.codes) 

322 codes = codes[codes != -1] 

323 if sort or self.grouper.ordered: 

324 codes = np.sort(codes) 

325 else: 

326 codes = np.arange(len(categories)) 

327 

328 self._group_index = CategoricalIndex( 

329 Categorical.from_codes( 

330 codes=codes, categories=categories, ordered=self.grouper.ordered 

331 ), 

332 name=self.name, 

333 ) 

334 

335 # we are done 

336 if isinstance(self.grouper, Grouping): 

337 self.grouper = self.grouper.grouper 

338 

339 # no level passed 

340 elif not isinstance( 

341 self.grouper, (Series, Index, ExtensionArray, np.ndarray) 

342 ): 

343 if getattr(self.grouper, "ndim", 1) != 1: 

344 t = self.name or str(type(self.grouper)) 

345 raise ValueError(f"Grouper for '{t}' not 1-dimensional") 

346 self.grouper = self.index.map(self.grouper) 

347 if not ( 

348 hasattr(self.grouper, "__len__") 

349 and len(self.grouper) == len(self.index) 

350 ): 

351 grper = pprint_thing(self.grouper) 

352 errmsg = ( 

353 "Grouper result violates len(labels) == " 

354 f"len(data)\nresult: {grper}" 

355 ) 

356 self.grouper = None # Try for sanity 

357 raise AssertionError(errmsg) 

358 

359 # if we have a date/time-like grouper, make sure that we have 

360 # Timestamps like 

361 if getattr(self.grouper, "dtype", None) is not None: 

362 if is_datetime64_dtype(self.grouper): 

363 self.grouper = self.grouper.astype("datetime64[ns]") 

364 elif is_timedelta64_dtype(self.grouper): 

365 

366 self.grouper = self.grouper.astype("timedelta64[ns]") 

367 

368 def __repr__(self) -> str: 

369 return f"Grouping({self.name})" 

370 

371 def __iter__(self): 

372 return iter(self.indices) 

373 

374 _codes: Optional[np.ndarray] = None 

375 _group_index: Optional[Index] = None 

376 

377 @property 

378 def ngroups(self) -> int: 

379 return len(self.group_index) 

380 

381 @cache_readonly 

382 def indices(self): 

383 # we have a list of groupers 

384 if isinstance(self.grouper, ops.BaseGrouper): 

385 return self.grouper.indices 

386 

387 values = ensure_categorical(self.grouper) 

388 return values._reverse_indexer() 

389 

390 @property 

391 def codes(self) -> np.ndarray: 

392 if self._codes is None: 

393 self._make_codes() 

394 return self._codes 

395 

396 @cache_readonly 

397 def result_index(self) -> Index: 

398 if self.all_grouper is not None: 

399 return recode_from_groupby(self.all_grouper, self.sort, self.group_index) 

400 return self.group_index 

401 

402 @property 

403 def group_index(self) -> Index: 

404 if self._group_index is None: 

405 self._make_codes() 

406 assert self._group_index is not None 

407 return self._group_index 

408 

409 def _make_codes(self) -> None: 

410 if self._codes is None or self._group_index is None: 

411 # we have a list of groupers 

412 if isinstance(self.grouper, ops.BaseGrouper): 

413 codes = self.grouper.codes_info 

414 uniques = self.grouper.result_index 

415 else: 

416 codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) 

417 uniques = Index(uniques, name=self.name) 

418 self._codes = codes 

419 self._group_index = uniques 

420 

421 @cache_readonly 

422 def groups(self) -> Dict[Hashable, np.ndarray]: 

423 return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) 

424 

425 

426def get_grouper( 

427 obj: FrameOrSeries, 

428 key=None, 

429 axis: int = 0, 

430 level=None, 

431 sort: bool = True, 

432 observed: bool = False, 

433 mutated: bool = False, 

434 validate: bool = True, 

435) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": 

436 """ 

437 Create and return a BaseGrouper, which is an internal 

438 mapping of how to create the grouper indexers. 

439 This may be composed of multiple Grouping objects, indicating 

440 multiple groupers 

441 

442 Groupers are ultimately index mappings. They can originate as: 

443 index mappings, keys to columns, functions, or Groupers 

444 

445 Groupers enable local references to axis,level,sort, while 

446 the passed in axis, level, and sort are 'global'. 

447 

448 This routine tries to figure out what the passing in references 

449 are and then creates a Grouping for each one, combined into 

450 a BaseGrouper. 

451 

452 If observed & we have a categorical grouper, only show the observed 

453 values. 

454 

455 If validate, then check for key/level overlaps. 

456 

457 """ 

458 group_axis = obj._get_axis(axis) 

459 

460 # validate that the passed single level is compatible with the passed 

461 # axis of the object 

462 if level is not None: 

463 # TODO: These if-block and else-block are almost same. 

464 # MultiIndex instance check is removable, but it seems that there are 

465 # some processes only for non-MultiIndex in else-block, 

466 # eg. `obj.index.name != level`. We have to consider carefully whether 

467 # these are applicable for MultiIndex. Even if these are applicable, 

468 # we need to check if it makes no side effect to subsequent processes 

469 # on the outside of this condition. 

470 # (GH 17621) 

471 if isinstance(group_axis, MultiIndex): 

472 if is_list_like(level) and len(level) == 1: 

473 level = level[0] 

474 

475 if key is None and is_scalar(level): 

476 # Get the level values from group_axis 

477 key = group_axis.get_level_values(level) 

478 level = None 

479 

480 else: 

481 # allow level to be a length-one list-like object 

482 # (e.g., level=[0]) 

483 # GH 13901 

484 if is_list_like(level): 

485 nlevels = len(level) 

486 if nlevels == 1: 

487 level = level[0] 

488 elif nlevels == 0: 

489 raise ValueError("No group keys passed!") 

490 else: 

491 raise ValueError("multiple levels only valid with MultiIndex") 

492 

493 if isinstance(level, str): 

494 if obj._get_axis(axis).name != level: 

495 raise ValueError( 

496 f"level name {level} is not the name " 

497 f"of the {obj._get_axis_name(axis)}" 

498 ) 

499 elif level > 0 or level < -1: 

500 raise ValueError("level > 0 or level < -1 only valid with MultiIndex") 

501 

502 # NOTE: `group_axis` and `group_axis.get_level_values(level)` 

503 # are same in this section. 

504 level = None 

505 key = group_axis 

506 

507 # a passed-in Grouper, directly convert 

508 if isinstance(key, Grouper): 

509 binner, grouper, obj = key._get_grouper(obj, validate=False) 

510 if key.key is None: 

511 return grouper, [], obj 

512 else: 

513 return grouper, [key.key], obj 

514 

515 # already have a BaseGrouper, just return it 

516 elif isinstance(key, ops.BaseGrouper): 

517 return key, [], obj 

518 

519 if not isinstance(key, list): 

520 keys = [key] 

521 match_axis_length = False 

522 else: 

523 keys = key 

524 match_axis_length = len(keys) == len(group_axis) 

525 

526 # what are we after, exactly? 

527 any_callable = any(callable(g) or isinstance(g, dict) for g in keys) 

528 any_groupers = any(isinstance(g, Grouper) for g in keys) 

529 any_arraylike = any( 

530 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys 

531 ) 

532 

533 # is this an index replacement? 

534 if ( 

535 not any_callable 

536 and not any_arraylike 

537 and not any_groupers 

538 and match_axis_length 

539 and level is None 

540 ): 

541 if isinstance(obj, DataFrame): 

542 all_in_columns_index = all( 

543 g in obj.columns or g in obj.index.names for g in keys 

544 ) 

545 else: 

546 assert isinstance(obj, Series) 

547 all_in_columns_index = all(g in obj.index.names for g in keys) 

548 

549 if not all_in_columns_index: 

550 keys = [com.asarray_tuplesafe(keys)] 

551 

552 if isinstance(level, (tuple, list)): 

553 if key is None: 

554 keys = [None] * len(level) 

555 levels = level 

556 else: 

557 levels = [level] * len(keys) 

558 

559 groupings: List[Grouping] = [] 

560 exclusions: List[Hashable] = [] 

561 

562 # if the actual grouper should be obj[key] 

563 def is_in_axis(key) -> bool: 

564 if not _is_label_like(key): 

565 items = obj._data.items 

566 try: 

567 items.get_loc(key) 

568 except (KeyError, TypeError): 

569 # TypeError shows up here if we pass e.g. Int64Index 

570 return False 

571 

572 return True 

573 

574 # if the grouper is obj[name] 

575 def is_in_obj(gpr) -> bool: 

576 if not hasattr(gpr, "name"): 

577 return False 

578 try: 

579 return gpr is obj[gpr.name] 

580 except (KeyError, IndexError, ValueError): 

581 # TODO: ValueError: Given date string not likely a datetime. 

582 # should be KeyError? 

583 return False 

584 

585 for i, (gpr, level) in enumerate(zip(keys, levels)): 

586 

587 if is_in_obj(gpr): # df.groupby(df['name']) 

588 in_axis, name = True, gpr.name 

589 exclusions.append(name) 

590 

591 elif is_in_axis(gpr): # df.groupby('name') 

592 if gpr in obj: 

593 if validate: 

594 obj._check_label_or_level_ambiguity(gpr, axis=axis) 

595 in_axis, name, gpr = True, gpr, obj[gpr] 

596 exclusions.append(name) 

597 elif obj._is_level_reference(gpr, axis=axis): 

598 in_axis, name, level, gpr = False, None, gpr, None 

599 else: 

600 raise KeyError(gpr) 

601 elif isinstance(gpr, Grouper) and gpr.key is not None: 

602 # Add key to exclusions 

603 exclusions.append(gpr.key) 

604 in_axis, name = False, None 

605 else: 

606 in_axis, name = False, None 

607 

608 if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: 

609 raise ValueError( 

610 f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " 

611 "must be same length" 

612 ) 

613 

614 # create the Grouping 

615 # allow us to passing the actual Grouping as the gpr 

616 ping = ( 

617 Grouping( 

618 group_axis, 

619 gpr, 

620 obj=obj, 

621 name=name, 

622 level=level, 

623 sort=sort, 

624 observed=observed, 

625 in_axis=in_axis, 

626 ) 

627 if not isinstance(gpr, Grouping) 

628 else gpr 

629 ) 

630 

631 groupings.append(ping) 

632 

633 if len(groupings) == 0 and len(obj): 

634 raise ValueError("No group keys passed!") 

635 elif len(groupings) == 0: 

636 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) 

637 

638 # create the internals grouper 

639 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) 

640 return grouper, exclusions, obj 

641 

642 

643def _is_label_like(val) -> bool: 

644 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) 

645 

646 

647def _convert_grouper(axis: Index, grouper): 

648 if isinstance(grouper, dict): 

649 return grouper.get 

650 elif isinstance(grouper, Series): 

651 if grouper.index.equals(axis): 

652 return grouper._values 

653 else: 

654 return grouper.reindex(axis)._values 

655 elif isinstance(grouper, (list, Series, Index, np.ndarray)): 

656 if len(grouper) != len(axis): 

657 raise ValueError("Grouper and axis must be same length") 

658 return grouper 

659 else: 

660 return grouper