Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Define the SeriesGroupBy and DataFrameGroupBy 

3classes that hold the groupby interfaces (and some implementations). 

4 

5These are user facing as the result of the ``df.groupby(...)`` operations, 

6which here returns a DataFrameGroupBy object. 

7""" 

8from collections import abc, defaultdict, namedtuple 

9import copy 

10from functools import partial 

11from textwrap import dedent 

12import typing 

13from typing import ( 

14 TYPE_CHECKING, 

15 Any, 

16 Callable, 

17 Dict, 

18 FrozenSet, 

19 Iterable, 

20 List, 

21 Mapping, 

22 Sequence, 

23 Tuple, 

24 Type, 

25 Union, 

26 cast, 

27) 

28import warnings 

29 

30import numpy as np 

31 

32from pandas._libs import Timestamp, lib 

33from pandas._typing import FrameOrSeries 

34from pandas.util._decorators import Appender, Substitution 

35 

36from pandas.core.dtypes.cast import ( 

37 maybe_convert_objects, 

38 maybe_downcast_numeric, 

39 maybe_downcast_to_dtype, 

40) 

41from pandas.core.dtypes.common import ( 

42 ensure_int64, 

43 ensure_platform_int, 

44 is_bool, 

45 is_dict_like, 

46 is_integer_dtype, 

47 is_interval_dtype, 

48 is_list_like, 

49 is_numeric_dtype, 

50 is_object_dtype, 

51 is_scalar, 

52 needs_i8_conversion, 

53) 

54from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna 

55 

56import pandas.core.algorithms as algorithms 

57from pandas.core.base import DataError, SpecificationError 

58import pandas.core.common as com 

59from pandas.core.construction import create_series_with_explicit_dtype 

60from pandas.core.frame import DataFrame 

61from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs 

62from pandas.core.groupby import base 

63from pandas.core.groupby.groupby import ( 

64 GroupBy, 

65 _apply_docs, 

66 _transform_template, 

67 get_groupby, 

68) 

69from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same 

70import pandas.core.indexes.base as ibase 

71from pandas.core.internals import BlockManager, make_block 

72from pandas.core.series import Series 

73 

74from pandas.plotting import boxplot_frame_groupby 

75 

76if TYPE_CHECKING: 

77 from pandas.core.internals import Block 

78 

79 

80NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) 

81# TODO(typing) the return value on this callable should be any *scalar*. 

82AggScalar = Union[str, Callable[..., Any]] 

83# TODO: validate types on ScalarResult and move to _typing 

84# Blocked from using by https://github.com/python/mypy/issues/1484 

85# See note at _mangle_lambda_list 

86ScalarResult = typing.TypeVar("ScalarResult") 

87 

88 

89def generate_property(name: str, klass: Type[FrameOrSeries]): 

90 """ 

91 Create a property for a GroupBy subclass to dispatch to DataFrame/Series. 

92 

93 Parameters 

94 ---------- 

95 name : str 

96 klass : {DataFrame, Series} 

97 

98 Returns 

99 ------- 

100 property 

101 """ 

102 

103 def prop(self): 

104 return self._make_wrapper(name) 

105 

106 parent_method = getattr(klass, name) 

107 prop.__doc__ = parent_method.__doc__ or "" 

108 prop.__name__ = name 

109 return property(prop) 

110 

111 

112def pin_whitelisted_properties(klass: Type[FrameOrSeries], whitelist: FrozenSet[str]): 

113 """ 

114 Create GroupBy member defs for DataFrame/Series names in a whitelist. 

115 

116 Parameters 

117 ---------- 

118 klass : DataFrame or Series class 

119 class where members are defined. 

120 whitelist : frozenset[str] 

121 Set of names of klass methods to be constructed 

122 

123 Returns 

124 ------- 

125 class decorator 

126 

127 Notes 

128 ----- 

129 Since we don't want to override methods explicitly defined in the 

130 base class, any such name is skipped. 

131 """ 

132 

133 def pinner(cls): 

134 for name in whitelist: 

135 if hasattr(cls, name): 

136 # don't override anything that was explicitly defined 

137 # in the base class 

138 continue 

139 

140 prop = generate_property(name, klass) 

141 setattr(cls, name, prop) 

142 

143 return cls 

144 

145 return pinner 

146 

147 

148@pin_whitelisted_properties(Series, base.series_apply_whitelist) 

149class SeriesGroupBy(GroupBy): 

150 _apply_whitelist = base.series_apply_whitelist 

151 

152 def _iterate_slices(self) -> Iterable[Series]: 

153 yield self._selected_obj 

154 

155 @property 

156 def _selection_name(self): 

157 """ 

158 since we are a series, we by definition only have 

159 a single name, but may be the result of a selection or 

160 the name of our object 

161 """ 

162 if self._selection is None: 

163 return self.obj.name 

164 else: 

165 return self._selection 

166 

167 _agg_see_also_doc = dedent( 

168 """ 

169 See Also 

170 -------- 

171 pandas.Series.groupby.apply 

172 pandas.Series.groupby.transform 

173 pandas.Series.aggregate 

174 """ 

175 ) 

176 

177 _agg_examples_doc = dedent( 

178 """ 

179 Examples 

180 -------- 

181 >>> s = pd.Series([1, 2, 3, 4]) 

182 

183 >>> s 

184 0 1 

185 1 2 

186 2 3 

187 3 4 

188 dtype: int64 

189 

190 >>> s.groupby([1, 1, 2, 2]).min() 

191 1 1 

192 2 3 

193 dtype: int64 

194 

195 >>> s.groupby([1, 1, 2, 2]).agg('min') 

196 1 1 

197 2 3 

198 dtype: int64 

199 

200 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) 

201 min max 

202 1 1 2 

203 2 3 4 

204 

205 The output column names can be controlled by passing 

206 the desired column names and aggregations as keyword arguments. 

207 

208 >>> s.groupby([1, 1, 2, 2]).agg( 

209 ... minimum='min', 

210 ... maximum='max', 

211 ... ) 

212 minimum maximum 

213 1 1 2 

214 2 3 4 

215 """ 

216 ) 

217 

218 @Appender( 

219 _apply_docs["template"].format( 

220 input="series", examples=_apply_docs["series_examples"] 

221 ) 

222 ) 

223 def apply(self, func, *args, **kwargs): 

224 return super().apply(func, *args, **kwargs) 

225 

226 @Substitution( 

227 see_also=_agg_see_also_doc, 

228 examples=_agg_examples_doc, 

229 versionadded="", 

230 klass="Series", 

231 axis="", 

232 ) 

233 @Appender(_shared_docs["aggregate"]) 

234 def aggregate(self, func=None, *args, **kwargs): 

235 

236 relabeling = func is None 

237 columns = None 

238 no_arg_message = "Must provide 'func' or named aggregation **kwargs." 

239 if relabeling: 

240 columns = list(kwargs) 

241 func = [kwargs[col] for col in columns] 

242 kwargs = {} 

243 if not columns: 

244 raise TypeError(no_arg_message) 

245 

246 if isinstance(func, str): 

247 return getattr(self, func)(*args, **kwargs) 

248 

249 elif isinstance(func, abc.Iterable): 

250 # Catch instances of lists / tuples 

251 # but not the class list / tuple itself. 

252 func = _maybe_mangle_lambdas(func) 

253 ret = self._aggregate_multiple_funcs(func) 

254 if relabeling: 

255 ret.columns = columns 

256 else: 

257 cyfunc = self._get_cython_func(func) 

258 if cyfunc and not args and not kwargs: 

259 return getattr(self, cyfunc)() 

260 

261 if self.grouper.nkeys > 1: 

262 return self._python_agg_general(func, *args, **kwargs) 

263 

264 try: 

265 return self._python_agg_general(func, *args, **kwargs) 

266 except (ValueError, KeyError): 

267 # TODO: KeyError is raised in _python_agg_general, 

268 # see see test_groupby.test_basic 

269 result = self._aggregate_named(func, *args, **kwargs) 

270 

271 index = Index(sorted(result), name=self.grouper.names[0]) 

272 ret = create_series_with_explicit_dtype( 

273 result, index=index, dtype_if_empty=object 

274 ) 

275 

276 if not self.as_index: # pragma: no cover 

277 print("Warning, ignoring as_index=True") 

278 

279 if isinstance(ret, dict): 

280 from pandas import concat 

281 

282 ret = concat(ret, axis=1) 

283 return ret 

284 

285 agg = aggregate 

286 

287 def _aggregate_multiple_funcs(self, arg): 

288 if isinstance(arg, dict): 

289 

290 # show the deprecation, but only if we 

291 # have not shown a higher level one 

292 # GH 15931 

293 if isinstance(self._selected_obj, Series): 

294 raise SpecificationError("nested renamer is not supported") 

295 

296 columns = list(arg.keys()) 

297 arg = arg.items() 

298 elif any(isinstance(x, (tuple, list)) for x in arg): 

299 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] 

300 

301 # indicated column order 

302 columns = next(zip(*arg)) 

303 else: 

304 # list of functions / function names 

305 columns = [] 

306 for f in arg: 

307 columns.append(com.get_callable_name(f) or f) 

308 

309 arg = zip(columns, arg) 

310 

311 results = {} 

312 for name, func in arg: 

313 obj = self 

314 

315 # reset the cache so that we 

316 # only include the named selection 

317 if name in self._selected_obj: 

318 obj = copy.copy(obj) 

319 obj._reset_cache() 

320 obj._selection = name 

321 results[name] = obj.aggregate(func) 

322 

323 if any(isinstance(x, DataFrame) for x in results.values()): 

324 # let higher level handle 

325 return results 

326 

327 return DataFrame(results, columns=columns) 

328 

329 def _wrap_series_output( 

330 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index 

331 ) -> Union[Series, DataFrame]: 

332 """ 

333 Wraps the output of a SeriesGroupBy operation into the expected result. 

334 

335 Parameters 

336 ---------- 

337 output : Mapping[base.OutputKey, Union[Series, np.ndarray]] 

338 Data to wrap. 

339 index : pd.Index 

340 Index to apply to the output. 

341 

342 Returns 

343 ------- 

344 Series or DataFrame 

345 

346 Notes 

347 ----- 

348 In the vast majority of cases output and columns will only contain one 

349 element. The exception is operations that expand dimensions, like ohlc. 

350 """ 

351 indexed_output = {key.position: val for key, val in output.items()} 

352 columns = Index(key.label for key in output) 

353 

354 result: Union[Series, DataFrame] 

355 if len(output) > 1: 

356 result = DataFrame(indexed_output, index=index) 

357 result.columns = columns 

358 else: 

359 result = Series(indexed_output[0], index=index, name=columns[0]) 

360 

361 return result 

362 

363 def _wrap_aggregated_output( 

364 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] 

365 ) -> Union[Series, DataFrame]: 

366 """ 

367 Wraps the output of a SeriesGroupBy aggregation into the expected result. 

368 

369 Parameters 

370 ---------- 

371 output : Mapping[base.OutputKey, Union[Series, np.ndarray]] 

372 Data to wrap. 

373 

374 Returns 

375 ------- 

376 Series or DataFrame 

377 

378 Notes 

379 ----- 

380 In the vast majority of cases output will only contain one element. 

381 The exception is operations that expand dimensions, like ohlc. 

382 """ 

383 result = self._wrap_series_output( 

384 output=output, index=self.grouper.result_index 

385 ) 

386 return self._reindex_output(result)._convert(datetime=True) 

387 

388 def _wrap_transformed_output( 

389 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] 

390 ) -> Series: 

391 """ 

392 Wraps the output of a SeriesGroupBy aggregation into the expected result. 

393 

394 Parameters 

395 ---------- 

396 output : dict[base.OutputKey, Union[Series, np.ndarray]] 

397 Dict with a sole key of 0 and a value of the result values. 

398 

399 Returns 

400 ------- 

401 Series 

402 

403 Notes 

404 ----- 

405 output should always contain one element. It is specified as a dict 

406 for consistency with DataFrame methods and _wrap_aggregated_output. 

407 """ 

408 assert len(output) == 1 

409 result = self._wrap_series_output(output=output, index=self.obj.index) 

410 

411 # No transformations increase the ndim of the result 

412 assert isinstance(result, Series) 

413 return result 

414 

415 def _wrap_applied_output(self, keys, values, not_indexed_same=False): 

416 if len(keys) == 0: 

417 # GH #6265 

418 return Series([], name=self._selection_name, index=keys, dtype=np.float64) 

419 

420 def _get_index() -> Index: 

421 if self.grouper.nkeys > 1: 

422 index = MultiIndex.from_tuples(keys, names=self.grouper.names) 

423 else: 

424 index = Index(keys, name=self.grouper.names[0]) 

425 return index 

426 

427 if isinstance(values[0], dict): 

428 # GH #823 #24880 

429 index = _get_index() 

430 result = self._reindex_output(DataFrame(values, index=index)) 

431 # if self.observed is False, 

432 # keep all-NaN rows created while re-indexing 

433 result = result.stack(dropna=self.observed) 

434 result.name = self._selection_name 

435 return result 

436 

437 if isinstance(values[0], Series): 

438 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) 

439 elif isinstance(values[0], DataFrame): 

440 # possible that Series -> DataFrame by applied function 

441 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) 

442 else: 

443 # GH #6265 #24880 

444 result = Series(data=values, index=_get_index(), name=self._selection_name) 

445 return self._reindex_output(result) 

446 

447 def _aggregate_named(self, func, *args, **kwargs): 

448 result = {} 

449 

450 for name, group in self: 

451 group.name = name 

452 output = func(group, *args, **kwargs) 

453 if isinstance(output, (Series, Index, np.ndarray)): 

454 raise ValueError("Must produce aggregated value") 

455 result[name] = output 

456 

457 return result 

458 

459 @Substitution(klass="Series", selected="A.") 

460 @Appender(_transform_template) 

461 def transform(self, func, *args, **kwargs): 

462 func = self._get_cython_func(func) or func 

463 

464 if not isinstance(func, str): 

465 return self._transform_general(func, *args, **kwargs) 

466 

467 elif func not in base.transform_kernel_whitelist: 

468 msg = f"'{func}' is not a valid function name for transform(name)" 

469 raise ValueError(msg) 

470 elif func in base.cythonized_kernels: 

471 # cythonized transform or canned "agg+broadcast" 

472 return getattr(self, func)(*args, **kwargs) 

473 

474 # If func is a reduction, we need to broadcast the 

475 # result to the whole group. Compute func result 

476 # and deal with possible broadcasting below. 

477 result = getattr(self, func)(*args, **kwargs) 

478 return self._transform_fast(result, func) 

479 

480 def _transform_general(self, func, *args, **kwargs): 

481 """ 

482 Transform with a non-str `func`. 

483 """ 

484 klass = type(self._selected_obj) 

485 

486 results = [] 

487 for name, group in self: 

488 object.__setattr__(group, "name", name) 

489 res = func(group, *args, **kwargs) 

490 

491 if isinstance(res, (ABCDataFrame, ABCSeries)): 

492 res = res._values 

493 

494 indexer = self._get_index(name) 

495 ser = klass(res, indexer) 

496 results.append(ser) 

497 

498 # check for empty "results" to avoid concat ValueError 

499 if results: 

500 from pandas.core.reshape.concat import concat 

501 

502 result = concat(results).sort_index() 

503 else: 

504 result = Series(dtype=np.float64) 

505 

506 # we will only try to coerce the result type if 

507 # we have a numeric dtype, as these are *always* user-defined funcs 

508 # the cython take a different path (and casting) 

509 dtype = self._selected_obj.dtype 

510 if is_numeric_dtype(dtype): 

511 result = maybe_downcast_to_dtype(result, dtype) 

512 

513 result.name = self._selected_obj.name 

514 result.index = self._selected_obj.index 

515 return result 

516 

517 def _transform_fast(self, result, func_nm: str) -> Series: 

518 """ 

519 fast version of transform, only applicable to 

520 builtin/cythonizable functions 

521 """ 

522 ids, _, ngroup = self.grouper.group_info 

523 cast = self._transform_should_cast(func_nm) 

524 out = algorithms.take_1d(result._values, ids) 

525 if cast: 

526 out = self._try_cast(out, self.obj) 

527 return Series(out, index=self.obj.index, name=self.obj.name) 

528 

529 def filter(self, func, dropna=True, *args, **kwargs): 

530 """ 

531 Return a copy of a Series excluding elements from groups that 

532 do not satisfy the boolean criterion specified by func. 

533 

534 Parameters 

535 ---------- 

536 func : function 

537 To apply to each group. Should return True or False. 

538 dropna : Drop groups that do not pass the filter. True by default; 

539 if False, groups that evaluate False are filled with NaNs. 

540 

541 Examples 

542 -------- 

543 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

544 ... 'foo', 'bar'], 

545 ... 'B' : [1, 2, 3, 4, 5, 6], 

546 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

547 >>> grouped = df.groupby('A') 

548 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) 

549 1 2 

550 3 4 

551 5 6 

552 Name: B, dtype: int64 

553 

554 Returns 

555 ------- 

556 filtered : Series 

557 """ 

558 if isinstance(func, str): 

559 wrapper = lambda x: getattr(x, func)(*args, **kwargs) 

560 else: 

561 wrapper = lambda x: func(x, *args, **kwargs) 

562 

563 # Interpret np.nan as False. 

564 def true_and_notna(x, *args, **kwargs) -> bool: 

565 b = wrapper(x, *args, **kwargs) 

566 return b and notna(b) 

567 

568 try: 

569 indices = [ 

570 self._get_index(name) for name, group in self if true_and_notna(group) 

571 ] 

572 except (ValueError, TypeError): 

573 raise TypeError("the filter must return a boolean result") 

574 

575 filtered = self._apply_filter(indices, dropna) 

576 return filtered 

577 

578 def nunique(self, dropna: bool = True) -> Series: 

579 """ 

580 Return number of unique elements in the group. 

581 

582 Returns 

583 ------- 

584 Series 

585 Number of unique values within each group. 

586 """ 

587 ids, _, _ = self.grouper.group_info 

588 

589 val = self.obj._internal_get_values() 

590 

591 codes, _ = algorithms.factorize(val, sort=False) 

592 sorter = np.lexsort((codes, ids)) 

593 codes = codes[sorter] 

594 ids = ids[sorter] 

595 

596 # group boundaries are where group ids change 

597 # unique observations are where sorted values change 

598 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] 

599 inc = np.r_[1, codes[1:] != codes[:-1]] 

600 

601 # 1st item of each group is a new unique observation 

602 mask = codes == -1 

603 if dropna: 

604 inc[idx] = 1 

605 inc[mask] = 0 

606 else: 

607 inc[mask & np.r_[False, mask[:-1]]] = 0 

608 inc[idx] = 1 

609 

610 out = np.add.reduceat(inc, idx).astype("int64", copy=False) 

611 if len(ids): 

612 # NaN/NaT group exists if the head of ids is -1, 

613 # so remove it from res and exclude its index from idx 

614 if ids[0] == -1: 

615 res = out[1:] 

616 idx = idx[np.flatnonzero(idx)] 

617 else: 

618 res = out 

619 else: 

620 res = out[1:] 

621 ri = self.grouper.result_index 

622 

623 # we might have duplications among the bins 

624 if len(res) != len(ri): 

625 res, out = np.zeros(len(ri), dtype=out.dtype), res 

626 res[ids[idx]] = out 

627 

628 result = Series(res, index=ri, name=self._selection_name) 

629 return self._reindex_output(result, fill_value=0) 

630 

631 @Appender(Series.describe.__doc__) 

632 def describe(self, **kwargs): 

633 result = self.apply(lambda x: x.describe(**kwargs)) 

634 if self.axis == 1: 

635 return result.T 

636 return result.unstack() 

637 

638 def value_counts( 

639 self, normalize=False, sort=True, ascending=False, bins=None, dropna=True 

640 ): 

641 

642 from pandas.core.reshape.tile import cut 

643 from pandas.core.reshape.merge import _get_join_indexers 

644 

645 if bins is not None and not np.iterable(bins): 

646 # scalar bins cannot be done at top level 

647 # in a backward compatible way 

648 return self.apply( 

649 Series.value_counts, 

650 normalize=normalize, 

651 sort=sort, 

652 ascending=ascending, 

653 bins=bins, 

654 ) 

655 

656 ids, _, _ = self.grouper.group_info 

657 val = self.obj._internal_get_values() 

658 

659 # groupby removes null keys from groupings 

660 mask = ids != -1 

661 ids, val = ids[mask], val[mask] 

662 

663 if bins is None: 

664 lab, lev = algorithms.factorize(val, sort=True) 

665 llab = lambda lab, inc: lab[inc] 

666 else: 

667 

668 # lab is a Categorical with categories an IntervalIndex 

669 lab = cut(Series(val), bins, include_lowest=True) 

670 lev = lab.cat.categories 

671 lab = lev.take(lab.cat.codes) 

672 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] 

673 

674 if is_interval_dtype(lab): 

675 # TODO: should we do this inside II? 

676 sorter = np.lexsort((lab.left, lab.right, ids)) 

677 else: 

678 sorter = np.lexsort((lab, ids)) 

679 

680 ids, lab = ids[sorter], lab[sorter] 

681 

682 # group boundaries are where group ids change 

683 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] 

684 

685 # new values are where sorted labels change 

686 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) 

687 inc = np.r_[True, lchanges] 

688 inc[idx] = True # group boundaries are also new values 

689 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts 

690 

691 # num. of times each group should be repeated 

692 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) 

693 

694 # multi-index components 

695 codes = self.grouper.reconstructed_codes 

696 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] 

697 levels = [ping.group_index for ping in self.grouper.groupings] + [lev] 

698 names = self.grouper.names + [self._selection_name] 

699 

700 if dropna: 

701 mask = codes[-1] != -1 

702 if mask.all(): 

703 dropna = False 

704 else: 

705 out, codes = out[mask], [level_codes[mask] for level_codes in codes] 

706 

707 if normalize: 

708 out = out.astype("float") 

709 d = np.diff(np.r_[idx, len(ids)]) 

710 if dropna: 

711 m = ids[lab == -1] 

712 np.add.at(d, m, -1) 

713 acc = rep(d)[mask] 

714 else: 

715 acc = rep(d) 

716 out /= acc 

717 

718 if sort and bins is None: 

719 cat = ids[inc][mask] if dropna else ids[inc] 

720 sorter = np.lexsort((out if ascending else -out, cat)) 

721 out, codes[-1] = out[sorter], codes[-1][sorter] 

722 

723 if bins is None: 

724 mi = MultiIndex( 

725 levels=levels, codes=codes, names=names, verify_integrity=False 

726 ) 

727 

728 if is_integer_dtype(out): 

729 out = ensure_int64(out) 

730 return Series(out, index=mi, name=self._selection_name) 

731 

732 # for compat. with libgroupby.value_counts need to ensure every 

733 # bin is present at every index level, null filled with zeros 

734 diff = np.zeros(len(out), dtype="bool") 

735 for level_codes in codes[:-1]: 

736 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] 

737 

738 ncat, nbin = diff.sum(), len(levels[-1]) 

739 

740 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] 

741 

742 right = [diff.cumsum() - 1, codes[-1]] 

743 

744 _, idx = _get_join_indexers(left, right, sort=False, how="left") 

745 out = np.where(idx != -1, out[idx], 0) 

746 

747 if sort: 

748 sorter = np.lexsort((out if ascending else -out, left[0])) 

749 out, left[-1] = out[sorter], left[-1][sorter] 

750 

751 # build the multi-index w/ full levels 

752 def build_codes(lev_codes: np.ndarray) -> np.ndarray: 

753 return np.repeat(lev_codes[diff], nbin) 

754 

755 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] 

756 codes.append(left[-1]) 

757 

758 mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) 

759 

760 if is_integer_dtype(out): 

761 out = ensure_int64(out) 

762 return Series(out, index=mi, name=self._selection_name) 

763 

764 def count(self) -> Series: 

765 """ 

766 Compute count of group, excluding missing values. 

767 

768 Returns 

769 ------- 

770 Series 

771 Count of values within each group. 

772 """ 

773 ids, _, ngroups = self.grouper.group_info 

774 val = self.obj._internal_get_values() 

775 

776 mask = (ids != -1) & ~isna(val) 

777 ids = ensure_platform_int(ids) 

778 minlength = ngroups or 0 

779 out = np.bincount(ids[mask], minlength=minlength) 

780 

781 result = Series( 

782 out, 

783 index=self.grouper.result_index, 

784 name=self._selection_name, 

785 dtype="int64", 

786 ) 

787 return self._reindex_output(result, fill_value=0) 

788 

789 def _apply_to_column_groupbys(self, func): 

790 """ return a pass thru """ 

791 return func(self) 

792 

793 def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): 

794 """Calculate pct_change of each value to previous entry in group""" 

795 # TODO: Remove this conditional when #23918 is fixed 

796 if freq: 

797 return self.apply( 

798 lambda x: x.pct_change( 

799 periods=periods, fill_method=fill_method, limit=limit, freq=freq 

800 ) 

801 ) 

802 if fill_method is None: # GH30463 

803 fill_method = "pad" 

804 limit = 0 

805 filled = getattr(self, fill_method)(limit=limit) 

806 fill_grp = filled.groupby(self.grouper.codes) 

807 shifted = fill_grp.shift(periods=periods, freq=freq) 

808 

809 return (filled / shifted) - 1 

810 

811 

812@pin_whitelisted_properties(DataFrame, base.dataframe_apply_whitelist) 

813class DataFrameGroupBy(GroupBy): 

814 

815 _apply_whitelist = base.dataframe_apply_whitelist 

816 

817 _agg_see_also_doc = dedent( 

818 """ 

819 See Also 

820 -------- 

821 pandas.DataFrame.groupby.apply 

822 pandas.DataFrame.groupby.transform 

823 pandas.DataFrame.aggregate 

824 """ 

825 ) 

826 

827 _agg_examples_doc = dedent( 

828 """ 

829 Examples 

830 -------- 

831 

832 >>> df = pd.DataFrame({'A': [1, 1, 2, 2], 

833 ... 'B': [1, 2, 3, 4], 

834 ... 'C': np.random.randn(4)}) 

835 

836 >>> df 

837 A B C 

838 0 1 1 0.362838 

839 1 1 2 0.227877 

840 2 2 3 1.267767 

841 3 2 4 -0.562860 

842 

843 The aggregation is for each column. 

844 

845 >>> df.groupby('A').agg('min') 

846 B C 

847 A 

848 1 1 0.227877 

849 2 3 -0.562860 

850 

851 Multiple aggregations 

852 

853 >>> df.groupby('A').agg(['min', 'max']) 

854 B C 

855 min max min max 

856 A 

857 1 1 2 0.227877 0.362838 

858 2 3 4 -0.562860 1.267767 

859 

860 Select a column for aggregation 

861 

862 >>> df.groupby('A').B.agg(['min', 'max']) 

863 min max 

864 A 

865 1 1 2 

866 2 3 4 

867 

868 Different aggregations per column 

869 

870 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) 

871 B C 

872 min max sum 

873 A 

874 1 1 2 0.590716 

875 2 3 4 0.704907 

876 

877 To control the output names with different aggregations per column, 

878 pandas supports "named aggregation" 

879 

880 >>> df.groupby("A").agg( 

881 ... b_min=pd.NamedAgg(column="B", aggfunc="min"), 

882 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) 

883 b_min c_sum 

884 A 

885 1 1 -1.956929 

886 2 3 -0.322183 

887 

888 - The keywords are the *output* column names 

889 - The values are tuples whose first element is the column to select 

890 and the second element is the aggregation to apply to that column. 

891 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields 

892 ``['column', 'aggfunc']`` to make it clearer what the arguments are. 

893 As usual, the aggregation can be a callable or a string alias. 

894 

895 See :ref:`groupby.aggregate.named` for more. 

896 """ 

897 ) 

898 

899 @Substitution( 

900 see_also=_agg_see_also_doc, 

901 examples=_agg_examples_doc, 

902 versionadded="", 

903 klass="DataFrame", 

904 axis="", 

905 ) 

906 @Appender(_shared_docs["aggregate"]) 

907 def aggregate(self, func=None, *args, **kwargs): 

908 

909 relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) 

910 if relabeling: 

911 func, columns, order = _normalize_keyword_aggregation(kwargs) 

912 

913 kwargs = {} 

914 elif isinstance(func, list) and len(func) > len(set(func)): 

915 

916 # GH 28426 will raise error if duplicated function names are used and 

917 # there is no reassigned name 

918 raise SpecificationError( 

919 "Function names must be unique if there is no new column " 

920 "names assigned" 

921 ) 

922 elif func is None: 

923 # nicer error message 

924 raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") 

925 

926 func = _maybe_mangle_lambdas(func) 

927 

928 result, how = self._aggregate(func, *args, **kwargs) 

929 if how is None: 

930 return result 

931 

932 if result is None: 

933 

934 # grouper specific aggregations 

935 if self.grouper.nkeys > 1: 

936 return self._python_agg_general(func, *args, **kwargs) 

937 elif args or kwargs: 

938 result = self._aggregate_frame(func, *args, **kwargs) 

939 

940 elif self.axis == 1: 

941 # _aggregate_multiple_funcs does not allow self.axis == 1 

942 result = self._aggregate_frame(func) 

943 

944 else: 

945 

946 # try to treat as if we are passing a list 

947 try: 

948 result = self._aggregate_multiple_funcs([func], _axis=self.axis) 

949 except ValueError as err: 

950 if "no results" not in str(err): 

951 # raised directly by _aggregate_multiple_funcs 

952 raise 

953 result = self._aggregate_frame(func) 

954 else: 

955 # select everything except for the last level, which is the one 

956 # containing the name of the function(s), see GH 32040 

957 result.columns = result.columns.rename( 

958 [self._selected_obj.columns.name] * result.columns.nlevels 

959 ).droplevel(-1) 

960 

961 if not self.as_index: 

962 self._insert_inaxis_grouper_inplace(result) 

963 result.index = np.arange(len(result)) 

964 

965 if relabeling: 

966 

967 # used reordered index of columns 

968 result = result.iloc[:, order] 

969 result.columns = columns 

970 

971 return result._convert(datetime=True) 

972 

973 agg = aggregate 

974 

975 def _iterate_slices(self) -> Iterable[Series]: 

976 obj = self._selected_obj 

977 if self.axis == 1: 

978 obj = obj.T 

979 

980 if isinstance(obj, Series) and obj.name not in self.exclusions: 

981 # Occurs when doing DataFrameGroupBy(...)["X"] 

982 yield obj 

983 else: 

984 for label, values in obj.items(): 

985 if label in self.exclusions: 

986 continue 

987 

988 yield values 

989 

990 def _cython_agg_general( 

991 self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 

992 ) -> DataFrame: 

993 agg_blocks, agg_items = self._cython_agg_blocks( 

994 how, alt=alt, numeric_only=numeric_only, min_count=min_count 

995 ) 

996 return self._wrap_agged_blocks(agg_blocks, items=agg_items) 

997 

998 def _cython_agg_blocks( 

999 self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 

1000 ) -> "Tuple[List[Block], Index]": 

1001 # TODO: the actual managing of mgr_locs is a PITA 

1002 # here, it should happen via BlockManager.combine 

1003 

1004 data: BlockManager = self._get_data_to_aggregate() 

1005 

1006 if numeric_only: 

1007 data = data.get_numeric_data(copy=False) 

1008 

1009 agg_blocks: List[Block] = [] 

1010 new_items: List[np.ndarray] = [] 

1011 deleted_items: List[np.ndarray] = [] 

1012 # Some object-dtype blocks might be split into List[Block[T], Block[U]] 

1013 split_items: List[np.ndarray] = [] 

1014 split_frames: List[DataFrame] = [] 

1015 

1016 no_result = object() 

1017 for block in data.blocks: 

1018 # Avoid inheriting result from earlier in the loop 

1019 result = no_result 

1020 locs = block.mgr_locs.as_array 

1021 try: 

1022 result, _ = self.grouper.aggregate( 

1023 block.values, how, axis=1, min_count=min_count 

1024 ) 

1025 except NotImplementedError: 

1026 # generally if we have numeric_only=False 

1027 # and non-applicable functions 

1028 # try to python agg 

1029 

1030 if alt is None: 

1031 # we cannot perform the operation 

1032 # in an alternate way, exclude the block 

1033 assert how == "ohlc" 

1034 deleted_items.append(locs) 

1035 continue 

1036 

1037 # call our grouper again with only this block 

1038 obj = self.obj[data.items[locs]] 

1039 if obj.shape[1] == 1: 

1040 # Avoid call to self.values that can occur in DataFrame 

1041 # reductions; see GH#28949 

1042 obj = obj.iloc[:, 0] 

1043 

1044 s = get_groupby(obj, self.grouper) 

1045 try: 

1046 result = s.aggregate(lambda x: alt(x, axis=self.axis)) 

1047 except TypeError: 

1048 # we may have an exception in trying to aggregate 

1049 # continue and exclude the block 

1050 deleted_items.append(locs) 

1051 continue 

1052 else: 

1053 result = cast(DataFrame, result) 

1054 # unwrap DataFrame to get array 

1055 if len(result._data.blocks) != 1: 

1056 # We've split an object block! Everything we've assumed 

1057 # about a single block input returning a single block output 

1058 # is a lie. To keep the code-path for the typical non-split case 

1059 # clean, we choose to clean up this mess later on. 

1060 split_items.append(locs) 

1061 split_frames.append(result) 

1062 continue 

1063 

1064 assert len(result._data.blocks) == 1 

1065 result = result._data.blocks[0].values 

1066 if isinstance(result, np.ndarray) and result.ndim == 1: 

1067 result = result.reshape(1, -1) 

1068 

1069 assert not isinstance(result, DataFrame) 

1070 

1071 if result is not no_result: 

1072 # see if we can cast the block back to the original dtype 

1073 result = maybe_downcast_numeric(result, block.dtype) 

1074 

1075 if block.is_extension and isinstance(result, np.ndarray): 

1076 # e.g. block.values was an IntegerArray 

1077 # (1, N) case can occur if block.values was Categorical 

1078 # and result is ndarray[object] 

1079 assert result.ndim == 1 or result.shape[0] == 1 

1080 try: 

1081 # Cast back if feasible 

1082 result = type(block.values)._from_sequence( 

1083 result.ravel(), dtype=block.values.dtype 

1084 ) 

1085 except (ValueError, TypeError): 

1086 # reshape to be valid for non-Extension Block 

1087 result = result.reshape(1, -1) 

1088 

1089 agg_block: Block = block.make_block(result) 

1090 

1091 new_items.append(locs) 

1092 agg_blocks.append(agg_block) 

1093 

1094 if not (agg_blocks or split_frames): 

1095 raise DataError("No numeric types to aggregate") 

1096 

1097 if split_items: 

1098 # Clean up the mess left over from split blocks. 

1099 for locs, result in zip(split_items, split_frames): 

1100 assert len(locs) == result.shape[1] 

1101 for i, loc in enumerate(locs): 

1102 new_items.append(np.array([loc], dtype=locs.dtype)) 

1103 agg_blocks.append(result.iloc[:, [i]]._data.blocks[0]) 

1104 

1105 # reset the locs in the blocks to correspond to our 

1106 # current ordering 

1107 indexer = np.concatenate(new_items) 

1108 agg_items = data.items.take(np.sort(indexer)) 

1109 

1110 if deleted_items: 

1111 

1112 # we need to adjust the indexer to account for the 

1113 # items we have removed 

1114 # really should be done in internals :< 

1115 

1116 deleted = np.concatenate(deleted_items) 

1117 ai = np.arange(len(data)) 

1118 mask = np.zeros(len(data)) 

1119 mask[deleted] = 1 

1120 indexer = (ai - mask.cumsum())[indexer] 

1121 

1122 offset = 0 

1123 for blk in agg_blocks: 

1124 loc = len(blk.mgr_locs) 

1125 blk.mgr_locs = indexer[offset : (offset + loc)] 

1126 offset += loc 

1127 

1128 return agg_blocks, agg_items 

1129 

1130 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: 

1131 if self.grouper.nkeys != 1: 

1132 raise AssertionError("Number of keys must be 1") 

1133 

1134 axis = self.axis 

1135 obj = self._obj_with_exclusions 

1136 

1137 result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {} 

1138 if axis != obj._info_axis_number: 

1139 for name, data in self: 

1140 fres = func(data, *args, **kwargs) 

1141 result[name] = fres 

1142 else: 

1143 for name in self.indices: 

1144 data = self.get_group(name, obj=obj) 

1145 fres = func(data, *args, **kwargs) 

1146 result[name] = fres 

1147 

1148 return self._wrap_frame_output(result, obj) 

1149 

1150 def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: 

1151 # only for axis==0 

1152 

1153 obj = self._obj_with_exclusions 

1154 result: Dict[Union[int, str], NDFrame] = {} 

1155 cannot_agg = [] 

1156 for item in obj: 

1157 data = obj[item] 

1158 colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) 

1159 

1160 cast = self._transform_should_cast(func) 

1161 try: 

1162 result[item] = colg.aggregate(func, *args, **kwargs) 

1163 

1164 except ValueError as err: 

1165 if "Must produce aggregated value" in str(err): 

1166 # raised in _aggregate_named, handle at higher level 

1167 # see test_apply_with_mutated_index 

1168 raise 

1169 # otherwise we get here from an AttributeError in _make_wrapper 

1170 cannot_agg.append(item) 

1171 continue 

1172 

1173 else: 

1174 if cast: 

1175 result[item] = self._try_cast(result[item], data) 

1176 

1177 result_columns = obj.columns 

1178 if cannot_agg: 

1179 result_columns = result_columns.drop(cannot_agg) 

1180 

1181 return DataFrame(result, columns=result_columns) 

1182 

1183 def _wrap_applied_output(self, keys, values, not_indexed_same=False): 

1184 if len(keys) == 0: 

1185 return DataFrame(index=keys) 

1186 

1187 key_names = self.grouper.names 

1188 

1189 # GH12824. 

1190 def first_not_none(values): 

1191 try: 

1192 return next(com.not_none(*values)) 

1193 except StopIteration: 

1194 return None 

1195 

1196 v = first_not_none(values) 

1197 

1198 if v is None: 

1199 # GH9684. If all values are None, then this will throw an error. 

1200 # We'd prefer it return an empty dataframe. 

1201 return DataFrame() 

1202 elif isinstance(v, DataFrame): 

1203 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) 

1204 elif self.grouper.groupings is not None: 

1205 if len(self.grouper.groupings) > 1: 

1206 key_index = self.grouper.result_index 

1207 

1208 else: 

1209 ping = self.grouper.groupings[0] 

1210 if len(keys) == ping.ngroups: 

1211 key_index = ping.group_index 

1212 key_index.name = key_names[0] 

1213 

1214 key_lookup = Index(keys) 

1215 indexer = key_lookup.get_indexer(key_index) 

1216 

1217 # reorder the values 

1218 values = [values[i] for i in indexer] 

1219 else: 

1220 

1221 key_index = Index(keys, name=key_names[0]) 

1222 

1223 # don't use the key indexer 

1224 if not self.as_index: 

1225 key_index = None 

1226 

1227 # make Nones an empty object 

1228 v = first_not_none(values) 

1229 if v is None: 

1230 return DataFrame() 

1231 elif isinstance(v, NDFrame): 

1232 

1233 # this is to silence a DeprecationWarning 

1234 # TODO: Remove when default dtype of empty Series is object 

1235 kwargs = v._construct_axes_dict() 

1236 if v._constructor is Series: 

1237 backup = create_series_with_explicit_dtype( 

1238 **kwargs, dtype_if_empty=object 

1239 ) 

1240 else: 

1241 backup = v._constructor(**kwargs) 

1242 

1243 values = [x if (x is not None) else backup for x in values] 

1244 

1245 v = values[0] 

1246 

1247 if isinstance(v, (np.ndarray, Index, Series)): 

1248 if isinstance(v, Series): 

1249 applied_index = self._selected_obj._get_axis(self.axis) 

1250 all_indexed_same = all_indexes_same([x.index for x in values]) 

1251 singular_series = len(values) == 1 and applied_index.nlevels == 1 

1252 

1253 # GH3596 

1254 # provide a reduction (Frame -> Series) if groups are 

1255 # unique 

1256 if self.squeeze: 

1257 # assign the name to this series 

1258 if singular_series: 

1259 values[0].name = keys[0] 

1260 

1261 # GH2893 

1262 # we have series in the values array, we want to 

1263 # produce a series: 

1264 # if any of the sub-series are not indexed the same 

1265 # OR we don't have a multi-index and we have only a 

1266 # single values 

1267 return self._concat_objects( 

1268 keys, values, not_indexed_same=not_indexed_same 

1269 ) 

1270 

1271 # still a series 

1272 # path added as of GH 5545 

1273 elif all_indexed_same: 

1274 from pandas.core.reshape.concat import concat 

1275 

1276 return concat(values) 

1277 

1278 if not all_indexed_same: 

1279 # GH 8467 

1280 return self._concat_objects(keys, values, not_indexed_same=True) 

1281 

1282 if self.axis == 0 and isinstance(v, ABCSeries): 

1283 # GH6124 if the list of Series have a consistent name, 

1284 # then propagate that name to the result. 

1285 index = v.index.copy() 

1286 if index.name is None: 

1287 # Only propagate the series name to the result 

1288 # if all series have a consistent name. If the 

1289 # series do not have a consistent name, do 

1290 # nothing. 

1291 names = {v.name for v in values} 

1292 if len(names) == 1: 

1293 index.name = list(names)[0] 

1294 

1295 # normally use vstack as its faster than concat 

1296 # and if we have mi-columns 

1297 if ( 

1298 isinstance(v.index, MultiIndex) 

1299 or key_index is None 

1300 or isinstance(key_index, MultiIndex) 

1301 ): 

1302 stacked_values = np.vstack([np.asarray(v) for v in values]) 

1303 result = DataFrame( 

1304 stacked_values, index=key_index, columns=index 

1305 ) 

1306 else: 

1307 # GH5788 instead of stacking; concat gets the 

1308 # dtypes correct 

1309 from pandas.core.reshape.concat import concat 

1310 

1311 result = concat( 

1312 values, 

1313 keys=key_index, 

1314 names=key_index.names, 

1315 axis=self.axis, 

1316 ).unstack() 

1317 result.columns = index 

1318 elif isinstance(v, ABCSeries): 

1319 stacked_values = np.vstack([np.asarray(v) for v in values]) 

1320 result = DataFrame( 

1321 stacked_values.T, index=v.index, columns=key_index 

1322 ) 

1323 else: 

1324 # GH#1738: values is list of arrays of unequal lengths 

1325 # fall through to the outer else clause 

1326 # TODO: sure this is right? we used to do this 

1327 # after raising AttributeError above 

1328 return Series(values, index=key_index, name=self._selection_name) 

1329 

1330 # if we have date/time like in the original, then coerce dates 

1331 # as we are stacking can easily have object dtypes here 

1332 so = self._selected_obj 

1333 if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): 

1334 result = _recast_datetimelike_result(result) 

1335 else: 

1336 result = result._convert(datetime=True) 

1337 

1338 return self._reindex_output(result) 

1339 

1340 # values are not series or array-like but scalars 

1341 else: 

1342 # only coerce dates if we find at least 1 datetime 

1343 should_coerce = any(isinstance(x, Timestamp) for x in values) 

1344 # self._selection_name not passed through to Series as the 

1345 # result should not take the name of original selection 

1346 # of columns 

1347 return Series(values, index=key_index)._convert( 

1348 datetime=True, coerce=should_coerce 

1349 ) 

1350 

1351 else: 

1352 # Handle cases like BinGrouper 

1353 return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) 

1354 

1355 def _transform_general(self, func, *args, **kwargs): 

1356 from pandas.core.reshape.concat import concat 

1357 

1358 applied = [] 

1359 obj = self._obj_with_exclusions 

1360 gen = self.grouper.get_iterator(obj, axis=self.axis) 

1361 fast_path, slow_path = self._define_paths(func, *args, **kwargs) 

1362 

1363 path = None 

1364 for name, group in gen: 

1365 object.__setattr__(group, "name", name) 

1366 

1367 if path is None: 

1368 # Try slow path and fast path. 

1369 try: 

1370 path, res = self._choose_path(fast_path, slow_path, group) 

1371 except TypeError: 

1372 return self._transform_item_by_item(obj, fast_path) 

1373 except ValueError: 

1374 msg = "transform must return a scalar value for each group" 

1375 raise ValueError(msg) 

1376 else: 

1377 res = path(group) 

1378 

1379 if isinstance(res, Series): 

1380 

1381 # we need to broadcast across the 

1382 # other dimension; this will preserve dtypes 

1383 # GH14457 

1384 if not np.prod(group.shape): 

1385 continue 

1386 elif res.index.is_(obj.index): 

1387 r = concat([res] * len(group.columns), axis=1) 

1388 r.columns = group.columns 

1389 r.index = group.index 

1390 else: 

1391 r = DataFrame( 

1392 np.concatenate([res.values] * len(group.index)).reshape( 

1393 group.shape 

1394 ), 

1395 columns=group.columns, 

1396 index=group.index, 

1397 ) 

1398 

1399 applied.append(r) 

1400 else: 

1401 applied.append(res) 

1402 

1403 concat_index = obj.columns if self.axis == 0 else obj.index 

1404 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 

1405 concatenated = concat(applied, axis=self.axis, verify_integrity=False) 

1406 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) 

1407 return self._set_result_index_ordered(concatenated) 

1408 

1409 @Substitution(klass="DataFrame", selected="") 

1410 @Appender(_transform_template) 

1411 def transform(self, func, *args, **kwargs): 

1412 

1413 # optimized transforms 

1414 func = self._get_cython_func(func) or func 

1415 

1416 if not isinstance(func, str): 

1417 return self._transform_general(func, *args, **kwargs) 

1418 

1419 elif func not in base.transform_kernel_whitelist: 

1420 msg = f"'{func}' is not a valid function name for transform(name)" 

1421 raise ValueError(msg) 

1422 elif func in base.cythonized_kernels: 

1423 # cythonized transformation or canned "reduction+broadcast" 

1424 return getattr(self, func)(*args, **kwargs) 

1425 

1426 # If func is a reduction, we need to broadcast the 

1427 # result to the whole group. Compute func result 

1428 # and deal with possible broadcasting below. 

1429 result = getattr(self, func)(*args, **kwargs) 

1430 

1431 # a reduction transform 

1432 if not isinstance(result, DataFrame): 

1433 return self._transform_general(func, *args, **kwargs) 

1434 

1435 obj = self._obj_with_exclusions 

1436 

1437 # nuisance columns 

1438 if not result.columns.equals(obj.columns): 

1439 return self._transform_general(func, *args, **kwargs) 

1440 

1441 return self._transform_fast(result, func) 

1442 

1443 def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: 

1444 """ 

1445 Fast transform path for aggregations 

1446 """ 

1447 # if there were groups with no observations (Categorical only?) 

1448 # try casting data to original dtype 

1449 cast = self._transform_should_cast(func_nm) 

1450 

1451 obj = self._obj_with_exclusions 

1452 

1453 # for each col, reshape to to size of original frame 

1454 # by take operation 

1455 ids, _, ngroup = self.grouper.group_info 

1456 output = [] 

1457 for i, _ in enumerate(result.columns): 

1458 res = algorithms.take_1d(result.iloc[:, i].values, ids) 

1459 # TODO: we have no test cases that get here with EA dtypes; 

1460 # try_cast may not be needed if EAs never get here 

1461 if cast: 

1462 res = self._try_cast(res, obj.iloc[:, i]) 

1463 output.append(res) 

1464 

1465 return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) 

1466 

1467 def _define_paths(self, func, *args, **kwargs): 

1468 if isinstance(func, str): 

1469 fast_path = lambda group: getattr(group, func)(*args, **kwargs) 

1470 slow_path = lambda group: group.apply( 

1471 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis 

1472 ) 

1473 else: 

1474 fast_path = lambda group: func(group, *args, **kwargs) 

1475 slow_path = lambda group: group.apply( 

1476 lambda x: func(x, *args, **kwargs), axis=self.axis 

1477 ) 

1478 return fast_path, slow_path 

1479 

1480 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): 

1481 path = slow_path 

1482 res = slow_path(group) 

1483 

1484 # if we make it here, test if we can use the fast path 

1485 try: 

1486 res_fast = fast_path(group) 

1487 except AssertionError: 

1488 raise 

1489 except Exception: 

1490 # GH#29631 For user-defined function, we cant predict what may be 

1491 # raised; see test_transform.test_transform_fastpath_raises 

1492 return path, res 

1493 

1494 # verify fast path does not change columns (and names), otherwise 

1495 # its results cannot be joined with those of the slow path 

1496 if not isinstance(res_fast, DataFrame): 

1497 return path, res 

1498 

1499 if not res_fast.columns.equals(group.columns): 

1500 return path, res 

1501 

1502 if res_fast.equals(res): 

1503 path = fast_path 

1504 

1505 return path, res 

1506 

1507 def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: 

1508 # iterate through columns 

1509 output = {} 

1510 inds = [] 

1511 for i, col in enumerate(obj): 

1512 try: 

1513 output[col] = self[col].transform(wrapper) 

1514 except TypeError: 

1515 # e.g. trying to call nanmean with string values 

1516 pass 

1517 else: 

1518 inds.append(i) 

1519 

1520 if len(output) == 0: 

1521 raise TypeError("Transform function invalid for data types") 

1522 

1523 columns = obj.columns 

1524 if len(output) < len(obj.columns): 

1525 columns = columns.take(inds) 

1526 

1527 return DataFrame(output, index=obj.index, columns=columns) 

1528 

1529 def filter(self, func, dropna=True, *args, **kwargs): 

1530 """ 

1531 Return a copy of a DataFrame excluding elements from groups that 

1532 do not satisfy the boolean criterion specified by func. 

1533 

1534 Parameters 

1535 ---------- 

1536 f : function 

1537 Function to apply to each subframe. Should return True or False. 

1538 dropna : Drop groups that do not pass the filter. True by default; 

1539 If False, groups that evaluate False are filled with NaNs. 

1540 

1541 Returns 

1542 ------- 

1543 filtered : DataFrame 

1544 

1545 Notes 

1546 ----- 

1547 Each subframe is endowed the attribute 'name' in case you need to know 

1548 which group you are working on. 

1549 

1550 Examples 

1551 -------- 

1552 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

1553 ... 'foo', 'bar'], 

1554 ... 'B' : [1, 2, 3, 4, 5, 6], 

1555 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

1556 >>> grouped = df.groupby('A') 

1557 >>> grouped.filter(lambda x: x['B'].mean() > 3.) 

1558 A B C 

1559 1 bar 2 5.0 

1560 3 bar 4 1.0 

1561 5 bar 6 9.0 

1562 """ 

1563 

1564 indices = [] 

1565 

1566 obj = self._selected_obj 

1567 gen = self.grouper.get_iterator(obj, axis=self.axis) 

1568 

1569 for name, group in gen: 

1570 object.__setattr__(group, "name", name) 

1571 

1572 res = func(group, *args, **kwargs) 

1573 

1574 try: 

1575 res = res.squeeze() 

1576 except AttributeError: # allow e.g., scalars and frames to pass 

1577 pass 

1578 

1579 # interpret the result of the filter 

1580 if is_bool(res) or (is_scalar(res) and isna(res)): 

1581 if res and notna(res): 

1582 indices.append(self._get_index(name)) 

1583 else: 

1584 # non scalars aren't allowed 

1585 raise TypeError( 

1586 f"filter function returned a {type(res).__name__}, " 

1587 "but expected a scalar bool" 

1588 ) 

1589 

1590 return self._apply_filter(indices, dropna) 

1591 

1592 def __getitem__(self, key): 

1593 # per GH 23566 

1594 if isinstance(key, tuple) and len(key) > 1: 

1595 # if len == 1, then it becomes a SeriesGroupBy and this is actually 

1596 # valid syntax, so don't raise warning 

1597 warnings.warn( 

1598 "Indexing with multiple keys (implicitly converted to a tuple " 

1599 "of keys) will be deprecated, use a list instead.", 

1600 FutureWarning, 

1601 stacklevel=2, 

1602 ) 

1603 return super().__getitem__(key) 

1604 

1605 def _gotitem(self, key, ndim: int, subset=None): 

1606 """ 

1607 sub-classes to define 

1608 return a sliced object 

1609 

1610 Parameters 

1611 ---------- 

1612 key : string / list of selections 

1613 ndim : 1,2 

1614 requested ndim of result 

1615 subset : object, default None 

1616 subset to act on 

1617 """ 

1618 

1619 if ndim == 2: 

1620 if subset is None: 

1621 subset = self.obj 

1622 return DataFrameGroupBy( 

1623 subset, 

1624 self.grouper, 

1625 selection=key, 

1626 grouper=self.grouper, 

1627 exclusions=self.exclusions, 

1628 as_index=self.as_index, 

1629 observed=self.observed, 

1630 ) 

1631 elif ndim == 1: 

1632 if subset is None: 

1633 subset = self.obj[key] 

1634 return SeriesGroupBy( 

1635 subset, selection=key, grouper=self.grouper, observed=self.observed 

1636 ) 

1637 

1638 raise AssertionError("invalid ndim for _gotitem") 

1639 

1640 def _wrap_frame_output(self, result, obj) -> DataFrame: 

1641 result_index = self.grouper.levels[0] 

1642 

1643 if self.axis == 0: 

1644 return DataFrame(result, index=obj.columns, columns=result_index).T 

1645 else: 

1646 return DataFrame(result, index=obj.index, columns=result_index) 

1647 

1648 def _get_data_to_aggregate(self) -> BlockManager: 

1649 obj = self._obj_with_exclusions 

1650 if self.axis == 1: 

1651 return obj.T._data 

1652 else: 

1653 return obj._data 

1654 

1655 def _insert_inaxis_grouper_inplace(self, result): 

1656 # zip in reverse so we can always insert at loc 0 

1657 izip = zip( 

1658 *map( 

1659 reversed, 

1660 ( 

1661 self.grouper.names, 

1662 self.grouper.get_group_levels(), 

1663 [grp.in_axis for grp in self.grouper.groupings], 

1664 ), 

1665 ) 

1666 ) 

1667 

1668 for name, lev, in_axis in izip: 

1669 if in_axis: 

1670 result.insert(0, name, lev) 

1671 

1672 def _wrap_aggregated_output( 

1673 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] 

1674 ) -> DataFrame: 

1675 """ 

1676 Wraps the output of DataFrameGroupBy aggregations into the expected result. 

1677 

1678 Parameters 

1679 ---------- 

1680 output : Mapping[base.OutputKey, Union[Series, np.ndarray]] 

1681 Data to wrap. 

1682 

1683 Returns 

1684 ------- 

1685 DataFrame 

1686 """ 

1687 indexed_output = {key.position: val for key, val in output.items()} 

1688 columns = Index(key.label for key in output) 

1689 

1690 result = DataFrame(indexed_output) 

1691 result.columns = columns 

1692 

1693 if not self.as_index: 

1694 self._insert_inaxis_grouper_inplace(result) 

1695 result = result._consolidate() 

1696 else: 

1697 index = self.grouper.result_index 

1698 result.index = index 

1699 

1700 if self.axis == 1: 

1701 result = result.T 

1702 

1703 return self._reindex_output(result)._convert(datetime=True) 

1704 

1705 def _wrap_transformed_output( 

1706 self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] 

1707 ) -> DataFrame: 

1708 """ 

1709 Wraps the output of DataFrameGroupBy transformations into the expected result. 

1710 

1711 Parameters 

1712 ---------- 

1713 output : Mapping[base.OutputKey, Union[Series, np.ndarray]] 

1714 Data to wrap. 

1715 

1716 Returns 

1717 ------- 

1718 DataFrame 

1719 """ 

1720 indexed_output = {key.position: val for key, val in output.items()} 

1721 columns = Index(key.label for key in output) 

1722 

1723 result = DataFrame(indexed_output) 

1724 result.columns = columns 

1725 result.index = self.obj.index 

1726 

1727 return result 

1728 

1729 def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: 

1730 if not self.as_index: 

1731 index = np.arange(blocks[0].values.shape[-1]) 

1732 mgr = BlockManager(blocks, axes=[items, index]) 

1733 result = DataFrame(mgr) 

1734 

1735 self._insert_inaxis_grouper_inplace(result) 

1736 result = result._consolidate() 

1737 else: 

1738 index = self.grouper.result_index 

1739 mgr = BlockManager(blocks, axes=[items, index]) 

1740 result = DataFrame(mgr) 

1741 

1742 if self.axis == 1: 

1743 result = result.T 

1744 

1745 return self._reindex_output(result)._convert(datetime=True) 

1746 

1747 def _iterate_column_groupbys(self): 

1748 for i, colname in enumerate(self._selected_obj.columns): 

1749 yield colname, SeriesGroupBy( 

1750 self._selected_obj.iloc[:, i], 

1751 selection=colname, 

1752 grouper=self.grouper, 

1753 exclusions=self.exclusions, 

1754 ) 

1755 

1756 def _apply_to_column_groupbys(self, func): 

1757 from pandas.core.reshape.concat import concat 

1758 

1759 return concat( 

1760 (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), 

1761 keys=self._selected_obj.columns, 

1762 axis=1, 

1763 ) 

1764 

1765 def count(self): 

1766 """ 

1767 Compute count of group, excluding missing values. 

1768 

1769 Returns 

1770 ------- 

1771 DataFrame 

1772 Count of values within each group. 

1773 """ 

1774 data = self._get_data_to_aggregate() 

1775 ids, _, ngroups = self.grouper.group_info 

1776 mask = ids != -1 

1777 

1778 vals = ( 

1779 (mask & ~_isna_ndarraylike(np.atleast_2d(blk.get_values()))) 

1780 for blk in data.blocks 

1781 ) 

1782 locs = (blk.mgr_locs for blk in data.blocks) 

1783 

1784 counted = ( 

1785 lib.count_level_2d(x, labels=ids, max_bin=ngroups, axis=1) for x in vals 

1786 ) 

1787 blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] 

1788 

1789 return self._wrap_agged_blocks(blocks, items=data.items) 

1790 

1791 def nunique(self, dropna: bool = True): 

1792 """ 

1793 Return DataFrame with number of distinct observations per group for 

1794 each column. 

1795 

1796 Parameters 

1797 ---------- 

1798 dropna : bool, default True 

1799 Don't include NaN in the counts. 

1800 

1801 Returns 

1802 ------- 

1803 nunique: DataFrame 

1804 

1805 Examples 

1806 -------- 

1807 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 

1808 ... 'ham', 'ham'], 

1809 ... 'value1': [1, 5, 5, 2, 5, 5], 

1810 ... 'value2': list('abbaxy')}) 

1811 >>> df 

1812 id value1 value2 

1813 0 spam 1 a 

1814 1 egg 5 b 

1815 2 egg 5 b 

1816 3 spam 2 a 

1817 4 ham 5 x 

1818 5 ham 5 y 

1819 

1820 >>> df.groupby('id').nunique() 

1821 id value1 value2 

1822 id 

1823 egg 1 1 1 

1824 ham 1 1 2 

1825 spam 1 2 1 

1826 

1827 Check for rows with the same id but conflicting values: 

1828 

1829 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) 

1830 id value1 value2 

1831 0 spam 1 a 

1832 3 spam 2 a 

1833 4 ham 5 x 

1834 5 ham 5 y 

1835 """ 

1836 

1837 obj = self._selected_obj 

1838 

1839 def groupby_series(obj, col=None): 

1840 return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique( 

1841 dropna=dropna 

1842 ) 

1843 

1844 if isinstance(obj, Series): 

1845 results = groupby_series(obj) 

1846 else: 

1847 # TODO: this is duplicative of how GroupBy naturally works 

1848 # Try to consolidate with normal wrapping functions 

1849 from pandas.core.reshape.concat import concat 

1850 

1851 axis_number = obj._get_axis_number(self.axis) 

1852 other_axis = int(not axis_number) 

1853 if axis_number == 0: 

1854 iter_func = obj.items 

1855 else: 

1856 iter_func = obj.iterrows 

1857 

1858 results = [groupby_series(content, label) for label, content in iter_func()] 

1859 results = concat(results, axis=1) 

1860 

1861 if axis_number == 1: 

1862 results = results.T 

1863 

1864 results._get_axis(other_axis).names = obj._get_axis(other_axis).names 

1865 

1866 if not self.as_index: 

1867 results.index = ibase.default_index(len(results)) 

1868 return results 

1869 

1870 boxplot = boxplot_frame_groupby 

1871 

1872 

1873def _is_multi_agg_with_relabel(**kwargs) -> bool: 

1874 """ 

1875 Check whether kwargs passed to .agg look like multi-agg with relabeling. 

1876 

1877 Parameters 

1878 ---------- 

1879 **kwargs : dict 

1880 

1881 Returns 

1882 ------- 

1883 bool 

1884 

1885 Examples 

1886 -------- 

1887 >>> _is_multi_agg_with_relabel(a='max') 

1888 False 

1889 >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), 

1890 ... a_min=('a', 'min')) 

1891 True 

1892 >>> _is_multi_agg_with_relabel() 

1893 False 

1894 """ 

1895 return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( 

1896 len(kwargs) > 0 

1897 ) 

1898 

1899 

1900def _normalize_keyword_aggregation(kwargs): 

1901 """ 

1902 Normalize user-provided "named aggregation" kwargs. 

1903 

1904 Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs 

1905 to the old Dict[str, List[scalar]]]. 

1906 

1907 Parameters 

1908 ---------- 

1909 kwargs : dict 

1910 

1911 Returns 

1912 ------- 

1913 aggspec : dict 

1914 The transformed kwargs. 

1915 columns : List[str] 

1916 The user-provided keys. 

1917 col_idx_order : List[int] 

1918 List of columns indices. 

1919 

1920 Examples 

1921 -------- 

1922 >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) 

1923 ({'input': ['sum']}, ('output',), [('input', 'sum')]) 

1924 """ 

1925 # Normalize the aggregation functions as Mapping[column, List[func]], 

1926 # process normally, then fixup the names. 

1927 # TODO: aggspec type: typing.Dict[str, List[AggScalar]] 

1928 # May be hitting https://github.com/python/mypy/issues/5958 

1929 # saying it doesn't have an attribute __name__ 

1930 aggspec = defaultdict(list) 

1931 order = [] 

1932 columns, pairs = list(zip(*kwargs.items())) 

1933 

1934 for name, (column, aggfunc) in zip(columns, pairs): 

1935 aggspec[column].append(aggfunc) 

1936 order.append((column, com.get_callable_name(aggfunc) or aggfunc)) 

1937 

1938 # uniquify aggfunc name if duplicated in order list 

1939 uniquified_order = _make_unique(order) 

1940 

1941 # GH 25719, due to aggspec will change the order of assigned columns in aggregation 

1942 # uniquified_aggspec will store uniquified order list and will compare it with order 

1943 # based on index 

1944 aggspec_order = [ 

1945 (column, com.get_callable_name(aggfunc) or aggfunc) 

1946 for column, aggfuncs in aggspec.items() 

1947 for aggfunc in aggfuncs 

1948 ] 

1949 uniquified_aggspec = _make_unique(aggspec_order) 

1950 

1951 # get the new indice of columns by comparison 

1952 col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) 

1953 return aggspec, columns, col_idx_order 

1954 

1955 

1956def _make_unique(seq): 

1957 """Uniquify aggfunc name of the pairs in the order list 

1958 

1959 Examples: 

1960 -------- 

1961 >>> _make_unique([('a', '<lambda>'), ('a', '<lambda>'), ('b', '<lambda>')]) 

1962 [('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')] 

1963 """ 

1964 return [ 

1965 (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) 

1966 if seq.count(pair) > 1 

1967 else pair 

1968 for i, pair in enumerate(seq) 

1969 ] 

1970 

1971 

1972# TODO: Can't use, because mypy doesn't like us setting __name__ 

1973# error: "partial[Any]" has no attribute "__name__" 

1974# the type is: 

1975# typing.Sequence[Callable[..., ScalarResult]] 

1976# -> typing.Sequence[Callable[..., ScalarResult]]: 

1977 

1978 

1979def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: 

1980 """ 

1981 Possibly mangle a list of aggfuncs. 

1982 

1983 Parameters 

1984 ---------- 

1985 aggfuncs : Sequence 

1986 

1987 Returns 

1988 ------- 

1989 mangled: list-like 

1990 A new AggSpec sequence, where lambdas have been converted 

1991 to have unique names. 

1992 

1993 Notes 

1994 ----- 

1995 If just one aggfunc is passed, the name will not be mangled. 

1996 """ 

1997 if len(aggfuncs) <= 1: 

1998 # don't mangle for .agg([lambda x: .]) 

1999 return aggfuncs 

2000 i = 0 

2001 mangled_aggfuncs = [] 

2002 for aggfunc in aggfuncs: 

2003 if com.get_callable_name(aggfunc) == "<lambda>": 

2004 aggfunc = partial(aggfunc) 

2005 aggfunc.__name__ = f"<lambda_{i}>" 

2006 i += 1 

2007 mangled_aggfuncs.append(aggfunc) 

2008 

2009 return mangled_aggfuncs 

2010 

2011 

2012def _maybe_mangle_lambdas(agg_spec: Any) -> Any: 

2013 """ 

2014 Make new lambdas with unique names. 

2015 

2016 Parameters 

2017 ---------- 

2018 agg_spec : Any 

2019 An argument to GroupBy.agg. 

2020 Non-dict-like `agg_spec` are pass through as is. 

2021 For dict-like `agg_spec` a new spec is returned 

2022 with name-mangled lambdas. 

2023 

2024 Returns 

2025 ------- 

2026 mangled : Any 

2027 Same type as the input. 

2028 

2029 Examples 

2030 -------- 

2031 >>> _maybe_mangle_lambdas('sum') 

2032 'sum' 

2033 

2034 >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP 

2035 [<function __main__.<lambda_0>, 

2036 <function pandas...._make_lambda.<locals>.f(*args, **kwargs)>] 

2037 """ 

2038 is_dict = is_dict_like(agg_spec) 

2039 if not (is_dict or is_list_like(agg_spec)): 

2040 return agg_spec 

2041 mangled_aggspec = type(agg_spec)() # dict or OrderdDict 

2042 

2043 if is_dict: 

2044 for key, aggfuncs in agg_spec.items(): 

2045 if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): 

2046 mangled_aggfuncs = _managle_lambda_list(aggfuncs) 

2047 else: 

2048 mangled_aggfuncs = aggfuncs 

2049 

2050 mangled_aggspec[key] = mangled_aggfuncs 

2051 else: 

2052 mangled_aggspec = _managle_lambda_list(agg_spec) 

2053 

2054 return mangled_aggspec 

2055 

2056 

2057def _recast_datetimelike_result(result: DataFrame) -> DataFrame: 

2058 """ 

2059 If we have date/time like in the original, then coerce dates 

2060 as we are stacking can easily have object dtypes here. 

2061 

2062 Parameters 

2063 ---------- 

2064 result : DataFrame 

2065 

2066 Returns 

2067 ------- 

2068 DataFrame 

2069 

2070 Notes 

2071 ----- 

2072 - Assumes Groupby._selected_obj has ndim==2 and at least one 

2073 datetimelike column 

2074 """ 

2075 result = result.copy() 

2076 

2077 obj_cols = [ 

2078 idx 

2079 for idx in range(len(result.columns)) 

2080 if is_object_dtype(result.dtypes.iloc[idx]) 

2081 ] 

2082 

2083 # See GH#26285 

2084 for n in obj_cols: 

2085 converted = maybe_convert_objects( 

2086 result.iloc[:, n].values, convert_numeric=False 

2087 ) 

2088 

2089 result.iloc[:, n] = converted 

2090 return result