Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import numpy as np 

2 

3from pandas.core.algorithms import unique1d 

4from pandas.core.arrays.categorical import ( 

5 Categorical, 

6 CategoricalDtype, 

7 _recode_for_categories, 

8) 

9 

10 

11def recode_for_groupby(c: Categorical, sort: bool, observed: bool): 

12 """ 

13 Code the categories to ensure we can groupby for categoricals. 

14 

15 If observed=True, we return a new Categorical with the observed 

16 categories only. 

17 

18 If sort=False, return a copy of self, coded with categories as 

19 returned by .unique(), followed by any categories not appearing in 

20 the data. If sort=True, return self. 

21 

22 This method is needed solely to ensure the categorical index of the 

23 GroupBy result has categories in the order of appearance in the data 

24 (GH-8868). 

25 

26 Parameters 

27 ---------- 

28 c : Categorical 

29 sort : boolean 

30 The value of the sort parameter groupby was called with. 

31 observed : boolean 

32 Account only for the observed values 

33 

34 Returns 

35 ------- 

36 New Categorical 

37 If sort=False, the new categories are set to the order of 

38 appearance in codes (unless ordered=True, in which case the 

39 original order is preserved), followed by any unrepresented 

40 categories in the original order. 

41 Categorical or None 

42 If we are observed, return the original categorical, otherwise None 

43 """ 

44 

45 # we only care about observed values 

46 if observed: 

47 unique_codes = unique1d(c.codes) 

48 

49 take_codes = unique_codes[unique_codes != -1] 

50 if c.ordered: 

51 take_codes = np.sort(take_codes) 

52 

53 # we recode according to the uniques 

54 categories = c.categories.take(take_codes) 

55 codes = _recode_for_categories(c.codes, c.categories, categories) 

56 

57 # return a new categorical that maps our new codes 

58 # and categories 

59 dtype = CategoricalDtype(categories, ordered=c.ordered) 

60 return Categorical(codes, dtype=dtype, fastpath=True), c 

61 

62 # Already sorted according to c.categories; all is fine 

63 if sort: 

64 return c, None 

65 

66 # sort=False should order groups in as-encountered order (GH-8868) 

67 cat = c.unique() 

68 

69 # But for groupby to work, all categories should be present, 

70 # including those missing from the data (GH-13179), which .unique() 

71 # above dropped 

72 cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) 

73 

74 return c.reorder_categories(cat.categories), None 

75 

76 

77def recode_from_groupby(c: Categorical, sort: bool, ci): 

78 """ 

79 Reverse the codes_to_groupby to account for sort / observed. 

80 

81 Parameters 

82 ---------- 

83 c : Categorical 

84 sort : boolean 

85 The value of the sort parameter groupby was called with. 

86 ci : CategoricalIndex 

87 The codes / categories to recode 

88 

89 Returns 

90 ------- 

91 CategoricalIndex 

92 """ 

93 

94 # we re-order to the original category orderings 

95 if sort: 

96 return ci.set_categories(c.categories) 

97 

98 # we are not sorting, so add unobserved to the end 

99 return ci.add_categories(c.categories[~c.categories.isin(ci.categories)])