Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" miscellaneous sorting / groupby utilities """ 

2import numpy as np 

3 

4from pandas._libs import algos, hashtable, lib 

5from pandas._libs.hashtable import unique_label_indices 

6 

7from pandas.core.dtypes.common import ( 

8 ensure_int64, 

9 ensure_platform_int, 

10 is_categorical_dtype, 

11 is_extension_array_dtype, 

12) 

13from pandas.core.dtypes.missing import isna 

14 

15import pandas.core.algorithms as algorithms 

16from pandas.core.construction import extract_array 

17 

18_INT64_MAX = np.iinfo(np.int64).max 

19 

20 

21def get_group_index(labels, shape, sort: bool, xnull: bool): 

22 """ 

23 For the particular label_list, gets the offsets into the hypothetical list 

24 representing the totally ordered cartesian product of all possible label 

25 combinations, *as long as* this space fits within int64 bounds; 

26 otherwise, though group indices identify unique combinations of 

27 labels, they cannot be deconstructed. 

28 - If `sort`, rank of returned ids preserve lexical ranks of labels. 

29 i.e. returned id's can be used to do lexical sort on labels; 

30 - If `xnull` nulls (-1 labels) are passed through. 

31 

32 Parameters 

33 ---------- 

34 labels : sequence of arrays 

35 Integers identifying levels at each location 

36 shape : sequence of ints 

37 Number of unique levels at each location 

38 sort : bool 

39 If the ranks of returned ids should match lexical ranks of labels 

40 xnull : bool 

41 If true nulls are excluded. i.e. -1 values in the labels are 

42 passed through. 

43 

44 Returns 

45 ------- 

46 An array of type int64 where two elements are equal if their corresponding 

47 labels are equal at all location. 

48 

49 Notes 

50 ----- 

51 The length of `labels` and `shape` must be identical. 

52 """ 

53 

54 def _int64_cut_off(shape) -> int: 

55 acc = 1 

56 for i, mul in enumerate(shape): 

57 acc *= int(mul) 

58 if not acc < _INT64_MAX: 

59 return i 

60 return len(shape) 

61 

62 def maybe_lift(lab, size): 

63 # promote nan values (assigned -1 label in lab array) 

64 # so that all output values are non-negative 

65 return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) 

66 

67 labels = map(ensure_int64, labels) 

68 if not xnull: 

69 labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) 

70 

71 labels = list(labels) 

72 shape = list(shape) 

73 

74 # Iteratively process all the labels in chunks sized so less 

75 # than _INT64_MAX unique int ids will be required for each chunk 

76 while True: 

77 # how many levels can be done without overflow: 

78 nlev = _int64_cut_off(shape) 

79 

80 # compute flat ids for the first `nlev` levels 

81 stride = np.prod(shape[1:nlev], dtype="i8") 

82 out = stride * labels[0].astype("i8", subok=False, copy=False) 

83 

84 for i in range(1, nlev): 

85 if shape[i] == 0: 

86 stride = 0 

87 else: 

88 stride //= shape[i] 

89 out += labels[i] * stride 

90 

91 if xnull: # exclude nulls 

92 mask = labels[0] == -1 

93 for lab in labels[1:nlev]: 

94 mask |= lab == -1 

95 out[mask] = -1 

96 

97 if nlev == len(shape): # all levels done! 

98 break 

99 

100 # compress what has been done so far in order to avoid overflow 

101 # to retain lexical ranks, obs_ids should be sorted 

102 comp_ids, obs_ids = compress_group_index(out, sort=sort) 

103 

104 labels = [comp_ids] + labels[nlev:] 

105 shape = [len(obs_ids)] + shape[nlev:] 

106 

107 return out 

108 

109 

110def get_compressed_ids(labels, sizes): 

111 """ 

112 Group_index is offsets into cartesian product of all possible labels. This 

113 space can be huge, so this function compresses it, by computing offsets 

114 (comp_ids) into the list of unique labels (obs_group_ids). 

115 

116 Parameters 

117 ---------- 

118 labels : list of label arrays 

119 sizes : list of size of the levels 

120 

121 Returns 

122 ------- 

123 tuple of (comp_ids, obs_group_ids) 

124 """ 

125 ids = get_group_index(labels, sizes, sort=True, xnull=False) 

126 return compress_group_index(ids, sort=True) 

127 

128 

129def is_int64_overflow_possible(shape) -> bool: 

130 the_prod = 1 

131 for x in shape: 

132 the_prod *= int(x) 

133 

134 return the_prod >= _INT64_MAX 

135 

136 

137def decons_group_index(comp_labels, shape): 

138 # reconstruct labels 

139 if is_int64_overflow_possible(shape): 

140 # at some point group indices are factorized, 

141 # and may not be deconstructed here! wrong path! 

142 raise ValueError("cannot deconstruct factorized group indices!") 

143 

144 label_list = [] 

145 factor = 1 

146 y = 0 

147 x = comp_labels 

148 for i in reversed(range(len(shape))): 

149 labels = (x - y) % (factor * shape[i]) // factor 

150 np.putmask(labels, comp_labels < 0, -1) 

151 label_list.append(labels) 

152 y = labels * factor 

153 factor *= shape[i] 

154 return label_list[::-1] 

155 

156 

157def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): 

158 """ 

159 Reconstruct labels from observed group ids. 

160 

161 Parameters 

162 ---------- 

163 xnull : bool 

164 If nulls are excluded; i.e. -1 labels are passed through. 

165 """ 

166 if not xnull: 

167 lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") 

168 shape = np.asarray(shape, dtype="i8") + lift 

169 

170 if not is_int64_overflow_possible(shape): 

171 # obs ids are deconstructable! take the fast route! 

172 out = decons_group_index(obs_ids, shape) 

173 return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] 

174 

175 i = unique_label_indices(comp_ids) 

176 i8copy = lambda a: a.astype("i8", subok=False, copy=True) 

177 return [i8copy(lab[i]) for lab in labels] 

178 

179 

180def indexer_from_factorized(labels, shape, compress: bool = True): 

181 ids = get_group_index(labels, shape, sort=True, xnull=False) 

182 

183 if not compress: 

184 ngroups = (ids.size and ids.max()) + 1 

185 else: 

186 ids, obs = compress_group_index(ids, sort=True) 

187 ngroups = len(obs) 

188 

189 return get_group_index_sorter(ids, ngroups) 

190 

191 

192def lexsort_indexer(keys, orders=None, na_position: str = "last"): 

193 """ 

194 Parameters 

195 ---------- 

196 na_position : {'first', 'last'}, default 'last' 

197 """ 

198 from pandas.core.arrays import Categorical 

199 

200 labels = [] 

201 shape = [] 

202 if isinstance(orders, bool): 

203 orders = [orders] * len(keys) 

204 elif orders is None: 

205 orders = [True] * len(keys) 

206 

207 for key, order in zip(keys, orders): 

208 

209 # we are already a Categorical 

210 if is_categorical_dtype(key): 

211 cat = key 

212 

213 # create the Categorical 

214 else: 

215 cat = Categorical(key, ordered=True) 

216 

217 if na_position not in ["last", "first"]: 

218 raise ValueError(f"invalid na_position: {na_position}") 

219 

220 n = len(cat.categories) 

221 codes = cat.codes.copy() 

222 

223 mask = cat.codes == -1 

224 if order: # ascending 

225 if na_position == "last": 

226 codes = np.where(mask, n, codes) 

227 elif na_position == "first": 

228 codes += 1 

229 else: # not order means descending 

230 if na_position == "last": 

231 codes = np.where(mask, n, n - codes - 1) 

232 elif na_position == "first": 

233 codes = np.where(mask, 0, n - codes) 

234 if mask.any(): 

235 n += 1 

236 

237 shape.append(n) 

238 labels.append(codes) 

239 

240 return indexer_from_factorized(labels, shape) 

241 

242 

243def nargsort( 

244 items, kind: str = "quicksort", ascending: bool = True, na_position: str = "last" 

245): 

246 """ 

247 Intended to be a drop-in replacement for np.argsort which handles NaNs. 

248 

249 Adds ascending and na_position parameters. 

250 

251 (GH #6399, #5231) 

252 

253 Parameters 

254 ---------- 

255 kind : str, default 'quicksort' 

256 ascending : bool, default True 

257 na_position : {'first', 'last'}, default 'last' 

258 """ 

259 items = extract_array(items) 

260 mask = np.asarray(isna(items)) 

261 

262 if is_extension_array_dtype(items): 

263 items = items._values_for_argsort() 

264 else: 

265 items = np.asanyarray(items) 

266 

267 idx = np.arange(len(items)) 

268 non_nans = items[~mask] 

269 non_nan_idx = idx[~mask] 

270 nan_idx = np.nonzero(mask)[0] 

271 if not ascending: 

272 non_nans = non_nans[::-1] 

273 non_nan_idx = non_nan_idx[::-1] 

274 indexer = non_nan_idx[non_nans.argsort(kind=kind)] 

275 if not ascending: 

276 indexer = indexer[::-1] 

277 # Finally, place the NaNs at the end or the beginning according to 

278 # na_position 

279 if na_position == "last": 

280 indexer = np.concatenate([indexer, nan_idx]) 

281 elif na_position == "first": 

282 indexer = np.concatenate([nan_idx, indexer]) 

283 else: 

284 raise ValueError(f"invalid na_position: {na_position}") 

285 return indexer 

286 

287 

288class _KeyMapper: 

289 """ 

290 Map compressed group id -> key tuple. 

291 """ 

292 

293 def __init__(self, comp_ids, ngroups: int, levels, labels): 

294 self.levels = levels 

295 self.labels = labels 

296 self.comp_ids = comp_ids.astype(np.int64) 

297 

298 self.k = len(labels) 

299 self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] 

300 

301 self._populate_tables() 

302 

303 def _populate_tables(self): 

304 for labs, table in zip(self.labels, self.tables): 

305 table.map(self.comp_ids, labs.astype(np.int64)) 

306 

307 def get_key(self, comp_id): 

308 return tuple( 

309 level[table.get_item(comp_id)] 

310 for table, level in zip(self.tables, self.levels) 

311 ) 

312 

313 

314def get_flattened_iterator(comp_ids, ngroups, levels, labels): 

315 # provide "flattened" iterator for multi-group setting 

316 mapper = _KeyMapper(comp_ids, ngroups, levels, labels) 

317 return [mapper.get_key(i) for i in range(ngroups)] 

318 

319 

320def get_indexer_dict(label_list, keys): 

321 """ 

322 Returns 

323 ------- 

324 dict 

325 Labels mapped to indexers. 

326 """ 

327 shape = [len(x) for x in keys] 

328 

329 group_index = get_group_index(label_list, shape, sort=True, xnull=True) 

330 ngroups = ( 

331 ((group_index.size and group_index.max()) + 1) 

332 if is_int64_overflow_possible(shape) 

333 else np.prod(shape, dtype="i8") 

334 ) 

335 

336 sorter = get_group_index_sorter(group_index, ngroups) 

337 

338 sorted_labels = [lab.take(sorter) for lab in label_list] 

339 group_index = group_index.take(sorter) 

340 

341 return lib.indices_fast(sorter, group_index, keys, sorted_labels) 

342 

343 

344# ---------------------------------------------------------------------- 

345# sorting levels...cleverly? 

346 

347 

348def get_group_index_sorter(group_index, ngroups: int): 

349 """ 

350 algos.groupsort_indexer implements `counting sort` and it is at least 

351 O(ngroups), where 

352 ngroups = prod(shape) 

353 shape = map(len, keys) 

354 that is, linear in the number of combinations (cartesian product) of unique 

355 values of groupby keys. This can be huge when doing multi-key groupby. 

356 np.argsort(kind='mergesort') is O(count x log(count)) where count is the 

357 length of the data-frame; 

358 Both algorithms are `stable` sort and that is necessary for correctness of 

359 groupby operations. e.g. consider: 

360 df.groupby(key)[col].transform('first') 

361 """ 

362 count = len(group_index) 

363 alpha = 0.0 # taking complexities literally; there may be 

364 beta = 1.0 # some room for fine-tuning these parameters 

365 do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count))) 

366 if do_groupsort: 

367 sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) 

368 return ensure_platform_int(sorter) 

369 else: 

370 return group_index.argsort(kind="mergesort") 

371 

372 

373def compress_group_index(group_index, sort: bool = True): 

374 """ 

375 Group_index is offsets into cartesian product of all possible labels. This 

376 space can be huge, so this function compresses it, by computing offsets 

377 (comp_ids) into the list of unique labels (obs_group_ids). 

378 """ 

379 

380 size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT) 

381 table = hashtable.Int64HashTable(size_hint) 

382 

383 group_index = ensure_int64(group_index) 

384 

385 # note, group labels come out ascending (ie, 1,2,3 etc) 

386 comp_ids, obs_group_ids = table.get_labels_groupby(group_index) 

387 

388 if sort and len(obs_group_ids) > 0: 

389 obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) 

390 

391 return comp_ids, obs_group_ids 

392 

393 

394def _reorder_by_uniques(uniques, labels): 

395 # sorter is index where elements ought to go 

396 sorter = uniques.argsort() 

397 

398 # reverse_indexer is where elements came from 

399 reverse_indexer = np.empty(len(sorter), dtype=np.int64) 

400 reverse_indexer.put(sorter, np.arange(len(sorter))) 

401 

402 mask = labels < 0 

403 

404 # move labels to right locations (ie, unsort ascending labels) 

405 labels = algorithms.take_nd(reverse_indexer, labels, allow_fill=False) 

406 np.putmask(labels, mask, -1) 

407 

408 # sort observed ids 

409 uniques = algorithms.take_nd(uniques, sorter, allow_fill=False) 

410 

411 return uniques, labels