Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/sorting.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1""" miscellaneous sorting / groupby utilities """
2import numpy as np
4from pandas._libs import algos, hashtable, lib
5from pandas._libs.hashtable import unique_label_indices
7from pandas.core.dtypes.common import (
8 ensure_int64,
9 ensure_platform_int,
10 is_categorical_dtype,
11 is_extension_array_dtype,
12)
13from pandas.core.dtypes.missing import isna
15import pandas.core.algorithms as algorithms
16from pandas.core.construction import extract_array
18_INT64_MAX = np.iinfo(np.int64).max
21def get_group_index(labels, shape, sort: bool, xnull: bool):
22 """
23 For the particular label_list, gets the offsets into the hypothetical list
24 representing the totally ordered cartesian product of all possible label
25 combinations, *as long as* this space fits within int64 bounds;
26 otherwise, though group indices identify unique combinations of
27 labels, they cannot be deconstructed.
28 - If `sort`, rank of returned ids preserve lexical ranks of labels.
29 i.e. returned id's can be used to do lexical sort on labels;
30 - If `xnull` nulls (-1 labels) are passed through.
32 Parameters
33 ----------
34 labels : sequence of arrays
35 Integers identifying levels at each location
36 shape : sequence of ints
37 Number of unique levels at each location
38 sort : bool
39 If the ranks of returned ids should match lexical ranks of labels
40 xnull : bool
41 If true nulls are excluded. i.e. -1 values in the labels are
42 passed through.
44 Returns
45 -------
46 An array of type int64 where two elements are equal if their corresponding
47 labels are equal at all location.
49 Notes
50 -----
51 The length of `labels` and `shape` must be identical.
52 """
54 def _int64_cut_off(shape) -> int:
55 acc = 1
56 for i, mul in enumerate(shape):
57 acc *= int(mul)
58 if not acc < _INT64_MAX:
59 return i
60 return len(shape)
62 def maybe_lift(lab, size):
63 # promote nan values (assigned -1 label in lab array)
64 # so that all output values are non-negative
65 return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
67 labels = map(ensure_int64, labels)
68 if not xnull:
69 labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
71 labels = list(labels)
72 shape = list(shape)
74 # Iteratively process all the labels in chunks sized so less
75 # than _INT64_MAX unique int ids will be required for each chunk
76 while True:
77 # how many levels can be done without overflow:
78 nlev = _int64_cut_off(shape)
80 # compute flat ids for the first `nlev` levels
81 stride = np.prod(shape[1:nlev], dtype="i8")
82 out = stride * labels[0].astype("i8", subok=False, copy=False)
84 for i in range(1, nlev):
85 if shape[i] == 0:
86 stride = 0
87 else:
88 stride //= shape[i]
89 out += labels[i] * stride
91 if xnull: # exclude nulls
92 mask = labels[0] == -1
93 for lab in labels[1:nlev]:
94 mask |= lab == -1
95 out[mask] = -1
97 if nlev == len(shape): # all levels done!
98 break
100 # compress what has been done so far in order to avoid overflow
101 # to retain lexical ranks, obs_ids should be sorted
102 comp_ids, obs_ids = compress_group_index(out, sort=sort)
104 labels = [comp_ids] + labels[nlev:]
105 shape = [len(obs_ids)] + shape[nlev:]
107 return out
110def get_compressed_ids(labels, sizes):
111 """
112 Group_index is offsets into cartesian product of all possible labels. This
113 space can be huge, so this function compresses it, by computing offsets
114 (comp_ids) into the list of unique labels (obs_group_ids).
116 Parameters
117 ----------
118 labels : list of label arrays
119 sizes : list of size of the levels
121 Returns
122 -------
123 tuple of (comp_ids, obs_group_ids)
124 """
125 ids = get_group_index(labels, sizes, sort=True, xnull=False)
126 return compress_group_index(ids, sort=True)
129def is_int64_overflow_possible(shape) -> bool:
130 the_prod = 1
131 for x in shape:
132 the_prod *= int(x)
134 return the_prod >= _INT64_MAX
137def decons_group_index(comp_labels, shape):
138 # reconstruct labels
139 if is_int64_overflow_possible(shape):
140 # at some point group indices are factorized,
141 # and may not be deconstructed here! wrong path!
142 raise ValueError("cannot deconstruct factorized group indices!")
144 label_list = []
145 factor = 1
146 y = 0
147 x = comp_labels
148 for i in reversed(range(len(shape))):
149 labels = (x - y) % (factor * shape[i]) // factor
150 np.putmask(labels, comp_labels < 0, -1)
151 label_list.append(labels)
152 y = labels * factor
153 factor *= shape[i]
154 return label_list[::-1]
157def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool):
158 """
159 Reconstruct labels from observed group ids.
161 Parameters
162 ----------
163 xnull : bool
164 If nulls are excluded; i.e. -1 labels are passed through.
165 """
166 if not xnull:
167 lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8")
168 shape = np.asarray(shape, dtype="i8") + lift
170 if not is_int64_overflow_possible(shape):
171 # obs ids are deconstructable! take the fast route!
172 out = decons_group_index(obs_ids, shape)
173 return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
175 i = unique_label_indices(comp_ids)
176 i8copy = lambda a: a.astype("i8", subok=False, copy=True)
177 return [i8copy(lab[i]) for lab in labels]
180def indexer_from_factorized(labels, shape, compress: bool = True):
181 ids = get_group_index(labels, shape, sort=True, xnull=False)
183 if not compress:
184 ngroups = (ids.size and ids.max()) + 1
185 else:
186 ids, obs = compress_group_index(ids, sort=True)
187 ngroups = len(obs)
189 return get_group_index_sorter(ids, ngroups)
192def lexsort_indexer(keys, orders=None, na_position: str = "last"):
193 """
194 Parameters
195 ----------
196 na_position : {'first', 'last'}, default 'last'
197 """
198 from pandas.core.arrays import Categorical
200 labels = []
201 shape = []
202 if isinstance(orders, bool):
203 orders = [orders] * len(keys)
204 elif orders is None:
205 orders = [True] * len(keys)
207 for key, order in zip(keys, orders):
209 # we are already a Categorical
210 if is_categorical_dtype(key):
211 cat = key
213 # create the Categorical
214 else:
215 cat = Categorical(key, ordered=True)
217 if na_position not in ["last", "first"]:
218 raise ValueError(f"invalid na_position: {na_position}")
220 n = len(cat.categories)
221 codes = cat.codes.copy()
223 mask = cat.codes == -1
224 if order: # ascending
225 if na_position == "last":
226 codes = np.where(mask, n, codes)
227 elif na_position == "first":
228 codes += 1
229 else: # not order means descending
230 if na_position == "last":
231 codes = np.where(mask, n, n - codes - 1)
232 elif na_position == "first":
233 codes = np.where(mask, 0, n - codes)
234 if mask.any():
235 n += 1
237 shape.append(n)
238 labels.append(codes)
240 return indexer_from_factorized(labels, shape)
243def nargsort(
244 items, kind: str = "quicksort", ascending: bool = True, na_position: str = "last"
245):
246 """
247 Intended to be a drop-in replacement for np.argsort which handles NaNs.
249 Adds ascending and na_position parameters.
251 (GH #6399, #5231)
253 Parameters
254 ----------
255 kind : str, default 'quicksort'
256 ascending : bool, default True
257 na_position : {'first', 'last'}, default 'last'
258 """
259 items = extract_array(items)
260 mask = np.asarray(isna(items))
262 if is_extension_array_dtype(items):
263 items = items._values_for_argsort()
264 else:
265 items = np.asanyarray(items)
267 idx = np.arange(len(items))
268 non_nans = items[~mask]
269 non_nan_idx = idx[~mask]
270 nan_idx = np.nonzero(mask)[0]
271 if not ascending:
272 non_nans = non_nans[::-1]
273 non_nan_idx = non_nan_idx[::-1]
274 indexer = non_nan_idx[non_nans.argsort(kind=kind)]
275 if not ascending:
276 indexer = indexer[::-1]
277 # Finally, place the NaNs at the end or the beginning according to
278 # na_position
279 if na_position == "last":
280 indexer = np.concatenate([indexer, nan_idx])
281 elif na_position == "first":
282 indexer = np.concatenate([nan_idx, indexer])
283 else:
284 raise ValueError(f"invalid na_position: {na_position}")
285 return indexer
288class _KeyMapper:
289 """
290 Map compressed group id -> key tuple.
291 """
293 def __init__(self, comp_ids, ngroups: int, levels, labels):
294 self.levels = levels
295 self.labels = labels
296 self.comp_ids = comp_ids.astype(np.int64)
298 self.k = len(labels)
299 self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)]
301 self._populate_tables()
303 def _populate_tables(self):
304 for labs, table in zip(self.labels, self.tables):
305 table.map(self.comp_ids, labs.astype(np.int64))
307 def get_key(self, comp_id):
308 return tuple(
309 level[table.get_item(comp_id)]
310 for table, level in zip(self.tables, self.levels)
311 )
314def get_flattened_iterator(comp_ids, ngroups, levels, labels):
315 # provide "flattened" iterator for multi-group setting
316 mapper = _KeyMapper(comp_ids, ngroups, levels, labels)
317 return [mapper.get_key(i) for i in range(ngroups)]
320def get_indexer_dict(label_list, keys):
321 """
322 Returns
323 -------
324 dict
325 Labels mapped to indexers.
326 """
327 shape = [len(x) for x in keys]
329 group_index = get_group_index(label_list, shape, sort=True, xnull=True)
330 ngroups = (
331 ((group_index.size and group_index.max()) + 1)
332 if is_int64_overflow_possible(shape)
333 else np.prod(shape, dtype="i8")
334 )
336 sorter = get_group_index_sorter(group_index, ngroups)
338 sorted_labels = [lab.take(sorter) for lab in label_list]
339 group_index = group_index.take(sorter)
341 return lib.indices_fast(sorter, group_index, keys, sorted_labels)
344# ----------------------------------------------------------------------
345# sorting levels...cleverly?
348def get_group_index_sorter(group_index, ngroups: int):
349 """
350 algos.groupsort_indexer implements `counting sort` and it is at least
351 O(ngroups), where
352 ngroups = prod(shape)
353 shape = map(len, keys)
354 that is, linear in the number of combinations (cartesian product) of unique
355 values of groupby keys. This can be huge when doing multi-key groupby.
356 np.argsort(kind='mergesort') is O(count x log(count)) where count is the
357 length of the data-frame;
358 Both algorithms are `stable` sort and that is necessary for correctness of
359 groupby operations. e.g. consider:
360 df.groupby(key)[col].transform('first')
361 """
362 count = len(group_index)
363 alpha = 0.0 # taking complexities literally; there may be
364 beta = 1.0 # some room for fine-tuning these parameters
365 do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
366 if do_groupsort:
367 sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
368 return ensure_platform_int(sorter)
369 else:
370 return group_index.argsort(kind="mergesort")
373def compress_group_index(group_index, sort: bool = True):
374 """
375 Group_index is offsets into cartesian product of all possible labels. This
376 space can be huge, so this function compresses it, by computing offsets
377 (comp_ids) into the list of unique labels (obs_group_ids).
378 """
380 size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT)
381 table = hashtable.Int64HashTable(size_hint)
383 group_index = ensure_int64(group_index)
385 # note, group labels come out ascending (ie, 1,2,3 etc)
386 comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
388 if sort and len(obs_group_ids) > 0:
389 obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
391 return comp_ids, obs_group_ids
394def _reorder_by_uniques(uniques, labels):
395 # sorter is index where elements ought to go
396 sorter = uniques.argsort()
398 # reverse_indexer is where elements came from
399 reverse_indexer = np.empty(len(sorter), dtype=np.int64)
400 reverse_indexer.put(sorter, np.arange(len(sorter)))
402 mask = labels < 0
404 # move labels to right locations (ie, unsort ascending labels)
405 labels = algorithms.take_nd(reverse_indexer, labels, allow_fill=False)
406 np.putmask(labels, mask, -1)
408 # sort observed ids
409 uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)
411 return uniques, labels