Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/util/hashing.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2data hash pandas / numpy objects
3"""
4import itertools
5from typing import Optional
7import numpy as np
9from pandas._libs import Timestamp
10import pandas._libs.hashing as hashing
12from pandas.core.dtypes.cast import infer_dtype_from_scalar
13from pandas.core.dtypes.common import (
14 is_categorical_dtype,
15 is_extension_array_dtype,
16 is_list_like,
17)
18from pandas.core.dtypes.generic import (
19 ABCDataFrame,
20 ABCIndexClass,
21 ABCMultiIndex,
22 ABCSeries,
23)
24from pandas.core.dtypes.missing import isna
26# 16 byte long hashing key
27_default_hash_key = "0123456789123456"
30def _combine_hash_arrays(arrays, num_items: int):
31 """
32 Parameters
33 ----------
34 arrays : generator
35 num_items : int
37 Should be the same as CPython's tupleobject.c
38 """
39 try:
40 first = next(arrays)
41 except StopIteration:
42 return np.array([], dtype=np.uint64)
44 arrays = itertools.chain([first], arrays)
46 mult = np.uint64(1000003)
47 out = np.zeros_like(first) + np.uint64(0x345678)
48 for i, a in enumerate(arrays):
49 inverse_i = num_items - i
50 out ^= a
51 out *= mult
52 mult += np.uint64(82520 + inverse_i + inverse_i)
53 assert i + 1 == num_items, "Fed in wrong num_items"
54 out += np.uint64(97531)
55 return out
58def hash_pandas_object(
59 obj,
60 index: bool = True,
61 encoding: str = "utf8",
62 hash_key: Optional[str] = _default_hash_key,
63 categorize: bool = True,
64):
65 """
66 Return a data hash of the Index/Series/DataFrame.
68 Parameters
69 ----------
70 index : bool, default True
71 Include the index in the hash (if Series/DataFrame).
72 encoding : str, default 'utf8'
73 Encoding for data & key when strings.
74 hash_key : str, default _default_hash_key
75 Hash_key for string key to encode.
76 categorize : bool, default True
77 Whether to first categorize object arrays before hashing. This is more
78 efficient when the array contains duplicate values.
80 Returns
81 -------
82 Series of uint64, same length as the object
83 """
84 from pandas import Series
86 if hash_key is None:
87 hash_key = _default_hash_key
89 if isinstance(obj, ABCMultiIndex):
90 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
92 elif isinstance(obj, ABCIndexClass):
93 h = hash_array(obj.values, encoding, hash_key, categorize).astype(
94 "uint64", copy=False
95 )
96 h = Series(h, index=obj, dtype="uint64", copy=False)
98 elif isinstance(obj, ABCSeries):
99 h = hash_array(obj.values, encoding, hash_key, categorize).astype(
100 "uint64", copy=False
101 )
102 if index:
103 index_iter = (
104 hash_pandas_object(
105 obj.index,
106 index=False,
107 encoding=encoding,
108 hash_key=hash_key,
109 categorize=categorize,
110 ).values
111 for _ in [None]
112 )
113 arrays = itertools.chain([h], index_iter)
114 h = _combine_hash_arrays(arrays, 2)
116 h = Series(h, index=obj.index, dtype="uint64", copy=False)
118 elif isinstance(obj, ABCDataFrame):
119 hashes = (hash_array(series.values) for _, series in obj.items())
120 num_items = len(obj.columns)
121 if index:
122 index_hash_generator = (
123 hash_pandas_object(
124 obj.index,
125 index=False,
126 encoding=encoding,
127 hash_key=hash_key,
128 categorize=categorize,
129 ).values # noqa
130 for _ in [None]
131 )
132 num_items += 1
134 # keep `hashes` specifically a generator to keep mypy happy
135 _hashes = itertools.chain(hashes, index_hash_generator)
136 hashes = (x for x in _hashes)
137 h = _combine_hash_arrays(hashes, num_items)
139 h = Series(h, index=obj.index, dtype="uint64", copy=False)
140 else:
141 raise TypeError(f"Unexpected type for hashing {type(obj)}")
142 return h
145def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
146 """
147 Hash an MultiIndex / list-of-tuples efficiently
149 Parameters
150 ----------
151 vals : MultiIndex, list-of-tuples, or single tuple
152 encoding : str, default 'utf8'
153 hash_key : str, default _default_hash_key
155 Returns
156 -------
157 ndarray of hashed values array
158 """
159 is_tuple = False
160 if isinstance(vals, tuple):
161 vals = [vals]
162 is_tuple = True
163 elif not is_list_like(vals):
164 raise TypeError("must be convertible to a list-of-tuples")
166 from pandas import Categorical, MultiIndex
168 if not isinstance(vals, ABCMultiIndex):
169 vals = MultiIndex.from_tuples(vals)
171 # create a list-of-Categoricals
172 vals = [
173 Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True)
174 for level in range(vals.nlevels)
175 ]
177 # hash the list-of-ndarrays
178 hashes = (
179 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
180 )
181 h = _combine_hash_arrays(hashes, len(vals))
182 if is_tuple:
183 h = h[0]
185 return h
188def hash_tuple(val, encoding: str = "utf8", hash_key: str = _default_hash_key):
189 """
190 Hash a single tuple efficiently
192 Parameters
193 ----------
194 val : single tuple
195 encoding : str, default 'utf8'
196 hash_key : str, default _default_hash_key
198 Returns
199 -------
200 hash
202 """
203 hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val)
205 h = _combine_hash_arrays(hashes, len(val))[0]
207 return h
210def _hash_categorical(c, encoding: str, hash_key: str):
211 """
212 Hash a Categorical by hashing its categories, and then mapping the codes
213 to the hashes
215 Parameters
216 ----------
217 c : Categorical
218 encoding : str
219 hash_key : str
221 Returns
222 -------
223 ndarray of hashed values array, same size as len(c)
224 """
225 # Convert ExtensionArrays to ndarrays
226 values = np.asarray(c.categories.values)
227 hashed = hash_array(values, encoding, hash_key, categorize=False)
229 # we have uint64, as we don't directly support missing values
230 # we don't want to use take_nd which will coerce to float
231 # instead, directly construct the result with a
232 # max(np.uint64) as the missing value indicator
233 #
234 # TODO: GH 15362
236 mask = c.isna()
237 if len(hashed):
238 result = hashed.take(c.codes)
239 else:
240 result = np.zeros(len(mask), dtype="uint64")
242 if mask.any():
243 result[mask] = np.iinfo(np.uint64).max
245 return result
248def hash_array(
249 vals,
250 encoding: str = "utf8",
251 hash_key: str = _default_hash_key,
252 categorize: bool = True,
253):
254 """
255 Given a 1d array, return an array of deterministic integers.
257 Parameters
258 ----------
259 vals : ndarray, Categorical
260 encoding : str, default 'utf8'
261 Encoding for data & key when strings.
262 hash_key : str, default _default_hash_key
263 Hash_key for string key to encode.
264 categorize : bool, default True
265 Whether to first categorize object arrays before hashing. This is more
266 efficient when the array contains duplicate values.
268 Returns
269 -------
270 1d uint64 numpy array of hash values, same length as the vals
271 """
273 if not hasattr(vals, "dtype"):
274 raise TypeError("must pass a ndarray-like")
275 dtype = vals.dtype
277 # For categoricals, we hash the categories, then remap the codes to the
278 # hash values. (This check is above the complex check so that we don't ask
279 # numpy if categorical is a subdtype of complex, as it will choke).
280 if is_categorical_dtype(dtype):
281 return _hash_categorical(vals, encoding, hash_key)
282 elif is_extension_array_dtype(dtype):
283 vals, _ = vals._values_for_factorize()
284 dtype = vals.dtype
286 # we'll be working with everything as 64-bit values, so handle this
287 # 128-bit value early
288 if np.issubdtype(dtype, np.complex128):
289 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))
291 # First, turn whatever array this is into unsigned 64-bit ints, if we can
292 # manage it.
293 elif isinstance(dtype, np.bool):
294 vals = vals.astype("u8")
295 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
296 vals = vals.view("i8").astype("u8", copy=False)
297 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
298 vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8")
299 else:
300 # With repeated values, its MUCH faster to categorize object dtypes,
301 # then hash and rename categories. We allow skipping the categorization
302 # when the values are known/likely to be unique.
303 if categorize:
304 from pandas import factorize, Categorical, Index
306 codes, categories = factorize(vals, sort=False)
307 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
308 return _hash_categorical(cat, encoding, hash_key)
310 try:
311 vals = hashing.hash_object_array(vals, hash_key, encoding)
312 except TypeError:
313 # we have mixed types
314 vals = hashing.hash_object_array(
315 vals.astype(str).astype(object), hash_key, encoding
316 )
318 # Then, redistribute these 64-bit ints within the space of 64-bit ints
319 vals ^= vals >> 30
320 vals *= np.uint64(0xBF58476D1CE4E5B9)
321 vals ^= vals >> 27
322 vals *= np.uint64(0x94D049BB133111EB)
323 vals ^= vals >> 31
324 return vals
327def _hash_scalar(
328 val, encoding: str = "utf8", hash_key: str = _default_hash_key
329) -> np.ndarray:
330 """
331 Hash scalar value.
333 Parameters
334 ----------
335 val : scalar
336 encoding : str, default "utf8"
337 hash_key : str, default _default_hash_key
339 Returns
340 -------
341 1d uint64 numpy array of hash value, of length 1
342 """
344 if isna(val):
345 # this is to be consistent with the _hash_categorical implementation
346 return np.array([np.iinfo(np.uint64).max], dtype="u8")
348 if getattr(val, "tzinfo", None) is not None:
349 # for tz-aware datetimes, we need the underlying naive UTC value and
350 # not the tz aware object or pd extension type (as
351 # infer_dtype_from_scalar would do)
352 if not isinstance(val, Timestamp):
353 val = Timestamp(val)
354 val = val.tz_convert(None)
356 dtype, val = infer_dtype_from_scalar(val)
357 vals = np.array([val], dtype=dtype)
359 return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False)