Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/dtypes/concat.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Utility functions related to concat
3"""
5import numpy as np
7from pandas._libs import tslib, tslibs
9from pandas.core.dtypes.common import (
10 _NS_DTYPE,
11 _TD_DTYPE,
12 is_bool_dtype,
13 is_categorical_dtype,
14 is_datetime64_dtype,
15 is_datetime64tz_dtype,
16 is_dtype_equal,
17 is_extension_array_dtype,
18 is_object_dtype,
19 is_sparse,
20 is_timedelta64_dtype,
21)
22from pandas.core.dtypes.generic import (
23 ABCCategoricalIndex,
24 ABCDatetimeArray,
25 ABCIndexClass,
26 ABCRangeIndex,
27 ABCSeries,
28)
31def get_dtype_kinds(l):
32 """
33 Parameters
34 ----------
35 l : list of arrays
37 Returns
38 -------
39 a set of kinds that exist in this list of arrays
40 """
42 typs = set()
43 for arr in l:
45 dtype = arr.dtype
46 if is_categorical_dtype(dtype):
47 typ = "category"
48 elif is_sparse(arr):
49 typ = "sparse"
50 elif isinstance(arr, ABCRangeIndex):
51 typ = "range"
52 elif is_datetime64tz_dtype(arr):
53 # if to_concat contains different tz,
54 # the result must be object dtype
55 typ = str(arr.dtype)
56 elif is_datetime64_dtype(dtype):
57 typ = "datetime"
58 elif is_timedelta64_dtype(dtype):
59 typ = "timedelta"
60 elif is_object_dtype(dtype):
61 typ = "object"
62 elif is_bool_dtype(dtype):
63 typ = "bool"
64 elif is_extension_array_dtype(dtype):
65 typ = str(arr.dtype)
66 else:
67 typ = dtype.kind
68 typs.add(typ)
69 return typs
72def concat_compat(to_concat, axis: int = 0):
73 """
74 provide concatenation of an array of arrays each of which is a single
75 'normalized' dtypes (in that for example, if it's object, then it is a
76 non-datetimelike and provide a combined dtype for the resulting array that
77 preserves the overall dtype if possible)
79 Parameters
80 ----------
81 to_concat : array of arrays
82 axis : axis to provide concatenation
84 Returns
85 -------
86 a single array, preserving the combined dtypes
87 """
89 # filter empty arrays
90 # 1-d dtypes always are included here
91 def is_nonempty(x) -> bool:
92 if x.ndim <= axis:
93 return True
94 return x.shape[axis] > 0
96 # If all arrays are empty, there's nothing to convert, just short-cut to
97 # the concatenation, #3121.
98 #
99 # Creating an empty array directly is tempting, but the winnings would be
100 # marginal given that it would still require shape & dtype calculation and
101 # np.concatenate which has them both implemented is compiled.
103 typs = get_dtype_kinds(to_concat)
104 _contains_datetime = any(typ.startswith("datetime") for typ in typs)
105 _contains_period = any(typ.startswith("period") for typ in typs)
107 if "category" in typs:
108 # this must be prior to concat_datetime,
109 # to support Categorical + datetime-like
110 return concat_categorical(to_concat, axis=axis)
112 elif _contains_datetime or "timedelta" in typs or _contains_period:
113 return concat_datetime(to_concat, axis=axis, typs=typs)
115 # these are mandated to handle empties as well
116 elif "sparse" in typs:
117 return _concat_sparse(to_concat, axis=axis, typs=typs)
119 all_empty = all(not is_nonempty(x) for x in to_concat)
120 if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1:
121 to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
123 if all_empty:
124 # we have all empties, but may need to coerce the result dtype to
125 # object if we have non-numeric type operands (numpy would otherwise
126 # cast this to float)
127 typs = get_dtype_kinds(to_concat)
128 if len(typs) != 1:
130 if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}):
131 # let numpy coerce
132 pass
133 else:
134 # coerce to object
135 to_concat = [x.astype("object") for x in to_concat]
137 return np.concatenate(to_concat, axis=axis)
140def concat_categorical(to_concat, axis: int = 0):
141 """Concatenate an object/categorical array of arrays, each of which is a
142 single dtype
144 Parameters
145 ----------
146 to_concat : array of arrays
147 axis : int
148 Axis to provide concatenation in the current implementation this is
149 always 0, e.g. we only have 1D categoricals
151 Returns
152 -------
153 Categorical
154 A single array, preserving the combined dtypes
155 """
157 # we could have object blocks and categoricals here
158 # if we only have a single categoricals then combine everything
159 # else its a non-compat categorical
160 categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
162 # validate the categories
163 if len(categoricals) != len(to_concat):
164 pass
165 else:
166 # when all categories are identical
167 first = to_concat[0]
168 if all(first.is_dtype_equal(other) for other in to_concat[1:]):
169 return union_categoricals(categoricals)
171 # extract the categoricals & coerce to object if needed
172 to_concat = [
173 x._internal_get_values()
174 if is_categorical_dtype(x.dtype)
175 else np.asarray(x).ravel()
176 if not is_datetime64tz_dtype(x)
177 else np.asarray(x.astype(object))
178 for x in to_concat
179 ]
180 result = concat_compat(to_concat)
181 if axis == 1:
182 result = result.reshape(1, len(result))
183 return result
186def union_categoricals(
187 to_union, sort_categories: bool = False, ignore_order: bool = False
188):
189 """
190 Combine list-like of Categorical-like, unioning categories.
192 All categories must have the same dtype.
194 Parameters
195 ----------
196 to_union : list-like
197 Categorical, CategoricalIndex, or Series with dtype='category'.
198 sort_categories : bool, default False
199 If true, resulting categories will be lexsorted, otherwise
200 they will be ordered as they appear in the data.
201 ignore_order : bool, default False
202 If true, the ordered attribute of the Categoricals will be ignored.
203 Results in an unordered categorical.
205 Returns
206 -------
207 Categorical
209 Raises
210 ------
211 TypeError
212 - all inputs do not have the same dtype
213 - all inputs do not have the same ordered property
214 - all inputs are ordered and their categories are not identical
215 - sort_categories=True and Categoricals are ordered
216 ValueError
217 Empty list of categoricals passed
219 Notes
220 -----
222 To learn more about categories, see `link
223 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
225 Examples
226 --------
228 >>> from pandas.api.types import union_categoricals
230 If you want to combine categoricals that do not necessarily have
231 the same categories, `union_categoricals` will combine a list-like
232 of categoricals. The new categories will be the union of the
233 categories being combined.
235 >>> a = pd.Categorical(["b", "c"])
236 >>> b = pd.Categorical(["a", "b"])
237 >>> union_categoricals([a, b])
238 [b, c, a, b]
239 Categories (3, object): [b, c, a]
241 By default, the resulting categories will be ordered as they appear
242 in the `categories` of the data. If you want the categories to be
243 lexsorted, use `sort_categories=True` argument.
245 >>> union_categoricals([a, b], sort_categories=True)
246 [b, c, a, b]
247 Categories (3, object): [a, b, c]
249 `union_categoricals` also works with the case of combining two
250 categoricals of the same categories and order information (e.g. what
251 you could also `append` for).
253 >>> a = pd.Categorical(["a", "b"], ordered=True)
254 >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
255 >>> union_categoricals([a, b])
256 [a, b, a, b, a]
257 Categories (2, object): [a < b]
259 Raises `TypeError` because the categories are ordered and not identical.
261 >>> a = pd.Categorical(["a", "b"], ordered=True)
262 >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
263 >>> union_categoricals([a, b])
264 TypeError: to union ordered Categoricals, all categories must be the same
266 New in version 0.20.0
268 Ordered categoricals with different categories or orderings can be
269 combined by using the `ignore_ordered=True` argument.
271 >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
272 >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
273 >>> union_categoricals([a, b], ignore_order=True)
274 [a, b, c, c, b, a]
275 Categories (3, object): [a, b, c]
277 `union_categoricals` also works with a `CategoricalIndex`, or `Series`
278 containing categorical data, but note that the resulting array will
279 always be a plain `Categorical`
281 >>> a = pd.Series(["b", "c"], dtype='category')
282 >>> b = pd.Series(["a", "b"], dtype='category')
283 >>> union_categoricals([a, b])
284 [b, c, a, b]
285 Categories (3, object): [b, c, a]
286 """
287 from pandas import Index, Categorical
288 from pandas.core.arrays.categorical import _recode_for_categories
290 if len(to_union) == 0:
291 raise ValueError("No Categoricals to union")
293 def _maybe_unwrap(x):
294 if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
295 return x.values
296 elif isinstance(x, Categorical):
297 return x
298 else:
299 raise TypeError("all components to combine must be Categorical")
301 to_union = [_maybe_unwrap(x) for x in to_union]
302 first = to_union[0]
304 if not all(
305 is_dtype_equal(other.categories.dtype, first.categories.dtype)
306 for other in to_union[1:]
307 ):
308 raise TypeError("dtype of categories must be the same")
310 ordered = False
311 if all(first.is_dtype_equal(other) for other in to_union[1:]):
312 # identical categories - fastpath
313 categories = first.categories
314 ordered = first.ordered
316 if all(first.categories.equals(other.categories) for other in to_union[1:]):
317 new_codes = np.concatenate([c.codes for c in to_union])
318 else:
319 codes = [first.codes] + [
320 _recode_for_categories(other.codes, other.categories, first.categories)
321 for other in to_union[1:]
322 ]
323 new_codes = np.concatenate(codes)
325 if sort_categories and not ignore_order and ordered:
326 raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
328 if sort_categories and not categories.is_monotonic_increasing:
329 categories = categories.sort_values()
330 indexer = categories.get_indexer(first.categories)
332 from pandas.core.algorithms import take_1d
334 new_codes = take_1d(indexer, new_codes, fill_value=-1)
335 elif ignore_order or all(not c.ordered for c in to_union):
336 # different categories - union and recode
337 cats = first.categories.append([c.categories for c in to_union[1:]])
338 categories = Index(cats.unique())
339 if sort_categories:
340 categories = categories.sort_values()
342 new_codes = [
343 _recode_for_categories(c.codes, c.categories, categories) for c in to_union
344 ]
345 new_codes = np.concatenate(new_codes)
346 else:
347 # ordered - to show a proper error message
348 if all(c.ordered for c in to_union):
349 msg = "to union ordered Categoricals, all categories must be the same"
350 raise TypeError(msg)
351 else:
352 raise TypeError("Categorical.ordered must be the same")
354 if ignore_order:
355 ordered = False
357 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
360def _concatenate_2d(to_concat, axis: int):
361 # coerce to 2d if needed & concatenate
362 if axis == 1:
363 to_concat = [np.atleast_2d(x) for x in to_concat]
364 return np.concatenate(to_concat, axis=axis)
367def concat_datetime(to_concat, axis=0, typs=None):
368 """
369 provide concatenation of an datetimelike array of arrays each of which is a
370 single M8[ns], datetimet64[ns, tz] or m8[ns] dtype
372 Parameters
373 ----------
374 to_concat : array of arrays
375 axis : axis to provide concatenation
376 typs : set of to_concat dtypes
378 Returns
379 -------
380 a single array, preserving the combined dtypes
381 """
383 if typs is None:
384 typs = get_dtype_kinds(to_concat)
386 # multiple types, need to coerce to object
387 if len(typs) != 1:
388 return _concatenate_2d(
389 [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis
390 )
392 # must be single dtype
393 if any(typ.startswith("datetime") for typ in typs):
395 if "datetime" in typs:
396 to_concat = [x.astype(np.int64, copy=False) for x in to_concat]
397 return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE)
398 else:
399 # when to_concat has different tz, len(typs) > 1.
400 # thus no need to care
401 return _concat_datetimetz(to_concat)
403 elif "timedelta" in typs:
404 return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view(
405 _TD_DTYPE
406 )
408 elif any(typ.startswith("period") for typ in typs):
409 assert len(typs) == 1
410 cls = to_concat[0]
411 new_values = cls._concat_same_type(to_concat)
412 return new_values
415def _convert_datetimelike_to_object(x):
416 # coerce datetimelike array to object dtype
418 # if dtype is of datetimetz or timezone
419 if x.dtype.kind == _NS_DTYPE.kind:
420 if getattr(x, "tz", None) is not None:
421 x = np.asarray(x.astype(object))
422 else:
423 shape = x.shape
424 x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp")
425 x = x.reshape(shape)
427 elif x.dtype == _TD_DTYPE:
428 shape = x.shape
429 x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
430 x = x.reshape(shape)
432 return x
435def _concat_datetimetz(to_concat, name=None):
436 """
437 concat DatetimeIndex with the same tz
438 all inputs must be DatetimeIndex
439 it is used in DatetimeIndex.append also
440 """
441 # Right now, internals will pass a List[DatetimeArray] here
442 # for reductions like quantile. I would like to disentangle
443 # all this before we get here.
444 sample = to_concat[0]
446 if isinstance(sample, ABCIndexClass):
447 return sample._concat_same_dtype(to_concat, name=name)
448 elif isinstance(sample, ABCDatetimeArray):
449 return sample._concat_same_type(to_concat)
452def _concat_sparse(to_concat, axis=0, typs=None):
453 """
454 provide concatenation of an sparse/dense array of arrays each of which is a
455 single dtype
457 Parameters
458 ----------
459 to_concat : array of arrays
460 axis : axis to provide concatenation
461 typs : set of to_concat dtypes
463 Returns
464 -------
465 a single array, preserving the combined dtypes
466 """
468 from pandas.core.arrays import SparseArray
470 fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
471 fill_value = fill_values[0]
473 # TODO: Fix join unit generation so we aren't passed this.
474 to_concat = [
475 x
476 if isinstance(x, SparseArray)
477 else SparseArray(x.squeeze(), fill_value=fill_value)
478 for x in to_concat
479 ]
481 return SparseArray._concat_same_type(to_concat)