Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Utility functions related to concat 

3""" 

4 

5import numpy as np 

6 

7from pandas._libs import tslib, tslibs 

8 

9from pandas.core.dtypes.common import ( 

10 _NS_DTYPE, 

11 _TD_DTYPE, 

12 is_bool_dtype, 

13 is_categorical_dtype, 

14 is_datetime64_dtype, 

15 is_datetime64tz_dtype, 

16 is_dtype_equal, 

17 is_extension_array_dtype, 

18 is_object_dtype, 

19 is_sparse, 

20 is_timedelta64_dtype, 

21) 

22from pandas.core.dtypes.generic import ( 

23 ABCCategoricalIndex, 

24 ABCDatetimeArray, 

25 ABCIndexClass, 

26 ABCRangeIndex, 

27 ABCSeries, 

28) 

29 

30 

31def get_dtype_kinds(l): 

32 """ 

33 Parameters 

34 ---------- 

35 l : list of arrays 

36 

37 Returns 

38 ------- 

39 a set of kinds that exist in this list of arrays 

40 """ 

41 

42 typs = set() 

43 for arr in l: 

44 

45 dtype = arr.dtype 

46 if is_categorical_dtype(dtype): 

47 typ = "category" 

48 elif is_sparse(arr): 

49 typ = "sparse" 

50 elif isinstance(arr, ABCRangeIndex): 

51 typ = "range" 

52 elif is_datetime64tz_dtype(arr): 

53 # if to_concat contains different tz, 

54 # the result must be object dtype 

55 typ = str(arr.dtype) 

56 elif is_datetime64_dtype(dtype): 

57 typ = "datetime" 

58 elif is_timedelta64_dtype(dtype): 

59 typ = "timedelta" 

60 elif is_object_dtype(dtype): 

61 typ = "object" 

62 elif is_bool_dtype(dtype): 

63 typ = "bool" 

64 elif is_extension_array_dtype(dtype): 

65 typ = str(arr.dtype) 

66 else: 

67 typ = dtype.kind 

68 typs.add(typ) 

69 return typs 

70 

71 

72def concat_compat(to_concat, axis: int = 0): 

73 """ 

74 provide concatenation of an array of arrays each of which is a single 

75 'normalized' dtypes (in that for example, if it's object, then it is a 

76 non-datetimelike and provide a combined dtype for the resulting array that 

77 preserves the overall dtype if possible) 

78 

79 Parameters 

80 ---------- 

81 to_concat : array of arrays 

82 axis : axis to provide concatenation 

83 

84 Returns 

85 ------- 

86 a single array, preserving the combined dtypes 

87 """ 

88 

89 # filter empty arrays 

90 # 1-d dtypes always are included here 

91 def is_nonempty(x) -> bool: 

92 if x.ndim <= axis: 

93 return True 

94 return x.shape[axis] > 0 

95 

96 # If all arrays are empty, there's nothing to convert, just short-cut to 

97 # the concatenation, #3121. 

98 # 

99 # Creating an empty array directly is tempting, but the winnings would be 

100 # marginal given that it would still require shape & dtype calculation and 

101 # np.concatenate which has them both implemented is compiled. 

102 

103 typs = get_dtype_kinds(to_concat) 

104 _contains_datetime = any(typ.startswith("datetime") for typ in typs) 

105 _contains_period = any(typ.startswith("period") for typ in typs) 

106 

107 if "category" in typs: 

108 # this must be prior to concat_datetime, 

109 # to support Categorical + datetime-like 

110 return concat_categorical(to_concat, axis=axis) 

111 

112 elif _contains_datetime or "timedelta" in typs or _contains_period: 

113 return concat_datetime(to_concat, axis=axis, typs=typs) 

114 

115 # these are mandated to handle empties as well 

116 elif "sparse" in typs: 

117 return _concat_sparse(to_concat, axis=axis, typs=typs) 

118 

119 all_empty = all(not is_nonempty(x) for x in to_concat) 

120 if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1: 

121 to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] 

122 

123 if all_empty: 

124 # we have all empties, but may need to coerce the result dtype to 

125 # object if we have non-numeric type operands (numpy would otherwise 

126 # cast this to float) 

127 typs = get_dtype_kinds(to_concat) 

128 if len(typs) != 1: 

129 

130 if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): 

131 # let numpy coerce 

132 pass 

133 else: 

134 # coerce to object 

135 to_concat = [x.astype("object") for x in to_concat] 

136 

137 return np.concatenate(to_concat, axis=axis) 

138 

139 

140def concat_categorical(to_concat, axis: int = 0): 

141 """Concatenate an object/categorical array of arrays, each of which is a 

142 single dtype 

143 

144 Parameters 

145 ---------- 

146 to_concat : array of arrays 

147 axis : int 

148 Axis to provide concatenation in the current implementation this is 

149 always 0, e.g. we only have 1D categoricals 

150 

151 Returns 

152 ------- 

153 Categorical 

154 A single array, preserving the combined dtypes 

155 """ 

156 

157 # we could have object blocks and categoricals here 

158 # if we only have a single categoricals then combine everything 

159 # else its a non-compat categorical 

160 categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] 

161 

162 # validate the categories 

163 if len(categoricals) != len(to_concat): 

164 pass 

165 else: 

166 # when all categories are identical 

167 first = to_concat[0] 

168 if all(first.is_dtype_equal(other) for other in to_concat[1:]): 

169 return union_categoricals(categoricals) 

170 

171 # extract the categoricals & coerce to object if needed 

172 to_concat = [ 

173 x._internal_get_values() 

174 if is_categorical_dtype(x.dtype) 

175 else np.asarray(x).ravel() 

176 if not is_datetime64tz_dtype(x) 

177 else np.asarray(x.astype(object)) 

178 for x in to_concat 

179 ] 

180 result = concat_compat(to_concat) 

181 if axis == 1: 

182 result = result.reshape(1, len(result)) 

183 return result 

184 

185 

186def union_categoricals( 

187 to_union, sort_categories: bool = False, ignore_order: bool = False 

188): 

189 """ 

190 Combine list-like of Categorical-like, unioning categories. 

191 

192 All categories must have the same dtype. 

193 

194 Parameters 

195 ---------- 

196 to_union : list-like 

197 Categorical, CategoricalIndex, or Series with dtype='category'. 

198 sort_categories : bool, default False 

199 If true, resulting categories will be lexsorted, otherwise 

200 they will be ordered as they appear in the data. 

201 ignore_order : bool, default False 

202 If true, the ordered attribute of the Categoricals will be ignored. 

203 Results in an unordered categorical. 

204 

205 Returns 

206 ------- 

207 Categorical 

208 

209 Raises 

210 ------ 

211 TypeError 

212 - all inputs do not have the same dtype 

213 - all inputs do not have the same ordered property 

214 - all inputs are ordered and their categories are not identical 

215 - sort_categories=True and Categoricals are ordered 

216 ValueError 

217 Empty list of categoricals passed 

218 

219 Notes 

220 ----- 

221 

222 To learn more about categories, see `link 

223 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__ 

224 

225 Examples 

226 -------- 

227 

228 >>> from pandas.api.types import union_categoricals 

229 

230 If you want to combine categoricals that do not necessarily have 

231 the same categories, `union_categoricals` will combine a list-like 

232 of categoricals. The new categories will be the union of the 

233 categories being combined. 

234 

235 >>> a = pd.Categorical(["b", "c"]) 

236 >>> b = pd.Categorical(["a", "b"]) 

237 >>> union_categoricals([a, b]) 

238 [b, c, a, b] 

239 Categories (3, object): [b, c, a] 

240 

241 By default, the resulting categories will be ordered as they appear 

242 in the `categories` of the data. If you want the categories to be 

243 lexsorted, use `sort_categories=True` argument. 

244 

245 >>> union_categoricals([a, b], sort_categories=True) 

246 [b, c, a, b] 

247 Categories (3, object): [a, b, c] 

248 

249 `union_categoricals` also works with the case of combining two 

250 categoricals of the same categories and order information (e.g. what 

251 you could also `append` for). 

252 

253 >>> a = pd.Categorical(["a", "b"], ordered=True) 

254 >>> b = pd.Categorical(["a", "b", "a"], ordered=True) 

255 >>> union_categoricals([a, b]) 

256 [a, b, a, b, a] 

257 Categories (2, object): [a < b] 

258 

259 Raises `TypeError` because the categories are ordered and not identical. 

260 

261 >>> a = pd.Categorical(["a", "b"], ordered=True) 

262 >>> b = pd.Categorical(["a", "b", "c"], ordered=True) 

263 >>> union_categoricals([a, b]) 

264 TypeError: to union ordered Categoricals, all categories must be the same 

265 

266 New in version 0.20.0 

267 

268 Ordered categoricals with different categories or orderings can be 

269 combined by using the `ignore_ordered=True` argument. 

270 

271 >>> a = pd.Categorical(["a", "b", "c"], ordered=True) 

272 >>> b = pd.Categorical(["c", "b", "a"], ordered=True) 

273 >>> union_categoricals([a, b], ignore_order=True) 

274 [a, b, c, c, b, a] 

275 Categories (3, object): [a, b, c] 

276 

277 `union_categoricals` also works with a `CategoricalIndex`, or `Series` 

278 containing categorical data, but note that the resulting array will 

279 always be a plain `Categorical` 

280 

281 >>> a = pd.Series(["b", "c"], dtype='category') 

282 >>> b = pd.Series(["a", "b"], dtype='category') 

283 >>> union_categoricals([a, b]) 

284 [b, c, a, b] 

285 Categories (3, object): [b, c, a] 

286 """ 

287 from pandas import Index, Categorical 

288 from pandas.core.arrays.categorical import _recode_for_categories 

289 

290 if len(to_union) == 0: 

291 raise ValueError("No Categoricals to union") 

292 

293 def _maybe_unwrap(x): 

294 if isinstance(x, (ABCCategoricalIndex, ABCSeries)): 

295 return x.values 

296 elif isinstance(x, Categorical): 

297 return x 

298 else: 

299 raise TypeError("all components to combine must be Categorical") 

300 

301 to_union = [_maybe_unwrap(x) for x in to_union] 

302 first = to_union[0] 

303 

304 if not all( 

305 is_dtype_equal(other.categories.dtype, first.categories.dtype) 

306 for other in to_union[1:] 

307 ): 

308 raise TypeError("dtype of categories must be the same") 

309 

310 ordered = False 

311 if all(first.is_dtype_equal(other) for other in to_union[1:]): 

312 # identical categories - fastpath 

313 categories = first.categories 

314 ordered = first.ordered 

315 

316 if all(first.categories.equals(other.categories) for other in to_union[1:]): 

317 new_codes = np.concatenate([c.codes for c in to_union]) 

318 else: 

319 codes = [first.codes] + [ 

320 _recode_for_categories(other.codes, other.categories, first.categories) 

321 for other in to_union[1:] 

322 ] 

323 new_codes = np.concatenate(codes) 

324 

325 if sort_categories and not ignore_order and ordered: 

326 raise TypeError("Cannot use sort_categories=True with ordered Categoricals") 

327 

328 if sort_categories and not categories.is_monotonic_increasing: 

329 categories = categories.sort_values() 

330 indexer = categories.get_indexer(first.categories) 

331 

332 from pandas.core.algorithms import take_1d 

333 

334 new_codes = take_1d(indexer, new_codes, fill_value=-1) 

335 elif ignore_order or all(not c.ordered for c in to_union): 

336 # different categories - union and recode 

337 cats = first.categories.append([c.categories for c in to_union[1:]]) 

338 categories = Index(cats.unique()) 

339 if sort_categories: 

340 categories = categories.sort_values() 

341 

342 new_codes = [ 

343 _recode_for_categories(c.codes, c.categories, categories) for c in to_union 

344 ] 

345 new_codes = np.concatenate(new_codes) 

346 else: 

347 # ordered - to show a proper error message 

348 if all(c.ordered for c in to_union): 

349 msg = "to union ordered Categoricals, all categories must be the same" 

350 raise TypeError(msg) 

351 else: 

352 raise TypeError("Categorical.ordered must be the same") 

353 

354 if ignore_order: 

355 ordered = False 

356 

357 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) 

358 

359 

360def _concatenate_2d(to_concat, axis: int): 

361 # coerce to 2d if needed & concatenate 

362 if axis == 1: 

363 to_concat = [np.atleast_2d(x) for x in to_concat] 

364 return np.concatenate(to_concat, axis=axis) 

365 

366 

367def concat_datetime(to_concat, axis=0, typs=None): 

368 """ 

369 provide concatenation of an datetimelike array of arrays each of which is a 

370 single M8[ns], datetimet64[ns, tz] or m8[ns] dtype 

371 

372 Parameters 

373 ---------- 

374 to_concat : array of arrays 

375 axis : axis to provide concatenation 

376 typs : set of to_concat dtypes 

377 

378 Returns 

379 ------- 

380 a single array, preserving the combined dtypes 

381 """ 

382 

383 if typs is None: 

384 typs = get_dtype_kinds(to_concat) 

385 

386 # multiple types, need to coerce to object 

387 if len(typs) != 1: 

388 return _concatenate_2d( 

389 [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis 

390 ) 

391 

392 # must be single dtype 

393 if any(typ.startswith("datetime") for typ in typs): 

394 

395 if "datetime" in typs: 

396 to_concat = [x.astype(np.int64, copy=False) for x in to_concat] 

397 return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE) 

398 else: 

399 # when to_concat has different tz, len(typs) > 1. 

400 # thus no need to care 

401 return _concat_datetimetz(to_concat) 

402 

403 elif "timedelta" in typs: 

404 return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view( 

405 _TD_DTYPE 

406 ) 

407 

408 elif any(typ.startswith("period") for typ in typs): 

409 assert len(typs) == 1 

410 cls = to_concat[0] 

411 new_values = cls._concat_same_type(to_concat) 

412 return new_values 

413 

414 

415def _convert_datetimelike_to_object(x): 

416 # coerce datetimelike array to object dtype 

417 

418 # if dtype is of datetimetz or timezone 

419 if x.dtype.kind == _NS_DTYPE.kind: 

420 if getattr(x, "tz", None) is not None: 

421 x = np.asarray(x.astype(object)) 

422 else: 

423 shape = x.shape 

424 x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp") 

425 x = x.reshape(shape) 

426 

427 elif x.dtype == _TD_DTYPE: 

428 shape = x.shape 

429 x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) 

430 x = x.reshape(shape) 

431 

432 return x 

433 

434 

435def _concat_datetimetz(to_concat, name=None): 

436 """ 

437 concat DatetimeIndex with the same tz 

438 all inputs must be DatetimeIndex 

439 it is used in DatetimeIndex.append also 

440 """ 

441 # Right now, internals will pass a List[DatetimeArray] here 

442 # for reductions like quantile. I would like to disentangle 

443 # all this before we get here. 

444 sample = to_concat[0] 

445 

446 if isinstance(sample, ABCIndexClass): 

447 return sample._concat_same_dtype(to_concat, name=name) 

448 elif isinstance(sample, ABCDatetimeArray): 

449 return sample._concat_same_type(to_concat) 

450 

451 

452def _concat_sparse(to_concat, axis=0, typs=None): 

453 """ 

454 provide concatenation of an sparse/dense array of arrays each of which is a 

455 single dtype 

456 

457 Parameters 

458 ---------- 

459 to_concat : array of arrays 

460 axis : axis to provide concatenation 

461 typs : set of to_concat dtypes 

462 

463 Returns 

464 ------- 

465 a single array, preserving the combined dtypes 

466 """ 

467 

468 from pandas.core.arrays import SparseArray 

469 

470 fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] 

471 fill_value = fill_values[0] 

472 

473 # TODO: Fix join unit generation so we aren't passed this. 

474 to_concat = [ 

475 x 

476 if isinstance(x, SparseArray) 

477 else SparseArray(x.squeeze(), fill_value=fill_value) 

478 for x in to_concat 

479 ] 

480 

481 return SparseArray._concat_same_type(to_concat)