Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2concat routines 

3""" 

4 

5from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload 

6 

7import numpy as np 

8 

9from pandas._typing import FrameOrSeriesUnion 

10 

11from pandas import DataFrame, Index, MultiIndex, Series 

12from pandas.core.arrays.categorical import ( 

13 factorize_from_iterable, 

14 factorize_from_iterables, 

15) 

16import pandas.core.common as com 

17from pandas.core.generic import NDFrame 

18from pandas.core.indexes.api import ( 

19 all_indexes_same, 

20 ensure_index, 

21 get_consensus_names, 

22 get_objs_combined_axis, 

23) 

24import pandas.core.indexes.base as ibase 

25from pandas.core.internals import concatenate_block_managers 

26 

27# --------------------------------------------------------------------- 

28# Concatenate DataFrame objects 

29 

30 

31@overload 

32def concat( 

33 objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]], 

34 axis=0, 

35 join: str = "outer", 

36 ignore_index: bool = False, 

37 keys=None, 

38 levels=None, 

39 names=None, 

40 verify_integrity: bool = False, 

41 sort: bool = False, 

42 copy: bool = True, 

43) -> "DataFrame": 

44 ... 

45 

46 

47@overload 

48def concat( 

49 objs: Union[ 

50 Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] 

51 ], 

52 axis=0, 

53 join: str = "outer", 

54 ignore_index: bool = False, 

55 keys=None, 

56 levels=None, 

57 names=None, 

58 verify_integrity: bool = False, 

59 sort: bool = False, 

60 copy: bool = True, 

61) -> FrameOrSeriesUnion: 

62 ... 

63 

64 

65def concat( 

66 objs: Union[ 

67 Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] 

68 ], 

69 axis=0, 

70 join="outer", 

71 ignore_index: bool = False, 

72 keys=None, 

73 levels=None, 

74 names=None, 

75 verify_integrity: bool = False, 

76 sort: bool = False, 

77 copy: bool = True, 

78) -> FrameOrSeriesUnion: 

79 """ 

80 Concatenate pandas objects along a particular axis with optional set logic 

81 along the other axes. 

82 

83 Can also add a layer of hierarchical indexing on the concatenation axis, 

84 which may be useful if the labels are the same (or overlapping) on 

85 the passed axis number. 

86 

87 Parameters 

88 ---------- 

89 objs : a sequence or mapping of Series or DataFrame objects 

90 If a dict is passed, the sorted keys will be used as the `keys` 

91 argument, unless it is passed, in which case the values will be 

92 selected (see below). Any None objects will be dropped silently unless 

93 they are all None in which case a ValueError will be raised. 

94 axis : {0/'index', 1/'columns'}, default 0 

95 The axis to concatenate along. 

96 join : {'inner', 'outer'}, default 'outer' 

97 How to handle indexes on other axis (or axes). 

98 ignore_index : bool, default False 

99 If True, do not use the index values along the concatenation axis. The 

100 resulting axis will be labeled 0, ..., n - 1. This is useful if you are 

101 concatenating objects where the concatenation axis does not have 

102 meaningful indexing information. Note the index values on the other 

103 axes are still respected in the join. 

104 keys : sequence, default None 

105 If multiple levels passed, should contain tuples. Construct 

106 hierarchical index using the passed keys as the outermost level. 

107 levels : list of sequences, default None 

108 Specific levels (unique values) to use for constructing a 

109 MultiIndex. Otherwise they will be inferred from the keys. 

110 names : list, default None 

111 Names for the levels in the resulting hierarchical index. 

112 verify_integrity : bool, default False 

113 Check whether the new concatenated axis contains duplicates. This can 

114 be very expensive relative to the actual data concatenation. 

115 sort : bool, default False 

116 Sort non-concatenation axis if it is not already aligned when `join` 

117 is 'outer'. 

118 This has no effect when ``join='inner'``, which already preserves 

119 the order of the non-concatenation axis. 

120 

121 .. versionadded:: 0.23.0 

122 .. versionchanged:: 1.0.0 

123 

124 Changed to not sort by default. 

125 

126 copy : bool, default True 

127 If False, do not copy data unnecessarily. 

128 

129 Returns 

130 ------- 

131 object, type of objs 

132 When concatenating all ``Series`` along the index (axis=0), a 

133 ``Series`` is returned. When ``objs`` contains at least one 

134 ``DataFrame``, a ``DataFrame`` is returned. When concatenating along 

135 the columns (axis=1), a ``DataFrame`` is returned. 

136 

137 See Also 

138 -------- 

139 Series.append : Concatenate Series. 

140 DataFrame.append : Concatenate DataFrames. 

141 DataFrame.join : Join DataFrames using indexes. 

142 DataFrame.merge : Merge DataFrames by indexes or columns. 

143 

144 Notes 

145 ----- 

146 The keys, levels, and names arguments are all optional. 

147 

148 A walkthrough of how this method fits in with other tools for combining 

149 pandas objects can be found `here 

150 <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__. 

151 

152 Examples 

153 -------- 

154 Combine two ``Series``. 

155 

156 >>> s1 = pd.Series(['a', 'b']) 

157 >>> s2 = pd.Series(['c', 'd']) 

158 >>> pd.concat([s1, s2]) 

159 0 a 

160 1 b 

161 0 c 

162 1 d 

163 dtype: object 

164 

165 Clear the existing index and reset it in the result 

166 by setting the ``ignore_index`` option to ``True``. 

167 

168 >>> pd.concat([s1, s2], ignore_index=True) 

169 0 a 

170 1 b 

171 2 c 

172 3 d 

173 dtype: object 

174 

175 Add a hierarchical index at the outermost level of 

176 the data with the ``keys`` option. 

177 

178 >>> pd.concat([s1, s2], keys=['s1', 's2']) 

179 s1 0 a 

180 1 b 

181 s2 0 c 

182 1 d 

183 dtype: object 

184 

185 Label the index keys you create with the ``names`` option. 

186 

187 >>> pd.concat([s1, s2], keys=['s1', 's2'], 

188 ... names=['Series name', 'Row ID']) 

189 Series name Row ID 

190 s1 0 a 

191 1 b 

192 s2 0 c 

193 1 d 

194 dtype: object 

195 

196 Combine two ``DataFrame`` objects with identical columns. 

197 

198 >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], 

199 ... columns=['letter', 'number']) 

200 >>> df1 

201 letter number 

202 0 a 1 

203 1 b 2 

204 >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], 

205 ... columns=['letter', 'number']) 

206 >>> df2 

207 letter number 

208 0 c 3 

209 1 d 4 

210 >>> pd.concat([df1, df2]) 

211 letter number 

212 0 a 1 

213 1 b 2 

214 0 c 3 

215 1 d 4 

216 

217 Combine ``DataFrame`` objects with overlapping columns 

218 and return everything. Columns outside the intersection will 

219 be filled with ``NaN`` values. 

220 

221 >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], 

222 ... columns=['letter', 'number', 'animal']) 

223 >>> df3 

224 letter number animal 

225 0 c 3 cat 

226 1 d 4 dog 

227 >>> pd.concat([df1, df3], sort=False) 

228 letter number animal 

229 0 a 1 NaN 

230 1 b 2 NaN 

231 0 c 3 cat 

232 1 d 4 dog 

233 

234 Combine ``DataFrame`` objects with overlapping columns 

235 and return only those that are shared by passing ``inner`` to 

236 the ``join`` keyword argument. 

237 

238 >>> pd.concat([df1, df3], join="inner") 

239 letter number 

240 0 a 1 

241 1 b 2 

242 0 c 3 

243 1 d 4 

244 

245 Combine ``DataFrame`` objects horizontally along the x axis by 

246 passing in ``axis=1``. 

247 

248 >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], 

249 ... columns=['animal', 'name']) 

250 >>> pd.concat([df1, df4], axis=1) 

251 letter number animal name 

252 0 a 1 bird polly 

253 1 b 2 monkey george 

254 

255 Prevent the result from including duplicate index values with the 

256 ``verify_integrity`` option. 

257 

258 >>> df5 = pd.DataFrame([1], index=['a']) 

259 >>> df5 

260 0 

261 a 1 

262 >>> df6 = pd.DataFrame([2], index=['a']) 

263 >>> df6 

264 0 

265 a 2 

266 >>> pd.concat([df5, df6], verify_integrity=True) 

267 Traceback (most recent call last): 

268 ... 

269 ValueError: Indexes have overlapping values: ['a'] 

270 """ 

271 op = _Concatenator( 

272 objs, 

273 axis=axis, 

274 ignore_index=ignore_index, 

275 join=join, 

276 keys=keys, 

277 levels=levels, 

278 names=names, 

279 verify_integrity=verify_integrity, 

280 copy=copy, 

281 sort=sort, 

282 ) 

283 

284 return op.get_result() 

285 

286 

287class _Concatenator: 

288 """ 

289 Orchestrates a concatenation operation for BlockManagers 

290 """ 

291 

292 def __init__( 

293 self, 

294 objs, 

295 axis=0, 

296 join: str = "outer", 

297 keys=None, 

298 levels=None, 

299 names=None, 

300 ignore_index: bool = False, 

301 verify_integrity: bool = False, 

302 copy: bool = True, 

303 sort=False, 

304 ): 

305 if isinstance(objs, (NDFrame, str)): 

306 raise TypeError( 

307 "first argument must be an iterable of pandas " 

308 "objects, you passed an object of type " 

309 '"{name}"'.format(name=type(objs).__name__) 

310 ) 

311 

312 if join == "outer": 

313 self.intersect = False 

314 elif join == "inner": 

315 self.intersect = True 

316 else: # pragma: no cover 

317 raise ValueError( 

318 "Only can inner (intersect) or outer (union) join the other axis" 

319 ) 

320 

321 if isinstance(objs, dict): 

322 if keys is None: 

323 keys = list(objs.keys()) 

324 objs = [objs[k] for k in keys] 

325 else: 

326 objs = list(objs) 

327 

328 if len(objs) == 0: 

329 raise ValueError("No objects to concatenate") 

330 

331 if keys is None: 

332 objs = list(com.not_none(*objs)) 

333 else: 

334 # #1649 

335 clean_keys = [] 

336 clean_objs = [] 

337 for k, v in zip(keys, objs): 

338 if v is None: 

339 continue 

340 clean_keys.append(k) 

341 clean_objs.append(v) 

342 objs = clean_objs 

343 name = getattr(keys, "name", None) 

344 keys = Index(clean_keys, name=name) 

345 

346 if len(objs) == 0: 

347 raise ValueError("All objects passed were None") 

348 

349 # consolidate data & figure out what our result ndim is going to be 

350 ndims = set() 

351 for obj in objs: 

352 if not isinstance(obj, (Series, DataFrame)): 

353 msg = ( 

354 "cannot concatenate object of type '{typ}'; " 

355 "only Series and DataFrame objs are valid".format(typ=type(obj)) 

356 ) 

357 raise TypeError(msg) 

358 

359 # consolidate 

360 obj._consolidate(inplace=True) 

361 ndims.add(obj.ndim) 

362 

363 # get the sample 

364 # want the highest ndim that we have, and must be non-empty 

365 # unless all objs are empty 

366 sample = None 

367 if len(ndims) > 1: 

368 max_ndim = max(ndims) 

369 for obj in objs: 

370 if obj.ndim == max_ndim and np.sum(obj.shape): 

371 sample = obj 

372 break 

373 

374 else: 

375 # filter out the empties if we have not multi-index possibilities 

376 # note to keep empty Series as it affect to result columns / name 

377 non_empties = [ 

378 obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series) 

379 ] 

380 

381 if len(non_empties) and ( 

382 keys is None and names is None and levels is None and not self.intersect 

383 ): 

384 objs = non_empties 

385 sample = objs[0] 

386 

387 if sample is None: 

388 sample = objs[0] 

389 self.objs = objs 

390 

391 # Standardize axis parameter to int 

392 if isinstance(sample, Series): 

393 axis = DataFrame._get_axis_number(axis) 

394 else: 

395 axis = sample._get_axis_number(axis) 

396 

397 # Need to flip BlockManager axis in the DataFrame special case 

398 self._is_frame = isinstance(sample, DataFrame) 

399 if self._is_frame: 

400 axis = 1 if axis == 0 else 0 

401 

402 self._is_series = isinstance(sample, Series) 

403 if not 0 <= axis <= sample.ndim: 

404 raise AssertionError( 

405 "axis must be between 0 and {ndim}, input was " 

406 "{axis}".format(ndim=sample.ndim, axis=axis) 

407 ) 

408 

409 # if we have mixed ndims, then convert to highest ndim 

410 # creating column numbers as needed 

411 if len(ndims) > 1: 

412 current_column = 0 

413 max_ndim = sample.ndim 

414 self.objs, objs = [], self.objs 

415 for obj in objs: 

416 

417 ndim = obj.ndim 

418 if ndim == max_ndim: 

419 pass 

420 

421 elif ndim != max_ndim - 1: 

422 raise ValueError( 

423 "cannot concatenate unaligned mixed " 

424 "dimensional NDFrame objects" 

425 ) 

426 

427 else: 

428 name = getattr(obj, "name", None) 

429 if ignore_index or name is None: 

430 name = current_column 

431 current_column += 1 

432 

433 # doing a row-wise concatenation so need everything 

434 # to line up 

435 if self._is_frame and axis == 1: 

436 name = 0 

437 obj = sample._constructor({name: obj}) 

438 

439 self.objs.append(obj) 

440 

441 # note: this is the BlockManager axis (since DataFrame is transposed) 

442 self.axis = axis 

443 self.keys = keys 

444 self.names = names or getattr(keys, "names", None) 

445 self.levels = levels 

446 self.sort = sort 

447 

448 self.ignore_index = ignore_index 

449 self.verify_integrity = verify_integrity 

450 self.copy = copy 

451 

452 self.new_axes = self._get_new_axes() 

453 

454 def get_result(self): 

455 

456 # series only 

457 if self._is_series: 

458 

459 # stack blocks 

460 if self.axis == 0: 

461 name = com.consensus_name_attr(self.objs) 

462 

463 mgr = self.objs[0]._data.concat( 

464 [x._data for x in self.objs], self.new_axes 

465 ) 

466 cons = self.objs[0]._constructor 

467 return cons(mgr, name=name).__finalize__(self, method="concat") 

468 

469 # combine as columns in a frame 

470 else: 

471 data = dict(zip(range(len(self.objs)), self.objs)) 

472 cons = DataFrame 

473 

474 index, columns = self.new_axes 

475 df = cons(data, index=index) 

476 df.columns = columns 

477 return df.__finalize__(self, method="concat") 

478 

479 # combine block managers 

480 else: 

481 mgrs_indexers = [] 

482 for obj in self.objs: 

483 mgr = obj._data 

484 indexers = {} 

485 for ax, new_labels in enumerate(self.new_axes): 

486 if ax == self.axis: 

487 # Suppress reindexing on concat axis 

488 continue 

489 

490 obj_labels = mgr.axes[ax] 

491 if not new_labels.equals(obj_labels): 

492 indexers[ax] = obj_labels.reindex(new_labels)[1] 

493 

494 mgrs_indexers.append((obj._data, indexers)) 

495 

496 new_data = concatenate_block_managers( 

497 mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy 

498 ) 

499 if not self.copy: 

500 new_data._consolidate_inplace() 

501 

502 cons = self.objs[0]._constructor 

503 return cons._from_axes(new_data, self.new_axes).__finalize__( 

504 self, method="concat" 

505 ) 

506 

507 def _get_result_dim(self) -> int: 

508 if self._is_series and self.axis == 1: 

509 return 2 

510 else: 

511 return self.objs[0].ndim 

512 

513 def _get_new_axes(self) -> List[Index]: 

514 ndim = self._get_result_dim() 

515 return [ 

516 self._get_concat_axis() if i == self.axis else self._get_comb_axis(i) 

517 for i in range(ndim) 

518 ] 

519 

520 def _get_comb_axis(self, i: int) -> Index: 

521 data_axis = self.objs[0]._get_block_manager_axis(i) 

522 return get_objs_combined_axis( 

523 self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort 

524 ) 

525 

526 def _get_concat_axis(self) -> Index: 

527 """ 

528 Return index to be used along concatenation axis. 

529 """ 

530 if self._is_series: 

531 if self.axis == 0: 

532 indexes = [x.index for x in self.objs] 

533 elif self.ignore_index: 

534 idx = ibase.default_index(len(self.objs)) 

535 return idx 

536 elif self.keys is None: 

537 names: List[Optional[Hashable]] = [None] * len(self.objs) 

538 num = 0 

539 has_names = False 

540 for i, x in enumerate(self.objs): 

541 if not isinstance(x, Series): 

542 raise TypeError( 

543 f"Cannot concatenate type 'Series' with " 

544 f"object of type '{type(x).__name__}'" 

545 ) 

546 if x.name is not None: 

547 names[i] = x.name 

548 has_names = True 

549 else: 

550 names[i] = num 

551 num += 1 

552 if has_names: 

553 return Index(names) 

554 else: 

555 return ibase.default_index(len(self.objs)) 

556 else: 

557 return ensure_index(self.keys).set_names(self.names) 

558 else: 

559 indexes = [x._data.axes[self.axis] for x in self.objs] 

560 

561 if self.ignore_index: 

562 idx = ibase.default_index(sum(len(i) for i in indexes)) 

563 return idx 

564 

565 if self.keys is None: 

566 concat_axis = _concat_indexes(indexes) 

567 else: 

568 concat_axis = _make_concat_multiindex( 

569 indexes, self.keys, self.levels, self.names 

570 ) 

571 

572 self._maybe_check_integrity(concat_axis) 

573 

574 return concat_axis 

575 

576 def _maybe_check_integrity(self, concat_index: Index): 

577 if self.verify_integrity: 

578 if not concat_index.is_unique: 

579 overlap = concat_index[concat_index.duplicated()].unique() 

580 raise ValueError( 

581 "Indexes have overlapping values: " 

582 "{overlap!s}".format(overlap=overlap) 

583 ) 

584 

585 

586def _concat_indexes(indexes) -> Index: 

587 return indexes[0].append(indexes[1:]) 

588 

589 

590def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: 

591 

592 if (levels is None and isinstance(keys[0], tuple)) or ( 

593 levels is not None and len(levels) > 1 

594 ): 

595 zipped = list(zip(*keys)) 

596 if names is None: 

597 names = [None] * len(zipped) 

598 

599 if levels is None: 

600 _, levels = factorize_from_iterables(zipped) 

601 else: 

602 levels = [ensure_index(x) for x in levels] 

603 else: 

604 zipped = [keys] 

605 if names is None: 

606 names = [None] 

607 

608 if levels is None: 

609 levels = [ensure_index(keys)] 

610 else: 

611 levels = [ensure_index(x) for x in levels] 

612 

613 if not all_indexes_same(indexes): 

614 codes_list = [] 

615 

616 # things are potentially different sizes, so compute the exact codes 

617 # for each level and pass those to MultiIndex.from_arrays 

618 

619 for hlevel, level in zip(zipped, levels): 

620 to_concat = [] 

621 for key, index in zip(hlevel, indexes): 

622 try: 

623 i = level.get_loc(key) 

624 except KeyError: 

625 raise ValueError( 

626 "Key {key!s} not in level {level!s}".format( 

627 key=key, level=level 

628 ) 

629 ) 

630 

631 to_concat.append(np.repeat(i, len(index))) 

632 codes_list.append(np.concatenate(to_concat)) 

633 

634 concat_index = _concat_indexes(indexes) 

635 

636 # these go at the end 

637 if isinstance(concat_index, MultiIndex): 

638 levels.extend(concat_index.levels) 

639 codes_list.extend(concat_index.codes) 

640 else: 

641 codes, categories = factorize_from_iterable(concat_index) 

642 levels.append(categories) 

643 codes_list.append(codes) 

644 

645 if len(names) == len(levels): 

646 names = list(names) 

647 else: 

648 # make sure that all of the passed indices have the same nlevels 

649 if not len({idx.nlevels for idx in indexes}) == 1: 

650 raise AssertionError( 

651 "Cannot concat indices that do " 

652 "not have the same number of levels" 

653 ) 

654 

655 # also copies 

656 names = names + get_consensus_names(indexes) 

657 

658 return MultiIndex( 

659 levels=levels, codes=codes_list, names=names, verify_integrity=False 

660 ) 

661 

662 new_index = indexes[0] 

663 n = len(new_index) 

664 kpieces = len(indexes) 

665 

666 # also copies 

667 new_names = list(names) 

668 new_levels = list(levels) 

669 

670 # construct codes 

671 new_codes = [] 

672 

673 # do something a bit more speedy 

674 

675 for hlevel, level in zip(zipped, levels): 

676 hlevel = ensure_index(hlevel) 

677 mapped = level.get_indexer(hlevel) 

678 

679 mask = mapped == -1 

680 if mask.any(): 

681 raise ValueError( 

682 "Values not found in passed level: {hlevel!s}".format( 

683 hlevel=hlevel[mask] 

684 ) 

685 ) 

686 

687 new_codes.append(np.repeat(mapped, n)) 

688 

689 if isinstance(new_index, MultiIndex): 

690 new_levels.extend(new_index.levels) 

691 new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) 

692 else: 

693 new_levels.append(new_index) 

694 new_codes.append(np.tile(np.arange(n), kpieces)) 

695 

696 if len(new_names) < len(new_levels): 

697 new_names.extend(new_index.names) 

698 

699 return MultiIndex( 

700 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

701 )