Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2SQL-style merge routines 

3""" 

4 

5import copy 

6import datetime 

7from functools import partial 

8import string 

9from typing import TYPE_CHECKING, Optional, Tuple, Union 

10import warnings 

11 

12import numpy as np 

13 

14from pandas._libs import Timedelta, hashtable as libhashtable, lib 

15import pandas._libs.join as libjoin 

16from pandas._typing import FrameOrSeries 

17from pandas.errors import MergeError 

18from pandas.util._decorators import Appender, Substitution 

19 

20from pandas.core.dtypes.common import ( 

21 ensure_float64, 

22 ensure_int64, 

23 ensure_object, 

24 is_array_like, 

25 is_bool, 

26 is_bool_dtype, 

27 is_categorical_dtype, 

28 is_datetime64tz_dtype, 

29 is_dtype_equal, 

30 is_extension_array_dtype, 

31 is_float_dtype, 

32 is_integer, 

33 is_integer_dtype, 

34 is_list_like, 

35 is_number, 

36 is_numeric_dtype, 

37 is_object_dtype, 

38 needs_i8_conversion, 

39) 

40from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries 

41from pandas.core.dtypes.missing import isna, na_value_for_dtype 

42 

43from pandas import Categorical, Index, MultiIndex 

44from pandas.core import groupby 

45import pandas.core.algorithms as algos 

46from pandas.core.arrays.categorical import _recode_for_categories 

47import pandas.core.common as com 

48from pandas.core.frame import _merge_doc 

49from pandas.core.internals import _transform_index, concatenate_block_managers 

50from pandas.core.sorting import is_int64_overflow_possible 

51 

52if TYPE_CHECKING: 

53 from pandas import DataFrame, Series # noqa:F401 

54 

55 

56@Substitution("\nleft : DataFrame") 

57@Appender(_merge_doc, indents=0) 

58def merge( 

59 left, 

60 right, 

61 how: str = "inner", 

62 on=None, 

63 left_on=None, 

64 right_on=None, 

65 left_index: bool = False, 

66 right_index: bool = False, 

67 sort: bool = False, 

68 suffixes=("_x", "_y"), 

69 copy: bool = True, 

70 indicator: bool = False, 

71 validate=None, 

72) -> "DataFrame": 

73 op = _MergeOperation( 

74 left, 

75 right, 

76 how=how, 

77 on=on, 

78 left_on=left_on, 

79 right_on=right_on, 

80 left_index=left_index, 

81 right_index=right_index, 

82 sort=sort, 

83 suffixes=suffixes, 

84 copy=copy, 

85 indicator=indicator, 

86 validate=validate, 

87 ) 

88 return op.get_result() 

89 

90 

91if __debug__: 

92 merge.__doc__ = _merge_doc % "\nleft : DataFrame" 

93 

94 

95def _groupby_and_merge( 

96 by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True 

97): 

98 """ 

99 groupby & merge; we are always performing a left-by type operation 

100 

101 Parameters 

102 ---------- 

103 by: field to group 

104 on: duplicates field 

105 left: left frame 

106 right: right frame 

107 _merge_pieces: function for merging 

108 check_duplicates: bool, default True 

109 should we check & clean duplicates 

110 """ 

111 

112 pieces = [] 

113 if not isinstance(by, (list, tuple)): 

114 by = [by] 

115 

116 lby = left.groupby(by, sort=False) 

117 rby: Optional[groupby.DataFrameGroupBy] = None 

118 

119 # if we can groupby the rhs 

120 # then we can get vastly better perf 

121 

122 # we will check & remove duplicates if indicated 

123 if check_duplicates: 

124 if on is None: 

125 on = [] 

126 elif not isinstance(on, (list, tuple)): 

127 on = [on] 

128 

129 if right.duplicated(by + on).any(): 

130 _right = right.drop_duplicates(by + on, keep="last") 

131 # TODO: use overload to refine return type of drop_duplicates 

132 assert _right is not None # needed for mypy 

133 right = _right 

134 try: 

135 rby = right.groupby(by, sort=False) 

136 except KeyError: 

137 pass 

138 

139 for key, lhs in lby: 

140 

141 if rby is None: 

142 rhs = right 

143 else: 

144 try: 

145 rhs = right.take(rby.indices[key]) 

146 except KeyError: 

147 # key doesn't exist in left 

148 lcols = lhs.columns.tolist() 

149 cols = lcols + [r for r in right.columns if r not in set(lcols)] 

150 merged = lhs.reindex(columns=cols) 

151 merged.index = range(len(merged)) 

152 pieces.append(merged) 

153 continue 

154 

155 merged = _merge_pieces(lhs, rhs) 

156 

157 # make sure join keys are in the merged 

158 # TODO, should _merge_pieces do this? 

159 for k in by: 

160 try: 

161 if k in merged: 

162 merged[k] = key 

163 except KeyError: 

164 pass 

165 

166 pieces.append(merged) 

167 

168 # preserve the original order 

169 # if we have a missing piece this can be reset 

170 from pandas.core.reshape.concat import concat 

171 

172 result = concat(pieces, ignore_index=True) 

173 result = result.reindex(columns=pieces[0].columns, copy=False) 

174 return result, lby 

175 

176 

177def merge_ordered( 

178 left, 

179 right, 

180 on=None, 

181 left_on=None, 

182 right_on=None, 

183 left_by=None, 

184 right_by=None, 

185 fill_method=None, 

186 suffixes=("_x", "_y"), 

187 how: str = "outer", 

188) -> "DataFrame": 

189 """ 

190 Perform merge with optional filling/interpolation. 

191 

192 Designed for ordered data like time series data. Optionally 

193 perform group-wise merge (see examples). 

194 

195 Parameters 

196 ---------- 

197 left : DataFrame 

198 right : DataFrame 

199 on : label or list 

200 Field names to join on. Must be found in both DataFrames. 

201 left_on : label or list, or array-like 

202 Field names to join on in left DataFrame. Can be a vector or list of 

203 vectors of the length of the DataFrame to use a particular vector as 

204 the join key instead of columns. 

205 right_on : label or list, or array-like 

206 Field names to join on in right DataFrame or vector/list of vectors per 

207 left_on docs. 

208 left_by : column name or list of column names 

209 Group left DataFrame by group columns and merge piece by piece with 

210 right DataFrame. 

211 right_by : column name or list of column names 

212 Group right DataFrame by group columns and merge piece by piece with 

213 left DataFrame. 

214 fill_method : {'ffill', None}, default None 

215 Interpolation method for data. 

216 suffixes : Sequence, default is ("_x", "_y") 

217 A length-2 sequence where each element is optionally a string 

218 indicating the suffix to add to overlapping column names in 

219 `left` and `right` respectively. Pass a value of `None` instead 

220 of a string to indicate that the column name from `left` or 

221 `right` should be left as-is, with no suffix. At least one of the 

222 values must not be None. 

223 

224 .. versionchanged:: 0.25.0 

225 how : {'left', 'right', 'outer', 'inner'}, default 'outer' 

226 * left: use only keys from left frame (SQL: left outer join) 

227 * right: use only keys from right frame (SQL: right outer join) 

228 * outer: use union of keys from both frames (SQL: full outer join) 

229 * inner: use intersection of keys from both frames (SQL: inner join). 

230 

231 Returns 

232 ------- 

233 DataFrame 

234 The merged DataFrame output type will the be same as 

235 'left', if it is a subclass of DataFrame. 

236 

237 See Also 

238 -------- 

239 merge 

240 merge_asof 

241 

242 Examples 

243 -------- 

244 >>> A 

245 key lvalue group 

246 0 a 1 a 

247 1 c 2 a 

248 2 e 3 a 

249 3 a 1 b 

250 4 c 2 b 

251 5 e 3 b 

252 

253 >>> B 

254 Key rvalue 

255 0 b 1 

256 1 c 2 

257 2 d 3 

258 

259 >>> merge_ordered(A, B, fill_method='ffill', left_by='group') 

260 group key lvalue rvalue 

261 0 a a 1 NaN 

262 1 a b 1 1.0 

263 2 a c 2 2.0 

264 3 a d 2 3.0 

265 4 a e 3 3.0 

266 5 b a 1 NaN 

267 6 b b 1 1.0 

268 7 b c 2 2.0 

269 8 b d 2 3.0 

270 9 b e 3 3.0 

271 """ 

272 

273 def _merger(x, y): 

274 # perform the ordered merge operation 

275 op = _OrderedMerge( 

276 x, 

277 y, 

278 on=on, 

279 left_on=left_on, 

280 right_on=right_on, 

281 suffixes=suffixes, 

282 fill_method=fill_method, 

283 how=how, 

284 ) 

285 return op.get_result() 

286 

287 if left_by is not None and right_by is not None: 

288 raise ValueError("Can only group either left or right frames") 

289 elif left_by is not None: 

290 result, _ = _groupby_and_merge( 

291 left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False 

292 ) 

293 elif right_by is not None: 

294 result, _ = _groupby_and_merge( 

295 right_by, 

296 on, 

297 right, 

298 left, 

299 lambda x, y: _merger(y, x), 

300 check_duplicates=False, 

301 ) 

302 else: 

303 result = _merger(left, right) 

304 return result 

305 

306 

307def merge_asof( 

308 left, 

309 right, 

310 on=None, 

311 left_on=None, 

312 right_on=None, 

313 left_index: bool = False, 

314 right_index: bool = False, 

315 by=None, 

316 left_by=None, 

317 right_by=None, 

318 suffixes=("_x", "_y"), 

319 tolerance=None, 

320 allow_exact_matches: bool = True, 

321 direction: str = "backward", 

322) -> "DataFrame": 

323 """ 

324 Perform an asof merge. This is similar to a left-join except that we 

325 match on nearest key rather than equal keys. 

326 

327 Both DataFrames must be sorted by the key. 

328 

329 For each row in the left DataFrame: 

330 

331 - A "backward" search selects the last row in the right DataFrame whose 

332 'on' key is less than or equal to the left's key. 

333 

334 - A "forward" search selects the first row in the right DataFrame whose 

335 'on' key is greater than or equal to the left's key. 

336 

337 - A "nearest" search selects the row in the right DataFrame whose 'on' 

338 key is closest in absolute distance to the left's key. 

339 

340 The default is "backward" and is compatible in versions below 0.20.0. 

341 The direction parameter was added in version 0.20.0 and introduces 

342 "forward" and "nearest". 

343 

344 Optionally match on equivalent keys with 'by' before searching with 'on'. 

345 

346 Parameters 

347 ---------- 

348 left : DataFrame 

349 right : DataFrame 

350 on : label 

351 Field name to join on. Must be found in both DataFrames. 

352 The data MUST be ordered. Furthermore this must be a numeric column, 

353 such as datetimelike, integer, or float. On or left_on/right_on 

354 must be given. 

355 left_on : label 

356 Field name to join on in left DataFrame. 

357 right_on : label 

358 Field name to join on in right DataFrame. 

359 left_index : bool 

360 Use the index of the left DataFrame as the join key. 

361 right_index : bool 

362 Use the index of the right DataFrame as the join key. 

363 by : column name or list of column names 

364 Match on these columns before performing merge operation. 

365 left_by : column name 

366 Field names to match on in the left DataFrame. 

367 right_by : column name 

368 Field names to match on in the right DataFrame. 

369 suffixes : 2-length sequence (tuple, list, ...) 

370 Suffix to apply to overlapping column names in the left and right 

371 side, respectively. 

372 tolerance : int or Timedelta, optional, default None 

373 Select asof tolerance within this range; must be compatible 

374 with the merge index. 

375 allow_exact_matches : bool, default True 

376 

377 - If True, allow matching with the same 'on' value 

378 (i.e. less-than-or-equal-to / greater-than-or-equal-to) 

379 - If False, don't match the same 'on' value 

380 (i.e., strictly less-than / strictly greater-than). 

381 

382 direction : 'backward' (default), 'forward', or 'nearest' 

383 Whether to search for prior, subsequent, or closest matches. 

384 

385 Returns 

386 ------- 

387 merged : DataFrame 

388 

389 See Also 

390 -------- 

391 merge 

392 merge_ordered 

393 

394 Examples 

395 -------- 

396 >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) 

397 >>> left 

398 a left_val 

399 0 1 a 

400 1 5 b 

401 2 10 c 

402 

403 >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7], 

404 ... 'right_val': [1, 2, 3, 6, 7]}) 

405 >>> right 

406 a right_val 

407 0 1 1 

408 1 2 2 

409 2 3 3 

410 3 6 6 

411 4 7 7 

412 

413 >>> pd.merge_asof(left, right, on='a') 

414 a left_val right_val 

415 0 1 a 1 

416 1 5 b 3 

417 2 10 c 7 

418 

419 >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) 

420 a left_val right_val 

421 0 1 a NaN 

422 1 5 b 3.0 

423 2 10 c 7.0 

424 

425 >>> pd.merge_asof(left, right, on='a', direction='forward') 

426 a left_val right_val 

427 0 1 a 1.0 

428 1 5 b 6.0 

429 2 10 c NaN 

430 

431 >>> pd.merge_asof(left, right, on='a', direction='nearest') 

432 a left_val right_val 

433 0 1 a 1 

434 1 5 b 6 

435 2 10 c 7 

436 

437 We can use indexed DataFrames as well. 

438 

439 >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10]) 

440 >>> left 

441 left_val 

442 1 a 

443 5 b 

444 10 c 

445 

446 >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]}, 

447 ... index=[1, 2, 3, 6, 7]) 

448 >>> right 

449 right_val 

450 1 1 

451 2 2 

452 3 3 

453 6 6 

454 7 7 

455 

456 >>> pd.merge_asof(left, right, left_index=True, right_index=True) 

457 left_val right_val 

458 1 a 1 

459 5 b 3 

460 10 c 7 

461 

462 Here is a real-world times-series example 

463 

464 >>> quotes 

465 time ticker bid ask 

466 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 

467 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 

468 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 

469 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 

470 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 

471 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 

472 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 

473 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 

474 

475 >>> trades 

476 time ticker price quantity 

477 0 2016-05-25 13:30:00.023 MSFT 51.95 75 

478 1 2016-05-25 13:30:00.038 MSFT 51.95 155 

479 2 2016-05-25 13:30:00.048 GOOG 720.77 100 

480 3 2016-05-25 13:30:00.048 GOOG 720.92 100 

481 4 2016-05-25 13:30:00.048 AAPL 98.00 100 

482 

483 By default we are taking the asof of the quotes 

484 

485 >>> pd.merge_asof(trades, quotes, 

486 ... on='time', 

487 ... by='ticker') 

488 time ticker price quantity bid ask 

489 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

490 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

491 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

492 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

493 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

494 

495 We only asof within 2ms between the quote time and the trade time 

496 

497 >>> pd.merge_asof(trades, quotes, 

498 ... on='time', 

499 ... by='ticker', 

500 ... tolerance=pd.Timedelta('2ms')) 

501 time ticker price quantity bid ask 

502 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

503 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN 

504 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

505 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

506 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

507 

508 We only asof within 10ms between the quote time and the trade time 

509 and we exclude exact matches on time. However *prior* data will 

510 propagate forward 

511 

512 >>> pd.merge_asof(trades, quotes, 

513 ... on='time', 

514 ... by='ticker', 

515 ... tolerance=pd.Timedelta('10ms'), 

516 ... allow_exact_matches=False) 

517 time ticker price quantity bid ask 

518 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 

519 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

520 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN 

521 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 

522 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

523 """ 

524 op = _AsOfMerge( 

525 left, 

526 right, 

527 on=on, 

528 left_on=left_on, 

529 right_on=right_on, 

530 left_index=left_index, 

531 right_index=right_index, 

532 by=by, 

533 left_by=left_by, 

534 right_by=right_by, 

535 suffixes=suffixes, 

536 how="asof", 

537 tolerance=tolerance, 

538 allow_exact_matches=allow_exact_matches, 

539 direction=direction, 

540 ) 

541 return op.get_result() 

542 

543 

544# TODO: transformations?? 

545# TODO: only copy DataFrames when modification necessary 

546class _MergeOperation: 

547 """ 

548 Perform a database (SQL) merge operation between two DataFrame or Series 

549 objects using either columns as keys or their row indexes 

550 """ 

551 

552 _merge_type = "merge" 

553 

554 def __init__( 

555 self, 

556 left: Union["Series", "DataFrame"], 

557 right: Union["Series", "DataFrame"], 

558 how: str = "inner", 

559 on=None, 

560 left_on=None, 

561 right_on=None, 

562 axis=1, 

563 left_index: bool = False, 

564 right_index: bool = False, 

565 sort: bool = True, 

566 suffixes=("_x", "_y"), 

567 copy: bool = True, 

568 indicator: bool = False, 

569 validate=None, 

570 ): 

571 _left = _validate_operand(left) 

572 _right = _validate_operand(right) 

573 self.left = self.orig_left = _left 

574 self.right = self.orig_right = _right 

575 self.how = how 

576 self.axis = axis 

577 

578 self.on = com.maybe_make_list(on) 

579 self.left_on = com.maybe_make_list(left_on) 

580 self.right_on = com.maybe_make_list(right_on) 

581 

582 self.copy = copy 

583 self.suffixes = suffixes 

584 self.sort = sort 

585 

586 self.left_index = left_index 

587 self.right_index = right_index 

588 

589 self.indicator = indicator 

590 

591 self.indicator_name: Optional[str] 

592 if isinstance(self.indicator, str): 

593 self.indicator_name = self.indicator 

594 elif isinstance(self.indicator, bool): 

595 self.indicator_name = "_merge" if self.indicator else None 

596 else: 

597 raise ValueError( 

598 "indicator option can only accept boolean or string arguments" 

599 ) 

600 

601 if not is_bool(left_index): 

602 raise ValueError( 

603 "left_index parameter must be of type bool, not " 

604 "{left_index}".format(left_index=type(left_index)) 

605 ) 

606 if not is_bool(right_index): 

607 raise ValueError( 

608 "right_index parameter must be of type bool, not " 

609 "{right_index}".format(right_index=type(right_index)) 

610 ) 

611 

612 # warn user when merging between different levels 

613 if _left.columns.nlevels != _right.columns.nlevels: 

614 msg = ( 

615 "merging between different levels can give an unintended " 

616 "result ({left} levels on the left, {right} on the right)" 

617 ).format(left=_left.columns.nlevels, right=_right.columns.nlevels) 

618 warnings.warn(msg, UserWarning) 

619 

620 self._validate_specification() 

621 

622 # note this function has side effects 

623 ( 

624 self.left_join_keys, 

625 self.right_join_keys, 

626 self.join_names, 

627 ) = self._get_merge_keys() 

628 

629 # validate the merge keys dtypes. We may need to coerce 

630 # to avoid incompat dtypes 

631 self._maybe_coerce_merge_keys() 

632 

633 # If argument passed to validate, 

634 # check if columns specified as unique 

635 # are in fact unique. 

636 if validate is not None: 

637 self._validate(validate) 

638 

639 def get_result(self): 

640 if self.indicator: 

641 self.left, self.right = self._indicator_pre_merge(self.left, self.right) 

642 

643 join_index, left_indexer, right_indexer = self._get_join_info() 

644 

645 ldata, rdata = self.left._data, self.right._data 

646 lsuf, rsuf = self.suffixes 

647 

648 llabels, rlabels = _items_overlap_with_suffix( 

649 ldata.items, lsuf, rdata.items, rsuf 

650 ) 

651 

652 lindexers = {1: left_indexer} if left_indexer is not None else {} 

653 rindexers = {1: right_indexer} if right_indexer is not None else {} 

654 

655 result_data = concatenate_block_managers( 

656 [(ldata, lindexers), (rdata, rindexers)], 

657 axes=[llabels.append(rlabels), join_index], 

658 concat_axis=0, 

659 copy=self.copy, 

660 ) 

661 

662 typ = self.left._constructor 

663 result = typ(result_data).__finalize__(self, method=self._merge_type) 

664 

665 if self.indicator: 

666 result = self._indicator_post_merge(result) 

667 

668 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

669 

670 self._maybe_restore_index_levels(result) 

671 

672 return result 

673 

674 def _indicator_pre_merge( 

675 self, left: "DataFrame", right: "DataFrame" 

676 ) -> Tuple["DataFrame", "DataFrame"]: 

677 

678 columns = left.columns.union(right.columns) 

679 

680 for i in ["_left_indicator", "_right_indicator"]: 

681 if i in columns: 

682 raise ValueError( 

683 "Cannot use `indicator=True` option when " 

684 "data contains a column named {name}".format(name=i) 

685 ) 

686 if self.indicator_name in columns: 

687 raise ValueError( 

688 "Cannot use name of an existing column for indicator column" 

689 ) 

690 

691 left = left.copy() 

692 right = right.copy() 

693 

694 left["_left_indicator"] = 1 

695 left["_left_indicator"] = left["_left_indicator"].astype("int8") 

696 

697 right["_right_indicator"] = 2 

698 right["_right_indicator"] = right["_right_indicator"].astype("int8") 

699 

700 return left, right 

701 

702 def _indicator_post_merge(self, result): 

703 

704 result["_left_indicator"] = result["_left_indicator"].fillna(0) 

705 result["_right_indicator"] = result["_right_indicator"].fillna(0) 

706 

707 result[self.indicator_name] = Categorical( 

708 (result["_left_indicator"] + result["_right_indicator"]), 

709 categories=[1, 2, 3], 

710 ) 

711 result[self.indicator_name] = result[self.indicator_name].cat.rename_categories( 

712 ["left_only", "right_only", "both"] 

713 ) 

714 

715 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) 

716 return result 

717 

718 def _maybe_restore_index_levels(self, result): 

719 """ 

720 Restore index levels specified as `on` parameters 

721 

722 Here we check for cases where `self.left_on` and `self.right_on` pairs 

723 each reference an index level in their respective DataFrames. The 

724 joined columns corresponding to these pairs are then restored to the 

725 index of `result`. 

726 

727 **Note:** This method has side effects. It modifies `result` in-place 

728 

729 Parameters 

730 ---------- 

731 result: DataFrame 

732 merge result 

733 

734 Returns 

735 ------- 

736 None 

737 """ 

738 names_to_restore = [] 

739 for name, left_key, right_key in zip( 

740 self.join_names, self.left_on, self.right_on 

741 ): 

742 if ( 

743 self.orig_left._is_level_reference(left_key) 

744 and self.orig_right._is_level_reference(right_key) 

745 and name not in result.index.names 

746 ): 

747 

748 names_to_restore.append(name) 

749 

750 if names_to_restore: 

751 result.set_index(names_to_restore, inplace=True) 

752 

753 def _maybe_add_join_keys(self, result, left_indexer, right_indexer): 

754 

755 left_has_missing = None 

756 right_has_missing = None 

757 

758 keys = zip(self.join_names, self.left_on, self.right_on) 

759 for i, (name, lname, rname) in enumerate(keys): 

760 if not _should_fill(lname, rname): 

761 continue 

762 

763 take_left, take_right = None, None 

764 

765 if name in result: 

766 

767 if left_indexer is not None and right_indexer is not None: 

768 if name in self.left: 

769 

770 if left_has_missing is None: 

771 left_has_missing = (left_indexer == -1).any() 

772 

773 if left_has_missing: 

774 take_right = self.right_join_keys[i] 

775 

776 if not is_dtype_equal( 

777 result[name].dtype, self.left[name].dtype 

778 ): 

779 take_left = self.left[name]._values 

780 

781 elif name in self.right: 

782 

783 if right_has_missing is None: 

784 right_has_missing = (right_indexer == -1).any() 

785 

786 if right_has_missing: 

787 take_left = self.left_join_keys[i] 

788 

789 if not is_dtype_equal( 

790 result[name].dtype, self.right[name].dtype 

791 ): 

792 take_right = self.right[name]._values 

793 

794 elif left_indexer is not None and is_array_like(self.left_join_keys[i]): 

795 take_left = self.left_join_keys[i] 

796 take_right = self.right_join_keys[i] 

797 

798 if take_left is not None or take_right is not None: 

799 

800 if take_left is None: 

801 lvals = result[name]._values 

802 else: 

803 lfill = na_value_for_dtype(take_left.dtype) 

804 lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill) 

805 

806 if take_right is None: 

807 rvals = result[name]._values 

808 else: 

809 rfill = na_value_for_dtype(take_right.dtype) 

810 rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) 

811 

812 # if we have an all missing left_indexer 

813 # make sure to just use the right values 

814 mask = left_indexer == -1 

815 if mask.all(): 

816 key_col = rvals 

817 else: 

818 key_col = Index(lvals).where(~mask, rvals) 

819 

820 if result._is_label_reference(name): 

821 result[name] = key_col 

822 elif result._is_level_reference(name): 

823 if isinstance(result.index, MultiIndex): 

824 key_col.name = name 

825 idx_list = [ 

826 result.index.get_level_values(level_name) 

827 if level_name != name 

828 else key_col 

829 for level_name in result.index.names 

830 ] 

831 

832 result.set_index(idx_list, inplace=True) 

833 else: 

834 result.index = Index(key_col, name=name) 

835 else: 

836 result.insert(i, name or "key_{i}".format(i=i), key_col) 

837 

838 def _get_join_indexers(self): 

839 """ return the join indexers """ 

840 return _get_join_indexers( 

841 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how 

842 ) 

843 

844 def _get_join_info(self): 

845 left_ax = self.left._data.axes[self.axis] 

846 right_ax = self.right._data.axes[self.axis] 

847 

848 if self.left_index and self.right_index and self.how != "asof": 

849 join_index, left_indexer, right_indexer = left_ax.join( 

850 right_ax, how=self.how, return_indexers=True, sort=self.sort 

851 ) 

852 elif self.right_index and self.how == "left": 

853 join_index, left_indexer, right_indexer = _left_join_on_index( 

854 left_ax, right_ax, self.left_join_keys, sort=self.sort 

855 ) 

856 

857 elif self.left_index and self.how == "right": 

858 join_index, right_indexer, left_indexer = _left_join_on_index( 

859 right_ax, left_ax, self.right_join_keys, sort=self.sort 

860 ) 

861 else: 

862 (left_indexer, right_indexer) = self._get_join_indexers() 

863 

864 if self.right_index: 

865 if len(self.left) > 0: 

866 join_index = self._create_join_index( 

867 self.left.index, 

868 self.right.index, 

869 left_indexer, 

870 right_indexer, 

871 how="right", 

872 ) 

873 else: 

874 join_index = self.right.index.take(right_indexer) 

875 left_indexer = np.array([-1] * len(join_index)) 

876 elif self.left_index: 

877 if len(self.right) > 0: 

878 join_index = self._create_join_index( 

879 self.right.index, 

880 self.left.index, 

881 right_indexer, 

882 left_indexer, 

883 how="left", 

884 ) 

885 else: 

886 join_index = self.left.index.take(left_indexer) 

887 right_indexer = np.array([-1] * len(join_index)) 

888 else: 

889 join_index = Index(np.arange(len(left_indexer))) 

890 

891 if len(join_index) == 0: 

892 join_index = join_index.astype(object) 

893 return join_index, left_indexer, right_indexer 

894 

895 def _create_join_index( 

896 self, 

897 index: Index, 

898 other_index: Index, 

899 indexer, 

900 other_indexer, 

901 how: str = "left", 

902 ): 

903 """ 

904 Create a join index by rearranging one index to match another 

905 

906 Parameters 

907 ---------- 

908 index: Index being rearranged 

909 other_index: Index used to supply values not found in index 

910 indexer: how to rearrange index 

911 how: replacement is only necessary if indexer based on other_index 

912 

913 Returns 

914 ------- 

915 join_index 

916 """ 

917 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): 

918 # if final index requires values in other_index but not target 

919 # index, indexer may hold missing (-1) values, causing Index.take 

920 # to take the final value in target index. So, we set the last 

921 # element to be the desired fill value. We do not use allow_fill 

922 # and fill_value because it throws a ValueError on integer indices 

923 mask = indexer == -1 

924 if np.any(mask): 

925 fill_value = na_value_for_dtype(index.dtype, compat=False) 

926 index = index.append(Index([fill_value])) 

927 return index.take(indexer) 

928 

929 def _get_merge_keys(self): 

930 """ 

931 Note: has side effects (copy/delete key columns) 

932 

933 Parameters 

934 ---------- 

935 left 

936 right 

937 on 

938 

939 Returns 

940 ------- 

941 left_keys, right_keys 

942 """ 

943 left_keys = [] 

944 right_keys = [] 

945 join_names = [] 

946 right_drop = [] 

947 left_drop = [] 

948 

949 left, right = self.left, self.right 

950 

951 is_lkey = lambda x: is_array_like(x) and len(x) == len(left) 

952 is_rkey = lambda x: is_array_like(x) and len(x) == len(right) 

953 

954 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A 

955 # user could, for example, request 'left_index' and 'left_by'. In a 

956 # regular pd.merge(), users cannot specify both 'left_index' and 

957 # 'left_on'. (Instead, users have a MultiIndex). That means the 

958 # self.left_on in this function is always empty in a pd.merge(), but 

959 # a pd.merge_asof(left_index=True, left_by=...) will result in a 

960 # self.left_on array with a None in the middle of it. This requires 

961 # a work-around as designated in the code below. 

962 # See _validate_specification() for where this happens. 

963 

964 # ugh, spaghetti re #733 

965 if _any(self.left_on) and _any(self.right_on): 

966 for lk, rk in zip(self.left_on, self.right_on): 

967 if is_lkey(lk): 

968 left_keys.append(lk) 

969 if is_rkey(rk): 

970 right_keys.append(rk) 

971 join_names.append(None) # what to do? 

972 else: 

973 if rk is not None: 

974 right_keys.append(right._get_label_or_level_values(rk)) 

975 join_names.append(rk) 

976 else: 

977 # work-around for merge_asof(right_index=True) 

978 right_keys.append(right.index) 

979 join_names.append(right.index.name) 

980 else: 

981 if not is_rkey(rk): 

982 if rk is not None: 

983 right_keys.append(right._get_label_or_level_values(rk)) 

984 else: 

985 # work-around for merge_asof(right_index=True) 

986 right_keys.append(right.index) 

987 if lk is not None and lk == rk: 

988 # avoid key upcast in corner case (length-0) 

989 if len(left) > 0: 

990 right_drop.append(rk) 

991 else: 

992 left_drop.append(lk) 

993 else: 

994 right_keys.append(rk) 

995 if lk is not None: 

996 left_keys.append(left._get_label_or_level_values(lk)) 

997 join_names.append(lk) 

998 else: 

999 # work-around for merge_asof(left_index=True) 

1000 left_keys.append(left.index) 

1001 join_names.append(left.index.name) 

1002 elif _any(self.left_on): 

1003 for k in self.left_on: 

1004 if is_lkey(k): 

1005 left_keys.append(k) 

1006 join_names.append(None) 

1007 else: 

1008 left_keys.append(left._get_label_or_level_values(k)) 

1009 join_names.append(k) 

1010 if isinstance(self.right.index, MultiIndex): 

1011 right_keys = [ 

1012 lev._values.take(lev_codes) 

1013 for lev, lev_codes in zip( 

1014 self.right.index.levels, self.right.index.codes 

1015 ) 

1016 ] 

1017 else: 

1018 right_keys = [self.right.index._values] 

1019 elif _any(self.right_on): 

1020 for k in self.right_on: 

1021 if is_rkey(k): 

1022 right_keys.append(k) 

1023 join_names.append(None) 

1024 else: 

1025 right_keys.append(right._get_label_or_level_values(k)) 

1026 join_names.append(k) 

1027 if isinstance(self.left.index, MultiIndex): 

1028 left_keys = [ 

1029 lev._values.take(lev_codes) 

1030 for lev, lev_codes in zip( 

1031 self.left.index.levels, self.left.index.codes 

1032 ) 

1033 ] 

1034 else: 

1035 left_keys = [self.left.index._values] 

1036 

1037 if left_drop: 

1038 self.left = self.left._drop_labels_or_levels(left_drop) 

1039 

1040 if right_drop: 

1041 self.right = self.right._drop_labels_or_levels(right_drop) 

1042 

1043 return left_keys, right_keys, join_names 

1044 

1045 def _maybe_coerce_merge_keys(self): 

1046 # we have valid mergees but we may have to further 

1047 # coerce these if they are originally incompatible types 

1048 # 

1049 # for example if these are categorical, but are not dtype_equal 

1050 # or if we have object and integer dtypes 

1051 

1052 for lk, rk, name in zip( 

1053 self.left_join_keys, self.right_join_keys, self.join_names 

1054 ): 

1055 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): 

1056 continue 

1057 

1058 lk_is_cat = is_categorical_dtype(lk) 

1059 rk_is_cat = is_categorical_dtype(rk) 

1060 lk_is_object = is_object_dtype(lk) 

1061 rk_is_object = is_object_dtype(rk) 

1062 

1063 # if either left or right is a categorical 

1064 # then the must match exactly in categories & ordered 

1065 if lk_is_cat and rk_is_cat: 

1066 if lk.is_dtype_equal(rk): 

1067 continue 

1068 

1069 elif lk_is_cat or rk_is_cat: 

1070 pass 

1071 

1072 elif is_dtype_equal(lk.dtype, rk.dtype): 

1073 continue 

1074 

1075 msg = ( 

1076 "You are trying to merge on {lk_dtype} and " 

1077 "{rk_dtype} columns. If you wish to proceed " 

1078 "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype) 

1079 ) 

1080 

1081 # if we are numeric, then allow differing 

1082 # kinds to proceed, eg. int64 and int8, int and float 

1083 # further if we are object, but we infer to 

1084 # the same, then proceed 

1085 if is_numeric_dtype(lk) and is_numeric_dtype(rk): 

1086 if lk.dtype.kind == rk.dtype.kind: 

1087 continue 

1088 

1089 # check whether ints and floats 

1090 elif is_integer_dtype(rk) and is_float_dtype(lk): 

1091 if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): 

1092 warnings.warn( 

1093 "You are merging on int and float " 

1094 "columns where the float values " 

1095 "are not equal to their int " 

1096 "representation", 

1097 UserWarning, 

1098 ) 

1099 continue 

1100 

1101 elif is_float_dtype(rk) and is_integer_dtype(lk): 

1102 if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): 

1103 warnings.warn( 

1104 "You are merging on int and float " 

1105 "columns where the float values " 

1106 "are not equal to their int " 

1107 "representation", 

1108 UserWarning, 

1109 ) 

1110 continue 

1111 

1112 # let's infer and see if we are ok 

1113 elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( 

1114 rk, skipna=False 

1115 ): 

1116 continue 

1117 

1118 # Check if we are trying to merge on obviously 

1119 # incompatible dtypes GH 9780, GH 15800 

1120 

1121 # bool values are coerced to object 

1122 elif (lk_is_object and is_bool_dtype(rk)) or ( 

1123 is_bool_dtype(lk) and rk_is_object 

1124 ): 

1125 pass 

1126 

1127 # object values are allowed to be merged 

1128 elif (lk_is_object and is_numeric_dtype(rk)) or ( 

1129 is_numeric_dtype(lk) and rk_is_object 

1130 ): 

1131 inferred_left = lib.infer_dtype(lk, skipna=False) 

1132 inferred_right = lib.infer_dtype(rk, skipna=False) 

1133 bool_types = ["integer", "mixed-integer", "boolean", "empty"] 

1134 string_types = ["string", "unicode", "mixed", "bytes", "empty"] 

1135 

1136 # inferred bool 

1137 if inferred_left in bool_types and inferred_right in bool_types: 

1138 pass 

1139 

1140 # unless we are merging non-string-like with string-like 

1141 elif ( 

1142 inferred_left in string_types and inferred_right not in string_types 

1143 ) or ( 

1144 inferred_right in string_types and inferred_left not in string_types 

1145 ): 

1146 raise ValueError(msg) 

1147 

1148 # datetimelikes must match exactly 

1149 elif needs_i8_conversion(lk) and not needs_i8_conversion(rk): 

1150 raise ValueError(msg) 

1151 elif not needs_i8_conversion(lk) and needs_i8_conversion(rk): 

1152 raise ValueError(msg) 

1153 elif is_datetime64tz_dtype(lk) and not is_datetime64tz_dtype(rk): 

1154 raise ValueError(msg) 

1155 elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): 

1156 raise ValueError(msg) 

1157 

1158 elif lk_is_object and rk_is_object: 

1159 continue 

1160 

1161 # Houston, we have a problem! 

1162 # let's coerce to object if the dtypes aren't 

1163 # categorical, otherwise coerce to the category 

1164 # dtype. If we coerced categories to object, 

1165 # then we would lose type information on some 

1166 # columns, and end up trying to merge 

1167 # incompatible dtypes. See GH 16900. 

1168 if name in self.left.columns: 

1169 typ = lk.categories.dtype if lk_is_cat else object 

1170 self.left = self.left.assign(**{name: self.left[name].astype(typ)}) 

1171 if name in self.right.columns: 

1172 typ = rk.categories.dtype if rk_is_cat else object 

1173 self.right = self.right.assign(**{name: self.right[name].astype(typ)}) 

1174 

1175 def _validate_specification(self): 

1176 # Hm, any way to make this logic less complicated?? 

1177 if self.on is None and self.left_on is None and self.right_on is None: 

1178 

1179 if self.left_index and self.right_index: 

1180 self.left_on, self.right_on = (), () 

1181 elif self.left_index: 

1182 if self.right_on is None: 

1183 raise MergeError("Must pass right_on or right_index=True") 

1184 elif self.right_index: 

1185 if self.left_on is None: 

1186 raise MergeError("Must pass left_on or left_index=True") 

1187 else: 

1188 # use the common columns 

1189 common_cols = self.left.columns.intersection(self.right.columns) 

1190 if len(common_cols) == 0: 

1191 raise MergeError( 

1192 "No common columns to perform merge on. " 

1193 "Merge options: left_on={lon}, right_on={ron}, " 

1194 "left_index={lidx}, right_index={ridx}".format( 

1195 lon=self.left_on, 

1196 ron=self.right_on, 

1197 lidx=self.left_index, 

1198 ridx=self.right_index, 

1199 ) 

1200 ) 

1201 if not common_cols.is_unique: 

1202 raise MergeError(f"Data columns not unique: {repr(common_cols)}") 

1203 self.left_on = self.right_on = common_cols 

1204 elif self.on is not None: 

1205 if self.left_on is not None or self.right_on is not None: 

1206 raise MergeError( 

1207 'Can only pass argument "on" OR "left_on" ' 

1208 'and "right_on", not a combination of both.' 

1209 ) 

1210 self.left_on = self.right_on = self.on 

1211 elif self.left_on is not None: 

1212 n = len(self.left_on) 

1213 if self.right_index: 

1214 if len(self.left_on) != self.right.index.nlevels: 

1215 raise ValueError( 

1216 "len(left_on) must equal the number " 

1217 'of levels in the index of "right"' 

1218 ) 

1219 self.right_on = [None] * n 

1220 elif self.right_on is not None: 

1221 n = len(self.right_on) 

1222 if self.left_index: 

1223 if len(self.right_on) != self.left.index.nlevels: 

1224 raise ValueError( 

1225 "len(right_on) must equal the number " 

1226 'of levels in the index of "left"' 

1227 ) 

1228 self.left_on = [None] * n 

1229 if len(self.right_on) != len(self.left_on): 

1230 raise ValueError("len(right_on) must equal len(left_on)") 

1231 

1232 def _validate(self, validate: str): 

1233 

1234 # Check uniqueness of each 

1235 if self.left_index: 

1236 left_unique = self.orig_left.index.is_unique 

1237 else: 

1238 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique 

1239 

1240 if self.right_index: 

1241 right_unique = self.orig_right.index.is_unique 

1242 else: 

1243 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique 

1244 

1245 # Check data integrity 

1246 if validate in ["one_to_one", "1:1"]: 

1247 if not left_unique and not right_unique: 

1248 raise MergeError( 

1249 "Merge keys are not unique in either left " 

1250 "or right dataset; not a one-to-one merge" 

1251 ) 

1252 elif not left_unique: 

1253 raise MergeError( 

1254 "Merge keys are not unique in left dataset; " 

1255 "not a one-to-one merge" 

1256 ) 

1257 elif not right_unique: 

1258 raise MergeError( 

1259 "Merge keys are not unique in right dataset; " 

1260 "not a one-to-one merge" 

1261 ) 

1262 

1263 elif validate in ["one_to_many", "1:m"]: 

1264 if not left_unique: 

1265 raise MergeError( 

1266 "Merge keys are not unique in left dataset; " 

1267 "not a one-to-many merge" 

1268 ) 

1269 

1270 elif validate in ["many_to_one", "m:1"]: 

1271 if not right_unique: 

1272 raise MergeError( 

1273 "Merge keys are not unique in right dataset; " 

1274 "not a many-to-one merge" 

1275 ) 

1276 

1277 elif validate in ["many_to_many", "m:m"]: 

1278 pass 

1279 

1280 else: 

1281 raise ValueError("Not a valid argument for validate") 

1282 

1283 

1284def _get_join_indexers( 

1285 left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs 

1286): 

1287 """ 

1288 

1289 Parameters 

1290 ---------- 

1291 left_keys: ndarray, Index, Series 

1292 right_keys: ndarray, Index, Series 

1293 sort: bool, default False 

1294 how: string {'inner', 'outer', 'left', 'right'}, default 'inner' 

1295 

1296 Returns 

1297 ------- 

1298 tuple of (left_indexer, right_indexer) 

1299 indexers into the left_keys, right_keys 

1300 

1301 """ 

1302 assert len(left_keys) == len( 

1303 right_keys 

1304 ), "left_key and right_keys must be the same length" 

1305 

1306 # get left & right join labels and num. of levels at each location 

1307 mapped = ( 

1308 _factorize_keys(left_keys[n], right_keys[n], sort=sort) 

1309 for n in range(len(left_keys)) 

1310 ) 

1311 zipped = zip(*mapped) 

1312 llab, rlab, shape = [list(x) for x in zipped] 

1313 

1314 # get flat i8 keys from label lists 

1315 lkey, rkey = _get_join_keys(llab, rlab, shape, sort) 

1316 

1317 # factorize keys to a dense i8 space 

1318 # `count` is the num. of unique keys 

1319 # set(lkey) | set(rkey) == range(count) 

1320 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

1321 

1322 # preserve left frame order if how == 'left' and sort == False 

1323 kwargs = copy.copy(kwargs) 

1324 if how == "left": 

1325 kwargs["sort"] = sort 

1326 join_func = _join_functions[how] 

1327 

1328 return join_func(lkey, rkey, count, **kwargs) 

1329 

1330 

1331def _restore_dropped_levels_multijoin( 

1332 left: MultiIndex, 

1333 right: MultiIndex, 

1334 dropped_level_names, 

1335 join_index, 

1336 lindexer, 

1337 rindexer, 

1338): 

1339 """ 

1340 *this is an internal non-public method* 

1341 

1342 Returns the levels, labels and names of a multi-index to multi-index join. 

1343 Depending on the type of join, this method restores the appropriate 

1344 dropped levels of the joined multi-index. 

1345 The method relies on lidx, rindexer which hold the index positions of 

1346 left and right, where a join was feasible 

1347 

1348 Parameters 

1349 ---------- 

1350 left : MultiIndex 

1351 left index 

1352 right : MultiIndex 

1353 right index 

1354 dropped_level_names : str array 

1355 list of non-common level names 

1356 join_index : MultiIndex 

1357 the index of the join between the 

1358 common levels of left and right 

1359 lindexer : intp array 

1360 left indexer 

1361 rindexer : intp array 

1362 right indexer 

1363 

1364 Returns 

1365 ------- 

1366 levels : list of Index 

1367 levels of combined multiindexes 

1368 labels : intp array 

1369 labels of combined multiindexes 

1370 names : str array 

1371 names of combined multiindexes 

1372 

1373 """ 

1374 

1375 def _convert_to_mulitindex(index) -> MultiIndex: 

1376 if isinstance(index, MultiIndex): 

1377 return index 

1378 else: 

1379 return MultiIndex.from_arrays([index.values], names=[index.name]) 

1380 

1381 # For multi-multi joins with one overlapping level, 

1382 # the returned index if of type Index 

1383 # Assure that join_index is of type MultiIndex 

1384 # so that dropped levels can be appended 

1385 join_index = _convert_to_mulitindex(join_index) 

1386 

1387 join_levels = join_index.levels 

1388 join_codes = join_index.codes 

1389 join_names = join_index.names 

1390 

1391 # lindexer and rindexer hold the indexes where the join occurred 

1392 # for left and right respectively. If left/right is None then 

1393 # the join occurred on all indices of left/right 

1394 if lindexer is None: 

1395 lindexer = range(left.size) 

1396 

1397 if rindexer is None: 

1398 rindexer = range(right.size) 

1399 

1400 # Iterate through the levels that must be restored 

1401 for dropped_level_name in dropped_level_names: 

1402 if dropped_level_name in left.names: 

1403 idx = left 

1404 indexer = lindexer 

1405 else: 

1406 idx = right 

1407 indexer = rindexer 

1408 

1409 # The index of the level name to be restored 

1410 name_idx = idx.names.index(dropped_level_name) 

1411 

1412 restore_levels = idx.levels[name_idx] 

1413 # Inject -1 in the codes list where a join was not possible 

1414 # IOW indexer[i]=-1 

1415 codes = idx.codes[name_idx] 

1416 restore_codes = algos.take_nd(codes, indexer, fill_value=-1) 

1417 

1418 join_levels = join_levels + [restore_levels] 

1419 join_codes = join_codes + [restore_codes] 

1420 join_names = join_names + [dropped_level_name] 

1421 

1422 return join_levels, join_codes, join_names 

1423 

1424 

1425class _OrderedMerge(_MergeOperation): 

1426 _merge_type = "ordered_merge" 

1427 

1428 def __init__( 

1429 self, 

1430 left, 

1431 right, 

1432 on=None, 

1433 left_on=None, 

1434 right_on=None, 

1435 left_index: bool = False, 

1436 right_index: bool = False, 

1437 axis=1, 

1438 suffixes=("_x", "_y"), 

1439 copy: bool = True, 

1440 fill_method=None, 

1441 how: str = "outer", 

1442 ): 

1443 

1444 self.fill_method = fill_method 

1445 _MergeOperation.__init__( 

1446 self, 

1447 left, 

1448 right, 

1449 on=on, 

1450 left_on=left_on, 

1451 left_index=left_index, 

1452 right_index=right_index, 

1453 right_on=right_on, 

1454 axis=axis, 

1455 how=how, 

1456 suffixes=suffixes, 

1457 sort=True, # factorize sorts 

1458 ) 

1459 

1460 def get_result(self): 

1461 join_index, left_indexer, right_indexer = self._get_join_info() 

1462 

1463 # this is a bit kludgy 

1464 ldata, rdata = self.left._data, self.right._data 

1465 lsuf, rsuf = self.suffixes 

1466 

1467 llabels, rlabels = _items_overlap_with_suffix( 

1468 ldata.items, lsuf, rdata.items, rsuf 

1469 ) 

1470 

1471 if self.fill_method == "ffill": 

1472 left_join_indexer = libjoin.ffill_indexer(left_indexer) 

1473 right_join_indexer = libjoin.ffill_indexer(right_indexer) 

1474 else: 

1475 left_join_indexer = left_indexer 

1476 right_join_indexer = right_indexer 

1477 

1478 lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} 

1479 rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} 

1480 

1481 result_data = concatenate_block_managers( 

1482 [(ldata, lindexers), (rdata, rindexers)], 

1483 axes=[llabels.append(rlabels), join_index], 

1484 concat_axis=0, 

1485 copy=self.copy, 

1486 ) 

1487 

1488 typ = self.left._constructor 

1489 result = typ(result_data).__finalize__(self, method=self._merge_type) 

1490 

1491 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

1492 

1493 return result 

1494 

1495 

1496def _asof_function(direction: str): 

1497 name = "asof_join_{dir}".format(dir=direction) 

1498 return getattr(libjoin, name, None) 

1499 

1500 

1501def _asof_by_function(direction: str): 

1502 name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) 

1503 return getattr(libjoin, name, None) 

1504 

1505 

1506_type_casters = { 

1507 "int64_t": ensure_int64, 

1508 "double": ensure_float64, 

1509 "object": ensure_object, 

1510} 

1511 

1512 

1513def _get_cython_type_upcast(dtype): 

1514 """ Upcast a dtype to 'int64_t', 'double', or 'object' """ 

1515 if is_integer_dtype(dtype): 

1516 return "int64_t" 

1517 elif is_float_dtype(dtype): 

1518 return "double" 

1519 else: 

1520 return "object" 

1521 

1522 

1523class _AsOfMerge(_OrderedMerge): 

1524 _merge_type = "asof_merge" 

1525 

1526 def __init__( 

1527 self, 

1528 left, 

1529 right, 

1530 on=None, 

1531 left_on=None, 

1532 right_on=None, 

1533 left_index: bool = False, 

1534 right_index: bool = False, 

1535 by=None, 

1536 left_by=None, 

1537 right_by=None, 

1538 axis=1, 

1539 suffixes=("_x", "_y"), 

1540 copy: bool = True, 

1541 fill_method=None, 

1542 how: str = "asof", 

1543 tolerance=None, 

1544 allow_exact_matches: bool = True, 

1545 direction: str = "backward", 

1546 ): 

1547 

1548 self.by = by 

1549 self.left_by = left_by 

1550 self.right_by = right_by 

1551 self.tolerance = tolerance 

1552 self.allow_exact_matches = allow_exact_matches 

1553 self.direction = direction 

1554 

1555 _OrderedMerge.__init__( 

1556 self, 

1557 left, 

1558 right, 

1559 on=on, 

1560 left_on=left_on, 

1561 right_on=right_on, 

1562 left_index=left_index, 

1563 right_index=right_index, 

1564 axis=axis, 

1565 how=how, 

1566 suffixes=suffixes, 

1567 fill_method=fill_method, 

1568 ) 

1569 

1570 def _validate_specification(self): 

1571 super()._validate_specification() 

1572 

1573 # we only allow on to be a single item for on 

1574 if len(self.left_on) != 1 and not self.left_index: 

1575 raise MergeError("can only asof on a key for left") 

1576 

1577 if len(self.right_on) != 1 and not self.right_index: 

1578 raise MergeError("can only asof on a key for right") 

1579 

1580 if self.left_index and isinstance(self.left.index, MultiIndex): 

1581 raise MergeError("left can only have one index") 

1582 

1583 if self.right_index and isinstance(self.right.index, MultiIndex): 

1584 raise MergeError("right can only have one index") 

1585 

1586 # set 'by' columns 

1587 if self.by is not None: 

1588 if self.left_by is not None or self.right_by is not None: 

1589 raise MergeError("Can only pass by OR left_by and right_by") 

1590 self.left_by = self.right_by = self.by 

1591 if self.left_by is None and self.right_by is not None: 

1592 raise MergeError("missing left_by") 

1593 if self.left_by is not None and self.right_by is None: 

1594 raise MergeError("missing right_by") 

1595 

1596 # add 'by' to our key-list so we can have it in the 

1597 # output as a key 

1598 if self.left_by is not None: 

1599 if not is_list_like(self.left_by): 

1600 self.left_by = [self.left_by] 

1601 if not is_list_like(self.right_by): 

1602 self.right_by = [self.right_by] 

1603 

1604 if len(self.left_by) != len(self.right_by): 

1605 raise MergeError("left_by and right_by must be same length") 

1606 

1607 self.left_on = self.left_by + list(self.left_on) 

1608 self.right_on = self.right_by + list(self.right_on) 

1609 

1610 # check 'direction' is valid 

1611 if self.direction not in ["backward", "forward", "nearest"]: 

1612 raise MergeError( 

1613 "direction invalid: {direction}".format(direction=self.direction) 

1614 ) 

1615 

1616 @property 

1617 def _asof_key(self): 

1618 """ This is our asof key, the 'on' """ 

1619 return self.left_on[-1] 

1620 

1621 def _get_merge_keys(self): 

1622 

1623 # note this function has side effects 

1624 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() 

1625 

1626 # validate index types are the same 

1627 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): 

1628 if not is_dtype_equal(lk.dtype, rk.dtype): 

1629 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): 

1630 # The generic error message is confusing for categoricals. 

1631 # 

1632 # In this function, the join keys include both the original 

1633 # ones of the merge_asof() call, and also the keys passed 

1634 # to its by= argument. Unordered but equal categories 

1635 # are not supported for the former, but will fail 

1636 # later with a ValueError, so we don't *need* to check 

1637 # for them here. 

1638 msg = ( 

1639 "incompatible merge keys [{i}] {lkdtype} and " 

1640 "{rkdtype}, both sides category, but not equal ones".format( 

1641 i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) 

1642 ) 

1643 ) 

1644 else: 

1645 msg = ( 

1646 "incompatible merge keys [{i}] {lkdtype} and " 

1647 "{rkdtype}, must be the same type".format( 

1648 i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) 

1649 ) 

1650 ) 

1651 raise MergeError(msg) 

1652 

1653 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI 

1654 if self.tolerance is not None: 

1655 

1656 if self.left_index: 

1657 lt = self.left.index 

1658 else: 

1659 lt = left_join_keys[-1] 

1660 

1661 msg = ( 

1662 "incompatible tolerance {tolerance}, must be compat " 

1663 "with type {lkdtype}".format( 

1664 tolerance=type(self.tolerance), lkdtype=repr(lt.dtype) 

1665 ) 

1666 ) 

1667 

1668 if needs_i8_conversion(lt): 

1669 if not isinstance(self.tolerance, datetime.timedelta): 

1670 raise MergeError(msg) 

1671 if self.tolerance < Timedelta(0): 

1672 raise MergeError("tolerance must be positive") 

1673 

1674 elif is_integer_dtype(lt): 

1675 if not is_integer(self.tolerance): 

1676 raise MergeError(msg) 

1677 if self.tolerance < 0: 

1678 raise MergeError("tolerance must be positive") 

1679 

1680 elif is_float_dtype(lt): 

1681 if not is_number(self.tolerance): 

1682 raise MergeError(msg) 

1683 if self.tolerance < 0: 

1684 raise MergeError("tolerance must be positive") 

1685 

1686 else: 

1687 raise MergeError("key must be integer, timestamp or float") 

1688 

1689 # validate allow_exact_matches 

1690 if not is_bool(self.allow_exact_matches): 

1691 msg = "allow_exact_matches must be boolean, passed {passed}" 

1692 raise MergeError(msg.format(passed=self.allow_exact_matches)) 

1693 

1694 return left_join_keys, right_join_keys, join_names 

1695 

1696 def _get_join_indexers(self): 

1697 """ return the join indexers """ 

1698 

1699 def flip(xs): 

1700 """ unlike np.transpose, this returns an array of tuples """ 

1701 xs = [ 

1702 x if not is_extension_array_dtype(x) else x._ndarray_values for x in xs 

1703 ] 

1704 labels = list(string.ascii_lowercase[: len(xs)]) 

1705 dtypes = [x.dtype for x in xs] 

1706 labeled_dtypes = list(zip(labels, dtypes)) 

1707 return np.array(list(zip(*xs)), labeled_dtypes) 

1708 

1709 # values to compare 

1710 left_values = ( 

1711 self.left.index.values if self.left_index else self.left_join_keys[-1] 

1712 ) 

1713 right_values = ( 

1714 self.right.index.values if self.right_index else self.right_join_keys[-1] 

1715 ) 

1716 tolerance = self.tolerance 

1717 

1718 # we require sortedness and non-null values in the join keys 

1719 msg_sorted = "{side} keys must be sorted" 

1720 msg_missings = "Merge keys contain null values on {side} side" 

1721 

1722 if not Index(left_values).is_monotonic: 

1723 if isna(left_values).any(): 

1724 raise ValueError(msg_missings.format(side="left")) 

1725 else: 

1726 raise ValueError(msg_sorted.format(side="left")) 

1727 

1728 if not Index(right_values).is_monotonic: 

1729 if isna(right_values).any(): 

1730 raise ValueError(msg_missings.format(side="right")) 

1731 else: 

1732 raise ValueError(msg_sorted.format(side="right")) 

1733 

1734 # initial type conversion as needed 

1735 if needs_i8_conversion(left_values): 

1736 left_values = left_values.view("i8") 

1737 right_values = right_values.view("i8") 

1738 if tolerance is not None: 

1739 tolerance = Timedelta(tolerance) 

1740 tolerance = tolerance.value 

1741 

1742 # a "by" parameter requires special handling 

1743 if self.left_by is not None: 

1744 # remove 'on' parameter from values if one existed 

1745 if self.left_index and self.right_index: 

1746 left_by_values = self.left_join_keys 

1747 right_by_values = self.right_join_keys 

1748 else: 

1749 left_by_values = self.left_join_keys[0:-1] 

1750 right_by_values = self.right_join_keys[0:-1] 

1751 

1752 # get tuple representation of values if more than one 

1753 if len(left_by_values) == 1: 

1754 left_by_values = left_by_values[0] 

1755 right_by_values = right_by_values[0] 

1756 else: 

1757 left_by_values = flip(left_by_values) 

1758 right_by_values = flip(right_by_values) 

1759 

1760 # upcast 'by' parameter because HashTable is limited 

1761 by_type = _get_cython_type_upcast(left_by_values.dtype) 

1762 by_type_caster = _type_casters[by_type] 

1763 left_by_values = by_type_caster(left_by_values) 

1764 right_by_values = by_type_caster(right_by_values) 

1765 

1766 # choose appropriate function by type 

1767 func = _asof_by_function(self.direction) 

1768 return func( 

1769 left_values, 

1770 right_values, 

1771 left_by_values, 

1772 right_by_values, 

1773 self.allow_exact_matches, 

1774 tolerance, 

1775 ) 

1776 else: 

1777 # choose appropriate function by type 

1778 func = _asof_function(self.direction) 

1779 return func(left_values, right_values, self.allow_exact_matches, tolerance) 

1780 

1781 

1782def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): 

1783 

1784 # left & right join labels and num. of levels at each location 

1785 mapped = ( 

1786 _factorize_keys(index.levels[n], join_keys[n], sort=sort) 

1787 for n in range(index.nlevels) 

1788 ) 

1789 zipped = zip(*mapped) 

1790 rcodes, lcodes, shape = [list(x) for x in zipped] 

1791 if sort: 

1792 rcodes = list(map(np.take, rcodes, index.codes)) 

1793 else: 

1794 i8copy = lambda a: a.astype("i8", subok=False, copy=True) 

1795 rcodes = list(map(i8copy, index.codes)) 

1796 

1797 # fix right labels if there were any nulls 

1798 for i in range(len(join_keys)): 

1799 mask = index.codes[i] == -1 

1800 if mask.any(): 

1801 # check if there already was any nulls at this location 

1802 # if there was, it is factorized to `shape[i] - 1` 

1803 a = join_keys[i][lcodes[i] == shape[i] - 1] 

1804 if a.size == 0 or not a[0] != a[0]: 

1805 shape[i] += 1 

1806 

1807 rcodes[i][mask] = shape[i] - 1 

1808 

1809 # get flat i8 join keys 

1810 lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) 

1811 

1812 # factorize keys to a dense i8 space 

1813 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

1814 

1815 return libjoin.left_outer_join(lkey, rkey, count, sort=sort) 

1816 

1817 

1818def _get_single_indexer(join_key, index, sort: bool = False): 

1819 left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) 

1820 

1821 left_indexer, right_indexer = libjoin.left_outer_join( 

1822 ensure_int64(left_key), ensure_int64(right_key), count, sort=sort 

1823 ) 

1824 

1825 return left_indexer, right_indexer 

1826 

1827 

1828def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): 

1829 if len(join_keys) > 1: 

1830 if not ( 

1831 (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) 

1832 ): 

1833 raise AssertionError( 

1834 "If more than one join key is given then " 

1835 "'right_ax' must be a MultiIndex and the " 

1836 "number of join keys must be the number of " 

1837 "levels in right_ax" 

1838 ) 

1839 

1840 left_indexer, right_indexer = _get_multiindex_indexer( 

1841 join_keys, right_ax, sort=sort 

1842 ) 

1843 else: 

1844 jkey = join_keys[0] 

1845 

1846 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) 

1847 

1848 if sort or len(left_ax) != len(left_indexer): 

1849 # if asked to sort or there are 1-to-many matches 

1850 join_index = left_ax.take(left_indexer) 

1851 return join_index, left_indexer, right_indexer 

1852 

1853 # left frame preserves order & length of its index 

1854 return left_ax, None, right_indexer 

1855 

1856 

1857def _right_outer_join(x, y, max_groups): 

1858 right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) 

1859 return left_indexer, right_indexer 

1860 

1861 

1862_join_functions = { 

1863 "inner": libjoin.inner_join, 

1864 "left": libjoin.left_outer_join, 

1865 "right": _right_outer_join, 

1866 "outer": libjoin.full_outer_join, 

1867} 

1868 

1869 

1870def _factorize_keys(lk, rk, sort=True): 

1871 # Some pre-processing for non-ndarray lk / rk 

1872 if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): 

1873 lk = getattr(lk, "_values", lk)._data 

1874 rk = getattr(rk, "_values", rk)._data 

1875 

1876 elif ( 

1877 is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) 

1878 ): 

1879 if lk.categories.equals(rk.categories): 

1880 # if we exactly match in categories, allow us to factorize on codes 

1881 rk = rk.codes 

1882 else: 

1883 # Same categories in different orders -> recode 

1884 rk = _recode_for_categories(rk.codes, rk.categories, lk.categories) 

1885 

1886 lk = ensure_int64(lk.codes) 

1887 rk = ensure_int64(rk) 

1888 

1889 elif ( 

1890 is_extension_array_dtype(lk.dtype) 

1891 and is_extension_array_dtype(rk.dtype) 

1892 and lk.dtype == rk.dtype 

1893 ): 

1894 lk, _ = lk._values_for_factorize() 

1895 rk, _ = rk._values_for_factorize() 

1896 

1897 if is_integer_dtype(lk) and is_integer_dtype(rk): 

1898 # GH#23917 TODO: needs tests for case where lk is integer-dtype 

1899 # and rk is datetime-dtype 

1900 klass = libhashtable.Int64Factorizer 

1901 lk = ensure_int64(com.values_from_object(lk)) 

1902 rk = ensure_int64(com.values_from_object(rk)) 

1903 elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( 

1904 rk.dtype.type, (np.timedelta64, np.datetime64) 

1905 ): 

1906 # GH#23917 TODO: Needs tests for non-matching dtypes 

1907 klass = libhashtable.Int64Factorizer 

1908 lk = ensure_int64(com.values_from_object(lk)) 

1909 rk = ensure_int64(com.values_from_object(rk)) 

1910 else: 

1911 klass = libhashtable.Factorizer 

1912 lk = ensure_object(lk) 

1913 rk = ensure_object(rk) 

1914 

1915 rizer = klass(max(len(lk), len(rk))) 

1916 

1917 llab = rizer.factorize(lk) 

1918 rlab = rizer.factorize(rk) 

1919 

1920 count = rizer.get_count() 

1921 

1922 if sort: 

1923 uniques = rizer.uniques.to_array() 

1924 llab, rlab = _sort_labels(uniques, llab, rlab) 

1925 

1926 # NA group 

1927 lmask = llab == -1 

1928 lany = lmask.any() 

1929 rmask = rlab == -1 

1930 rany = rmask.any() 

1931 

1932 if lany or rany: 

1933 if lany: 

1934 np.putmask(llab, lmask, count) 

1935 if rany: 

1936 np.putmask(rlab, rmask, count) 

1937 count += 1 

1938 

1939 return llab, rlab, count 

1940 

1941 

1942def _sort_labels(uniques: np.ndarray, left, right): 

1943 if not isinstance(uniques, np.ndarray): 

1944 # tuplesafe 

1945 uniques = Index(uniques).values 

1946 

1947 llength = len(left) 

1948 labels = np.concatenate([left, right]) 

1949 

1950 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) 

1951 new_labels = ensure_int64(new_labels) 

1952 new_left, new_right = new_labels[:llength], new_labels[llength:] 

1953 

1954 return new_left, new_right 

1955 

1956 

1957def _get_join_keys(llab, rlab, shape, sort: bool): 

1958 

1959 # how many levels can be done without overflow 

1960 pred = lambda i: not is_int64_overflow_possible(shape[:i]) 

1961 nlev = next(filter(pred, range(len(shape), 0, -1))) 

1962 

1963 # get keys for the first `nlev` levels 

1964 stride = np.prod(shape[1:nlev], dtype="i8") 

1965 lkey = stride * llab[0].astype("i8", subok=False, copy=False) 

1966 rkey = stride * rlab[0].astype("i8", subok=False, copy=False) 

1967 

1968 for i in range(1, nlev): 

1969 with np.errstate(divide="ignore"): 

1970 stride //= shape[i] 

1971 lkey += llab[i] * stride 

1972 rkey += rlab[i] * stride 

1973 

1974 if nlev == len(shape): # all done! 

1975 return lkey, rkey 

1976 

1977 # densify current keys to avoid overflow 

1978 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

1979 

1980 llab = [lkey] + llab[nlev:] 

1981 rlab = [rkey] + rlab[nlev:] 

1982 shape = [count] + shape[nlev:] 

1983 

1984 return _get_join_keys(llab, rlab, shape, sort) 

1985 

1986 

1987def _should_fill(lname, rname) -> bool: 

1988 if not isinstance(lname, str) or not isinstance(rname, str): 

1989 return True 

1990 return lname == rname 

1991 

1992 

1993def _any(x) -> bool: 

1994 return x is not None and com.any_not_none(*x) 

1995 

1996 

1997def _validate_operand(obj: FrameOrSeries) -> "DataFrame": 

1998 if isinstance(obj, ABCDataFrame): 

1999 return obj 

2000 elif isinstance(obj, ABCSeries): 

2001 if obj.name is None: 

2002 raise ValueError("Cannot merge a Series without a name") 

2003 else: 

2004 return obj.to_frame() 

2005 else: 

2006 raise TypeError( 

2007 "Can only merge Series or DataFrame objects, " 

2008 "a {obj} was passed".format(obj=type(obj)) 

2009 ) 

2010 

2011 

2012def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): 

2013 """ 

2014 If two indices overlap, add suffixes to overlapping entries. 

2015 

2016 If corresponding suffix is empty, the entry is simply converted to string. 

2017 

2018 """ 

2019 to_rename = left.intersection(right) 

2020 if len(to_rename) == 0: 

2021 return left, right 

2022 

2023 if not lsuffix and not rsuffix: 

2024 raise ValueError( 

2025 "columns overlap but no suffix specified: " 

2026 "{rename}".format(rename=to_rename) 

2027 ) 

2028 

2029 def renamer(x, suffix): 

2030 """ 

2031 Rename the left and right indices. 

2032 

2033 If there is overlap, and suffix is not None, add 

2034 suffix, otherwise, leave it as-is. 

2035 

2036 Parameters 

2037 ---------- 

2038 x : original column name 

2039 suffix : str or None 

2040 

2041 Returns 

2042 ------- 

2043 x : renamed column name 

2044 """ 

2045 if x in to_rename and suffix is not None: 

2046 return "{x}{suffix}".format(x=x, suffix=suffix) 

2047 return x 

2048 

2049 lrenamer = partial(renamer, suffix=lsuffix) 

2050 rrenamer = partial(renamer, suffix=rsuffix) 

2051 

2052 return (_transform_index(left, lrenamer), _transform_index(right, rrenamer))