Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Generic data algorithms. This module is experimental at the moment and not 

3intended for public consumption 

4""" 

5import operator 

6from textwrap import dedent 

7from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union 

8from warnings import catch_warnings, simplefilter, warn 

9 

10import numpy as np 

11 

12from pandas._libs import Timestamp, algos, hashtable as htable, lib 

13from pandas._libs.tslib import iNaT 

14from pandas.util._decorators import Appender, Substitution 

15 

16from pandas.core.dtypes.cast import ( 

17 construct_1d_object_array_from_listlike, 

18 infer_dtype_from_array, 

19 maybe_promote, 

20) 

21from pandas.core.dtypes.common import ( 

22 ensure_float64, 

23 ensure_int64, 

24 ensure_object, 

25 ensure_platform_int, 

26 ensure_uint64, 

27 is_array_like, 

28 is_bool_dtype, 

29 is_categorical_dtype, 

30 is_complex_dtype, 

31 is_datetime64_any_dtype, 

32 is_datetime64_dtype, 

33 is_datetime64_ns_dtype, 

34 is_extension_array_dtype, 

35 is_float_dtype, 

36 is_integer, 

37 is_integer_dtype, 

38 is_list_like, 

39 is_numeric_dtype, 

40 is_object_dtype, 

41 is_period_dtype, 

42 is_scalar, 

43 is_signed_integer_dtype, 

44 is_timedelta64_dtype, 

45 is_unsigned_integer_dtype, 

46 needs_i8_conversion, 

47) 

48from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries 

49from pandas.core.dtypes.missing import isna, na_value_for_dtype 

50 

51import pandas.core.common as com 

52from pandas.core.construction import array, extract_array 

53from pandas.core.indexers import validate_indices 

54 

55if TYPE_CHECKING: 

56 from pandas import Series 

57 

58_shared_docs: Dict[str, str] = {} 

59 

60 

61# --------------- # 

62# dtype access # 

63# --------------- # 

64def _ensure_data(values, dtype=None): 

65 """ 

66 routine to ensure that our data is of the correct 

67 input dtype for lower-level routines 

68 

69 This will coerce: 

70 - ints -> int64 

71 - uint -> uint64 

72 - bool -> uint64 (TODO this should be uint8) 

73 - datetimelike -> i8 

74 - datetime64tz -> i8 (in local tz) 

75 - categorical -> codes 

76 

77 Parameters 

78 ---------- 

79 values : array-like 

80 dtype : pandas_dtype, optional 

81 coerce to this dtype 

82 

83 Returns 

84 ------- 

85 values : ndarray 

86 pandas_dtype : str or dtype 

87 """ 

88 

89 # we check some simple dtypes first 

90 if is_object_dtype(dtype): 

91 return ensure_object(np.asarray(values)), "object" 

92 elif is_object_dtype(values) and dtype is None: 

93 return ensure_object(np.asarray(values)), "object" 

94 

95 try: 

96 if is_bool_dtype(values) or is_bool_dtype(dtype): 

97 # we are actually coercing to uint64 

98 # until our algos support uint8 directly (see TODO) 

99 return np.asarray(values).astype("uint64"), "bool" 

100 elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): 

101 return ensure_int64(values), "int64" 

102 elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): 

103 return ensure_uint64(values), "uint64" 

104 elif is_float_dtype(values) or is_float_dtype(dtype): 

105 return ensure_float64(values), "float64" 

106 elif is_complex_dtype(values) or is_complex_dtype(dtype): 

107 

108 # ignore the fact that we are casting to float 

109 # which discards complex parts 

110 with catch_warnings(): 

111 simplefilter("ignore", np.ComplexWarning) 

112 values = ensure_float64(values) 

113 return values, "float64" 

114 

115 except (TypeError, ValueError, OverflowError): 

116 # if we are trying to coerce to a dtype 

117 # and it is incompat this will fall through to here 

118 return ensure_object(values), "object" 

119 

120 # datetimelike 

121 if ( 

122 needs_i8_conversion(values) 

123 or is_period_dtype(dtype) 

124 or is_datetime64_any_dtype(dtype) 

125 or is_timedelta64_dtype(dtype) 

126 ): 

127 if is_period_dtype(values) or is_period_dtype(dtype): 

128 from pandas import PeriodIndex 

129 

130 values = PeriodIndex(values) 

131 dtype = values.dtype 

132 elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype): 

133 from pandas import TimedeltaIndex 

134 

135 values = TimedeltaIndex(values) 

136 dtype = values.dtype 

137 else: 

138 # Datetime 

139 if values.ndim > 1 and is_datetime64_ns_dtype(values): 

140 # Avoid calling the DatetimeIndex constructor as it is 1D only 

141 # Note: this is reached by DataFrame.rank calls GH#27027 

142 asi8 = values.view("i8") 

143 dtype = values.dtype 

144 return asi8, dtype 

145 

146 from pandas import DatetimeIndex 

147 

148 values = DatetimeIndex(values) 

149 dtype = values.dtype 

150 

151 return values.asi8, dtype 

152 

153 elif is_categorical_dtype(values) and ( 

154 is_categorical_dtype(dtype) or dtype is None 

155 ): 

156 values = getattr(values, "values", values) 

157 values = values.codes 

158 dtype = "category" 

159 

160 # we are actually coercing to int64 

161 # until our algos support int* directly (not all do) 

162 values = ensure_int64(values) 

163 

164 return values, dtype 

165 

166 # we have failed, return object 

167 values = np.asarray(values, dtype=np.object) 

168 return ensure_object(values), "object" 

169 

170 

171def _reconstruct_data(values, dtype, original): 

172 """ 

173 reverse of _ensure_data 

174 

175 Parameters 

176 ---------- 

177 values : ndarray 

178 dtype : pandas_dtype 

179 original : ndarray-like 

180 

181 Returns 

182 ------- 

183 Index for extension types, otherwise ndarray casted to dtype 

184 """ 

185 

186 if is_extension_array_dtype(dtype): 

187 values = dtype.construct_array_type()._from_sequence(values) 

188 elif is_bool_dtype(dtype): 

189 values = values.astype(dtype, copy=False) 

190 

191 # we only support object dtypes bool Index 

192 if isinstance(original, ABCIndexClass): 

193 values = values.astype(object, copy=False) 

194 elif dtype is not None: 

195 if is_datetime64_dtype(dtype): 

196 dtype = "datetime64[ns]" 

197 elif is_timedelta64_dtype(dtype): 

198 dtype = "timedelta64[ns]" 

199 

200 values = values.astype(dtype, copy=False) 

201 

202 return values 

203 

204 

205def _ensure_arraylike(values): 

206 """ 

207 ensure that we are arraylike if not already 

208 """ 

209 if not is_array_like(values): 

210 inferred = lib.infer_dtype(values, skipna=False) 

211 if inferred in ["mixed", "string", "unicode"]: 

212 if isinstance(values, tuple): 

213 values = list(values) 

214 values = construct_1d_object_array_from_listlike(values) 

215 else: 

216 values = np.asarray(values) 

217 return values 

218 

219 

220_hashtables = { 

221 "float64": htable.Float64HashTable, 

222 "uint64": htable.UInt64HashTable, 

223 "int64": htable.Int64HashTable, 

224 "string": htable.StringHashTable, 

225 "object": htable.PyObjectHashTable, 

226} 

227 

228 

229def _get_hashtable_algo(values): 

230 """ 

231 Parameters 

232 ---------- 

233 values : arraylike 

234 

235 Returns 

236 ------- 

237 htable : HashTable subclass 

238 values : ndarray 

239 """ 

240 values, _ = _ensure_data(values) 

241 

242 ndtype = _check_object_for_strings(values) 

243 htable = _hashtables[ndtype] 

244 return htable, values 

245 

246 

247def _get_values_for_rank(values): 

248 if is_categorical_dtype(values): 

249 values = values._values_for_rank() 

250 

251 values, _ = _ensure_data(values) 

252 return values 

253 

254 

255def _get_data_algo(values): 

256 values = _get_values_for_rank(values) 

257 

258 ndtype = _check_object_for_strings(values) 

259 htable = _hashtables.get(ndtype, _hashtables["object"]) 

260 

261 return htable, values 

262 

263 

264def _check_object_for_strings(values) -> str: 

265 """ 

266 Check if we can use string hashtable instead of object hashtable. 

267 

268 Parameters 

269 ---------- 

270 values : ndarray 

271 ndtype : str 

272 

273 Returns 

274 ------- 

275 str 

276 """ 

277 ndtype = values.dtype.name 

278 if ndtype == "object": 

279 

280 # it's cheaper to use a String Hash Table than Object; we infer 

281 # including nulls because that is the only difference between 

282 # StringHashTable and ObjectHashtable 

283 if lib.infer_dtype(values, skipna=False) in ["string"]: 

284 ndtype = "string" 

285 return ndtype 

286 

287 

288# --------------- # 

289# top-level algos # 

290# --------------- # 

291 

292 

293def unique(values): 

294 """ 

295 Hash table-based unique. Uniques are returned in order 

296 of appearance. This does NOT sort. 

297 

298 Significantly faster than numpy.unique. Includes NA values. 

299 

300 Parameters 

301 ---------- 

302 values : 1d array-like 

303 

304 Returns 

305 ------- 

306 numpy.ndarray or ExtensionArray 

307 

308 The return can be: 

309 

310 * Index : when the input is an Index 

311 * Categorical : when the input is a Categorical dtype 

312 * ndarray : when the input is a Series/ndarray 

313 

314 Return numpy.ndarray or ExtensionArray. 

315 

316 See Also 

317 -------- 

318 Index.unique 

319 Series.unique 

320 

321 Examples 

322 -------- 

323 >>> pd.unique(pd.Series([2, 1, 3, 3])) 

324 array([2, 1, 3]) 

325 

326 >>> pd.unique(pd.Series([2] + [1] * 5)) 

327 array([2, 1]) 

328 

329 >>> pd.unique(pd.Series([pd.Timestamp('20160101'), 

330 ... pd.Timestamp('20160101')])) 

331 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') 

332 

333 >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), 

334 ... pd.Timestamp('20160101', tz='US/Eastern')])) 

335 array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], 

336 dtype=object) 

337 

338 >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), 

339 ... pd.Timestamp('20160101', tz='US/Eastern')])) 

340 DatetimeIndex(['2016-01-01 00:00:00-05:00'], 

341 ... dtype='datetime64[ns, US/Eastern]', freq=None) 

342 

343 >>> pd.unique(list('baabc')) 

344 array(['b', 'a', 'c'], dtype=object) 

345 

346 An unordered Categorical will return categories in the 

347 order of appearance. 

348 

349 >>> pd.unique(pd.Series(pd.Categorical(list('baabc')))) 

350 [b, a, c] 

351 Categories (3, object): [b, a, c] 

352 

353 >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), 

354 ... categories=list('abc')))) 

355 [b, a, c] 

356 Categories (3, object): [b, a, c] 

357 

358 An ordered Categorical preserves the category ordering. 

359 

360 >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), 

361 ... categories=list('abc'), 

362 ... ordered=True))) 

363 [b, a, c] 

364 Categories (3, object): [a < b < c] 

365 

366 An array of tuples 

367 

368 >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) 

369 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) 

370 """ 

371 

372 values = _ensure_arraylike(values) 

373 

374 if is_extension_array_dtype(values): 

375 # Dispatch to extension dtype's unique. 

376 return values.unique() 

377 

378 original = values 

379 htable, values = _get_hashtable_algo(values) 

380 

381 table = htable(len(values)) 

382 uniques = table.unique(values) 

383 uniques = _reconstruct_data(uniques, original.dtype, original) 

384 return uniques 

385 

386 

387unique1d = unique 

388 

389 

390def isin(comps, values) -> np.ndarray: 

391 """ 

392 Compute the isin boolean array. 

393 

394 Parameters 

395 ---------- 

396 comps : array-like 

397 values : array-like 

398 

399 Returns 

400 ------- 

401 ndarray[bool] 

402 Same length as `comps`. 

403 """ 

404 if not is_list_like(comps): 

405 raise TypeError( 

406 "only list-like objects are allowed to be passed " 

407 f"to isin(), you passed a [{type(comps).__name__}]" 

408 ) 

409 if not is_list_like(values): 

410 raise TypeError( 

411 "only list-like objects are allowed to be passed " 

412 f"to isin(), you passed a [{type(values).__name__}]" 

413 ) 

414 

415 if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): 

416 values = construct_1d_object_array_from_listlike(list(values)) 

417 

418 if is_categorical_dtype(comps): 

419 # TODO(extension) 

420 # handle categoricals 

421 return comps._values.isin(values) 

422 

423 comps = com.values_from_object(comps) 

424 

425 comps, dtype = _ensure_data(comps) 

426 values, _ = _ensure_data(values, dtype=dtype) 

427 

428 # faster for larger cases to use np.in1d 

429 f = htable.ismember_object 

430 

431 # GH16012 

432 # Ensure np.in1d doesn't get object types or it *may* throw an exception 

433 if len(comps) > 1_000_000 and not is_object_dtype(comps): 

434 f = np.in1d 

435 elif is_integer_dtype(comps): 

436 try: 

437 values = values.astype("int64", copy=False) 

438 comps = comps.astype("int64", copy=False) 

439 f = htable.ismember_int64 

440 except (TypeError, ValueError, OverflowError): 

441 values = values.astype(object) 

442 comps = comps.astype(object) 

443 

444 elif is_float_dtype(comps): 

445 try: 

446 values = values.astype("float64", copy=False) 

447 comps = comps.astype("float64", copy=False) 

448 f = htable.ismember_float64 

449 except (TypeError, ValueError): 

450 values = values.astype(object) 

451 comps = comps.astype(object) 

452 

453 return f(comps, values) 

454 

455 

456def _factorize_array( 

457 values, na_sentinel: int = -1, size_hint=None, na_value=None 

458) -> Tuple[np.ndarray, np.ndarray]: 

459 """ 

460 Factorize an array-like to codes and uniques. 

461 

462 This doesn't do any coercion of types or unboxing before factorization. 

463 

464 Parameters 

465 ---------- 

466 values : ndarray 

467 na_sentinel : int, default -1 

468 size_hint : int, optional 

469 Passsed through to the hashtable's 'get_labels' method 

470 na_value : object, optional 

471 A value in `values` to consider missing. Note: only use this 

472 parameter when you know that you don't have any values pandas would 

473 consider missing in the array (NaN for float data, iNaT for 

474 datetimes, etc.). 

475 

476 Returns 

477 ------- 

478 codes : ndarray 

479 uniques : ndarray 

480 """ 

481 hash_klass, values = _get_data_algo(values) 

482 

483 table = hash_klass(size_hint or len(values)) 

484 uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) 

485 

486 codes = ensure_platform_int(codes) 

487 return codes, uniques 

488 

489 

490_shared_docs[ 

491 "factorize" 

492] = """ 

493 Encode the object as an enumerated type or categorical variable. 

494 

495 This method is useful for obtaining a numeric representation of an 

496 array when all that matters is identifying distinct values. `factorize` 

497 is available as both a top-level function :func:`pandas.factorize`, 

498 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. 

499 

500 Parameters 

501 ---------- 

502 %(values)s%(sort)s 

503 na_sentinel : int, default -1 

504 Value to mark "not found". 

505 %(size_hint)s\ 

506 

507 Returns 

508 ------- 

509 codes : ndarray 

510 An integer ndarray that's an indexer into `uniques`. 

511 ``uniques.take(codes)`` will have the same values as `values`. 

512 uniques : ndarray, Index, or Categorical 

513 The unique valid values. When `values` is Categorical, `uniques` 

514 is a Categorical. When `values` is some other pandas object, an 

515 `Index` is returned. Otherwise, a 1-D ndarray is returned. 

516 

517 .. note :: 

518 

519 Even if there's a missing value in `values`, `uniques` will 

520 *not* contain an entry for it. 

521 

522 See Also 

523 -------- 

524 cut : Discretize continuous-valued array. 

525 unique : Find the unique value in an array. 

526 

527 Examples 

528 -------- 

529 These examples all show factorize as a top-level method like 

530 ``pd.factorize(values)``. The results are identical for methods like 

531 :meth:`Series.factorize`. 

532 

533 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) 

534 >>> codes 

535 array([0, 0, 1, 2, 0]) 

536 >>> uniques 

537 array(['b', 'a', 'c'], dtype=object) 

538 

539 With ``sort=True``, the `uniques` will be sorted, and `codes` will be 

540 shuffled so that the relationship is the maintained. 

541 

542 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) 

543 >>> codes 

544 array([1, 1, 0, 2, 1]) 

545 >>> uniques 

546 array(['a', 'b', 'c'], dtype=object) 

547 

548 Missing values are indicated in `codes` with `na_sentinel` 

549 (``-1`` by default). Note that missing values are never 

550 included in `uniques`. 

551 

552 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) 

553 >>> codes 

554 array([ 0, -1, 1, 2, 0]) 

555 >>> uniques 

556 array(['b', 'a', 'c'], dtype=object) 

557 

558 Thus far, we've only factorized lists (which are internally coerced to 

559 NumPy arrays). When factorizing pandas objects, the type of `uniques` 

560 will differ. For Categoricals, a `Categorical` is returned. 

561 

562 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) 

563 >>> codes, uniques = pd.factorize(cat) 

564 >>> codes 

565 array([0, 0, 1]) 

566 >>> uniques 

567 [a, c] 

568 Categories (3, object): [a, b, c] 

569 

570 Notice that ``'b'`` is in ``uniques.categories``, despite not being 

571 present in ``cat.values``. 

572 

573 For all other pandas objects, an Index of the appropriate type is 

574 returned. 

575 

576 >>> cat = pd.Series(['a', 'a', 'c']) 

577 >>> codes, uniques = pd.factorize(cat) 

578 >>> codes 

579 array([0, 0, 1]) 

580 >>> uniques 

581 Index(['a', 'c'], dtype='object') 

582 """ 

583 

584 

585@Substitution( 

586 values=dedent( 

587 """\ 

588 values : sequence 

589 A 1-D sequence. Sequences that aren't pandas objects are 

590 coerced to ndarrays before factorization. 

591 """ 

592 ), 

593 sort=dedent( 

594 """\ 

595 sort : bool, default False 

596 Sort `uniques` and shuffle `codes` to maintain the 

597 relationship. 

598 """ 

599 ), 

600 size_hint=dedent( 

601 """\ 

602 size_hint : int, optional 

603 Hint to the hashtable sizer. 

604 """ 

605 ), 

606) 

607@Appender(_shared_docs["factorize"]) 

608def factorize( 

609 values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None 

610) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: 

611 # Implementation notes: This method is responsible for 3 things 

612 # 1.) coercing data to array-like (ndarray, Index, extension array) 

613 # 2.) factorizing codes and uniques 

614 # 3.) Maybe boxing the uniques in an Index 

615 # 

616 # Step 2 is dispatched to extension types (like Categorical). They are 

617 # responsible only for factorization. All data coercion, sorting and boxing 

618 # should happen here. 

619 

620 values = _ensure_arraylike(values) 

621 original = values 

622 

623 if is_extension_array_dtype(values): 

624 values = extract_array(values) 

625 codes, uniques = values.factorize(na_sentinel=na_sentinel) 

626 dtype = original.dtype 

627 else: 

628 values, dtype = _ensure_data(values) 

629 

630 if original.dtype.kind in ["m", "M"]: 

631 na_value = na_value_for_dtype(original.dtype) 

632 else: 

633 na_value = None 

634 

635 codes, uniques = _factorize_array( 

636 values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value 

637 ) 

638 

639 if sort and len(uniques) > 0: 

640 uniques, codes = safe_sort( 

641 uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False 

642 ) 

643 

644 uniques = _reconstruct_data(uniques, dtype, original) 

645 

646 # return original tenor 

647 if isinstance(original, ABCIndexClass): 

648 uniques = original._shallow_copy(uniques, name=None) 

649 elif isinstance(original, ABCSeries): 

650 from pandas import Index 

651 

652 uniques = Index(uniques) 

653 

654 return codes, uniques 

655 

656 

657def value_counts( 

658 values, 

659 sort: bool = True, 

660 ascending: bool = False, 

661 normalize: bool = False, 

662 bins=None, 

663 dropna: bool = True, 

664) -> "Series": 

665 """ 

666 Compute a histogram of the counts of non-null values. 

667 

668 Parameters 

669 ---------- 

670 values : ndarray (1-d) 

671 sort : bool, default True 

672 Sort by values 

673 ascending : bool, default False 

674 Sort in ascending order 

675 normalize: bool, default False 

676 If True then compute a relative histogram 

677 bins : integer, optional 

678 Rather than count values, group them into half-open bins, 

679 convenience for pd.cut, only works with numeric data 

680 dropna : bool, default True 

681 Don't include counts of NaN 

682 

683 Returns 

684 ------- 

685 Series 

686 """ 

687 from pandas.core.series import Series 

688 

689 name = getattr(values, "name", None) 

690 

691 if bins is not None: 

692 from pandas.core.reshape.tile import cut 

693 

694 values = Series(values) 

695 try: 

696 ii = cut(values, bins, include_lowest=True) 

697 except TypeError: 

698 raise TypeError("bins argument only works with numeric data.") 

699 

700 # count, remove nulls (from the index), and but the bins 

701 result = ii.value_counts(dropna=dropna) 

702 result = result[result.index.notna()] 

703 result.index = result.index.astype("interval") 

704 result = result.sort_index() 

705 

706 # if we are dropna and we have NO values 

707 if dropna and (result.values == 0).all(): 

708 result = result.iloc[0:0] 

709 

710 # normalizing is by len of all (regardless of dropna) 

711 counts = np.array([len(ii)]) 

712 

713 else: 

714 

715 if is_extension_array_dtype(values): 

716 

717 # handle Categorical and sparse, 

718 result = Series(values)._values.value_counts(dropna=dropna) 

719 result.name = name 

720 counts = result.values 

721 

722 else: 

723 keys, counts = _value_counts_arraylike(values, dropna) 

724 

725 result = Series(counts, index=keys, name=name) 

726 

727 if sort: 

728 result = result.sort_values(ascending=ascending) 

729 

730 if normalize: 

731 result = result / float(counts.sum()) 

732 

733 return result 

734 

735 

736def _value_counts_arraylike(values, dropna: bool): 

737 """ 

738 Parameters 

739 ---------- 

740 values : arraylike 

741 dropna : bool 

742 

743 Returns 

744 ------- 

745 uniques : np.ndarray or ExtensionArray 

746 counts : np.ndarray 

747 """ 

748 values = _ensure_arraylike(values) 

749 original = values 

750 values, _ = _ensure_data(values) 

751 ndtype = values.dtype.name 

752 

753 if needs_i8_conversion(original.dtype): 

754 # datetime, timedelta, or period 

755 

756 keys, counts = htable.value_count_int64(values, dropna) 

757 

758 if dropna: 

759 msk = keys != iNaT 

760 keys, counts = keys[msk], counts[msk] 

761 

762 else: 

763 # ndarray like 

764 

765 # TODO: handle uint8 

766 f = getattr(htable, f"value_count_{ndtype}") 

767 keys, counts = f(values, dropna) 

768 

769 mask = isna(values) 

770 if not dropna and mask.any(): 

771 if not isna(keys).any(): 

772 keys = np.insert(keys, 0, np.NaN) 

773 counts = np.insert(counts, 0, mask.sum()) 

774 

775 keys = _reconstruct_data(keys, original.dtype, original) 

776 

777 return keys, counts 

778 

779 

780def duplicated(values, keep="first") -> np.ndarray: 

781 """ 

782 Return boolean ndarray denoting duplicate values. 

783 

784 Parameters 

785 ---------- 

786 values : ndarray-like 

787 Array over which to check for duplicate values. 

788 keep : {'first', 'last', False}, default 'first' 

789 - ``first`` : Mark duplicates as ``True`` except for the first 

790 occurrence. 

791 - ``last`` : Mark duplicates as ``True`` except for the last 

792 occurrence. 

793 - False : Mark all duplicates as ``True``. 

794 

795 Returns 

796 ------- 

797 duplicated : ndarray 

798 """ 

799 

800 values, _ = _ensure_data(values) 

801 ndtype = values.dtype.name 

802 f = getattr(htable, f"duplicated_{ndtype}") 

803 return f(values, keep=keep) 

804 

805 

806def mode(values, dropna: bool = True) -> "Series": 

807 """ 

808 Returns the mode(s) of an array. 

809 

810 Parameters 

811 ---------- 

812 values : array-like 

813 Array over which to check for duplicate values. 

814 dropna : boolean, default True 

815 Don't consider counts of NaN/NaT. 

816 

817 .. versionadded:: 0.24.0 

818 

819 Returns 

820 ------- 

821 mode : Series 

822 """ 

823 from pandas import Series 

824 

825 values = _ensure_arraylike(values) 

826 original = values 

827 

828 # categorical is a fast-path 

829 if is_categorical_dtype(values): 

830 if isinstance(values, Series): 

831 return Series(values.values.mode(dropna=dropna), name=values.name) 

832 return values.mode(dropna=dropna) 

833 

834 if dropna and needs_i8_conversion(values.dtype): 

835 mask = values.isnull() 

836 values = values[~mask] 

837 

838 values, _ = _ensure_data(values) 

839 ndtype = values.dtype.name 

840 

841 f = getattr(htable, f"mode_{ndtype}") 

842 result = f(values, dropna=dropna) 

843 try: 

844 result = np.sort(result) 

845 except TypeError as err: 

846 warn(f"Unable to sort modes: {err}") 

847 

848 result = _reconstruct_data(result, original.dtype, original) 

849 return Series(result) 

850 

851 

852def rank( 

853 values, 

854 axis: int = 0, 

855 method: str = "average", 

856 na_option: str = "keep", 

857 ascending: bool = True, 

858 pct: bool = False, 

859): 

860 """ 

861 Rank the values along a given axis. 

862 

863 Parameters 

864 ---------- 

865 values : array-like 

866 Array whose values will be ranked. The number of dimensions in this 

867 array must not exceed 2. 

868 axis : int, default 0 

869 Axis over which to perform rankings. 

870 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

871 The method by which tiebreaks are broken during the ranking. 

872 na_option : {'keep', 'top'}, default 'keep' 

873 The method by which NaNs are placed in the ranking. 

874 - ``keep``: rank each NaN value with a NaN ranking 

875 - ``top``: replace each NaN with either +/- inf so that they 

876 there are ranked at the top 

877 ascending : boolean, default True 

878 Whether or not the elements should be ranked in ascending order. 

879 pct : boolean, default False 

880 Whether or not to the display the returned rankings in integer form 

881 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). 

882 """ 

883 if values.ndim == 1: 

884 values = _get_values_for_rank(values) 

885 ranks = algos.rank_1d( 

886 values, 

887 ties_method=method, 

888 ascending=ascending, 

889 na_option=na_option, 

890 pct=pct, 

891 ) 

892 elif values.ndim == 2: 

893 values = _get_values_for_rank(values) 

894 ranks = algos.rank_2d( 

895 values, 

896 axis=axis, 

897 ties_method=method, 

898 ascending=ascending, 

899 na_option=na_option, 

900 pct=pct, 

901 ) 

902 else: 

903 raise TypeError("Array with ndim > 2 are not supported.") 

904 

905 return ranks 

906 

907 

908def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): 

909 """ 

910 Perform array addition that checks for underflow and overflow. 

911 

912 Performs the addition of an int64 array and an int64 integer (or array) 

913 but checks that they do not result in overflow first. For elements that 

914 are indicated to be NaN, whether or not there is overflow for that element 

915 is automatically ignored. 

916 

917 Parameters 

918 ---------- 

919 arr : array addend. 

920 b : array or scalar addend. 

921 arr_mask : boolean array or None 

922 array indicating which elements to exclude from checking 

923 b_mask : boolean array or boolean or None 

924 array or scalar indicating which element(s) to exclude from checking 

925 

926 Returns 

927 ------- 

928 sum : An array for elements x + b for each element x in arr if b is 

929 a scalar or an array for elements x + y for each element pair 

930 (x, y) in (arr, b). 

931 

932 Raises 

933 ------ 

934 OverflowError if any x + y exceeds the maximum or minimum int64 value. 

935 """ 

936 # For performance reasons, we broadcast 'b' to the new array 'b2' 

937 # so that it has the same size as 'arr'. 

938 b2 = np.broadcast_to(b, arr.shape) 

939 if b_mask is not None: 

940 # We do the same broadcasting for b_mask as well. 

941 b2_mask = np.broadcast_to(b_mask, arr.shape) 

942 else: 

943 b2_mask = None 

944 

945 # For elements that are NaN, regardless of their value, we should 

946 # ignore whether they overflow or not when doing the checked add. 

947 if arr_mask is not None and b2_mask is not None: 

948 not_nan = np.logical_not(arr_mask | b2_mask) 

949 elif arr_mask is not None: 

950 not_nan = np.logical_not(arr_mask) 

951 elif b_mask is not None: 

952 not_nan = np.logical_not(b2_mask) 

953 else: 

954 not_nan = np.empty(arr.shape, dtype=bool) 

955 not_nan.fill(True) 

956 

957 # gh-14324: For each element in 'arr' and its corresponding element 

958 # in 'b2', we check the sign of the element in 'b2'. If it is positive, 

959 # we then check whether its sum with the element in 'arr' exceeds 

960 # np.iinfo(np.int64).max. If so, we have an overflow error. If it 

961 # it is negative, we then check whether its sum with the element in 

962 # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow 

963 # error as well. 

964 mask1 = b2 > 0 

965 mask2 = b2 < 0 

966 

967 if not mask1.any(): 

968 to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() 

969 elif not mask2.any(): 

970 to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() 

971 else: 

972 to_raise = ( 

973 ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() 

974 or ( 

975 (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] 

976 ).any() 

977 ) 

978 

979 if to_raise: 

980 raise OverflowError("Overflow in int64 addition") 

981 return arr + b 

982 

983 

984def quantile(x, q, interpolation_method="fraction"): 

985 """ 

986 Compute sample quantile or quantiles of the input array. For example, q=0.5 

987 computes the median. 

988 

989 The `interpolation_method` parameter supports three values, namely 

990 `fraction` (default), `lower` and `higher`. Interpolation is done only, 

991 if the desired quantile lies between two data points `i` and `j`. For 

992 `fraction`, the result is an interpolated value between `i` and `j`; 

993 for `lower`, the result is `i`, for `higher` the result is `j`. 

994 

995 Parameters 

996 ---------- 

997 x : ndarray 

998 Values from which to extract score. 

999 q : scalar or array 

1000 Percentile at which to extract score. 

1001 interpolation_method : {'fraction', 'lower', 'higher'}, optional 

1002 This optional parameter specifies the interpolation method to use, 

1003 when the desired quantile lies between two data points `i` and `j`: 

1004 

1005 - fraction: `i + (j - i)*fraction`, where `fraction` is the 

1006 fractional part of the index surrounded by `i` and `j`. 

1007 -lower: `i`. 

1008 - higher: `j`. 

1009 

1010 Returns 

1011 ------- 

1012 score : float 

1013 Score at percentile. 

1014 

1015 Examples 

1016 -------- 

1017 >>> from scipy import stats 

1018 >>> a = np.arange(100) 

1019 >>> stats.scoreatpercentile(a, 50) 

1020 49.5 

1021 

1022 """ 

1023 x = np.asarray(x) 

1024 mask = isna(x) 

1025 

1026 x = x[~mask] 

1027 

1028 values = np.sort(x) 

1029 

1030 def _interpolate(a, b, fraction): 

1031 """ 

1032 Returns the point at the given fraction between a and b, where 

1033 'fraction' must be between 0 and 1. 

1034 """ 

1035 return a + (b - a) * fraction 

1036 

1037 def _get_score(at): 

1038 if len(values) == 0: 

1039 return np.nan 

1040 

1041 idx = at * (len(values) - 1) 

1042 if idx % 1 == 0: 

1043 score = values[int(idx)] 

1044 else: 

1045 if interpolation_method == "fraction": 

1046 score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1) 

1047 elif interpolation_method == "lower": 

1048 score = values[np.floor(idx)] 

1049 elif interpolation_method == "higher": 

1050 score = values[np.ceil(idx)] 

1051 else: 

1052 raise ValueError( 

1053 "interpolation_method can only be 'fraction' " 

1054 ", 'lower' or 'higher'" 

1055 ) 

1056 

1057 return score 

1058 

1059 if is_scalar(q): 

1060 return _get_score(q) 

1061 else: 

1062 q = np.asarray(q, np.float64) 

1063 result = [_get_score(x) for x in q] 

1064 result = np.array(result, dtype=np.float64) 

1065 return result 

1066 

1067 

1068# --------------- # 

1069# select n # 

1070# --------------- # 

1071 

1072 

1073class SelectN: 

1074 def __init__(self, obj, n: int, keep: str): 

1075 self.obj = obj 

1076 self.n = n 

1077 self.keep = keep 

1078 

1079 if self.keep not in ("first", "last", "all"): 

1080 raise ValueError('keep must be either "first", "last" or "all"') 

1081 

1082 def nlargest(self): 

1083 return self.compute("nlargest") 

1084 

1085 def nsmallest(self): 

1086 return self.compute("nsmallest") 

1087 

1088 @staticmethod 

1089 def is_valid_dtype_n_method(dtype) -> bool: 

1090 """ 

1091 Helper function to determine if dtype is valid for 

1092 nsmallest/nlargest methods 

1093 """ 

1094 return ( 

1095 is_numeric_dtype(dtype) and not is_complex_dtype(dtype) 

1096 ) or needs_i8_conversion(dtype) 

1097 

1098 

1099class SelectNSeries(SelectN): 

1100 """ 

1101 Implement n largest/smallest for Series 

1102 

1103 Parameters 

1104 ---------- 

1105 obj : Series 

1106 n : int 

1107 keep : {'first', 'last'}, default 'first' 

1108 

1109 Returns 

1110 ------- 

1111 nordered : Series 

1112 """ 

1113 

1114 def compute(self, method): 

1115 

1116 n = self.n 

1117 dtype = self.obj.dtype 

1118 if not self.is_valid_dtype_n_method(dtype): 

1119 raise TypeError(f"Cannot use method '{method}' with dtype {dtype}") 

1120 

1121 if n <= 0: 

1122 return self.obj[[]] 

1123 

1124 dropped = self.obj.dropna() 

1125 

1126 # slow method 

1127 if n >= len(self.obj): 

1128 reverse_it = self.keep == "last" or method == "nlargest" 

1129 ascending = method == "nsmallest" 

1130 slc = np.s_[::-1] if reverse_it else np.s_[:] 

1131 return dropped[slc].sort_values(ascending=ascending).head(n) 

1132 

1133 # fast method 

1134 arr, pandas_dtype = _ensure_data(dropped.values) 

1135 if method == "nlargest": 

1136 arr = -arr 

1137 if is_integer_dtype(pandas_dtype): 

1138 # GH 21426: ensure reverse ordering at boundaries 

1139 arr -= 1 

1140 

1141 elif is_bool_dtype(pandas_dtype): 

1142 # GH 26154: ensure False is smaller than True 

1143 arr = 1 - (-arr) 

1144 

1145 if self.keep == "last": 

1146 arr = arr[::-1] 

1147 

1148 narr = len(arr) 

1149 n = min(n, narr) 

1150 

1151 kth_val = algos.kth_smallest(arr.copy(), n - 1) 

1152 (ns,) = np.nonzero(arr <= kth_val) 

1153 inds = ns[arr[ns].argsort(kind="mergesort")] 

1154 

1155 if self.keep != "all": 

1156 inds = inds[:n] 

1157 

1158 if self.keep == "last": 

1159 # reverse indices 

1160 inds = narr - 1 - inds 

1161 

1162 return dropped.iloc[inds] 

1163 

1164 

1165class SelectNFrame(SelectN): 

1166 """ 

1167 Implement n largest/smallest for DataFrame 

1168 

1169 Parameters 

1170 ---------- 

1171 obj : DataFrame 

1172 n : int 

1173 keep : {'first', 'last'}, default 'first' 

1174 columns : list or str 

1175 

1176 Returns 

1177 ------- 

1178 nordered : DataFrame 

1179 """ 

1180 

1181 def __init__(self, obj, n: int, keep: str, columns): 

1182 super().__init__(obj, n, keep) 

1183 if not is_list_like(columns) or isinstance(columns, tuple): 

1184 columns = [columns] 

1185 columns = list(columns) 

1186 self.columns = columns 

1187 

1188 def compute(self, method): 

1189 

1190 from pandas import Int64Index 

1191 

1192 n = self.n 

1193 frame = self.obj 

1194 columns = self.columns 

1195 

1196 for column in columns: 

1197 dtype = frame[column].dtype 

1198 if not self.is_valid_dtype_n_method(dtype): 

1199 raise TypeError( 

1200 f"Column {repr(column)} has dtype {dtype}, " 

1201 f"cannot use method {repr(method)} with this dtype" 

1202 ) 

1203 

1204 def get_indexer(current_indexer, other_indexer): 

1205 """ 

1206 Helper function to concat `current_indexer` and `other_indexer` 

1207 depending on `method` 

1208 """ 

1209 if method == "nsmallest": 

1210 return current_indexer.append(other_indexer) 

1211 else: 

1212 return other_indexer.append(current_indexer) 

1213 

1214 # Below we save and reset the index in case index contains duplicates 

1215 original_index = frame.index 

1216 cur_frame = frame = frame.reset_index(drop=True) 

1217 cur_n = n 

1218 indexer = Int64Index([]) 

1219 

1220 for i, column in enumerate(columns): 

1221 # For each column we apply method to cur_frame[column]. 

1222 # If it's the last column or if we have the number of 

1223 # results desired we are done. 

1224 # Otherwise there are duplicates of the largest/smallest 

1225 # value and we need to look at the rest of the columns 

1226 # to determine which of the rows with the largest/smallest 

1227 # value in the column to keep. 

1228 series = cur_frame[column] 

1229 is_last_column = len(columns) - 1 == i 

1230 values = getattr(series, method)( 

1231 cur_n, keep=self.keep if is_last_column else "all" 

1232 ) 

1233 

1234 if is_last_column or len(values) <= cur_n: 

1235 indexer = get_indexer(indexer, values.index) 

1236 break 

1237 

1238 # Now find all values which are equal to 

1239 # the (nsmallest: largest)/(nlarrgest: smallest) 

1240 # from our series. 

1241 border_value = values == values[values.index[-1]] 

1242 

1243 # Some of these values are among the top-n 

1244 # some aren't. 

1245 unsafe_values = values[border_value] 

1246 

1247 # These values are definitely among the top-n 

1248 safe_values = values[~border_value] 

1249 indexer = get_indexer(indexer, safe_values.index) 

1250 

1251 # Go on and separate the unsafe_values on the remaining 

1252 # columns. 

1253 cur_frame = cur_frame.loc[unsafe_values.index] 

1254 cur_n = n - len(indexer) 

1255 

1256 frame = frame.take(indexer) 

1257 

1258 # Restore the index on frame 

1259 frame.index = original_index.take(indexer) 

1260 

1261 # If there is only one column, the frame is already sorted. 

1262 if len(columns) == 1: 

1263 return frame 

1264 

1265 ascending = method == "nsmallest" 

1266 

1267 return frame.sort_values(columns, ascending=ascending, kind="mergesort") 

1268 

1269 

1270# ---- # 

1271# take # 

1272# ---- # 

1273 

1274 

1275def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): 

1276 def wrapper(arr, indexer, out, fill_value=np.nan): 

1277 if arr_dtype is not None: 

1278 arr = arr.view(arr_dtype) 

1279 if out_dtype is not None: 

1280 out = out.view(out_dtype) 

1281 if fill_wrap is not None: 

1282 fill_value = fill_wrap(fill_value) 

1283 f(arr, indexer, out, fill_value=fill_value) 

1284 

1285 return wrapper 

1286 

1287 

1288def _convert_wrapper(f, conv_dtype): 

1289 def wrapper(arr, indexer, out, fill_value=np.nan): 

1290 arr = arr.astype(conv_dtype) 

1291 f(arr, indexer, out, fill_value=fill_value) 

1292 

1293 return wrapper 

1294 

1295 

1296def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): 

1297 # this is not ideal, performance-wise, but it's better than raising 

1298 # an exception (best to optimize in Cython to avoid getting here) 

1299 row_idx, col_idx = indexer 

1300 if mask_info is not None: 

1301 (row_mask, col_mask), (row_needs, col_needs) = mask_info 

1302 else: 

1303 row_mask = row_idx == -1 

1304 col_mask = col_idx == -1 

1305 row_needs = row_mask.any() 

1306 col_needs = col_mask.any() 

1307 if fill_value is not None: 

1308 if row_needs: 

1309 out[row_mask, :] = fill_value 

1310 if col_needs: 

1311 out[:, col_mask] = fill_value 

1312 for i in range(len(row_idx)): 

1313 u_ = row_idx[i] 

1314 for j in range(len(col_idx)): 

1315 v = col_idx[j] 

1316 out[i, j] = arr[u_, v] 

1317 

1318 

1319def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): 

1320 if mask_info is not None: 

1321 mask, needs_masking = mask_info 

1322 else: 

1323 mask = indexer == -1 

1324 needs_masking = mask.any() 

1325 if arr.dtype != out.dtype: 

1326 arr = arr.astype(out.dtype) 

1327 if arr.shape[axis] > 0: 

1328 arr.take(ensure_platform_int(indexer), axis=axis, out=out) 

1329 if needs_masking: 

1330 outindexer = [slice(None)] * arr.ndim 

1331 outindexer[axis] = mask 

1332 out[tuple(outindexer)] = fill_value 

1333 

1334 

1335_take_1d_dict = { 

1336 ("int8", "int8"): algos.take_1d_int8_int8, 

1337 ("int8", "int32"): algos.take_1d_int8_int32, 

1338 ("int8", "int64"): algos.take_1d_int8_int64, 

1339 ("int8", "float64"): algos.take_1d_int8_float64, 

1340 ("int16", "int16"): algos.take_1d_int16_int16, 

1341 ("int16", "int32"): algos.take_1d_int16_int32, 

1342 ("int16", "int64"): algos.take_1d_int16_int64, 

1343 ("int16", "float64"): algos.take_1d_int16_float64, 

1344 ("int32", "int32"): algos.take_1d_int32_int32, 

1345 ("int32", "int64"): algos.take_1d_int32_int64, 

1346 ("int32", "float64"): algos.take_1d_int32_float64, 

1347 ("int64", "int64"): algos.take_1d_int64_int64, 

1348 ("int64", "float64"): algos.take_1d_int64_float64, 

1349 ("float32", "float32"): algos.take_1d_float32_float32, 

1350 ("float32", "float64"): algos.take_1d_float32_float64, 

1351 ("float64", "float64"): algos.take_1d_float64_float64, 

1352 ("object", "object"): algos.take_1d_object_object, 

1353 ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), 

1354 ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), 

1355 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 

1356 algos.take_1d_int64_int64, np.int64, np.int64, np.int64 

1357 ), 

1358} 

1359 

1360_take_2d_axis0_dict = { 

1361 ("int8", "int8"): algos.take_2d_axis0_int8_int8, 

1362 ("int8", "int32"): algos.take_2d_axis0_int8_int32, 

1363 ("int8", "int64"): algos.take_2d_axis0_int8_int64, 

1364 ("int8", "float64"): algos.take_2d_axis0_int8_float64, 

1365 ("int16", "int16"): algos.take_2d_axis0_int16_int16, 

1366 ("int16", "int32"): algos.take_2d_axis0_int16_int32, 

1367 ("int16", "int64"): algos.take_2d_axis0_int16_int64, 

1368 ("int16", "float64"): algos.take_2d_axis0_int16_float64, 

1369 ("int32", "int32"): algos.take_2d_axis0_int32_int32, 

1370 ("int32", "int64"): algos.take_2d_axis0_int32_int64, 

1371 ("int32", "float64"): algos.take_2d_axis0_int32_float64, 

1372 ("int64", "int64"): algos.take_2d_axis0_int64_int64, 

1373 ("int64", "float64"): algos.take_2d_axis0_int64_float64, 

1374 ("float32", "float32"): algos.take_2d_axis0_float32_float32, 

1375 ("float32", "float64"): algos.take_2d_axis0_float32_float64, 

1376 ("float64", "float64"): algos.take_2d_axis0_float64_float64, 

1377 ("object", "object"): algos.take_2d_axis0_object_object, 

1378 ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), 

1379 ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), 

1380 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 

1381 algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 

1382 ), 

1383} 

1384 

1385_take_2d_axis1_dict = { 

1386 ("int8", "int8"): algos.take_2d_axis1_int8_int8, 

1387 ("int8", "int32"): algos.take_2d_axis1_int8_int32, 

1388 ("int8", "int64"): algos.take_2d_axis1_int8_int64, 

1389 ("int8", "float64"): algos.take_2d_axis1_int8_float64, 

1390 ("int16", "int16"): algos.take_2d_axis1_int16_int16, 

1391 ("int16", "int32"): algos.take_2d_axis1_int16_int32, 

1392 ("int16", "int64"): algos.take_2d_axis1_int16_int64, 

1393 ("int16", "float64"): algos.take_2d_axis1_int16_float64, 

1394 ("int32", "int32"): algos.take_2d_axis1_int32_int32, 

1395 ("int32", "int64"): algos.take_2d_axis1_int32_int64, 

1396 ("int32", "float64"): algos.take_2d_axis1_int32_float64, 

1397 ("int64", "int64"): algos.take_2d_axis1_int64_int64, 

1398 ("int64", "float64"): algos.take_2d_axis1_int64_float64, 

1399 ("float32", "float32"): algos.take_2d_axis1_float32_float32, 

1400 ("float32", "float64"): algos.take_2d_axis1_float32_float64, 

1401 ("float64", "float64"): algos.take_2d_axis1_float64_float64, 

1402 ("object", "object"): algos.take_2d_axis1_object_object, 

1403 ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), 

1404 ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), 

1405 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 

1406 algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 

1407 ), 

1408} 

1409 

1410_take_2d_multi_dict = { 

1411 ("int8", "int8"): algos.take_2d_multi_int8_int8, 

1412 ("int8", "int32"): algos.take_2d_multi_int8_int32, 

1413 ("int8", "int64"): algos.take_2d_multi_int8_int64, 

1414 ("int8", "float64"): algos.take_2d_multi_int8_float64, 

1415 ("int16", "int16"): algos.take_2d_multi_int16_int16, 

1416 ("int16", "int32"): algos.take_2d_multi_int16_int32, 

1417 ("int16", "int64"): algos.take_2d_multi_int16_int64, 

1418 ("int16", "float64"): algos.take_2d_multi_int16_float64, 

1419 ("int32", "int32"): algos.take_2d_multi_int32_int32, 

1420 ("int32", "int64"): algos.take_2d_multi_int32_int64, 

1421 ("int32", "float64"): algos.take_2d_multi_int32_float64, 

1422 ("int64", "int64"): algos.take_2d_multi_int64_int64, 

1423 ("int64", "float64"): algos.take_2d_multi_int64_float64, 

1424 ("float32", "float32"): algos.take_2d_multi_float32_float32, 

1425 ("float32", "float64"): algos.take_2d_multi_float32_float64, 

1426 ("float64", "float64"): algos.take_2d_multi_float64_float64, 

1427 ("object", "object"): algos.take_2d_multi_object_object, 

1428 ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), 

1429 ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), 

1430 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 

1431 algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 

1432 ), 

1433} 

1434 

1435 

1436def _get_take_nd_function( 

1437 ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None 

1438): 

1439 if ndim <= 2: 

1440 tup = (arr_dtype.name, out_dtype.name) 

1441 if ndim == 1: 

1442 func = _take_1d_dict.get(tup, None) 

1443 elif ndim == 2: 

1444 if axis == 0: 

1445 func = _take_2d_axis0_dict.get(tup, None) 

1446 else: 

1447 func = _take_2d_axis1_dict.get(tup, None) 

1448 if func is not None: 

1449 return func 

1450 

1451 tup = (out_dtype.name, out_dtype.name) 

1452 if ndim == 1: 

1453 func = _take_1d_dict.get(tup, None) 

1454 elif ndim == 2: 

1455 if axis == 0: 

1456 func = _take_2d_axis0_dict.get(tup, None) 

1457 else: 

1458 func = _take_2d_axis1_dict.get(tup, None) 

1459 if func is not None: 

1460 func = _convert_wrapper(func, out_dtype) 

1461 return func 

1462 

1463 def func2(arr, indexer, out, fill_value=np.nan): 

1464 indexer = ensure_int64(indexer) 

1465 _take_nd_object( 

1466 arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info 

1467 ) 

1468 

1469 return func2 

1470 

1471 

1472def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): 

1473 """ 

1474 Take elements from an array. 

1475 

1476 .. versionadded:: 0.23.0 

1477 

1478 Parameters 

1479 ---------- 

1480 arr : sequence 

1481 Non array-likes (sequences without a dtype) are coerced 

1482 to an ndarray. 

1483 indices : sequence of integers 

1484 Indices to be taken. 

1485 axis : int, default 0 

1486 The axis over which to select values. 

1487 allow_fill : bool, default False 

1488 How to handle negative values in `indices`. 

1489 

1490 * False: negative values in `indices` indicate positional indices 

1491 from the right (the default). This is similar to :func:`numpy.take`. 

1492 

1493 * True: negative values in `indices` indicate 

1494 missing values. These values are set to `fill_value`. Any other 

1495 other negative values raise a ``ValueError``. 

1496 

1497 fill_value : any, optional 

1498 Fill value to use for NA-indices when `allow_fill` is True. 

1499 This may be ``None``, in which case the default NA value for 

1500 the type (``self.dtype.na_value``) is used. 

1501 

1502 For multi-dimensional `arr`, each *element* is filled with 

1503 `fill_value`. 

1504 

1505 Returns 

1506 ------- 

1507 ndarray or ExtensionArray 

1508 Same type as the input. 

1509 

1510 Raises 

1511 ------ 

1512 IndexError 

1513 When `indices` is out of bounds for the array. 

1514 ValueError 

1515 When the indexer contains negative values other than ``-1`` 

1516 and `allow_fill` is True. 

1517 

1518 Notes 

1519 ----- 

1520 When `allow_fill` is False, `indices` may be whatever dimensionality 

1521 is accepted by NumPy for `arr`. 

1522 

1523 When `allow_fill` is True, `indices` should be 1-D. 

1524 

1525 See Also 

1526 -------- 

1527 numpy.take 

1528 

1529 Examples 

1530 -------- 

1531 >>> from pandas.api.extensions import take 

1532 

1533 With the default ``allow_fill=False``, negative numbers indicate 

1534 positional indices from the right. 

1535 

1536 >>> take(np.array([10, 20, 30]), [0, 0, -1]) 

1537 array([10, 10, 30]) 

1538 

1539 Setting ``allow_fill=True`` will place `fill_value` in those positions. 

1540 

1541 >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) 

1542 array([10., 10., nan]) 

1543 

1544 >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, 

1545 ... fill_value=-10) 

1546 array([ 10, 10, -10]) 

1547 """ 

1548 if not is_array_like(arr): 

1549 arr = np.asarray(arr) 

1550 

1551 indices = np.asarray(indices, dtype=np.intp) 

1552 

1553 if allow_fill: 

1554 # Pandas style, -1 means NA 

1555 validate_indices(indices, arr.shape[axis]) 

1556 result = take_1d( 

1557 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value 

1558 ) 

1559 else: 

1560 # NumPy style 

1561 result = arr.take(indices, axis=axis) 

1562 return result 

1563 

1564 

1565def take_nd( 

1566 arr, indexer, axis: int = 0, out=None, fill_value=np.nan, allow_fill: bool = True 

1567): 

1568 """ 

1569 Specialized Cython take which sets NaN values in one pass 

1570 

1571 This dispatches to ``take`` defined on ExtensionArrays. It does not 

1572 currently dispatch to ``SparseArray.take`` for sparse ``arr``. 

1573 

1574 Parameters 

1575 ---------- 

1576 arr : array-like 

1577 Input array. 

1578 indexer : ndarray 

1579 1-D array of indices to take, subarrays corresponding to -1 value 

1580 indices are filed with fill_value 

1581 axis : int, default 0 

1582 Axis to take from 

1583 out : ndarray or None, default None 

1584 Optional output array, must be appropriate type to hold input and 

1585 fill_value together, if indexer has any -1 value entries; call 

1586 maybe_promote to determine this type for any fill_value 

1587 fill_value : any, default np.nan 

1588 Fill value to replace -1 values with 

1589 allow_fill : boolean, default True 

1590 If False, indexer is assumed to contain no -1 values so no filling 

1591 will be done. This short-circuits computation of a mask. Result is 

1592 undefined if allow_fill == False and -1 is present in indexer. 

1593 

1594 Returns 

1595 ------- 

1596 subarray : array-like 

1597 May be the same type as the input, or cast to an ndarray. 

1598 """ 

1599 mask_info = None 

1600 

1601 if is_extension_array_dtype(arr): 

1602 return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) 

1603 

1604 arr = extract_array(arr) 

1605 arr = np.asarray(arr) 

1606 

1607 if indexer is None: 

1608 indexer = np.arange(arr.shape[axis], dtype=np.int64) 

1609 dtype, fill_value = arr.dtype, arr.dtype.type() 

1610 else: 

1611 indexer = ensure_int64(indexer, copy=False) 

1612 if not allow_fill: 

1613 dtype, fill_value = arr.dtype, arr.dtype.type() 

1614 mask_info = None, False 

1615 else: 

1616 # check for promotion based on types only (do this first because 

1617 # it's faster than computing a mask) 

1618 dtype, fill_value = maybe_promote(arr.dtype, fill_value) 

1619 if dtype != arr.dtype and (out is None or out.dtype != dtype): 

1620 # check if promotion is actually required based on indexer 

1621 mask = indexer == -1 

1622 needs_masking = mask.any() 

1623 mask_info = mask, needs_masking 

1624 if needs_masking: 

1625 if out is not None and out.dtype != dtype: 

1626 raise TypeError("Incompatible type for fill_value") 

1627 else: 

1628 # if not, then depromote, set fill_value to dummy 

1629 # (it won't be used but we don't want the cython code 

1630 # to crash when trying to cast it to dtype) 

1631 dtype, fill_value = arr.dtype, arr.dtype.type() 

1632 

1633 flip_order = False 

1634 if arr.ndim == 2: 

1635 if arr.flags.f_contiguous: 

1636 flip_order = True 

1637 

1638 if flip_order: 

1639 arr = arr.T 

1640 axis = arr.ndim - axis - 1 

1641 if out is not None: 

1642 out = out.T 

1643 

1644 # at this point, it's guaranteed that dtype can hold both the arr values 

1645 # and the fill_value 

1646 if out is None: 

1647 out_shape_ = list(arr.shape) 

1648 out_shape_[axis] = len(indexer) 

1649 out_shape = tuple(out_shape_) 

1650 if arr.flags.f_contiguous and axis == arr.ndim - 1: 

1651 # minor tweak that can make an order-of-magnitude difference 

1652 # for dataframes initialized directly from 2-d ndarrays 

1653 # (s.t. df.values is c-contiguous and df._data.blocks[0] is its 

1654 # f-contiguous transpose) 

1655 out = np.empty(out_shape, dtype=dtype, order="F") 

1656 else: 

1657 out = np.empty(out_shape, dtype=dtype) 

1658 

1659 func = _get_take_nd_function( 

1660 arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info 

1661 ) 

1662 func(arr, indexer, out, fill_value) 

1663 

1664 if flip_order: 

1665 out = out.T 

1666 return out 

1667 

1668 

1669take_1d = take_nd 

1670 

1671 

1672def take_2d_multi(arr, indexer, fill_value=np.nan): 

1673 """ 

1674 Specialized Cython take which sets NaN values in one pass. 

1675 """ 

1676 # This is only called from one place in DataFrame._reindex_multi, 

1677 # so we know indexer is well-behaved. 

1678 assert indexer is not None 

1679 assert indexer[0] is not None 

1680 assert indexer[1] is not None 

1681 

1682 row_idx, col_idx = indexer 

1683 

1684 row_idx = ensure_int64(row_idx) 

1685 col_idx = ensure_int64(col_idx) 

1686 indexer = row_idx, col_idx 

1687 mask_info = None 

1688 

1689 # check for promotion based on types only (do this first because 

1690 # it's faster than computing a mask) 

1691 dtype, fill_value = maybe_promote(arr.dtype, fill_value) 

1692 if dtype != arr.dtype: 

1693 # check if promotion is actually required based on indexer 

1694 row_mask = row_idx == -1 

1695 col_mask = col_idx == -1 

1696 row_needs = row_mask.any() 

1697 col_needs = col_mask.any() 

1698 mask_info = (row_mask, col_mask), (row_needs, col_needs) 

1699 

1700 if not (row_needs or col_needs): 

1701 # if not, then depromote, set fill_value to dummy 

1702 # (it won't be used but we don't want the cython code 

1703 # to crash when trying to cast it to dtype) 

1704 dtype, fill_value = arr.dtype, arr.dtype.type() 

1705 

1706 # at this point, it's guaranteed that dtype can hold both the arr values 

1707 # and the fill_value 

1708 out_shape = len(row_idx), len(col_idx) 

1709 out = np.empty(out_shape, dtype=dtype) 

1710 

1711 func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) 

1712 if func is None and arr.dtype != out.dtype: 

1713 func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) 

1714 if func is not None: 

1715 func = _convert_wrapper(func, out.dtype) 

1716 if func is None: 

1717 

1718 def func(arr, indexer, out, fill_value=np.nan): 

1719 _take_2d_multi_object( 

1720 arr, indexer, out, fill_value=fill_value, mask_info=mask_info 

1721 ) 

1722 

1723 func(arr, indexer, out=out, fill_value=fill_value) 

1724 return out 

1725 

1726 

1727# ------------ # 

1728# searchsorted # 

1729# ------------ # 

1730 

1731 

1732def searchsorted(arr, value, side="left", sorter=None): 

1733 """ 

1734 Find indices where elements should be inserted to maintain order. 

1735 

1736 .. versionadded:: 0.25.0 

1737 

1738 Find the indices into a sorted array `arr` (a) such that, if the 

1739 corresponding elements in `value` were inserted before the indices, 

1740 the order of `arr` would be preserved. 

1741 

1742 Assuming that `arr` is sorted: 

1743 

1744 ====== ================================ 

1745 `side` returned index `i` satisfies 

1746 ====== ================================ 

1747 left ``arr[i-1] < value <= self[i]`` 

1748 right ``arr[i-1] <= value < self[i]`` 

1749 ====== ================================ 

1750 

1751 Parameters 

1752 ---------- 

1753 arr: array-like 

1754 Input array. If `sorter` is None, then it must be sorted in 

1755 ascending order, otherwise `sorter` must be an array of indices 

1756 that sort it. 

1757 value : array_like 

1758 Values to insert into `arr`. 

1759 side : {'left', 'right'}, optional 

1760 If 'left', the index of the first suitable location found is given. 

1761 If 'right', return the last such index. If there is no suitable 

1762 index, return either 0 or N (where N is the length of `self`). 

1763 sorter : 1-D array_like, optional 

1764 Optional array of integer indices that sort array a into ascending 

1765 order. They are typically the result of argsort. 

1766 

1767 Returns 

1768 ------- 

1769 array of ints 

1770 Array of insertion points with the same shape as `value`. 

1771 

1772 See Also 

1773 -------- 

1774 numpy.searchsorted : Similar method from NumPy. 

1775 """ 

1776 if sorter is not None: 

1777 sorter = ensure_platform_int(sorter) 

1778 

1779 if ( 

1780 isinstance(arr, np.ndarray) 

1781 and is_integer_dtype(arr) 

1782 and (is_integer(value) or is_integer_dtype(value)) 

1783 ): 

1784 # if `arr` and `value` have different dtypes, `arr` would be 

1785 # recast by numpy, causing a slow search. 

1786 # Before searching below, we therefore try to give `value` the 

1787 # same dtype as `arr`, while guarding against integer overflows. 

1788 iinfo = np.iinfo(arr.dtype.type) 

1789 value_arr = np.array([value]) if is_scalar(value) else np.array(value) 

1790 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): 

1791 # value within bounds, so no overflow, so can convert value dtype 

1792 # to dtype of arr 

1793 dtype = arr.dtype 

1794 else: 

1795 dtype = value_arr.dtype 

1796 

1797 if is_scalar(value): 

1798 value = dtype.type(value) 

1799 else: 

1800 value = array(value, dtype=dtype) 

1801 elif not ( 

1802 is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) 

1803 ): 

1804 # E.g. if `arr` is an array with dtype='datetime64[ns]' 

1805 # and `value` is a pd.Timestamp, we may need to convert value 

1806 value_ser = array([value]) if is_scalar(value) else array(value) 

1807 value = value_ser[0] if is_scalar(value) else value_ser 

1808 if isinstance(value, Timestamp) and value.tzinfo is None: 

1809 value = value.to_datetime64() 

1810 

1811 result = arr.searchsorted(value, side=side, sorter=sorter) 

1812 return result 

1813 

1814 

1815# ---- # 

1816# diff # 

1817# ---- # 

1818 

1819_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} 

1820 

1821 

1822def diff(arr, n: int, axis: int = 0, stacklevel=3): 

1823 """ 

1824 difference of n between self, 

1825 analogous to s-s.shift(n) 

1826 

1827 Parameters 

1828 ---------- 

1829 arr : ndarray 

1830 n : int 

1831 number of periods 

1832 axis : int 

1833 axis to shift on 

1834 stacklevel : int 

1835 The stacklevel for the lost dtype warning. 

1836 

1837 Returns 

1838 ------- 

1839 shifted 

1840 """ 

1841 from pandas.core.arrays import PandasDtype 

1842 

1843 n = int(n) 

1844 na = np.nan 

1845 dtype = arr.dtype 

1846 

1847 if dtype.kind == "b": 

1848 op = operator.xor 

1849 else: 

1850 op = operator.sub 

1851 

1852 if isinstance(dtype, PandasDtype): 

1853 # PandasArray cannot necessarily hold shifted versions of itself. 

1854 arr = np.asarray(arr) 

1855 dtype = arr.dtype 

1856 

1857 if is_extension_array_dtype(dtype): 

1858 if hasattr(arr, f"__{op.__name__}__"): 

1859 return op(arr, arr.shift(n)) 

1860 else: 

1861 warn( 

1862 "dtype lost in 'diff()'. In the future this will raise a " 

1863 "TypeError. Convert to a suitable dtype prior to calling 'diff'.", 

1864 FutureWarning, 

1865 stacklevel=stacklevel, 

1866 ) 

1867 arr = np.asarray(arr) 

1868 dtype = arr.dtype 

1869 

1870 is_timedelta = False 

1871 is_bool = False 

1872 if needs_i8_conversion(arr): 

1873 dtype = np.float64 

1874 arr = arr.view("i8") 

1875 na = iNaT 

1876 is_timedelta = True 

1877 

1878 elif is_bool_dtype(dtype): 

1879 dtype = np.object_ 

1880 is_bool = True 

1881 

1882 elif is_integer_dtype(dtype): 

1883 dtype = np.float64 

1884 

1885 dtype = np.dtype(dtype) 

1886 out_arr = np.empty(arr.shape, dtype=dtype) 

1887 

1888 na_indexer = [slice(None)] * arr.ndim 

1889 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) 

1890 out_arr[tuple(na_indexer)] = na 

1891 

1892 if arr.ndim == 2 and arr.dtype.name in _diff_special: 

1893 # TODO: can diff_2d dtype specialization troubles be fixed by defining 

1894 # out_arr inside diff_2d? 

1895 algos.diff_2d(arr, out_arr, n, axis) 

1896 else: 

1897 # To keep mypy happy, _res_indexer is a list while res_indexer is 

1898 # a tuple, ditto for lag_indexer. 

1899 _res_indexer = [slice(None)] * arr.ndim 

1900 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) 

1901 res_indexer = tuple(_res_indexer) 

1902 

1903 _lag_indexer = [slice(None)] * arr.ndim 

1904 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) 

1905 lag_indexer = tuple(_lag_indexer) 

1906 

1907 # need to make sure that we account for na for datelike/timedelta 

1908 # we don't actually want to subtract these i8 numbers 

1909 if is_timedelta: 

1910 res = arr[res_indexer] 

1911 lag = arr[lag_indexer] 

1912 

1913 mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na) 

1914 if mask.any(): 

1915 res = res.copy() 

1916 res[mask] = 0 

1917 lag = lag.copy() 

1918 lag[mask] = 0 

1919 

1920 result = res - lag 

1921 result[mask] = na 

1922 out_arr[res_indexer] = result 

1923 elif is_bool: 

1924 out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer] 

1925 else: 

1926 out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] 

1927 

1928 if is_timedelta: 

1929 out_arr = out_arr.astype("int64").view("timedelta64[ns]") 

1930 

1931 return out_arr 

1932 

1933 

1934# -------------------------------------------------------------------- 

1935# Helper functions 

1936 

1937# Note: safe_sort is in algorithms.py instead of sorting.py because it is 

1938# low-dependency, is used in this module, and used private methods from 

1939# this module. 

1940def safe_sort( 

1941 values, 

1942 codes=None, 

1943 na_sentinel: int = -1, 

1944 assume_unique: bool = False, 

1945 verify: bool = True, 

1946) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: 

1947 """ 

1948 Sort ``values`` and reorder corresponding ``codes``. 

1949 

1950 ``values`` should be unique if ``codes`` is not None. 

1951 Safe for use with mixed types (int, str), orders ints before strs. 

1952 

1953 Parameters 

1954 ---------- 

1955 values : list-like 

1956 Sequence; must be unique if ``codes`` is not None. 

1957 codes : list_like, optional 

1958 Indices to ``values``. All out of bound indices are treated as 

1959 "not found" and will be masked with ``na_sentinel``. 

1960 na_sentinel : int, default -1 

1961 Value in ``codes`` to mark "not found". 

1962 Ignored when ``codes`` is None. 

1963 assume_unique : bool, default False 

1964 When True, ``values`` are assumed to be unique, which can speed up 

1965 the calculation. Ignored when ``codes`` is None. 

1966 verify : bool, default True 

1967 Check if codes are out of bound for the values and put out of bound 

1968 codes equal to na_sentinel. If ``verify=False``, it is assumed there 

1969 are no out of bound codes. Ignored when ``codes`` is None. 

1970 

1971 .. versionadded:: 0.25.0 

1972 

1973 Returns 

1974 ------- 

1975 ordered : ndarray 

1976 Sorted ``values`` 

1977 new_codes : ndarray 

1978 Reordered ``codes``; returned when ``codes`` is not None. 

1979 

1980 Raises 

1981 ------ 

1982 TypeError 

1983 * If ``values`` is not list-like or if ``codes`` is neither None 

1984 nor list-like 

1985 * If ``values`` cannot be sorted 

1986 ValueError 

1987 * If ``codes`` is not None and ``values`` contain duplicates. 

1988 """ 

1989 if not is_list_like(values): 

1990 raise TypeError( 

1991 "Only list-like objects are allowed to be passed to safe_sort as values" 

1992 ) 

1993 

1994 if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): 

1995 # don't convert to string types 

1996 dtype, _ = infer_dtype_from_array(values) 

1997 values = np.asarray(values, dtype=dtype) 

1998 

1999 def sort_mixed(values): 

2000 # order ints before strings, safe in py3 

2001 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) 

2002 nums = np.sort(values[~str_pos]) 

2003 strs = np.sort(values[str_pos]) 

2004 return np.concatenate([nums, np.asarray(strs, dtype=object)]) 

2005 

2006 sorter = None 

2007 if ( 

2008 not is_extension_array_dtype(values) 

2009 and lib.infer_dtype(values, skipna=False) == "mixed-integer" 

2010 ): 

2011 # unorderable in py3 if mixed str/int 

2012 ordered = sort_mixed(values) 

2013 else: 

2014 try: 

2015 sorter = values.argsort() 

2016 ordered = values.take(sorter) 

2017 except TypeError: 

2018 # try this anyway 

2019 ordered = sort_mixed(values) 

2020 

2021 # codes: 

2022 

2023 if codes is None: 

2024 return ordered 

2025 

2026 if not is_list_like(codes): 

2027 raise TypeError( 

2028 "Only list-like objects or None are allowed to " 

2029 "be passed to safe_sort as codes" 

2030 ) 

2031 codes = ensure_platform_int(np.asarray(codes)) 

2032 

2033 from pandas import Index 

2034 

2035 if not assume_unique and not Index(values).is_unique: 

2036 raise ValueError("values should be unique if codes is not None") 

2037 

2038 if sorter is None: 

2039 # mixed types 

2040 hash_klass, values = _get_data_algo(values) 

2041 t = hash_klass(len(values)) 

2042 t.map_locations(values) 

2043 sorter = ensure_platform_int(t.lookup(ordered)) 

2044 

2045 if na_sentinel == -1: 

2046 # take_1d is faster, but only works for na_sentinels of -1 

2047 order2 = sorter.argsort() 

2048 new_codes = take_1d(order2, codes, fill_value=-1) 

2049 if verify: 

2050 mask = (codes < -len(values)) | (codes >= len(values)) 

2051 else: 

2052 mask = None 

2053 else: 

2054 reverse_indexer = np.empty(len(sorter), dtype=np.int_) 

2055 reverse_indexer.put(sorter, np.arange(len(sorter))) 

2056 # Out of bound indices will be masked with `na_sentinel` next, so we 

2057 # may deal with them here without performance loss using `mode='wrap'` 

2058 new_codes = reverse_indexer.take(codes, mode="wrap") 

2059 

2060 mask = codes == na_sentinel 

2061 if verify: 

2062 mask = mask | (codes < -len(values)) | (codes >= len(values)) 

2063 

2064 if mask is not None: 

2065 np.putmask(new_codes, mask, na_sentinel) 

2066 

2067 return ordered, ensure_platform_int(new_codes)