Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2SparseArray data structure 

3""" 

4from collections import abc 

5import numbers 

6import operator 

7from typing import Any, Callable 

8import warnings 

9 

10import numpy as np 

11 

12from pandas._libs import index as libindex, lib 

13import pandas._libs.sparse as splib 

14from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex 

15from pandas._libs.tslibs import NaT 

16import pandas.compat as compat 

17from pandas.compat.numpy import function as nv 

18from pandas.errors import PerformanceWarning 

19 

20from pandas.core.dtypes.cast import ( 

21 astype_nansafe, 

22 construct_1d_arraylike_from_scalar, 

23 find_common_type, 

24 infer_dtype_from_scalar, 

25) 

26from pandas.core.dtypes.common import ( 

27 is_array_like, 

28 is_bool_dtype, 

29 is_datetime64_any_dtype, 

30 is_dtype_equal, 

31 is_integer, 

32 is_object_dtype, 

33 is_scalar, 

34 is_string_dtype, 

35 pandas_dtype, 

36) 

37from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray 

38from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna 

39 

40import pandas.core.algorithms as algos 

41from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin 

42from pandas.core.arrays.sparse.dtype import SparseDtype 

43from pandas.core.base import PandasObject 

44import pandas.core.common as com 

45from pandas.core.construction import sanitize_array 

46from pandas.core.indexers import check_array_indexer 

47from pandas.core.missing import interpolate_2d 

48import pandas.core.ops as ops 

49from pandas.core.ops.common import unpack_zerodim_and_defer 

50 

51import pandas.io.formats.printing as printing 

52 

53# ---------------------------------------------------------------------------- 

54# Array 

55 

56 

57_sparray_doc_kwargs = dict(klass="SparseArray") 

58 

59 

60def _get_fill(arr: ABCSparseArray) -> np.ndarray: 

61 """ 

62 Create a 0-dim ndarray containing the fill value 

63 

64 Parameters 

65 ---------- 

66 arr : SparseArray 

67 

68 Returns 

69 ------- 

70 fill_value : ndarray 

71 0-dim ndarray with just the fill value. 

72 

73 Notes 

74 ----- 

75 coerce fill_value to arr dtype if possible 

76 int64 SparseArray can have NaN as fill_value if there is no missing 

77 """ 

78 try: 

79 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype) 

80 except ValueError: 

81 return np.asarray(arr.fill_value) 

82 

83 

84def _sparse_array_op( 

85 left: ABCSparseArray, right: ABCSparseArray, op: Callable, name: str 

86) -> Any: 

87 """ 

88 Perform a binary operation between two arrays. 

89 

90 Parameters 

91 ---------- 

92 left : Union[SparseArray, ndarray] 

93 right : Union[SparseArray, ndarray] 

94 op : Callable 

95 The binary operation to perform 

96 name str 

97 Name of the callable. 

98 

99 Returns 

100 ------- 

101 SparseArray 

102 """ 

103 if name.startswith("__"): 

104 # For lookups in _libs.sparse we need non-dunder op name 

105 name = name[2:-2] 

106 

107 # dtype used to find corresponding sparse method 

108 ltype = left.dtype.subtype 

109 rtype = right.dtype.subtype 

110 

111 if not is_dtype_equal(ltype, rtype): 

112 subtype = find_common_type([ltype, rtype]) 

113 ltype = SparseDtype(subtype, left.fill_value) 

114 rtype = SparseDtype(subtype, right.fill_value) 

115 

116 # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe 

117 left = left.astype(ltype) 

118 right = right.astype(rtype) 

119 dtype = ltype.subtype 

120 else: 

121 dtype = ltype 

122 

123 # dtype the result must have 

124 result_dtype = None 

125 

126 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: 

127 with np.errstate(all="ignore"): 

128 result = op(left.to_dense(), right.to_dense()) 

129 fill = op(_get_fill(left), _get_fill(right)) 

130 

131 if left.sp_index.ngaps == 0: 

132 index = left.sp_index 

133 else: 

134 index = right.sp_index 

135 elif left.sp_index.equals(right.sp_index): 

136 with np.errstate(all="ignore"): 

137 result = op(left.sp_values, right.sp_values) 

138 fill = op(_get_fill(left), _get_fill(right)) 

139 index = left.sp_index 

140 else: 

141 if name[0] == "r": 

142 left, right = right, left 

143 name = name[1:] 

144 

145 if name in ("and", "or", "xor") and dtype == "bool": 

146 opname = f"sparse_{name}_uint8" 

147 # to make template simple, cast here 

148 left_sp_values = left.sp_values.view(np.uint8) 

149 right_sp_values = right.sp_values.view(np.uint8) 

150 result_dtype = np.bool 

151 else: 

152 opname = f"sparse_{name}_{dtype}" 

153 left_sp_values = left.sp_values 

154 right_sp_values = right.sp_values 

155 

156 sparse_op = getattr(splib, opname) 

157 

158 with np.errstate(all="ignore"): 

159 result, index, fill = sparse_op( 

160 left_sp_values, 

161 left.sp_index, 

162 left.fill_value, 

163 right_sp_values, 

164 right.sp_index, 

165 right.fill_value, 

166 ) 

167 

168 if result_dtype is None: 

169 result_dtype = result.dtype 

170 

171 return _wrap_result(name, result, index, fill, dtype=result_dtype) 

172 

173 

174def _wrap_result(name, data, sparse_index, fill_value, dtype=None): 

175 """ 

176 wrap op result to have correct dtype 

177 """ 

178 if name.startswith("__"): 

179 # e.g. __eq__ --> eq 

180 name = name[2:-2] 

181 

182 if name in ("eq", "ne", "lt", "gt", "le", "ge"): 

183 dtype = np.bool 

184 

185 fill_value = lib.item_from_zerodim(fill_value) 

186 

187 if is_bool_dtype(dtype): 

188 # fill_value may be np.bool_ 

189 fill_value = bool(fill_value) 

190 return SparseArray( 

191 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype 

192 ) 

193 

194 

195class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 

196 """ 

197 An ExtensionArray for storing sparse data. 

198 

199 .. versionchanged:: 0.24.0 

200 

201 Implements the ExtensionArray interface. 

202 

203 Parameters 

204 ---------- 

205 data : array-like 

206 A dense array of values to store in the SparseArray. This may contain 

207 `fill_value`. 

208 sparse_index : SparseIndex, optional 

209 index : Index 

210 fill_value : scalar, optional 

211 Elements in `data` that are `fill_value` are not stored in the 

212 SparseArray. For memory savings, this should be the most common value 

213 in `data`. By default, `fill_value` depends on the dtype of `data`: 

214 

215 =========== ========== 

216 data.dtype na_value 

217 =========== ========== 

218 float ``np.nan`` 

219 int ``0`` 

220 bool False 

221 datetime64 ``pd.NaT`` 

222 timedelta64 ``pd.NaT`` 

223 =========== ========== 

224 

225 The fill value is potentially specified in three ways. In order of 

226 precedence, these are 

227 

228 1. The `fill_value` argument 

229 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is 

230 a ``SparseDtype`` 

231 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` 

232 is not a ``SparseDtype`` and `data` is a ``SparseArray``. 

233 

234 kind : {'integer', 'block'}, default 'integer' 

235 The type of storage for sparse locations. 

236 

237 * 'block': Stores a `block` and `block_length` for each 

238 contiguous *span* of sparse values. This is best when 

239 sparse data tends to be clumped together, with large 

240 regions of ``fill-value`` values between sparse values. 

241 * 'integer': uses an integer to store the location of 

242 each sparse value. 

243 

244 dtype : np.dtype or SparseDtype, optional 

245 The dtype to use for the SparseArray. For numpy dtypes, this 

246 determines the dtype of ``self.sp_values``. For SparseDtype, 

247 this determines ``self.sp_values`` and ``self.fill_value``. 

248 copy : bool, default False 

249 Whether to explicitly copy the incoming `data` array. 

250 

251 Attributes 

252 ---------- 

253 None 

254 

255 Methods 

256 ------- 

257 None 

258 """ 

259 

260 _pandas_ftype = "sparse" 

261 _subtyp = "sparse_array" # register ABCSparseArray 

262 _deprecations = PandasObject._deprecations | frozenset(["get_values"]) 

263 _sparse_index: SparseIndex 

264 

265 def __init__( 

266 self, 

267 data, 

268 sparse_index=None, 

269 index=None, 

270 fill_value=None, 

271 kind="integer", 

272 dtype=None, 

273 copy=False, 

274 ): 

275 

276 if fill_value is None and isinstance(dtype, SparseDtype): 

277 fill_value = dtype.fill_value 

278 

279 if isinstance(data, type(self)): 

280 # disable normal inference on dtype, sparse_index, & fill_value 

281 if sparse_index is None: 

282 sparse_index = data.sp_index 

283 if fill_value is None: 

284 fill_value = data.fill_value 

285 if dtype is None: 

286 dtype = data.dtype 

287 # TODO: make kind=None, and use data.kind? 

288 data = data.sp_values 

289 

290 # Handle use-provided dtype 

291 if isinstance(dtype, str): 

292 # Two options: dtype='int', regular numpy dtype 

293 # or dtype='Sparse[int]', a sparse dtype 

294 try: 

295 dtype = SparseDtype.construct_from_string(dtype) 

296 except TypeError: 

297 dtype = pandas_dtype(dtype) 

298 

299 if isinstance(dtype, SparseDtype): 

300 if fill_value is None: 

301 fill_value = dtype.fill_value 

302 dtype = dtype.subtype 

303 

304 if index is not None and not is_scalar(data): 

305 raise Exception("must only pass scalars with an index ") 

306 

307 if is_scalar(data): 

308 if index is not None: 

309 if data is None: 

310 data = np.nan 

311 

312 if index is not None: 

313 npoints = len(index) 

314 elif sparse_index is None: 

315 npoints = 1 

316 else: 

317 npoints = sparse_index.length 

318 

319 dtype = infer_dtype_from_scalar(data)[0] 

320 data = construct_1d_arraylike_from_scalar(data, npoints, dtype) 

321 

322 if dtype is not None: 

323 dtype = pandas_dtype(dtype) 

324 

325 # TODO: disentangle the fill_value dtype inference from 

326 # dtype inference 

327 if data is None: 

328 # XXX: What should the empty dtype be? Object or float? 

329 data = np.array([], dtype=dtype) 

330 

331 if not is_array_like(data): 

332 try: 

333 # probably shared code in sanitize_series 

334 

335 data = sanitize_array(data, index=None) 

336 except ValueError: 

337 # NumPy may raise a ValueError on data like [1, []] 

338 # we retry with object dtype here. 

339 if dtype is None: 

340 dtype = object 

341 data = np.atleast_1d(np.asarray(data, dtype=dtype)) 

342 else: 

343 raise 

344 

345 if copy: 

346 # TODO: avoid double copy when dtype forces cast. 

347 data = data.copy() 

348 

349 if fill_value is None: 

350 fill_value_dtype = data.dtype if dtype is None else dtype 

351 if fill_value_dtype is None: 

352 fill_value = np.nan 

353 else: 

354 fill_value = na_value_for_dtype(fill_value_dtype) 

355 

356 if isinstance(data, type(self)) and sparse_index is None: 

357 sparse_index = data._sparse_index 

358 sparse_values = np.asarray(data.sp_values, dtype=dtype) 

359 elif sparse_index is None: 

360 sparse_values, sparse_index, fill_value = make_sparse( 

361 data, kind=kind, fill_value=fill_value, dtype=dtype 

362 ) 

363 else: 

364 sparse_values = np.asarray(data, dtype=dtype) 

365 if len(sparse_values) != sparse_index.npoints: 

366 raise AssertionError( 

367 f"Non array-like type {type(sparse_values)} must " 

368 "have the same length as the index" 

369 ) 

370 self._sparse_index = sparse_index 

371 self._sparse_values = sparse_values 

372 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

373 

374 @classmethod 

375 def _simple_new( 

376 cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype 

377 ) -> "SparseArray": 

378 new = cls([]) 

379 new._sparse_index = sparse_index 

380 new._sparse_values = sparse_array 

381 new._dtype = dtype 

382 return new 

383 

384 @classmethod 

385 def from_spmatrix(cls, data): 

386 """ 

387 Create a SparseArray from a scipy.sparse matrix. 

388 

389 .. versionadded:: 0.25.0 

390 

391 Parameters 

392 ---------- 

393 data : scipy.sparse.sp_matrix 

394 This should be a SciPy sparse matrix where the size 

395 of the second dimension is 1. In other words, a 

396 sparse matrix with a single column. 

397 

398 Returns 

399 ------- 

400 SparseArray 

401 

402 Examples 

403 -------- 

404 >>> import scipy.sparse 

405 >>> mat = scipy.sparse.coo_matrix((4, 1)) 

406 >>> pd.arrays.SparseArray.from_spmatrix(mat) 

407 [0.0, 0.0, 0.0, 0.0] 

408 Fill: 0.0 

409 IntIndex 

410 Indices: array([], dtype=int32) 

411 """ 

412 length, ncol = data.shape 

413 

414 if ncol != 1: 

415 raise ValueError(f"'data' must have a single column, not '{ncol}'") 

416 

417 # our sparse index classes require that the positions be strictly 

418 # increasing. So we need to sort loc, and arr accordingly. 

419 arr = data.data 

420 idx, _ = data.nonzero() 

421 loc = np.argsort(idx) 

422 arr = arr.take(loc) 

423 idx.sort() 

424 

425 zero = np.array(0, dtype=arr.dtype).item() 

426 dtype = SparseDtype(arr.dtype, zero) 

427 index = IntIndex(length, idx) 

428 

429 return cls._simple_new(arr, index, dtype) 

430 

431 def __array__(self, dtype=None, copy=True) -> np.ndarray: 

432 fill_value = self.fill_value 

433 

434 if self.sp_index.ngaps == 0: 

435 # Compat for na dtype and int values. 

436 return self.sp_values 

437 if dtype is None: 

438 # Can NumPy represent this type? 

439 # If not, `np.result_type` will raise. We catch that 

440 # and return object. 

441 if is_datetime64_any_dtype(self.sp_values.dtype): 

442 # However, we *do* special-case the common case of 

443 # a datetime64 with pandas NaT. 

444 if fill_value is NaT: 

445 # Can't put pd.NaT in a datetime64[ns] 

446 fill_value = np.datetime64("NaT") 

447 try: 

448 dtype = np.result_type(self.sp_values.dtype, type(fill_value)) 

449 except TypeError: 

450 dtype = object 

451 

452 out = np.full(self.shape, fill_value, dtype=dtype) 

453 out[self.sp_index.to_int_index().indices] = self.sp_values 

454 return out 

455 

456 def __setitem__(self, key, value): 

457 # I suppose we could allow setting of non-fill_value elements. 

458 # TODO(SparseArray.__setitem__): remove special cases in 

459 # ExtensionBlock.where 

460 msg = "SparseArray does not support item assignment via setitem" 

461 raise TypeError(msg) 

462 

463 @classmethod 

464 def _from_sequence(cls, scalars, dtype=None, copy=False): 

465 return cls(scalars, dtype=dtype) 

466 

467 @classmethod 

468 def _from_factorized(cls, values, original): 

469 return cls(values, dtype=original.dtype) 

470 

471 # ------------------------------------------------------------------------ 

472 # Data 

473 # ------------------------------------------------------------------------ 

474 @property 

475 def sp_index(self): 

476 """ 

477 The SparseIndex containing the location of non- ``fill_value`` points. 

478 """ 

479 return self._sparse_index 

480 

481 @property 

482 def sp_values(self): 

483 """ 

484 An ndarray containing the non- ``fill_value`` values. 

485 

486 Examples 

487 -------- 

488 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) 

489 >>> s.sp_values 

490 array([1, 2]) 

491 """ 

492 return self._sparse_values 

493 

494 @property 

495 def dtype(self): 

496 return self._dtype 

497 

498 @property 

499 def fill_value(self): 

500 """ 

501 Elements in `data` that are `fill_value` are not stored. 

502 

503 For memory savings, this should be the most common value in the array. 

504 """ 

505 return self.dtype.fill_value 

506 

507 @fill_value.setter 

508 def fill_value(self, value): 

509 self._dtype = SparseDtype(self.dtype.subtype, value) 

510 

511 @property 

512 def kind(self) -> str: 

513 """ 

514 The kind of sparse index for this array. One of {'integer', 'block'}. 

515 """ 

516 if isinstance(self.sp_index, IntIndex): 

517 return "integer" 

518 else: 

519 return "block" 

520 

521 @property 

522 def _valid_sp_values(self): 

523 sp_vals = self.sp_values 

524 mask = notna(sp_vals) 

525 return sp_vals[mask] 

526 

527 def __len__(self) -> int: 

528 return self.sp_index.length 

529 

530 @property 

531 def _null_fill_value(self): 

532 return self._dtype._is_na_fill_value 

533 

534 def _fill_value_matches(self, fill_value): 

535 if self._null_fill_value: 

536 return isna(fill_value) 

537 else: 

538 return self.fill_value == fill_value 

539 

540 @property 

541 def nbytes(self) -> int: 

542 return self.sp_values.nbytes + self.sp_index.nbytes 

543 

544 @property 

545 def density(self): 

546 """ 

547 The percent of non- ``fill_value`` points, as decimal. 

548 

549 Examples 

550 -------- 

551 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

552 >>> s.density 

553 0.6 

554 """ 

555 r = float(self.sp_index.npoints) / float(self.sp_index.length) 

556 return r 

557 

558 @property 

559 def npoints(self) -> int: 

560 """ 

561 The number of non- ``fill_value`` points. 

562 

563 Examples 

564 -------- 

565 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

566 >>> s.npoints 

567 3 

568 """ 

569 return self.sp_index.npoints 

570 

571 def isna(self): 

572 # If null fill value, we want SparseDtype[bool, true] 

573 # to preserve the same memory usage. 

574 dtype = SparseDtype(bool, self._null_fill_value) 

575 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) 

576 

577 def fillna(self, value=None, method=None, limit=None): 

578 """ 

579 Fill missing values with `value`. 

580 

581 Parameters 

582 ---------- 

583 value : scalar, optional 

584 method : str, optional 

585 

586 .. warning:: 

587 

588 Using 'method' will result in high memory use, 

589 as all `fill_value` methods will be converted to 

590 an in-memory ndarray 

591 

592 limit : int, optional 

593 

594 Returns 

595 ------- 

596 SparseArray 

597 

598 Notes 

599 ----- 

600 When `value` is specified, the result's ``fill_value`` depends on 

601 ``self.fill_value``. The goal is to maintain low-memory use. 

602 

603 If ``self.fill_value`` is NA, the result dtype will be 

604 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve 

605 amount of memory used before and after filling. 

606 

607 When ``self.fill_value`` is not NA, the result dtype will be 

608 ``self.dtype``. Again, this preserves the amount of memory used. 

609 """ 

610 if (method is None and value is None) or ( 

611 method is not None and value is not None 

612 ): 

613 raise ValueError("Must specify one of 'method' or 'value'.") 

614 

615 elif method is not None: 

616 msg = "fillna with 'method' requires high memory usage." 

617 warnings.warn(msg, PerformanceWarning) 

618 filled = interpolate_2d(np.asarray(self), method=method, limit=limit) 

619 return type(self)(filled, fill_value=self.fill_value) 

620 

621 else: 

622 new_values = np.where(isna(self.sp_values), value, self.sp_values) 

623 

624 if self._null_fill_value: 

625 # This is essentially just updating the dtype. 

626 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) 

627 else: 

628 new_dtype = self.dtype 

629 

630 return self._simple_new(new_values, self._sparse_index, new_dtype) 

631 

632 def shift(self, periods=1, fill_value=None): 

633 

634 if not len(self) or periods == 0: 

635 return self.copy() 

636 

637 if isna(fill_value): 

638 fill_value = self.dtype.na_value 

639 

640 subtype = np.result_type(fill_value, self.dtype.subtype) 

641 

642 if subtype != self.dtype.subtype: 

643 # just coerce up front 

644 arr = self.astype(SparseDtype(subtype, self.fill_value)) 

645 else: 

646 arr = self 

647 

648 empty = self._from_sequence( 

649 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype 

650 ) 

651 

652 if periods > 0: 

653 a = empty 

654 b = arr[:-periods] 

655 else: 

656 a = arr[abs(periods) :] 

657 b = empty 

658 return arr._concat_same_type([a, b]) 

659 

660 def _first_fill_value_loc(self): 

661 """ 

662 Get the location of the first missing value. 

663 

664 Returns 

665 ------- 

666 int 

667 """ 

668 if len(self) == 0 or self.sp_index.npoints == len(self): 

669 return -1 

670 

671 indices = self.sp_index.to_int_index().indices 

672 if not len(indices) or indices[0] > 0: 

673 return 0 

674 

675 diff = indices[1:] - indices[:-1] 

676 return np.searchsorted(diff, 2) + 1 

677 

678 def unique(self): 

679 uniques = list(algos.unique(self.sp_values)) 

680 fill_loc = self._first_fill_value_loc() 

681 if fill_loc >= 0: 

682 uniques.insert(fill_loc, self.fill_value) 

683 return type(self)._from_sequence(uniques, dtype=self.dtype) 

684 

685 def _values_for_factorize(self): 

686 # Still override this for hash_pandas_object 

687 return np.asarray(self), self.fill_value 

688 

689 def factorize(self, na_sentinel=-1): 

690 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] 

691 # The sparsity on this is backwards from what Sparse would want. Want 

692 # ExtensionArray.factorize -> Tuple[EA, EA] 

693 # Given that we have to return a dense array of codes, why bother 

694 # implementing an efficient factorize? 

695 codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) 

696 uniques = SparseArray(uniques, dtype=self.dtype) 

697 return codes, uniques 

698 

699 def value_counts(self, dropna=True): 

700 """ 

701 Returns a Series containing counts of unique values. 

702 

703 Parameters 

704 ---------- 

705 dropna : boolean, default True 

706 Don't include counts of NaN, even if NaN is in sp_values. 

707 

708 Returns 

709 ------- 

710 counts : Series 

711 """ 

712 from pandas import Index, Series 

713 

714 keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) 

715 fcounts = self.sp_index.ngaps 

716 if fcounts > 0: 

717 if self._null_fill_value and dropna: 

718 pass 

719 else: 

720 if self._null_fill_value: 

721 mask = isna(keys) 

722 else: 

723 mask = keys == self.fill_value 

724 

725 if mask.any(): 

726 counts[mask] += fcounts 

727 else: 

728 keys = np.insert(keys, 0, self.fill_value) 

729 counts = np.insert(counts, 0, fcounts) 

730 

731 if not isinstance(keys, ABCIndexClass): 

732 keys = Index(keys) 

733 result = Series(counts, index=keys) 

734 return result 

735 

736 # -------- 

737 # Indexing 

738 # -------- 

739 

740 def __getitem__(self, key): 

741 # avoid mypy issues when importing at the top-level 

742 from pandas.core.indexing import check_bool_indexer 

743 

744 if isinstance(key, tuple): 

745 if len(key) > 1: 

746 raise IndexError("too many indices for array.") 

747 key = key[0] 

748 

749 if is_integer(key): 

750 return self._get_val_at(key) 

751 elif isinstance(key, tuple): 

752 data_slice = self.to_dense()[key] 

753 elif isinstance(key, slice): 

754 # special case to preserve dtypes 

755 if key == slice(None): 

756 return self.copy() 

757 # TODO: this logic is surely elsewhere 

758 # TODO: this could be more efficient 

759 indices = np.arange(len(self), dtype=np.int32)[key] 

760 return self.take(indices) 

761 else: 

762 # TODO: I think we can avoid densifying when masking a 

763 # boolean SparseArray with another. Need to look at the 

764 # key's fill_value for True / False, and then do an intersection 

765 # on the indicies of the sp_values. 

766 if isinstance(key, SparseArray): 

767 if is_bool_dtype(key): 

768 key = key.to_dense() 

769 else: 

770 key = np.asarray(key) 

771 

772 key = check_array_indexer(self, key) 

773 

774 if com.is_bool_indexer(key): 

775 key = check_bool_indexer(self, key) 

776 

777 return self.take(np.arange(len(key), dtype=np.int32)[key]) 

778 elif hasattr(key, "__len__"): 

779 return self.take(key) 

780 else: 

781 raise ValueError(f"Cannot slice with '{key}'") 

782 

783 return type(self)(data_slice, kind=self.kind) 

784 

785 def _get_val_at(self, loc): 

786 n = len(self) 

787 if loc < 0: 

788 loc += n 

789 

790 if loc >= n or loc < 0: 

791 raise IndexError("Out of bounds access") 

792 

793 sp_loc = self.sp_index.lookup(loc) 

794 if sp_loc == -1: 

795 return self.fill_value 

796 else: 

797 return libindex.get_value_at(self.sp_values, sp_loc) 

798 

799 def take(self, indices, allow_fill=False, fill_value=None): 

800 if is_scalar(indices): 

801 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") 

802 indices = np.asarray(indices, dtype=np.int32) 

803 

804 if indices.size == 0: 

805 result = [] 

806 kwargs = {"dtype": self.dtype} 

807 elif allow_fill: 

808 result = self._take_with_fill(indices, fill_value=fill_value) 

809 kwargs = {} 

810 else: 

811 result = self._take_without_fill(indices) 

812 kwargs = {"dtype": self.dtype} 

813 

814 return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs) 

815 

816 def _take_with_fill(self, indices, fill_value=None): 

817 if fill_value is None: 

818 fill_value = self.dtype.na_value 

819 

820 if indices.min() < -1: 

821 raise ValueError( 

822 "Invalid value in 'indices'. Must be between -1 " 

823 "and the length of the array." 

824 ) 

825 

826 if indices.max() >= len(self): 

827 raise IndexError("out of bounds value in 'indices'.") 

828 

829 if len(self) == 0: 

830 # Empty... Allow taking only if all empty 

831 if (indices == -1).all(): 

832 dtype = np.result_type(self.sp_values, type(fill_value)) 

833 taken = np.empty_like(indices, dtype=dtype) 

834 taken.fill(fill_value) 

835 return taken 

836 else: 

837 raise IndexError("cannot do a non-empty take from an empty axes.") 

838 

839 sp_indexer = self.sp_index.lookup_array(indices) 

840 

841 if self.sp_index.npoints == 0: 

842 # Avoid taking from the empty self.sp_values 

843 taken = np.full( 

844 sp_indexer.shape, 

845 fill_value=fill_value, 

846 dtype=np.result_type(type(fill_value)), 

847 ) 

848 else: 

849 taken = self.sp_values.take(sp_indexer) 

850 

851 # sp_indexer may be -1 for two reasons 

852 # 1.) we took for an index of -1 (new) 

853 # 2.) we took a value that was self.fill_value (old) 

854 new_fill_indices = indices == -1 

855 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices 

856 

857 # Fill in two steps. 

858 # Old fill values 

859 # New fill values 

860 # potentially coercing to a new dtype at each stage. 

861 

862 m0 = sp_indexer[old_fill_indices] < 0 

863 m1 = sp_indexer[new_fill_indices] < 0 

864 

865 result_type = taken.dtype 

866 

867 if m0.any(): 

868 result_type = np.result_type(result_type, type(self.fill_value)) 

869 taken = taken.astype(result_type) 

870 taken[old_fill_indices] = self.fill_value 

871 

872 if m1.any(): 

873 result_type = np.result_type(result_type, type(fill_value)) 

874 taken = taken.astype(result_type) 

875 taken[new_fill_indices] = fill_value 

876 

877 return taken 

878 

879 def _take_without_fill(self, indices): 

880 to_shift = indices < 0 

881 indices = indices.copy() 

882 

883 n = len(self) 

884 

885 if (indices.max() >= n) or (indices.min() < -n): 

886 if n == 0: 

887 raise IndexError("cannot do a non-empty take from an empty axes.") 

888 else: 

889 raise IndexError("out of bounds value in 'indices'.") 

890 

891 if to_shift.any(): 

892 indices[to_shift] += n 

893 

894 if self.sp_index.npoints == 0: 

895 # edge case in take... 

896 # I think just return 

897 out = np.full( 

898 indices.shape, 

899 self.fill_value, 

900 dtype=np.result_type(type(self.fill_value)), 

901 ) 

902 arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value) 

903 return type(self)(arr, sparse_index=sp_index, fill_value=fill_value) 

904 

905 sp_indexer = self.sp_index.lookup_array(indices) 

906 taken = self.sp_values.take(sp_indexer) 

907 fillable = sp_indexer < 0 

908 

909 if fillable.any(): 

910 # TODO: may need to coerce array to fill value 

911 result_type = np.result_type(taken, type(self.fill_value)) 

912 taken = taken.astype(result_type) 

913 taken[fillable] = self.fill_value 

914 

915 return taken 

916 

917 def searchsorted(self, v, side="left", sorter=None): 

918 msg = "searchsorted requires high memory usage." 

919 warnings.warn(msg, PerformanceWarning, stacklevel=2) 

920 if not is_scalar(v): 

921 v = np.asarray(v) 

922 v = np.asarray(v) 

923 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) 

924 

925 def copy(self): 

926 values = self.sp_values.copy() 

927 return self._simple_new(values, self.sp_index, self.dtype) 

928 

929 @classmethod 

930 def _concat_same_type(cls, to_concat): 

931 fill_values = [x.fill_value for x in to_concat] 

932 

933 fill_value = fill_values[0] 

934 

935 # np.nan isn't a singleton, so we may end up with multiple 

936 # NaNs here, so we ignore tha all NA case too. 

937 if not (len(set(fill_values)) == 1 or isna(fill_values).all()): 

938 warnings.warn( 

939 "Concatenating sparse arrays with multiple fill " 

940 f"values: '{fill_values}'. Picking the first and " 

941 "converting the rest.", 

942 PerformanceWarning, 

943 stacklevel=6, 

944 ) 

945 keep = to_concat[0] 

946 to_concat2 = [keep] 

947 

948 for arr in to_concat[1:]: 

949 to_concat2.append(cls(np.asarray(arr), fill_value=fill_value)) 

950 

951 to_concat = to_concat2 

952 

953 values = [] 

954 length = 0 

955 

956 if to_concat: 

957 sp_kind = to_concat[0].kind 

958 else: 

959 sp_kind = "integer" 

960 

961 if sp_kind == "integer": 

962 indices = [] 

963 

964 for arr in to_concat: 

965 idx = arr.sp_index.to_int_index().indices.copy() 

966 idx += length # TODO: wraparound 

967 length += arr.sp_index.length 

968 

969 values.append(arr.sp_values) 

970 indices.append(idx) 

971 

972 data = np.concatenate(values) 

973 indices = np.concatenate(indices) 

974 sp_index = IntIndex(length, indices) 

975 

976 else: 

977 # when concatenating block indices, we don't claim that you'll 

978 # get an identical index as concating the values and then 

979 # creating a new index. We don't want to spend the time trying 

980 # to merge blocks across arrays in `to_concat`, so the resulting 

981 # BlockIndex may have more blocs. 

982 blengths = [] 

983 blocs = [] 

984 

985 for arr in to_concat: 

986 idx = arr.sp_index.to_block_index() 

987 

988 values.append(arr.sp_values) 

989 blocs.append(idx.blocs.copy() + length) 

990 blengths.append(idx.blengths) 

991 length += arr.sp_index.length 

992 

993 data = np.concatenate(values) 

994 blocs = np.concatenate(blocs) 

995 blengths = np.concatenate(blengths) 

996 

997 sp_index = BlockIndex(length, blocs, blengths) 

998 

999 return cls(data, sparse_index=sp_index, fill_value=fill_value) 

1000 

1001 def astype(self, dtype=None, copy=True): 

1002 """ 

1003 Change the dtype of a SparseArray. 

1004 

1005 The output will always be a SparseArray. To convert to a dense 

1006 ndarray with a certain dtype, use :meth:`numpy.asarray`. 

1007 

1008 Parameters 

1009 ---------- 

1010 dtype : np.dtype or ExtensionDtype 

1011 For SparseDtype, this changes the dtype of 

1012 ``self.sp_values`` and the ``self.fill_value``. 

1013 

1014 For other dtypes, this only changes the dtype of 

1015 ``self.sp_values``. 

1016 

1017 copy : bool, default True 

1018 Whether to ensure a copy is made, even if not necessary. 

1019 

1020 Returns 

1021 ------- 

1022 SparseArray 

1023 

1024 Examples 

1025 -------- 

1026 >>> arr = SparseArray([0, 0, 1, 2]) 

1027 >>> arr 

1028 [0, 0, 1, 2] 

1029 Fill: 0 

1030 IntIndex 

1031 Indices: array([2, 3], dtype=int32) 

1032 

1033 >>> arr.astype(np.dtype('int32')) 

1034 [0, 0, 1, 2] 

1035 Fill: 0 

1036 IntIndex 

1037 Indices: array([2, 3], dtype=int32) 

1038 

1039 Using a NumPy dtype with a different kind (e.g. float) will coerce 

1040 just ``self.sp_values``. 

1041 

1042 >>> arr.astype(np.dtype('float64')) 

1043 ... # doctest: +NORMALIZE_WHITESPACE 

1044 [0, 0, 1.0, 2.0] 

1045 Fill: 0 

1046 IntIndex 

1047 Indices: array([2, 3], dtype=int32) 

1048 

1049 Use a SparseDtype if you wish to be change the fill value as well. 

1050 

1051 >>> arr.astype(SparseDtype("float64", fill_value=np.nan)) 

1052 ... # doctest: +NORMALIZE_WHITESPACE 

1053 [nan, nan, 1.0, 2.0] 

1054 Fill: nan 

1055 IntIndex 

1056 Indices: array([2, 3], dtype=int32) 

1057 """ 

1058 dtype = self.dtype.update_dtype(dtype) 

1059 subtype = dtype._subtype_with_str 

1060 sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) 

1061 if sp_values is self.sp_values and copy: 

1062 sp_values = sp_values.copy() 

1063 

1064 return self._simple_new(sp_values, self.sp_index, dtype) 

1065 

1066 def map(self, mapper): 

1067 """ 

1068 Map categories using input correspondence (dict, Series, or function). 

1069 

1070 Parameters 

1071 ---------- 

1072 mapper : dict, Series, callable 

1073 The correspondence from old values to new. 

1074 

1075 Returns 

1076 ------- 

1077 SparseArray 

1078 The output array will have the same density as the input. 

1079 The output fill value will be the result of applying the 

1080 mapping to ``self.fill_value`` 

1081 

1082 Examples 

1083 -------- 

1084 >>> arr = pd.arrays.SparseArray([0, 1, 2]) 

1085 >>> arr.apply(lambda x: x + 10) 

1086 [10, 11, 12] 

1087 Fill: 10 

1088 IntIndex 

1089 Indices: array([1, 2], dtype=int32) 

1090 

1091 >>> arr.apply({0: 10, 1: 11, 2: 12}) 

1092 [10, 11, 12] 

1093 Fill: 10 

1094 IntIndex 

1095 Indices: array([1, 2], dtype=int32) 

1096 

1097 >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2])) 

1098 [10, 11, 12] 

1099 Fill: 10 

1100 IntIndex 

1101 Indices: array([1, 2], dtype=int32) 

1102 """ 

1103 # this is used in apply. 

1104 # We get hit since we're an "is_extension_type" but regular extension 

1105 # types are not hit. This may be worth adding to the interface. 

1106 if isinstance(mapper, ABCSeries): 

1107 mapper = mapper.to_dict() 

1108 

1109 if isinstance(mapper, abc.Mapping): 

1110 fill_value = mapper.get(self.fill_value, self.fill_value) 

1111 sp_values = [mapper.get(x, None) for x in self.sp_values] 

1112 else: 

1113 fill_value = mapper(self.fill_value) 

1114 sp_values = [mapper(x) for x in self.sp_values] 

1115 

1116 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value) 

1117 

1118 def to_dense(self): 

1119 """ 

1120 Convert SparseArray to a NumPy array. 

1121 

1122 Returns 

1123 ------- 

1124 arr : NumPy array 

1125 """ 

1126 return np.asarray(self, dtype=self.sp_values.dtype) 

1127 

1128 _internal_get_values = to_dense 

1129 

1130 # ------------------------------------------------------------------------ 

1131 # IO 

1132 # ------------------------------------------------------------------------ 

1133 def __setstate__(self, state): 

1134 """Necessary for making this object picklable""" 

1135 if isinstance(state, tuple): 

1136 # Compat for pandas < 0.24.0 

1137 nd_state, (fill_value, sp_index) = state 

1138 sparse_values = np.array([]) 

1139 sparse_values.__setstate__(nd_state) 

1140 

1141 self._sparse_values = sparse_values 

1142 self._sparse_index = sp_index 

1143 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

1144 else: 

1145 self.__dict__.update(state) 

1146 

1147 def nonzero(self): 

1148 if self.fill_value == 0: 

1149 return (self.sp_index.to_int_index().indices,) 

1150 else: 

1151 return (self.sp_index.to_int_index().indices[self.sp_values != 0],) 

1152 

1153 # ------------------------------------------------------------------------ 

1154 # Reductions 

1155 # ------------------------------------------------------------------------ 

1156 

1157 def _reduce(self, name, skipna=True, **kwargs): 

1158 method = getattr(self, name, None) 

1159 

1160 if method is None: 

1161 raise TypeError(f"cannot perform {name} with type {self.dtype}") 

1162 

1163 if skipna: 

1164 arr = self 

1165 else: 

1166 arr = self.dropna() 

1167 

1168 # we don't support these kwargs. 

1169 # They should only be present when called via pandas, so do it here. 

1170 # instead of in `any` / `all` (which will raise if they're present, 

1171 # thanks to nv.validate 

1172 kwargs.pop("filter_type", None) 

1173 kwargs.pop("numeric_only", None) 

1174 kwargs.pop("op", None) 

1175 return getattr(arr, name)(**kwargs) 

1176 

1177 def all(self, axis=None, *args, **kwargs): 

1178 """ 

1179 Tests whether all elements evaluate True 

1180 

1181 Returns 

1182 ------- 

1183 all : bool 

1184 

1185 See Also 

1186 -------- 

1187 numpy.all 

1188 """ 

1189 nv.validate_all(args, kwargs) 

1190 

1191 values = self.sp_values 

1192 

1193 if len(values) != len(self) and not np.all(self.fill_value): 

1194 return False 

1195 

1196 return values.all() 

1197 

1198 def any(self, axis=0, *args, **kwargs): 

1199 """ 

1200 Tests whether at least one of elements evaluate True 

1201 

1202 Returns 

1203 ------- 

1204 any : bool 

1205 

1206 See Also 

1207 -------- 

1208 numpy.any 

1209 """ 

1210 nv.validate_any(args, kwargs) 

1211 

1212 values = self.sp_values 

1213 

1214 if len(values) != len(self) and np.any(self.fill_value): 

1215 return True 

1216 

1217 return values.any().item() 

1218 

1219 def sum(self, axis=0, *args, **kwargs): 

1220 """ 

1221 Sum of non-NA/null values 

1222 

1223 Returns 

1224 ------- 

1225 sum : float 

1226 """ 

1227 nv.validate_sum(args, kwargs) 

1228 valid_vals = self._valid_sp_values 

1229 sp_sum = valid_vals.sum() 

1230 if self._null_fill_value: 

1231 return sp_sum 

1232 else: 

1233 nsparse = self.sp_index.ngaps 

1234 return sp_sum + self.fill_value * nsparse 

1235 

1236 def cumsum(self, axis=0, *args, **kwargs): 

1237 """ 

1238 Cumulative sum of non-NA/null values. 

1239 

1240 When performing the cumulative summation, any non-NA/null values will 

1241 be skipped. The resulting SparseArray will preserve the locations of 

1242 NaN values, but the fill value will be `np.nan` regardless. 

1243 

1244 Parameters 

1245 ---------- 

1246 axis : int or None 

1247 Axis over which to perform the cumulative summation. If None, 

1248 perform cumulative summation over flattened array. 

1249 

1250 Returns 

1251 ------- 

1252 cumsum : SparseArray 

1253 """ 

1254 nv.validate_cumsum(args, kwargs) 

1255 

1256 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. 

1257 raise ValueError(f"axis(={axis}) out of bounds") 

1258 

1259 if not self._null_fill_value: 

1260 return SparseArray(self.to_dense()).cumsum() 

1261 

1262 return SparseArray( 

1263 self.sp_values.cumsum(), 

1264 sparse_index=self.sp_index, 

1265 fill_value=self.fill_value, 

1266 ) 

1267 

1268 def mean(self, axis=0, *args, **kwargs): 

1269 """ 

1270 Mean of non-NA/null values 

1271 

1272 Returns 

1273 ------- 

1274 mean : float 

1275 """ 

1276 nv.validate_mean(args, kwargs) 

1277 valid_vals = self._valid_sp_values 

1278 sp_sum = valid_vals.sum() 

1279 ct = len(valid_vals) 

1280 

1281 if self._null_fill_value: 

1282 return sp_sum / ct 

1283 else: 

1284 nsparse = self.sp_index.ngaps 

1285 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) 

1286 

1287 def transpose(self, *axes): 

1288 """ 

1289 Returns the SparseArray. 

1290 """ 

1291 return self 

1292 

1293 @property 

1294 def T(self): 

1295 """ 

1296 Returns the SparseArray. 

1297 """ 

1298 return self 

1299 

1300 # ------------------------------------------------------------------------ 

1301 # Ufuncs 

1302 # ------------------------------------------------------------------------ 

1303 

1304 _HANDLED_TYPES = (np.ndarray, numbers.Number) 

1305 

1306 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): 

1307 out = kwargs.get("out", ()) 

1308 

1309 for x in inputs + out: 

1310 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): 

1311 return NotImplemented 

1312 

1313 # for binary ops, use our custom dunder methods 

1314 result = ops.maybe_dispatch_ufunc_to_dunder_op( 

1315 self, ufunc, method, *inputs, **kwargs 

1316 ) 

1317 if result is not NotImplemented: 

1318 return result 

1319 

1320 if len(inputs) == 1: 

1321 # No alignment necessary. 

1322 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) 

1323 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) 

1324 

1325 if isinstance(sp_values, tuple): 

1326 # multiple outputs. e.g. modf 

1327 arrays = tuple( 

1328 self._simple_new( 

1329 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) 

1330 ) 

1331 for sp_value, fv in zip(sp_values, fill_value) 

1332 ) 

1333 return arrays 

1334 elif is_scalar(sp_values): 

1335 # e.g. reductions 

1336 return sp_values 

1337 

1338 return self._simple_new( 

1339 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) 

1340 ) 

1341 

1342 result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs) 

1343 if out: 

1344 if len(out) == 1: 

1345 out = out[0] 

1346 return out 

1347 

1348 if type(result) is tuple: 

1349 return tuple(type(self)(x) for x in result) 

1350 elif method == "at": 

1351 # no return value 

1352 return None 

1353 else: 

1354 return type(self)(result) 

1355 

1356 def __abs__(self): 

1357 return np.abs(self) 

1358 

1359 # ------------------------------------------------------------------------ 

1360 # Ops 

1361 # ------------------------------------------------------------------------ 

1362 

1363 @classmethod 

1364 def _create_unary_method(cls, op) -> Callable[["SparseArray"], "SparseArray"]: 

1365 def sparse_unary_method(self) -> "SparseArray": 

1366 fill_value = op(np.array(self.fill_value)).item() 

1367 values = op(self.sp_values) 

1368 dtype = SparseDtype(values.dtype, fill_value) 

1369 return cls._simple_new(values, self.sp_index, dtype) 

1370 

1371 name = f"__{op.__name__}__" 

1372 return compat.set_function_name(sparse_unary_method, name, cls) 

1373 

1374 @classmethod 

1375 def _create_arithmetic_method(cls, op): 

1376 op_name = op.__name__ 

1377 

1378 @unpack_zerodim_and_defer(op_name) 

1379 def sparse_arithmetic_method(self, other): 

1380 

1381 if isinstance(other, SparseArray): 

1382 return _sparse_array_op(self, other, op, op_name) 

1383 

1384 elif is_scalar(other): 

1385 with np.errstate(all="ignore"): 

1386 fill = op(_get_fill(self), np.asarray(other)) 

1387 result = op(self.sp_values, other) 

1388 

1389 if op_name == "divmod": 

1390 left, right = result 

1391 lfill, rfill = fill 

1392 return ( 

1393 _wrap_result(op_name, left, self.sp_index, lfill), 

1394 _wrap_result(op_name, right, self.sp_index, rfill), 

1395 ) 

1396 

1397 return _wrap_result(op_name, result, self.sp_index, fill) 

1398 

1399 else: 

1400 other = np.asarray(other) 

1401 with np.errstate(all="ignore"): 

1402 # TODO: look into _wrap_result 

1403 if len(self) != len(other): 

1404 raise AssertionError( 

1405 (f"length mismatch: {len(self)} vs. {len(other)}") 

1406 ) 

1407 if not isinstance(other, SparseArray): 

1408 dtype = getattr(other, "dtype", None) 

1409 other = SparseArray( 

1410 other, fill_value=self.fill_value, dtype=dtype 

1411 ) 

1412 return _sparse_array_op(self, other, op, op_name) 

1413 

1414 name = f"__{op.__name__}__" 

1415 return compat.set_function_name(sparse_arithmetic_method, name, cls) 

1416 

1417 @classmethod 

1418 def _create_comparison_method(cls, op): 

1419 op_name = op.__name__ 

1420 if op_name in {"and_", "or_"}: 

1421 op_name = op_name[:-1] 

1422 

1423 @unpack_zerodim_and_defer(op_name) 

1424 def cmp_method(self, other): 

1425 

1426 if not is_scalar(other) and not isinstance(other, type(self)): 

1427 # convert list-like to ndarray 

1428 other = np.asarray(other) 

1429 

1430 if isinstance(other, np.ndarray): 

1431 # TODO: make this more flexible than just ndarray... 

1432 if len(self) != len(other): 

1433 raise AssertionError( 

1434 f"length mismatch: {len(self)} vs. {len(other)}" 

1435 ) 

1436 other = SparseArray(other, fill_value=self.fill_value) 

1437 

1438 if isinstance(other, SparseArray): 

1439 return _sparse_array_op(self, other, op, op_name) 

1440 else: 

1441 with np.errstate(all="ignore"): 

1442 fill_value = op(self.fill_value, other) 

1443 result = op(self.sp_values, other) 

1444 

1445 return type(self)( 

1446 result, 

1447 sparse_index=self.sp_index, 

1448 fill_value=fill_value, 

1449 dtype=np.bool_, 

1450 ) 

1451 

1452 name = f"__{op.__name__}__" 

1453 return compat.set_function_name(cmp_method, name, cls) 

1454 

1455 @classmethod 

1456 def _add_unary_ops(cls): 

1457 cls.__pos__ = cls._create_unary_method(operator.pos) 

1458 cls.__neg__ = cls._create_unary_method(operator.neg) 

1459 cls.__invert__ = cls._create_unary_method(operator.invert) 

1460 

1461 @classmethod 

1462 def _add_comparison_ops(cls): 

1463 cls.__and__ = cls._create_comparison_method(operator.and_) 

1464 cls.__or__ = cls._create_comparison_method(operator.or_) 

1465 cls.__xor__ = cls._create_arithmetic_method(operator.xor) 

1466 super()._add_comparison_ops() 

1467 

1468 # ---------- 

1469 # Formatting 

1470 # ----------- 

1471 def __repr__(self) -> str: 

1472 pp_str = printing.pprint_thing(self) 

1473 pp_fill = printing.pprint_thing(self.fill_value) 

1474 pp_index = printing.pprint_thing(self.sp_index) 

1475 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" 

1476 

1477 def _formatter(self, boxed=False): 

1478 # Defer to the formatter from the GenericArrayFormatter calling us. 

1479 # This will infer the correct formatter from the dtype of the values. 

1480 return None 

1481 

1482 

1483SparseArray._add_arithmetic_ops() 

1484SparseArray._add_comparison_ops() 

1485SparseArray._add_unary_ops() 

1486 

1487 

1488def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): 

1489 """ 

1490 Convert ndarray to sparse format 

1491 

1492 Parameters 

1493 ---------- 

1494 arr : ndarray 

1495 kind : {'block', 'integer'} 

1496 fill_value : NaN or another value 

1497 dtype : np.dtype, optional 

1498 copy : bool, default False 

1499 

1500 Returns 

1501 ------- 

1502 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) 

1503 """ 

1504 

1505 arr = com.values_from_object(arr) 

1506 

1507 if arr.ndim > 1: 

1508 raise TypeError("expected dimension <= 1 data") 

1509 

1510 if fill_value is None: 

1511 fill_value = na_value_for_dtype(arr.dtype) 

1512 

1513 if isna(fill_value): 

1514 mask = notna(arr) 

1515 else: 

1516 # cast to object comparison to be safe 

1517 if is_string_dtype(arr): 

1518 arr = arr.astype(object) 

1519 

1520 if is_object_dtype(arr.dtype): 

1521 # element-wise equality check method in numpy doesn't treat 

1522 # each element type, eg. 0, 0.0, and False are treated as 

1523 # same. So we have to check the both of its type and value. 

1524 mask = splib.make_mask_object_ndarray(arr, fill_value) 

1525 else: 

1526 mask = arr != fill_value 

1527 

1528 length = len(arr) 

1529 if length != len(mask): 

1530 # the arr is a SparseArray 

1531 indices = mask.sp_index.indices 

1532 else: 

1533 indices = mask.nonzero()[0].astype(np.int32) 

1534 

1535 index = _make_index(length, indices, kind) 

1536 sparsified_values = arr[mask] 

1537 if dtype is not None: 

1538 sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) 

1539 # TODO: copy 

1540 return sparsified_values, index, fill_value 

1541 

1542 

1543def _make_index(length, indices, kind): 

1544 

1545 if kind == "block" or isinstance(kind, BlockIndex): 

1546 locs, lens = splib.get_blocks(indices) 

1547 index = BlockIndex(length, locs, lens) 

1548 elif kind == "integer" or isinstance(kind, IntIndex): 

1549 index = IntIndex(length, indices) 

1550 else: # pragma: no cover 

1551 raise ValueError("must be block or integer type") 

1552 return index