Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import functools 

2import itertools 

3import operator 

4from typing import Any, Optional, Tuple, Union 

5 

6import numpy as np 

7 

8from pandas._config import get_option 

9 

10from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib 

11from pandas.compat._optional import import_optional_dependency 

12 

13from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask 

14from pandas.core.dtypes.common import ( 

15 _get_dtype, 

16 is_any_int_dtype, 

17 is_bool_dtype, 

18 is_complex, 

19 is_datetime64_dtype, 

20 is_datetime64tz_dtype, 

21 is_datetime_or_timedelta_dtype, 

22 is_float, 

23 is_float_dtype, 

24 is_integer, 

25 is_integer_dtype, 

26 is_numeric_dtype, 

27 is_object_dtype, 

28 is_scalar, 

29 is_timedelta64_dtype, 

30 pandas_dtype, 

31) 

32from pandas.core.dtypes.dtypes import DatetimeTZDtype 

33from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna 

34 

35bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") 

36_BOTTLENECK_INSTALLED = bn is not None 

37_USE_BOTTLENECK = False 

38 

39 

40def set_use_bottleneck(v=True): 

41 # set/unset to use bottleneck 

42 global _USE_BOTTLENECK 

43 if _BOTTLENECK_INSTALLED: 

44 _USE_BOTTLENECK = v 

45 

46 

47set_use_bottleneck(get_option("compute.use_bottleneck")) 

48 

49 

50class disallow: 

51 def __init__(self, *dtypes): 

52 super().__init__() 

53 self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes) 

54 

55 def check(self, obj) -> bool: 

56 return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes) 

57 

58 def __call__(self, f): 

59 @functools.wraps(f) 

60 def _f(*args, **kwargs): 

61 obj_iter = itertools.chain(args, kwargs.values()) 

62 if any(self.check(obj) for obj in obj_iter): 

63 f_name = f.__name__.replace("nan", "") 

64 raise TypeError( 

65 f"reduction operation '{f_name}' not allowed for this dtype" 

66 ) 

67 try: 

68 with np.errstate(invalid="ignore"): 

69 return f(*args, **kwargs) 

70 except ValueError as e: 

71 # we want to transform an object array 

72 # ValueError message to the more typical TypeError 

73 # e.g. this is normally a disallowed function on 

74 # object arrays that contain strings 

75 if is_object_dtype(args[0]): 

76 raise TypeError(e) 

77 raise 

78 

79 return _f 

80 

81 

82class bottleneck_switch: 

83 def __init__(self, name=None, **kwargs): 

84 self.name = name 

85 self.kwargs = kwargs 

86 

87 def __call__(self, alt): 

88 bn_name = self.name or alt.__name__ 

89 

90 try: 

91 bn_func = getattr(bn, bn_name) 

92 except (AttributeError, NameError): # pragma: no cover 

93 bn_func = None 

94 

95 @functools.wraps(alt) 

96 def f(values, axis=None, skipna=True, **kwds): 

97 if len(self.kwargs) > 0: 

98 for k, v in self.kwargs.items(): 

99 if k not in kwds: 

100 kwds[k] = v 

101 

102 if values.size == 0 and kwds.get("min_count") is None: 

103 # We are empty, returning NA for our type 

104 # Only applies for the default `min_count` of None 

105 # since that affects how empty arrays are handled. 

106 # TODO(GH-18976) update all the nanops methods to 

107 # correctly handle empty inputs and remove this check. 

108 # It *may* just be `var` 

109 return _na_for_min_count(values, axis) 

110 

111 if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name): 

112 if kwds.get("mask", None) is None: 

113 # `mask` is not recognised by bottleneck, would raise 

114 # TypeError if called 

115 kwds.pop("mask", None) 

116 result = bn_func(values, axis=axis, **kwds) 

117 

118 # prefer to treat inf/-inf as NA, but must compute the func 

119 # twice :( 

120 if _has_infs(result): 

121 result = alt(values, axis=axis, skipna=skipna, **kwds) 

122 else: 

123 result = alt(values, axis=axis, skipna=skipna, **kwds) 

124 else: 

125 result = alt(values, axis=axis, skipna=skipna, **kwds) 

126 

127 return result 

128 

129 return f 

130 

131 

132def _bn_ok_dtype(dt, name: str) -> bool: 

133 # Bottleneck chokes on datetime64 

134 if not is_object_dtype(dt) and not ( 

135 is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt) 

136 ): 

137 

138 # GH 15507 

139 # bottleneck does not properly upcast during the sum 

140 # so can overflow 

141 

142 # GH 9422 

143 # further we also want to preserve NaN when all elements 

144 # are NaN, unlinke bottleneck/numpy which consider this 

145 # to be 0 

146 if name in ["nansum", "nanprod"]: 

147 return False 

148 

149 return True 

150 return False 

151 

152 

153def _has_infs(result) -> bool: 

154 if isinstance(result, np.ndarray): 

155 if result.dtype == "f8": 

156 return lib.has_infs_f8(result.ravel()) 

157 elif result.dtype == "f4": 

158 return lib.has_infs_f4(result.ravel()) 

159 try: 

160 return np.isinf(result).any() 

161 except (TypeError, NotImplementedError): 

162 # if it doesn't support infs, then it can't have infs 

163 return False 

164 

165 

166def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): 

167 """ return the correct fill value for the dtype of the values """ 

168 if fill_value is not None: 

169 return fill_value 

170 if _na_ok_dtype(dtype): 

171 if fill_value_typ is None: 

172 return np.nan 

173 else: 

174 if fill_value_typ == "+inf": 

175 return np.inf 

176 else: 

177 return -np.inf 

178 else: 

179 if fill_value_typ is None: 

180 return iNaT 

181 else: 

182 if fill_value_typ == "+inf": 

183 # need the max int here 

184 return _int64_max 

185 else: 

186 return iNaT 

187 

188 

189def _maybe_get_mask( 

190 values: np.ndarray, skipna: bool, mask: Optional[np.ndarray] 

191) -> Optional[np.ndarray]: 

192 """ 

193 Compute a mask if and only if necessary. 

194 

195 This function will compute a mask iff it is necessary. Otherwise, 

196 return the provided mask (potentially None) when a mask does not need to be 

197 computed. 

198 

199 A mask is never necessary if the values array is of boolean or integer 

200 dtypes, as these are incapable of storing NaNs. If passing a NaN-capable 

201 dtype that is interpretable as either boolean or integer data (eg, 

202 timedelta64), a mask must be provided. 

203 

204 If the skipna parameter is False, a new mask will not be computed. 

205 

206 The mask is computed using isna() by default. Setting invert=True selects 

207 notna() as the masking function. 

208 

209 Parameters 

210 ---------- 

211 values : ndarray 

212 input array to potentially compute mask for 

213 skipna : bool 

214 boolean for whether NaNs should be skipped 

215 mask : Optional[ndarray] 

216 nan-mask if known 

217 

218 Returns 

219 ------- 

220 Optional[np.ndarray] 

221 """ 

222 

223 if mask is None: 

224 if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype): 

225 # Boolean data cannot contain nulls, so signal via mask being None 

226 return None 

227 

228 if skipna: 

229 mask = isna(values) 

230 

231 return mask 

232 

233 

234def _get_values( 

235 values: np.ndarray, 

236 skipna: bool, 

237 fill_value: Any = None, 

238 fill_value_typ: Optional[str] = None, 

239 mask: Optional[np.ndarray] = None, 

240) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: 

241 """ 

242 Utility to get the values view, mask, dtype, dtype_max, and fill_value. 

243 

244 If both mask and fill_value/fill_value_typ are not None and skipna is True, 

245 the values array will be copied. 

246 

247 For input arrays of boolean or integer dtypes, copies will only occur if a 

248 precomputed mask, a fill_value/fill_value_typ, and skipna=True are 

249 provided. 

250 

251 Parameters 

252 ---------- 

253 values : ndarray 

254 input array to potentially compute mask for 

255 skipna : bool 

256 boolean for whether NaNs should be skipped 

257 fill_value : Any 

258 value to fill NaNs with 

259 fill_value_typ : str 

260 Set to '+inf' or '-inf' to handle dtype-specific infinities 

261 mask : Optional[np.ndarray] 

262 nan-mask if known 

263 

264 Returns 

265 ------- 

266 values : ndarray 

267 Potential copy of input value array 

268 mask : Optional[ndarray[bool]] 

269 Mask for values, if deemed necessary to compute 

270 dtype : dtype 

271 dtype for values 

272 dtype_max : dtype 

273 platform independent dtype 

274 fill_value : Any 

275 fill value used 

276 """ 

277 

278 # In _get_values is only called from within nanops, and in all cases 

279 # with scalar fill_value. This guarantee is important for the 

280 # maybe_upcast_putmask call below 

281 assert is_scalar(fill_value) 

282 

283 mask = _maybe_get_mask(values, skipna, mask) 

284 

285 if is_datetime64tz_dtype(values): 

286 # lib.values_from_object returns M8[ns] dtype instead of tz-aware, 

287 # so this case must be handled separately from the rest 

288 dtype = values.dtype 

289 values = getattr(values, "_values", values) 

290 else: 

291 values = lib.values_from_object(values) 

292 dtype = values.dtype 

293 

294 if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): 

295 # changing timedelta64/datetime64 to int64 needs to happen after 

296 # finding `mask` above 

297 values = getattr(values, "asi8", values) 

298 values = values.view(np.int64) 

299 

300 dtype_ok = _na_ok_dtype(dtype) 

301 

302 # get our fill value (in case we need to provide an alternative 

303 # dtype for it) 

304 fill_value = _get_fill_value( 

305 dtype, fill_value=fill_value, fill_value_typ=fill_value_typ 

306 ) 

307 

308 copy = (mask is not None) and (fill_value is not None) 

309 

310 if skipna and copy: 

311 values = values.copy() 

312 if dtype_ok: 

313 np.putmask(values, mask, fill_value) 

314 

315 # promote if needed 

316 else: 

317 values, _ = maybe_upcast_putmask(values, mask, fill_value) 

318 

319 # return a platform independent precision dtype 

320 dtype_max = dtype 

321 if is_integer_dtype(dtype) or is_bool_dtype(dtype): 

322 dtype_max = np.int64 

323 elif is_float_dtype(dtype): 

324 dtype_max = np.float64 

325 

326 return values, mask, dtype, dtype_max, fill_value 

327 

328 

329def _na_ok_dtype(dtype): 

330 # TODO: what about datetime64tz? PeriodDtype? 

331 return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) 

332 

333 

334def _wrap_results(result, dtype, fill_value=None): 

335 """ wrap our results if needed """ 

336 

337 if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): 

338 if fill_value is None: 

339 # GH#24293 

340 fill_value = iNaT 

341 if not isinstance(result, np.ndarray): 

342 tz = getattr(dtype, "tz", None) 

343 assert not isna(fill_value), "Expected non-null fill_value" 

344 if result == fill_value: 

345 result = np.nan 

346 result = Timestamp(result, tz=tz) 

347 else: 

348 result = result.view(dtype) 

349 elif is_timedelta64_dtype(dtype): 

350 if not isinstance(result, np.ndarray): 

351 if result == fill_value: 

352 result = np.nan 

353 

354 # raise if we have a timedelta64[ns] which is too large 

355 if np.fabs(result) > _int64_max: 

356 raise ValueError("overflow in timedelta operation") 

357 

358 result = Timedelta(result, unit="ns") 

359 else: 

360 result = result.astype("m8[ns]").view(dtype) 

361 

362 return result 

363 

364 

365def _na_for_min_count(values, axis: Optional[int]): 

366 """ 

367 Return the missing value for `values`. 

368 

369 Parameters 

370 ---------- 

371 values : ndarray 

372 axis : int or None 

373 axis for the reduction, required if values.ndim > 1. 

374 

375 Returns 

376 ------- 

377 result : scalar or ndarray 

378 For 1-D values, returns a scalar of the correct missing type. 

379 For 2-D values, returns a 1-D array where each element is missing. 

380 """ 

381 # we either return np.nan or pd.NaT 

382 if is_numeric_dtype(values): 

383 values = values.astype("float64") 

384 fill_value = na_value_for_dtype(values.dtype) 

385 

386 if values.ndim == 1: 

387 return fill_value 

388 else: 

389 assert axis is not None # assertion to make mypy happy 

390 result_shape = values.shape[:axis] + values.shape[axis + 1 :] 

391 result = np.empty(result_shape, dtype=values.dtype) 

392 result.fill(fill_value) 

393 return result 

394 

395 

396def nanany(values, axis=None, skipna: bool = True, mask=None): 

397 """ 

398 Check if any elements along an axis evaluate to True. 

399 

400 Parameters 

401 ---------- 

402 values : ndarray 

403 axis : int, optional 

404 skipna : bool, default True 

405 mask : ndarray[bool], optional 

406 nan-mask if known 

407 

408 Returns 

409 ------- 

410 result : bool 

411 

412 Examples 

413 -------- 

414 >>> import pandas.core.nanops as nanops 

415 >>> s = pd.Series([1, 2]) 

416 >>> nanops.nanany(s) 

417 True 

418 

419 >>> import pandas.core.nanops as nanops 

420 >>> s = pd.Series([np.nan]) 

421 >>> nanops.nanany(s) 

422 False 

423 """ 

424 values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask) 

425 return values.any(axis) 

426 

427 

428def nanall(values, axis=None, skipna: bool = True, mask=None): 

429 """ 

430 Check if all elements along an axis evaluate to True. 

431 

432 Parameters 

433 ---------- 

434 values : ndarray 

435 axis: int, optional 

436 skipna : bool, default True 

437 mask : ndarray[bool], optional 

438 nan-mask if known 

439 

440 Returns 

441 ------- 

442 result : bool 

443 

444 Examples 

445 -------- 

446 >>> import pandas.core.nanops as nanops 

447 >>> s = pd.Series([1, 2, np.nan]) 

448 >>> nanops.nanall(s) 

449 True 

450 

451 >>> import pandas.core.nanops as nanops 

452 >>> s = pd.Series([1, 0]) 

453 >>> nanops.nanall(s) 

454 False 

455 """ 

456 values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask) 

457 return values.all(axis) 

458 

459 

460@disallow("M8") 

461def nansum(values, axis=None, skipna=True, min_count=0, mask=None): 

462 """ 

463 Sum the elements along an axis ignoring NaNs 

464 

465 Parameters 

466 ---------- 

467 values : ndarray[dtype] 

468 axis: int, optional 

469 skipna : bool, default True 

470 min_count: int, default 0 

471 mask : ndarray[bool], optional 

472 nan-mask if known 

473 

474 Returns 

475 ------- 

476 result : dtype 

477 

478 Examples 

479 -------- 

480 >>> import pandas.core.nanops as nanops 

481 >>> s = pd.Series([1, 2, np.nan]) 

482 >>> nanops.nansum(s) 

483 3.0 

484 """ 

485 values, mask, dtype, dtype_max, _ = _get_values( 

486 values, skipna, fill_value=0, mask=mask 

487 ) 

488 dtype_sum = dtype_max 

489 if is_float_dtype(dtype): 

490 dtype_sum = dtype 

491 elif is_timedelta64_dtype(dtype): 

492 dtype_sum = np.float64 

493 the_sum = values.sum(axis, dtype=dtype_sum) 

494 the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) 

495 

496 return _wrap_results(the_sum, dtype) 

497 

498 

499@disallow("M8", DatetimeTZDtype) 

500@bottleneck_switch() 

501def nanmean(values, axis=None, skipna=True, mask=None): 

502 """ 

503 Compute the mean of the element along an axis ignoring NaNs 

504 

505 Parameters 

506 ---------- 

507 values : ndarray 

508 axis: int, optional 

509 skipna : bool, default True 

510 mask : ndarray[bool], optional 

511 nan-mask if known 

512 

513 Returns 

514 ------- 

515 result : float 

516 Unless input is a float array, in which case use the same 

517 precision as the input array. 

518 

519 Examples 

520 -------- 

521 >>> import pandas.core.nanops as nanops 

522 >>> s = pd.Series([1, 2, np.nan]) 

523 >>> nanops.nanmean(s) 

524 1.5 

525 """ 

526 values, mask, dtype, dtype_max, _ = _get_values( 

527 values, skipna, fill_value=0, mask=mask 

528 ) 

529 dtype_sum = dtype_max 

530 dtype_count = np.float64 

531 if ( 

532 is_integer_dtype(dtype) 

533 or is_timedelta64_dtype(dtype) 

534 or is_datetime64_dtype(dtype) 

535 or is_datetime64tz_dtype(dtype) 

536 ): 

537 dtype_sum = np.float64 

538 elif is_float_dtype(dtype): 

539 dtype_sum = dtype 

540 dtype_count = dtype 

541 count = _get_counts(values.shape, mask, axis, dtype=dtype_count) 

542 the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) 

543 

544 if axis is not None and getattr(the_sum, "ndim", False): 

545 with np.errstate(all="ignore"): 

546 # suppress division by zero warnings 

547 the_mean = the_sum / count 

548 ct_mask = count == 0 

549 if ct_mask.any(): 

550 the_mean[ct_mask] = np.nan 

551 else: 

552 the_mean = the_sum / count if count > 0 else np.nan 

553 

554 return _wrap_results(the_mean, dtype) 

555 

556 

557@disallow("M8") 

558@bottleneck_switch() 

559def nanmedian(values, axis=None, skipna=True, mask=None): 

560 """ 

561 Parameters 

562 ---------- 

563 values : ndarray 

564 axis: int, optional 

565 skipna : bool, default True 

566 mask : ndarray[bool], optional 

567 nan-mask if known 

568 

569 Returns 

570 ------- 

571 result : float 

572 Unless input is a float array, in which case use the same 

573 precision as the input array. 

574 

575 Examples 

576 -------- 

577 >>> import pandas.core.nanops as nanops 

578 >>> s = pd.Series([1, np.nan, 2, 2]) 

579 >>> nanops.nanmedian(s) 

580 2.0 

581 """ 

582 

583 def get_median(x): 

584 mask = notna(x) 

585 if not skipna and not mask.all(): 

586 return np.nan 

587 return np.nanmedian(x[mask]) 

588 

589 values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) 

590 if not is_float_dtype(values): 

591 values = values.astype("f8") 

592 if mask is not None: 

593 values[mask] = np.nan 

594 

595 if axis is None: 

596 values = values.ravel() 

597 

598 notempty = values.size 

599 

600 # an array from a frame 

601 if values.ndim > 1: 

602 

603 # there's a non-empty array to apply over otherwise numpy raises 

604 if notempty: 

605 if not skipna: 

606 return _wrap_results( 

607 np.apply_along_axis(get_median, axis, values), dtype 

608 ) 

609 

610 # fastpath for the skipna case 

611 return _wrap_results(np.nanmedian(values, axis), dtype) 

612 

613 # must return the correct shape, but median is not defined for the 

614 # empty set so return nans of shape "everything but the passed axis" 

615 # since "axis" is where the reduction would occur if we had a nonempty 

616 # array 

617 shp = np.array(values.shape) 

618 dims = np.arange(values.ndim) 

619 ret = np.empty(shp[dims != axis]) 

620 ret.fill(np.nan) 

621 return _wrap_results(ret, dtype) 

622 

623 # otherwise return a scalar value 

624 return _wrap_results(get_median(values) if notempty else np.nan, dtype) 

625 

626 

627def _get_counts_nanvar( 

628 value_counts: Tuple[int], 

629 mask: Optional[np.ndarray], 

630 axis: Optional[int], 

631 ddof: int, 

632 dtype=float, 

633) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]: 

634 """ Get the count of non-null values along an axis, accounting 

635 for degrees of freedom. 

636 

637 Parameters 

638 ---------- 

639 values_shape : Tuple[int] 

640 shape tuple from values ndarray, used if mask is None 

641 mask : Optional[ndarray[bool]] 

642 locations in values that should be considered missing 

643 axis : Optional[int] 

644 axis to count along 

645 ddof : int 

646 degrees of freedom 

647 dtype : type, optional 

648 type to use for count 

649 

650 Returns 

651 ------- 

652 count : scalar or array 

653 d : scalar or array 

654 """ 

655 dtype = _get_dtype(dtype) 

656 count = _get_counts(value_counts, mask, axis, dtype=dtype) 

657 d = count - dtype.type(ddof) 

658 

659 # always return NaN, never inf 

660 if is_scalar(count): 

661 if count <= ddof: 

662 count = np.nan 

663 d = np.nan 

664 else: 

665 mask2: np.ndarray = count <= ddof 

666 if mask2.any(): 

667 np.putmask(d, mask2, np.nan) 

668 np.putmask(count, mask2, np.nan) 

669 return count, d 

670 

671 

672@disallow("M8") 

673@bottleneck_switch(ddof=1) 

674def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): 

675 """ 

676 Compute the standard deviation along given axis while ignoring NaNs 

677 

678 Parameters 

679 ---------- 

680 values : ndarray 

681 axis: int, optional 

682 skipna : bool, default True 

683 ddof : int, default 1 

684 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, 

685 where N represents the number of elements. 

686 mask : ndarray[bool], optional 

687 nan-mask if known 

688 

689 Returns 

690 ------- 

691 result : float 

692 Unless input is a float array, in which case use the same 

693 precision as the input array. 

694 

695 Examples 

696 -------- 

697 >>> import pandas.core.nanops as nanops 

698 >>> s = pd.Series([1, np.nan, 2, 3]) 

699 >>> nanops.nanstd(s) 

700 1.0 

701 """ 

702 orig_dtype = values.dtype 

703 values, mask, dtype, dtype_max, fill_value = _get_values(values, skipna, mask=mask) 

704 

705 result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)) 

706 return _wrap_results(result, orig_dtype) 

707 

708 

709@disallow("M8", "m8") 

710@bottleneck_switch(ddof=1) 

711def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): 

712 """ 

713 Compute the variance along given axis while ignoring NaNs 

714 

715 Parameters 

716 ---------- 

717 values : ndarray 

718 axis: int, optional 

719 skipna : bool, default True 

720 ddof : int, default 1 

721 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, 

722 where N represents the number of elements. 

723 mask : ndarray[bool], optional 

724 nan-mask if known 

725 

726 Returns 

727 ------- 

728 result : float 

729 Unless input is a float array, in which case use the same 

730 precision as the input array. 

731 

732 Examples 

733 -------- 

734 >>> import pandas.core.nanops as nanops 

735 >>> s = pd.Series([1, np.nan, 2, 3]) 

736 >>> nanops.nanvar(s) 

737 1.0 

738 """ 

739 values = lib.values_from_object(values) 

740 dtype = values.dtype 

741 mask = _maybe_get_mask(values, skipna, mask) 

742 if is_any_int_dtype(values): 

743 values = values.astype("f8") 

744 if mask is not None: 

745 values[mask] = np.nan 

746 

747 if is_float_dtype(values): 

748 count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) 

749 else: 

750 count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) 

751 

752 if skipna and mask is not None: 

753 values = values.copy() 

754 np.putmask(values, mask, 0) 

755 

756 # xref GH10242 

757 # Compute variance via two-pass algorithm, which is stable against 

758 # cancellation errors and relatively accurate for small numbers of 

759 # observations. 

760 # 

761 # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 

762 avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count 

763 if axis is not None: 

764 avg = np.expand_dims(avg, axis) 

765 sqr = _ensure_numeric((avg - values) ** 2) 

766 if mask is not None: 

767 np.putmask(sqr, mask, 0) 

768 result = sqr.sum(axis=axis, dtype=np.float64) / d 

769 

770 # Return variance as np.float64 (the datatype used in the accumulator), 

771 # unless we were dealing with a float array, in which case use the same 

772 # precision as the original values array. 

773 if is_float_dtype(dtype): 

774 result = result.astype(dtype) 

775 return _wrap_results(result, values.dtype) 

776 

777 

778@disallow("M8", "m8") 

779def nansem(values, axis=None, skipna=True, ddof=1, mask=None): 

780 """ 

781 Compute the standard error in the mean along given axis while ignoring NaNs 

782 

783 Parameters 

784 ---------- 

785 values : ndarray 

786 axis: int, optional 

787 skipna : bool, default True 

788 ddof : int, default 1 

789 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, 

790 where N represents the number of elements. 

791 mask : ndarray[bool], optional 

792 nan-mask if known 

793 

794 Returns 

795 ------- 

796 result : float64 

797 Unless input is a float array, in which case use the same 

798 precision as the input array. 

799 

800 Examples 

801 -------- 

802 >>> import pandas.core.nanops as nanops 

803 >>> s = pd.Series([1, np.nan, 2, 3]) 

804 >>> nanops.nansem(s) 

805 0.5773502691896258 

806 """ 

807 

808 # This checks if non-numeric-like data is passed with numeric_only=False 

809 # and raises a TypeError otherwise 

810 nanvar(values, axis, skipna, ddof=ddof, mask=mask) 

811 

812 mask = _maybe_get_mask(values, skipna, mask) 

813 if not is_float_dtype(values.dtype): 

814 values = values.astype("f8") 

815 

816 count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) 

817 var = nanvar(values, axis, skipna, ddof=ddof) 

818 

819 return np.sqrt(var) / np.sqrt(count) 

820 

821 

822def _nanminmax(meth, fill_value_typ): 

823 @bottleneck_switch(name="nan" + meth) 

824 def reduction(values, axis=None, skipna=True, mask=None): 

825 

826 values, mask, dtype, dtype_max, fill_value = _get_values( 

827 values, skipna, fill_value_typ=fill_value_typ, mask=mask 

828 ) 

829 

830 if (axis is not None and values.shape[axis] == 0) or values.size == 0: 

831 try: 

832 result = getattr(values, meth)(axis, dtype=dtype_max) 

833 result.fill(np.nan) 

834 except (AttributeError, TypeError, ValueError): 

835 result = np.nan 

836 else: 

837 result = getattr(values, meth)(axis) 

838 

839 result = _wrap_results(result, dtype, fill_value) 

840 return _maybe_null_out(result, axis, mask, values.shape) 

841 

842 return reduction 

843 

844 

845nanmin = _nanminmax("min", fill_value_typ="+inf") 

846nanmax = _nanminmax("max", fill_value_typ="-inf") 

847 

848 

849@disallow("O") 

850def nanargmax(values, axis=None, skipna=True, mask=None): 

851 """ 

852 Parameters 

853 ---------- 

854 values : ndarray 

855 axis: int, optional 

856 skipna : bool, default True 

857 mask : ndarray[bool], optional 

858 nan-mask if known 

859 

860 Returns 

861 ------- 

862 result : int 

863 The index of max value in specified axis or -1 in the NA case 

864 

865 Examples 

866 -------- 

867 >>> import pandas.core.nanops as nanops 

868 >>> s = pd.Series([1, 2, 3, np.nan, 4]) 

869 >>> nanops.nanargmax(s) 

870 4 

871 """ 

872 values, mask, dtype, _, _ = _get_values( 

873 values, True, fill_value_typ="-inf", mask=mask 

874 ) 

875 result = values.argmax(axis) 

876 result = _maybe_arg_null_out(result, axis, mask, skipna) 

877 return result 

878 

879 

880@disallow("O") 

881def nanargmin(values, axis=None, skipna=True, mask=None): 

882 """ 

883 Parameters 

884 ---------- 

885 values : ndarray 

886 axis: int, optional 

887 skipna : bool, default True 

888 mask : ndarray[bool], optional 

889 nan-mask if known 

890 

891 Returns 

892 ------- 

893 result : int 

894 The index of min value in specified axis or -1 in the NA case 

895 

896 Examples 

897 -------- 

898 >>> import pandas.core.nanops as nanops 

899 >>> s = pd.Series([1, 2, 3, np.nan, 4]) 

900 >>> nanops.nanargmin(s) 

901 0 

902 """ 

903 values, mask, dtype, _, _ = _get_values( 

904 values, True, fill_value_typ="+inf", mask=mask 

905 ) 

906 result = values.argmin(axis) 

907 result = _maybe_arg_null_out(result, axis, mask, skipna) 

908 return result 

909 

910 

911@disallow("M8", "m8") 

912def nanskew(values, axis=None, skipna=True, mask=None): 

913 """ Compute the sample skewness. 

914 

915 The statistic computed here is the adjusted Fisher-Pearson standardized 

916 moment coefficient G1. The algorithm computes this coefficient directly 

917 from the second and third central moment. 

918 

919 Parameters 

920 ---------- 

921 values : ndarray 

922 axis: int, optional 

923 skipna : bool, default True 

924 mask : ndarray[bool], optional 

925 nan-mask if known 

926 

927 Returns 

928 ------- 

929 result : float64 

930 Unless input is a float array, in which case use the same 

931 precision as the input array. 

932 

933 Examples 

934 -------- 

935 >>> import pandas.core.nanops as nanops 

936 >>> s = pd.Series([1,np.nan, 1, 2]) 

937 >>> nanops.nanskew(s) 

938 1.7320508075688787 

939 """ 

940 values = lib.values_from_object(values) 

941 mask = _maybe_get_mask(values, skipna, mask) 

942 if not is_float_dtype(values.dtype): 

943 values = values.astype("f8") 

944 count = _get_counts(values.shape, mask, axis) 

945 else: 

946 count = _get_counts(values.shape, mask, axis, dtype=values.dtype) 

947 

948 if skipna and mask is not None: 

949 values = values.copy() 

950 np.putmask(values, mask, 0) 

951 

952 mean = values.sum(axis, dtype=np.float64) / count 

953 if axis is not None: 

954 mean = np.expand_dims(mean, axis) 

955 

956 adjusted = values - mean 

957 if skipna and mask is not None: 

958 np.putmask(adjusted, mask, 0) 

959 adjusted2 = adjusted ** 2 

960 adjusted3 = adjusted2 * adjusted 

961 m2 = adjusted2.sum(axis, dtype=np.float64) 

962 m3 = adjusted3.sum(axis, dtype=np.float64) 

963 

964 # floating point error 

965 # 

966 # #18044 in _libs/windows.pyx calc_skew follow this behavior 

967 # to fix the fperr to treat m2 <1e-14 as zero 

968 m2 = _zero_out_fperr(m2) 

969 m3 = _zero_out_fperr(m3) 

970 

971 with np.errstate(invalid="ignore", divide="ignore"): 

972 result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) 

973 

974 dtype = values.dtype 

975 if is_float_dtype(dtype): 

976 result = result.astype(dtype) 

977 

978 if isinstance(result, np.ndarray): 

979 result = np.where(m2 == 0, 0, result) 

980 result[count < 3] = np.nan 

981 return result 

982 else: 

983 result = 0 if m2 == 0 else result 

984 if count < 3: 

985 return np.nan 

986 return result 

987 

988 

989@disallow("M8", "m8") 

990def nankurt(values, axis=None, skipna=True, mask=None): 

991 """ 

992 Compute the sample excess kurtosis 

993 

994 The statistic computed here is the adjusted Fisher-Pearson standardized 

995 moment coefficient G2, computed directly from the second and fourth 

996 central moment. 

997 

998 Parameters 

999 ---------- 

1000 values : ndarray 

1001 axis: int, optional 

1002 skipna : bool, default True 

1003 mask : ndarray[bool], optional 

1004 nan-mask if known 

1005 

1006 Returns 

1007 ------- 

1008 result : float64 

1009 Unless input is a float array, in which case use the same 

1010 precision as the input array. 

1011 

1012 Examples 

1013 -------- 

1014 >>> import pandas.core.nanops as nanops 

1015 >>> s = pd.Series([1,np.nan, 1, 3, 2]) 

1016 >>> nanops.nankurt(s) 

1017 -1.2892561983471076 

1018 """ 

1019 values = lib.values_from_object(values) 

1020 mask = _maybe_get_mask(values, skipna, mask) 

1021 if not is_float_dtype(values.dtype): 

1022 values = values.astype("f8") 

1023 count = _get_counts(values.shape, mask, axis) 

1024 else: 

1025 count = _get_counts(values.shape, mask, axis, dtype=values.dtype) 

1026 

1027 if skipna and mask is not None: 

1028 values = values.copy() 

1029 np.putmask(values, mask, 0) 

1030 

1031 mean = values.sum(axis, dtype=np.float64) / count 

1032 if axis is not None: 

1033 mean = np.expand_dims(mean, axis) 

1034 

1035 adjusted = values - mean 

1036 if skipna and mask is not None: 

1037 np.putmask(adjusted, mask, 0) 

1038 adjusted2 = adjusted ** 2 

1039 adjusted4 = adjusted2 ** 2 

1040 m2 = adjusted2.sum(axis, dtype=np.float64) 

1041 m4 = adjusted4.sum(axis, dtype=np.float64) 

1042 

1043 with np.errstate(invalid="ignore", divide="ignore"): 

1044 adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) 

1045 numer = count * (count + 1) * (count - 1) * m4 

1046 denom = (count - 2) * (count - 3) * m2 ** 2 

1047 

1048 # floating point error 

1049 # 

1050 # #18044 in _libs/windows.pyx calc_kurt follow this behavior 

1051 # to fix the fperr to treat denom <1e-14 as zero 

1052 numer = _zero_out_fperr(numer) 

1053 denom = _zero_out_fperr(denom) 

1054 

1055 if not isinstance(denom, np.ndarray): 

1056 # if ``denom`` is a scalar, check these corner cases first before 

1057 # doing division 

1058 if count < 4: 

1059 return np.nan 

1060 if denom == 0: 

1061 return 0 

1062 

1063 with np.errstate(invalid="ignore", divide="ignore"): 

1064 result = numer / denom - adj 

1065 

1066 dtype = values.dtype 

1067 if is_float_dtype(dtype): 

1068 result = result.astype(dtype) 

1069 

1070 if isinstance(result, np.ndarray): 

1071 result = np.where(denom == 0, 0, result) 

1072 result[count < 4] = np.nan 

1073 

1074 return result 

1075 

1076 

1077@disallow("M8", "m8") 

1078def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): 

1079 """ 

1080 Parameters 

1081 ---------- 

1082 values : ndarray[dtype] 

1083 axis: int, optional 

1084 skipna : bool, default True 

1085 min_count: int, default 0 

1086 mask : ndarray[bool], optional 

1087 nan-mask if known 

1088 

1089 Returns 

1090 ------- 

1091 result : dtype 

1092 

1093 Examples 

1094 -------- 

1095 >>> import pandas.core.nanops as nanops 

1096 >>> s = pd.Series([1, 2, 3, np.nan]) 

1097 >>> nanops.nanprod(s) 

1098 6.0 

1099 

1100 Returns 

1101 ------- 

1102 The product of all elements on a given axis. ( NaNs are treated as 1) 

1103 """ 

1104 mask = _maybe_get_mask(values, skipna, mask) 

1105 

1106 if skipna and mask is not None: 

1107 values = values.copy() 

1108 values[mask] = 1 

1109 result = values.prod(axis) 

1110 return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count) 

1111 

1112 

1113def _maybe_arg_null_out( 

1114 result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool 

1115) -> Union[np.ndarray, int]: 

1116 # helper function for nanargmin/nanargmax 

1117 if mask is None: 

1118 return result 

1119 

1120 if axis is None or not getattr(result, "ndim", False): 

1121 if skipna: 

1122 if mask.all(): 

1123 result = -1 

1124 else: 

1125 if mask.any(): 

1126 result = -1 

1127 else: 

1128 if skipna: 

1129 na_mask = mask.all(axis) 

1130 else: 

1131 na_mask = mask.any(axis) 

1132 if na_mask.any(): 

1133 result[na_mask] = -1 

1134 return result 

1135 

1136 

1137def _get_counts( 

1138 values_shape: Tuple[int], 

1139 mask: Optional[np.ndarray], 

1140 axis: Optional[int], 

1141 dtype=float, 

1142) -> Union[int, np.ndarray]: 

1143 """ Get the count of non-null values along an axis 

1144 

1145 Parameters 

1146 ---------- 

1147 values_shape : Tuple[int] 

1148 shape tuple from values ndarray, used if mask is None 

1149 mask : Optional[ndarray[bool]] 

1150 locations in values that should be considered missing 

1151 axis : Optional[int] 

1152 axis to count along 

1153 dtype : type, optional 

1154 type to use for count 

1155 

1156 Returns 

1157 ------- 

1158 count : scalar or array 

1159 """ 

1160 dtype = _get_dtype(dtype) 

1161 if axis is None: 

1162 if mask is not None: 

1163 n = mask.size - mask.sum() 

1164 else: 

1165 n = np.prod(values_shape) 

1166 return dtype.type(n) 

1167 

1168 if mask is not None: 

1169 count = mask.shape[axis] - mask.sum(axis) 

1170 else: 

1171 count = values_shape[axis] 

1172 

1173 if is_scalar(count): 

1174 return dtype.type(count) 

1175 try: 

1176 return count.astype(dtype) 

1177 except AttributeError: 

1178 return np.array(count, dtype=dtype) 

1179 

1180 

1181def _maybe_null_out( 

1182 result: np.ndarray, 

1183 axis: Optional[int], 

1184 mask: Optional[np.ndarray], 

1185 shape: Tuple, 

1186 min_count: int = 1, 

1187) -> np.ndarray: 

1188 if mask is not None and axis is not None and getattr(result, "ndim", False): 

1189 null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 

1190 if np.any(null_mask): 

1191 if is_numeric_dtype(result): 

1192 if np.iscomplexobj(result): 

1193 result = result.astype("c16") 

1194 else: 

1195 result = result.astype("f8") 

1196 result[null_mask] = np.nan 

1197 else: 

1198 # GH12941, use None to auto cast null 

1199 result[null_mask] = None 

1200 elif result is not NaT: 

1201 if mask is not None: 

1202 null_mask = mask.size - mask.sum() 

1203 else: 

1204 null_mask = np.prod(shape) 

1205 if null_mask < min_count: 

1206 result = np.nan 

1207 

1208 return result 

1209 

1210 

1211def _zero_out_fperr(arg): 

1212 # #18044 reference this behavior to fix rolling skew/kurt issue 

1213 if isinstance(arg, np.ndarray): 

1214 with np.errstate(invalid="ignore"): 

1215 return np.where(np.abs(arg) < 1e-14, 0, arg) 

1216 else: 

1217 return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg 

1218 

1219 

1220@disallow("M8", "m8") 

1221def nancorr(a, b, method="pearson", min_periods=None): 

1222 """ 

1223 a, b: ndarrays 

1224 """ 

1225 if len(a) != len(b): 

1226 raise AssertionError("Operands to nancorr must have same size") 

1227 

1228 if min_periods is None: 

1229 min_periods = 1 

1230 

1231 valid = notna(a) & notna(b) 

1232 if not valid.all(): 

1233 a = a[valid] 

1234 b = b[valid] 

1235 

1236 if len(a) < min_periods: 

1237 return np.nan 

1238 

1239 f = get_corr_func(method) 

1240 return f(a, b) 

1241 

1242 

1243def get_corr_func(method): 

1244 if method in ["kendall", "spearman"]: 

1245 from scipy.stats import kendalltau, spearmanr 

1246 elif method in ["pearson"]: 

1247 pass 

1248 elif callable(method): 

1249 return method 

1250 else: 

1251 raise ValueError( 

1252 f"Unkown method '{method}', expected one of 'kendall', 'spearman'" 

1253 ) 

1254 

1255 def _pearson(a, b): 

1256 return np.corrcoef(a, b)[0, 1] 

1257 

1258 def _kendall(a, b): 

1259 # kendallttau returns a tuple of the tau statistic and pvalue 

1260 rs = kendalltau(a, b) 

1261 return rs[0] 

1262 

1263 def _spearman(a, b): 

1264 return spearmanr(a, b)[0] 

1265 

1266 _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman} 

1267 return _cor_methods[method] 

1268 

1269 

1270@disallow("M8", "m8") 

1271def nancov(a, b, min_periods=None): 

1272 if len(a) != len(b): 

1273 raise AssertionError("Operands to nancov must have same size") 

1274 

1275 if min_periods is None: 

1276 min_periods = 1 

1277 

1278 valid = notna(a) & notna(b) 

1279 if not valid.all(): 

1280 a = a[valid] 

1281 b = b[valid] 

1282 

1283 if len(a) < min_periods: 

1284 return np.nan 

1285 

1286 return np.cov(a, b)[0, 1] 

1287 

1288 

1289def _ensure_numeric(x): 

1290 if isinstance(x, np.ndarray): 

1291 if is_integer_dtype(x) or is_bool_dtype(x): 

1292 x = x.astype(np.float64) 

1293 elif is_object_dtype(x): 

1294 try: 

1295 x = x.astype(np.complex128) 

1296 except (TypeError, ValueError): 

1297 x = x.astype(np.float64) 

1298 else: 

1299 if not np.any(np.imag(x)): 

1300 x = x.real 

1301 elif not (is_float(x) or is_integer(x) or is_complex(x)): 

1302 try: 

1303 x = float(x) 

1304 except ValueError: 

1305 # e.g. "1+1j" or "foo" 

1306 try: 

1307 x = complex(x) 

1308 except ValueError: 

1309 # e.g. "foo" 

1310 raise TypeError(f"Could not convert {x} to numeric") 

1311 return x 

1312 

1313 

1314# NA-friendly array comparisons 

1315 

1316 

1317def make_nancomp(op): 

1318 def f(x, y): 

1319 xmask = isna(x) 

1320 ymask = isna(y) 

1321 mask = xmask | ymask 

1322 

1323 with np.errstate(all="ignore"): 

1324 result = op(x, y) 

1325 

1326 if mask.any(): 

1327 if is_bool_dtype(result): 

1328 result = result.astype("O") 

1329 np.putmask(result, mask, np.nan) 

1330 

1331 return result 

1332 

1333 return f 

1334 

1335 

1336nangt = make_nancomp(operator.gt) 

1337nange = make_nancomp(operator.ge) 

1338nanlt = make_nancomp(operator.lt) 

1339nanle = make_nancomp(operator.le) 

1340naneq = make_nancomp(operator.eq) 

1341nanne = make_nancomp(operator.ne) 

1342 

1343 

1344def _nanpercentile_1d(values, mask, q, na_value, interpolation): 

1345 """ 

1346 Wrapper for np.percentile that skips missing values, specialized to 

1347 1-dimensional case. 

1348 

1349 Parameters 

1350 ---------- 

1351 values : array over which to find quantiles 

1352 mask : ndarray[bool] 

1353 locations in values that should be considered missing 

1354 q : scalar or array of quantile indices to find 

1355 na_value : scalar 

1356 value to return for empty or all-null values 

1357 interpolation : str 

1358 

1359 Returns 

1360 ------- 

1361 quantiles : scalar or array 

1362 """ 

1363 # mask is Union[ExtensionArray, ndarray] 

1364 values = values[~mask] 

1365 

1366 if len(values) == 0: 

1367 if lib.is_scalar(q): 

1368 return na_value 

1369 else: 

1370 return np.array([na_value] * len(q), dtype=values.dtype) 

1371 

1372 return np.percentile(values, q, interpolation=interpolation) 

1373 

1374 

1375def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): 

1376 """ 

1377 Wrapper for np.percentile that skips missing values. 

1378 

1379 Parameters 

1380 ---------- 

1381 values : array over which to find quantiles 

1382 q : scalar or array of quantile indices to find 

1383 axis : {0, 1} 

1384 na_value : scalar 

1385 value to return for empty or all-null values 

1386 mask : ndarray[bool] 

1387 locations in values that should be considered missing 

1388 ndim : {1, 2} 

1389 interpolation : str 

1390 

1391 Returns 

1392 ------- 

1393 quantiles : scalar or array 

1394 """ 

1395 if values.dtype.kind in ["m", "M"]: 

1396 # need to cast to integer to avoid rounding errors in numpy 

1397 result = nanpercentile( 

1398 values.view("i8"), q, axis, na_value.view("i8"), mask, ndim, interpolation 

1399 ) 

1400 

1401 # Note: we have to do do `astype` and not view because in general we 

1402 # have float result at this point, not i8 

1403 return result.astype(values.dtype) 

1404 

1405 if not lib.is_scalar(mask) and mask.any(): 

1406 if ndim == 1: 

1407 return _nanpercentile_1d( 

1408 values, mask, q, na_value, interpolation=interpolation 

1409 ) 

1410 else: 

1411 # for nonconsolidatable blocks mask is 1D, but values 2D 

1412 if mask.ndim < values.ndim: 

1413 mask = mask.reshape(values.shape) 

1414 if axis == 0: 

1415 values = values.T 

1416 mask = mask.T 

1417 result = [ 

1418 _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation) 

1419 for (val, m) in zip(list(values), list(mask)) 

1420 ] 

1421 result = np.array(result, dtype=values.dtype, copy=False).T 

1422 return result 

1423 else: 

1424 return np.percentile(values, q, axis=axis, interpolation=interpolation)