Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from typing import TYPE_CHECKING 

2 

3import numpy as np 

4 

5from pandas._libs import lib, missing as libmissing 

6 

7from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype 

8from pandas.core.dtypes.missing import isna, notna 

9 

10from pandas.core.algorithms import take 

11from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin 

12from pandas.core.indexers import check_array_indexer 

13 

14if TYPE_CHECKING: 

15 from pandas._typing import Scalar 

16 

17 

18class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): 

19 """ 

20 Base class for masked arrays (which use _data and _mask to store the data). 

21 

22 numpy based 

23 """ 

24 

25 _data: np.ndarray 

26 _mask: np.ndarray 

27 

28 # The value used to fill '_data' to avoid upcasting 

29 _internal_fill_value: "Scalar" 

30 

31 def __getitem__(self, item): 

32 if is_integer(item): 

33 if self._mask[item]: 

34 return self.dtype.na_value 

35 return self._data[item] 

36 

37 item = check_array_indexer(self, item) 

38 

39 return type(self)(self._data[item], self._mask[item]) 

40 

41 def __iter__(self): 

42 for i in range(len(self)): 

43 if self._mask[i]: 

44 yield self.dtype.na_value 

45 else: 

46 yield self._data[i] 

47 

48 def __len__(self) -> int: 

49 return len(self._data) 

50 

51 def __invert__(self): 

52 return type(self)(~self._data, self._mask) 

53 

54 def to_numpy( 

55 self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, 

56 ): 

57 """ 

58 Convert to a NumPy Array. 

59 

60 By default converts to an object-dtype NumPy array. Specify the `dtype` and 

61 `na_value` keywords to customize the conversion. 

62 

63 Parameters 

64 ---------- 

65 dtype : dtype, default object 

66 The numpy dtype to convert to. 

67 copy : bool, default False 

68 Whether to ensure that the returned value is a not a view on 

69 the array. Note that ``copy=False`` does not *ensure* that 

70 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 

71 a copy is made, even if not strictly necessary. This is typically 

72 only possible when no missing values are present and `dtype` 

73 is the equivalent numpy dtype. 

74 na_value : scalar, optional 

75 Scalar missing value indicator to use in numpy array. Defaults 

76 to the native missing value indicator of this array (pd.NA). 

77 

78 Returns 

79 ------- 

80 numpy.ndarray 

81 

82 Examples 

83 -------- 

84 An object-dtype is the default result 

85 

86 >>> a = pd.array([True, False, pd.NA], dtype="boolean") 

87 >>> a.to_numpy() 

88 array([True, False, NA], dtype=object) 

89 

90 When no missing values are present, an equivalent dtype can be used. 

91 

92 >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") 

93 array([ True, False]) 

94 >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") 

95 array([1, 2]) 

96 

97 However, requesting such dtype will raise a ValueError if 

98 missing values are present and the default missing value :attr:`NA` 

99 is used. 

100 

101 >>> a = pd.array([True, False, pd.NA], dtype="boolean") 

102 >>> a 

103 <BooleanArray> 

104 [True, False, NA] 

105 Length: 3, dtype: boolean 

106 

107 >>> a.to_numpy(dtype="bool") 

108 Traceback (most recent call last): 

109 ... 

110 ValueError: cannot convert to bool numpy array in presence of missing values 

111 

112 Specify a valid `na_value` instead 

113 

114 >>> a.to_numpy(dtype="bool", na_value=False) 

115 array([ True, False, False]) 

116 """ 

117 if na_value is lib.no_default: 

118 na_value = libmissing.NA 

119 if dtype is None: 

120 dtype = object 

121 if self._hasna: 

122 if ( 

123 not (is_object_dtype(dtype) or is_string_dtype(dtype)) 

124 and na_value is libmissing.NA 

125 ): 

126 raise ValueError( 

127 f"cannot convert to '{dtype}'-dtype NumPy array " 

128 "with missing values. Specify an appropriate 'na_value' " 

129 "for this dtype." 

130 ) 

131 # don't pass copy to astype -> always need a copy since we are mutating 

132 data = self._data.astype(dtype) 

133 data[self._mask] = na_value 

134 else: 

135 data = self._data.astype(dtype, copy=copy) 

136 return data 

137 

138 __array_priority__ = 1000 # higher than ndarray so ops dispatch to us 

139 

140 def __array__(self, dtype=None) -> np.ndarray: 

141 """ 

142 the array interface, return my values 

143 We return an object array here to preserve our scalar values 

144 """ 

145 return self.to_numpy(dtype=dtype) 

146 

147 def __arrow_array__(self, type=None): 

148 """ 

149 Convert myself into a pyarrow Array. 

150 """ 

151 import pyarrow as pa 

152 

153 return pa.array(self._data, mask=self._mask, type=type) 

154 

155 @property 

156 def _hasna(self) -> bool: 

157 # Note: this is expensive right now! The hope is that we can 

158 # make this faster by having an optional mask, but not have to change 

159 # source code using it.. 

160 return self._mask.any() 

161 

162 def isna(self): 

163 return self._mask 

164 

165 @property 

166 def _na_value(self): 

167 return self.dtype.na_value 

168 

169 @property 

170 def nbytes(self): 

171 return self._data.nbytes + self._mask.nbytes 

172 

173 @classmethod 

174 def _concat_same_type(cls, to_concat): 

175 data = np.concatenate([x._data for x in to_concat]) 

176 mask = np.concatenate([x._mask for x in to_concat]) 

177 return cls(data, mask) 

178 

179 def take(self, indexer, allow_fill=False, fill_value=None): 

180 # we always fill with 1 internally 

181 # to avoid upcasting 

182 data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value 

183 result = take( 

184 self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill 

185 ) 

186 

187 mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) 

188 

189 # if we are filling 

190 # we only fill where the indexer is null 

191 # not existing missing values 

192 # TODO(jreback) what if we have a non-na float as a fill value? 

193 if allow_fill and notna(fill_value): 

194 fill_mask = np.asarray(indexer) == -1 

195 result[fill_mask] = fill_value 

196 mask = mask ^ fill_mask 

197 

198 return type(self)(result, mask, copy=False) 

199 

200 def copy(self): 

201 data, mask = self._data, self._mask 

202 data = data.copy() 

203 mask = mask.copy() 

204 return type(self)(data, mask, copy=False) 

205 

206 def value_counts(self, dropna=True): 

207 """ 

208 Returns a Series containing counts of each unique value. 

209 

210 Parameters 

211 ---------- 

212 dropna : bool, default True 

213 Don't include counts of missing values. 

214 

215 Returns 

216 ------- 

217 counts : Series 

218 

219 See Also 

220 -------- 

221 Series.value_counts 

222 """ 

223 from pandas import Index, Series 

224 from pandas.arrays import IntegerArray 

225 

226 # compute counts on the data with no nans 

227 data = self._data[~self._mask] 

228 value_counts = Index(data).value_counts() 

229 

230 # TODO(extension) 

231 # if we have allow Index to hold an ExtensionArray 

232 # this is easier 

233 index = value_counts.index.values.astype(object) 

234 

235 # if we want nans, count the mask 

236 if dropna: 

237 counts = value_counts.values 

238 else: 

239 counts = np.empty(len(value_counts) + 1, dtype="int64") 

240 counts[:-1] = value_counts 

241 counts[-1] = self._mask.sum() 

242 

243 index = Index( 

244 np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), 

245 dtype=object, 

246 ) 

247 

248 mask = np.zeros(len(counts), dtype="bool") 

249 counts = IntegerArray(counts, mask) 

250 

251 return Series(counts, index=index)