Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Extend pandas with custom array types""" 

2from typing import Any, List, Optional, Tuple, Type 

3 

4import numpy as np 

5 

6from pandas.errors import AbstractMethodError 

7 

8from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries 

9 

10 

11class ExtensionDtype: 

12 """ 

13 A custom data type, to be paired with an ExtensionArray. 

14 

15 .. versionadded:: 0.23.0 

16 

17 See Also 

18 -------- 

19 extensions.register_extension_dtype 

20 extensions.ExtensionArray 

21 

22 Notes 

23 ----- 

24 The interface includes the following abstract methods that must 

25 be implemented by subclasses: 

26 

27 * type 

28 * name 

29 * construct_from_string 

30 

31 The following attributes influence the behavior of the dtype in 

32 pandas operations 

33 

34 * _is_numeric 

35 * _is_boolean 

36 

37 Optionally one can override construct_array_type for construction 

38 with the name of this dtype via the Registry. See 

39 :meth:`extensions.register_extension_dtype`. 

40 

41 * construct_array_type 

42 

43 The `na_value` class attribute can be used to set the default NA value 

44 for this type. :attr:`numpy.nan` is used by default. 

45 

46 ExtensionDtypes are required to be hashable. The base class provides 

47 a default implementation, which relies on the ``_metadata`` class 

48 attribute. ``_metadata`` should be a tuple containing the strings 

49 that define your data type. For example, with ``PeriodDtype`` that's 

50 the ``freq`` attribute. 

51 

52 **If you have a parametrized dtype you should set the ``_metadata`` 

53 class property**. 

54 

55 Ideally, the attributes in ``_metadata`` will match the 

56 parameters to your ``ExtensionDtype.__init__`` (if any). If any of 

57 the attributes in ``_metadata`` don't implement the standard 

58 ``__eq__`` or ``__hash__``, the default implementations here will not 

59 work. 

60 

61 .. versionchanged:: 0.24.0 

62 

63 Added ``_metadata``, ``__hash__``, and changed the default definition 

64 of ``__eq__``. 

65 

66 For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method 

67 can be implemented: this method receives a pyarrow Array or ChunkedArray 

68 as only argument and is expected to return the appropriate pandas 

69 ExtensionArray for this dtype and the passed values:: 

70 

71 class ExtensionDtype: 

72 

73 def __from_arrow__( 

74 self, array: pyarrow.Array/ChunkedArray 

75 ) -> ExtensionArray: 

76 ... 

77 

78 This class does not inherit from 'abc.ABCMeta' for performance reasons. 

79 Methods and properties required by the interface raise 

80 ``pandas.errors.AbstractMethodError`` and no ``register`` method is 

81 provided for registering virtual subclasses. 

82 """ 

83 

84 _metadata: Tuple[str, ...] = () 

85 

86 def __str__(self) -> str: 

87 return self.name 

88 

89 def __eq__(self, other: Any) -> bool: 

90 """ 

91 Check whether 'other' is equal to self. 

92 

93 By default, 'other' is considered equal if either 

94 

95 * it's a string matching 'self.name'. 

96 * it's an instance of this type and all of the 

97 the attributes in ``self._metadata`` are equal between 

98 `self` and `other`. 

99 

100 Parameters 

101 ---------- 

102 other : Any 

103 

104 Returns 

105 ------- 

106 bool 

107 """ 

108 if isinstance(other, str): 

109 try: 

110 other = self.construct_from_string(other) 

111 except TypeError: 

112 return False 

113 if isinstance(other, type(self)): 

114 return all( 

115 getattr(self, attr) == getattr(other, attr) for attr in self._metadata 

116 ) 

117 return False 

118 

119 def __hash__(self) -> int: 

120 return hash(tuple(getattr(self, attr) for attr in self._metadata)) 

121 

122 def __ne__(self, other) -> bool: 

123 return not self.__eq__(other) 

124 

125 @property 

126 def na_value(self): 

127 """ 

128 Default NA value to use for this type. 

129 

130 This is used in e.g. ExtensionArray.take. This should be the 

131 user-facing "boxed" version of the NA value, not the physical NA value 

132 for storage. e.g. for JSONArray, this is an empty dictionary. 

133 """ 

134 return np.nan 

135 

136 @property 

137 def type(self) -> Type: 

138 """ 

139 The scalar type for the array, e.g. ``int`` 

140 

141 It's expected ``ExtensionArray[item]`` returns an instance 

142 of ``ExtensionDtype.type`` for scalar ``item``, assuming 

143 that value is valid (not NA). NA values do not need to be 

144 instances of `type`. 

145 """ 

146 raise AbstractMethodError(self) 

147 

148 @property 

149 def kind(self) -> str: 

150 """ 

151 A character code (one of 'biufcmMOSUV'), default 'O' 

152 

153 This should match the NumPy dtype used when the array is 

154 converted to an ndarray, which is probably 'O' for object if 

155 the extension type cannot be represented as a built-in NumPy 

156 type. 

157 

158 See Also 

159 -------- 

160 numpy.dtype.kind 

161 """ 

162 return "O" 

163 

164 @property 

165 def name(self) -> str: 

166 """ 

167 A string identifying the data type. 

168 

169 Will be used for display in, e.g. ``Series.dtype`` 

170 """ 

171 raise AbstractMethodError(self) 

172 

173 @property 

174 def names(self) -> Optional[List[str]]: 

175 """ 

176 Ordered list of field names, or None if there are no fields. 

177 

178 This is for compatibility with NumPy arrays, and may be removed in the 

179 future. 

180 """ 

181 return None 

182 

183 @classmethod 

184 def construct_array_type(cls): 

185 """ 

186 Return the array type associated with this dtype. 

187 

188 Returns 

189 ------- 

190 type 

191 """ 

192 raise NotImplementedError 

193 

194 @classmethod 

195 def construct_from_string(cls, string: str): 

196 r""" 

197 Construct this type from a string. 

198 

199 This is useful mainly for data types that accept parameters. 

200 For example, a period dtype accepts a frequency parameter that 

201 can be set as ``period[H]`` (where H means hourly frequency). 

202 

203 By default, in the abstract class, just the name of the type is 

204 expected. But subclasses can overwrite this method to accept 

205 parameters. 

206 

207 Parameters 

208 ---------- 

209 string : str 

210 The name of the type, for example ``category``. 

211 

212 Returns 

213 ------- 

214 ExtensionDtype 

215 Instance of the dtype. 

216 

217 Raises 

218 ------ 

219 TypeError 

220 If a class cannot be constructed from this 'string'. 

221 

222 Examples 

223 -------- 

224 For extension dtypes with arguments the following may be an 

225 adequate implementation. 

226 

227 >>> @classmethod 

228 ... def construct_from_string(cls, string): 

229 ... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$") 

230 ... match = pattern.match(string) 

231 ... if match: 

232 ... return cls(**match.groupdict()) 

233 ... else: 

234 ... raise TypeError(f"Cannot construct a '{cls.__name__}' from 

235 ... " "'{string}'") 

236 """ 

237 if not isinstance(string, str): 

238 raise TypeError(f"Expects a string, got {type(string).__name__}") 

239 

240 # error: Non-overlapping equality check (left operand type: "str", right 

241 # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] 

242 assert isinstance(cls.name, str), (cls, type(cls.name)) 

243 if string != cls.name: 

244 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") 

245 return cls() 

246 

247 @classmethod 

248 def is_dtype(cls, dtype) -> bool: 

249 """ 

250 Check if we match 'dtype'. 

251 

252 Parameters 

253 ---------- 

254 dtype : object 

255 The object to check. 

256 

257 Returns 

258 ------- 

259 is_dtype : bool 

260 

261 Notes 

262 ----- 

263 The default implementation is True if 

264 

265 1. ``cls.construct_from_string(dtype)`` is an instance 

266 of ``cls``. 

267 2. ``dtype`` is an object and is an instance of ``cls`` 

268 3. ``dtype`` has a ``dtype`` attribute, and any of the above 

269 conditions is true for ``dtype.dtype``. 

270 """ 

271 dtype = getattr(dtype, "dtype", dtype) 

272 

273 if isinstance(dtype, (ABCSeries, ABCIndexClass, ABCDataFrame, np.dtype)): 

274 # https://github.com/pandas-dev/pandas/issues/22960 

275 # avoid passing data to `construct_from_string`. This could 

276 # cause a FutureWarning from numpy about failing elementwise 

277 # comparison from, e.g., comparing DataFrame == 'category'. 

278 return False 

279 elif dtype is None: 

280 return False 

281 elif isinstance(dtype, cls): 

282 return True 

283 if isinstance(dtype, str): 

284 try: 

285 return cls.construct_from_string(dtype) is not None 

286 except TypeError: 

287 return False 

288 return False 

289 

290 @property 

291 def _is_numeric(self) -> bool: 

292 """ 

293 Whether columns with this dtype should be considered numeric. 

294 

295 By default ExtensionDtypes are assumed to be non-numeric. 

296 They'll be excluded from operations that exclude non-numeric 

297 columns, like (groupby) reductions, plotting, etc. 

298 """ 

299 return False 

300 

301 @property 

302 def _is_boolean(self) -> bool: 

303 """ 

304 Whether this dtype should be considered boolean. 

305 

306 By default, ExtensionDtypes are assumed to be non-numeric. 

307 Setting this to True will affect the behavior of several places, 

308 e.g. 

309 

310 * is_bool 

311 * boolean indexing 

312 

313 Returns 

314 ------- 

315 bool 

316 """ 

317 return False