Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/arrays/sparse/dtype.py : 29%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Sparse Dtype"""
3import re
4from typing import Any, Tuple
6import numpy as np
8from pandas._typing import Dtype
10from pandas.core.dtypes.base import ExtensionDtype
11from pandas.core.dtypes.cast import astype_nansafe
12from pandas.core.dtypes.common import (
13 is_bool_dtype,
14 is_object_dtype,
15 is_scalar,
16 is_string_dtype,
17 pandas_dtype,
18)
19from pandas.core.dtypes.dtypes import register_extension_dtype
20from pandas.core.dtypes.missing import isna, na_value_for_dtype
23@register_extension_dtype
24class SparseDtype(ExtensionDtype):
25 """
26 Dtype for data stored in :class:`SparseArray`.
28 This dtype implements the pandas ExtensionDtype interface.
30 .. versionadded:: 0.24.0
32 Parameters
33 ----------
34 dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
35 The dtype of the underlying array storing the non-fill value values.
36 fill_value : scalar, optional
37 The scalar value not stored in the SparseArray. By default, this
38 depends on `dtype`.
40 =========== ==========
41 dtype na_value
42 =========== ==========
43 float ``np.nan``
44 int ``0``
45 bool ``False``
46 datetime64 ``pd.NaT``
47 timedelta64 ``pd.NaT``
48 =========== ==========
50 The default value may be overridden by specifying a `fill_value`.
52 Attributes
53 ----------
54 None
56 Methods
57 -------
58 None
59 """
61 # We include `_is_na_fill_value` in the metadata to avoid hash collisions
62 # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
63 # Without is_na_fill_value in the comparison, those would be equal since
64 # hash(nan) is (sometimes?) 0.
65 _metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
67 def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
69 if isinstance(dtype, type(self)):
70 if fill_value is None:
71 fill_value = dtype.fill_value
72 dtype = dtype.subtype
74 dtype = pandas_dtype(dtype)
75 if is_string_dtype(dtype):
76 dtype = np.dtype("object")
78 if fill_value is None:
79 fill_value = na_value_for_dtype(dtype)
81 if not is_scalar(fill_value):
82 raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead")
83 self._dtype = dtype
84 self._fill_value = fill_value
86 def __hash__(self):
87 # Python3 doesn't inherit __hash__ when a base class overrides
88 # __eq__, so we explicitly do it here.
89 return super().__hash__()
91 def __eq__(self, other: Any) -> bool:
92 # We have to override __eq__ to handle NA values in _metadata.
93 # The base class does simple == checks, which fail for NA.
94 if isinstance(other, str):
95 try:
96 other = self.construct_from_string(other)
97 except TypeError:
98 return False
100 if isinstance(other, type(self)):
101 subtype = self.subtype == other.subtype
102 if self._is_na_fill_value:
103 # this case is complicated by two things:
104 # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
105 # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
106 # i.e. we want to treat any floating-point NaN as equal, but
107 # not a floating-point NaN and a datetime NaT.
108 fill_value = (
109 other._is_na_fill_value
110 and isinstance(self.fill_value, type(other.fill_value))
111 or isinstance(other.fill_value, type(self.fill_value))
112 )
113 else:
114 fill_value = self.fill_value == other.fill_value
116 return subtype and fill_value
117 return False
119 @property
120 def fill_value(self):
121 """
122 The fill value of the array.
124 Converting the SparseArray to a dense ndarray will fill the
125 array with this value.
127 .. warning::
129 It's possible to end up with a SparseArray that has ``fill_value``
130 values in ``sp_values``. This can occur, for example, when setting
131 ``SparseArray.fill_value`` directly.
132 """
133 return self._fill_value
135 @property
136 def _is_na_fill_value(self):
137 return isna(self.fill_value)
139 @property
140 def _is_numeric(self):
141 return not is_object_dtype(self.subtype)
143 @property
144 def _is_boolean(self):
145 return is_bool_dtype(self.subtype)
147 @property
148 def kind(self):
149 """
150 The sparse kind. Either 'integer', or 'block'.
151 """
152 return self.subtype.kind
154 @property
155 def type(self):
156 return self.subtype.type
158 @property
159 def subtype(self):
160 return self._dtype
162 @property
163 def name(self):
164 return f"Sparse[{self.subtype.name}, {self.fill_value}]"
166 def __repr__(self) -> str:
167 return self.name
169 @classmethod
170 def construct_array_type(cls):
171 """
172 Return the array type associated with this dtype.
174 Returns
175 -------
176 type
177 """
178 from pandas.core.arrays.sparse.array import SparseArray
180 return SparseArray
182 @classmethod
183 def construct_from_string(cls, string):
184 """
185 Construct a SparseDtype from a string form.
187 Parameters
188 ----------
189 string : str
190 Can take the following forms.
192 string dtype
193 ================ ============================
194 'int' SparseDtype[np.int64, 0]
195 'Sparse' SparseDtype[np.float64, nan]
196 'Sparse[int]' SparseDtype[np.int64, 0]
197 'Sparse[int, 0]' SparseDtype[np.int64, 0]
198 ================ ============================
200 It is not possible to specify non-default fill values
201 with a string. An argument like ``'Sparse[int, 1]'``
202 will raise a ``TypeError`` because the default fill value
203 for integers is 0.
205 Returns
206 -------
207 SparseDtype
208 """
209 msg = f"Cannot construct a 'SparseDtype' from '{string}'"
210 if string.startswith("Sparse"):
211 try:
212 sub_type, has_fill_value = cls._parse_subtype(string)
213 except ValueError:
214 raise TypeError(msg)
215 else:
216 result = SparseDtype(sub_type)
217 msg = (
218 f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "
219 "looks like the fill_value in the string is not "
220 "the default for the dtype. Non-default fill_values "
221 "are not supported. Use the 'SparseDtype()' "
222 "constructor instead."
223 )
224 if has_fill_value and str(result) != string:
225 raise TypeError(msg)
226 return result
227 else:
228 raise TypeError(msg)
230 @staticmethod
231 def _parse_subtype(dtype: str) -> Tuple[str, bool]:
232 """
233 Parse a string to get the subtype
235 Parameters
236 ----------
237 dtype : str
238 A string like
240 * Sparse[subtype]
241 * Sparse[subtype, fill_value]
243 Returns
244 -------
245 subtype : str
247 Raises
248 ------
249 ValueError
250 When the subtype cannot be extracted.
251 """
252 xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")
253 m = xpr.match(dtype)
254 has_fill_value = False
255 if m:
256 subtype = m.groupdict()["subtype"]
257 has_fill_value = bool(m.groupdict()["fill_value"])
258 elif dtype == "Sparse":
259 subtype = "float64"
260 else:
261 raise ValueError(f"Cannot parse {dtype}")
262 return subtype, has_fill_value
264 @classmethod
265 def is_dtype(cls, dtype):
266 dtype = getattr(dtype, "dtype", dtype)
267 if isinstance(dtype, str) and dtype.startswith("Sparse"):
268 sub_type, _ = cls._parse_subtype(dtype)
269 dtype = np.dtype(sub_type)
270 elif isinstance(dtype, cls):
271 return True
272 return isinstance(dtype, np.dtype) or dtype == "Sparse"
274 def update_dtype(self, dtype):
275 """
276 Convert the SparseDtype to a new dtype.
278 This takes care of converting the ``fill_value``.
280 Parameters
281 ----------
282 dtype : Union[str, numpy.dtype, SparseDtype]
283 The new dtype to use.
285 * For a SparseDtype, it is simply returned
286 * For a NumPy dtype (or str), the current fill value
287 is converted to the new dtype, and a SparseDtype
288 with `dtype` and the new fill value is returned.
290 Returns
291 -------
292 SparseDtype
293 A new SparseDtype with the correct `dtype` and fill value
294 for that `dtype`.
296 Raises
297 ------
298 ValueError
299 When the current fill value cannot be converted to the
300 new `dtype` (e.g. trying to convert ``np.nan`` to an
301 integer dtype).
304 Examples
305 --------
306 >>> SparseDtype(int, 0).update_dtype(float)
307 Sparse[float64, 0.0]
309 >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
310 Sparse[float64, nan]
311 """
312 cls = type(self)
313 dtype = pandas_dtype(dtype)
315 if not isinstance(dtype, cls):
316 fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
317 dtype = cls(dtype, fill_value=fill_value)
319 return dtype
321 @property
322 def _subtype_with_str(self):
323 """
324 Whether the SparseDtype's subtype should be considered ``str``.
326 Typically, pandas will store string data in an object-dtype array.
327 When converting values to a dtype, e.g. in ``.astype``, we need to
328 be more specific, we need the actual underlying type.
330 Returns
331 -------
333 >>> SparseDtype(int, 1)._subtype_with_str
334 dtype('int64')
336 >>> SparseDtype(object, 1)._subtype_with_str
337 dtype('O')
339 >>> dtype = SparseDtype(str, '')
340 >>> dtype.subtype
341 dtype('O')
343 >>> dtype._subtype_with_str
344 str
345 """
346 if isinstance(self.fill_value, str):
347 return type(self.fill_value)
348 return self.subtype