Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import operator 

2from typing import Type 

3 

4import numpy as np 

5 

6from pandas._libs import lib, missing as libmissing 

7 

8from pandas.core.dtypes.base import ExtensionDtype 

9from pandas.core.dtypes.common import pandas_dtype 

10from pandas.core.dtypes.dtypes import register_extension_dtype 

11from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries 

12from pandas.core.dtypes.inference import is_array_like 

13 

14from pandas import compat 

15from pandas.core import ops 

16from pandas.core.arrays import PandasArray 

17from pandas.core.construction import extract_array 

18from pandas.core.indexers import check_array_indexer 

19from pandas.core.missing import isna 

20 

21 

22@register_extension_dtype 

23class StringDtype(ExtensionDtype): 

24 """ 

25 Extension dtype for string data. 

26 

27 .. versionadded:: 1.0.0 

28 

29 .. warning:: 

30 

31 StringDtype is considered experimental. The implementation and 

32 parts of the API may change without warning. 

33 

34 In particular, StringDtype.na_value may change to no longer be 

35 ``numpy.nan``. 

36 

37 Attributes 

38 ---------- 

39 None 

40 

41 Methods 

42 ------- 

43 None 

44 

45 Examples 

46 -------- 

47 >>> pd.StringDtype() 

48 StringDtype 

49 """ 

50 

51 name = "string" 

52 

53 #: StringDtype.na_value uses pandas.NA 

54 na_value = libmissing.NA 

55 

56 @property 

57 def type(self) -> Type: 

58 return str 

59 

60 @classmethod 

61 def construct_array_type(cls) -> "Type[StringArray]": 

62 return StringArray 

63 

64 def __repr__(self) -> str: 

65 return "StringDtype" 

66 

67 def __from_arrow__(self, array): 

68 """Construct StringArray from passed pyarrow Array/ChunkedArray""" 

69 import pyarrow 

70 

71 if isinstance(array, pyarrow.Array): 

72 chunks = [array] 

73 else: 

74 # pyarrow.ChunkedArray 

75 chunks = array.chunks 

76 

77 results = [] 

78 for arr in chunks: 

79 # using _from_sequence to ensure None is converted to NA 

80 str_arr = StringArray._from_sequence(np.array(arr)) 

81 results.append(str_arr) 

82 

83 return StringArray._concat_same_type(results) 

84 

85 

86class StringArray(PandasArray): 

87 """ 

88 Extension array for string data. 

89 

90 .. versionadded:: 1.0.0 

91 

92 .. warning:: 

93 

94 StringArray is considered experimental. The implementation and 

95 parts of the API may change without warning. 

96 

97 Parameters 

98 ---------- 

99 values : array-like 

100 The array of data. 

101 

102 .. warning:: 

103 

104 Currently, this expects an object-dtype ndarray 

105 where the elements are Python strings or :attr:`pandas.NA`. 

106 This may change without warning in the future. Use 

107 :meth:`pandas.array` with ``dtype="string"`` for a stable way of 

108 creating a `StringArray` from any sequence. 

109 

110 copy : bool, default False 

111 Whether to copy the array of data. 

112 

113 Attributes 

114 ---------- 

115 None 

116 

117 Methods 

118 ------- 

119 None 

120 

121 See Also 

122 -------- 

123 array 

124 The recommended function for creating a StringArray. 

125 Series.str 

126 The string methods are available on Series backed by 

127 a StringArray. 

128 

129 Notes 

130 ----- 

131 StringArray returns a BooleanArray for comparison methods. 

132 

133 Examples 

134 -------- 

135 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") 

136 <StringArray> 

137 ['This is', 'some text', <NA>, 'data.'] 

138 Length: 4, dtype: string 

139 

140 Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string 

141 values. 

142 

143 >>> pd.array(['1', 1], dtype="string") 

144 Traceback (most recent call last): 

145 ... 

146 ValueError: StringArray requires an object-dtype ndarray of strings. 

147 

148 For comparison methods, this returns a :class:`pandas.BooleanArray` 

149 

150 >>> pd.array(["a", None, "c"], dtype="string") == "a" 

151 <BooleanArray> 

152 [True, <NA>, False] 

153 Length: 3, dtype: boolean 

154 """ 

155 

156 # undo the PandasArray hack 

157 _typ = "extension" 

158 

159 def __init__(self, values, copy=False): 

160 values = extract_array(values) 

161 skip_validation = isinstance(values, type(self)) 

162 

163 super().__init__(values, copy=copy) 

164 self._dtype = StringDtype() 

165 if not skip_validation: 

166 self._validate() 

167 

168 def _validate(self): 

169 """Validate that we only store NA or strings.""" 

170 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): 

171 raise ValueError("StringArray requires a sequence of strings or pandas.NA") 

172 if self._ndarray.dtype != "object": 

173 raise ValueError( 

174 "StringArray requires a sequence of strings or pandas.NA. Got " 

175 f"'{self._ndarray.dtype}' dtype instead." 

176 ) 

177 

178 @classmethod 

179 def _from_sequence(cls, scalars, dtype=None, copy=False): 

180 if dtype: 

181 assert dtype == "string" 

182 

183 result = np.asarray(scalars, dtype="object") 

184 if copy and result is scalars: 

185 result = result.copy() 

186 

187 # Standardize all missing-like values to NA 

188 # TODO: it would be nice to do this in _validate / lib.is_string_array 

189 # We are already doing a scan over the values there. 

190 na_values = isna(result) 

191 if na_values.any(): 

192 if result is scalars: 

193 # force a copy now, if we haven't already 

194 result = result.copy() 

195 result[na_values] = StringDtype.na_value 

196 

197 return cls(result) 

198 

199 @classmethod 

200 def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): 

201 return cls._from_sequence(strings, dtype=dtype, copy=copy) 

202 

203 def __arrow_array__(self, type=None): 

204 """ 

205 Convert myself into a pyarrow Array. 

206 """ 

207 import pyarrow as pa 

208 

209 if type is None: 

210 type = pa.string() 

211 

212 values = self._ndarray.copy() 

213 values[self.isna()] = None 

214 return pa.array(values, type=type, from_pandas=True) 

215 

216 def _values_for_factorize(self): 

217 arr = self._ndarray.copy() 

218 mask = self.isna() 

219 arr[mask] = -1 

220 return arr, -1 

221 

222 def __setitem__(self, key, value): 

223 value = extract_array(value, extract_numpy=True) 

224 if isinstance(value, type(self)): 

225 # extract_array doesn't extract PandasArray subclasses 

226 value = value._ndarray 

227 

228 key = check_array_indexer(self, key) 

229 scalar_key = lib.is_scalar(key) 

230 scalar_value = lib.is_scalar(value) 

231 if scalar_key and not scalar_value: 

232 raise ValueError("setting an array element with a sequence.") 

233 

234 # validate new items 

235 if scalar_value: 

236 if isna(value): 

237 value = StringDtype.na_value 

238 elif not isinstance(value, str): 

239 raise ValueError( 

240 f"Cannot set non-string value '{value}' into a StringArray." 

241 ) 

242 else: 

243 if not is_array_like(value): 

244 value = np.asarray(value, dtype=object) 

245 if len(value) and not lib.is_string_array(value, skipna=True): 

246 raise ValueError("Must provide strings.") 

247 

248 super().__setitem__(key, value) 

249 

250 def fillna(self, value=None, method=None, limit=None): 

251 # TODO: validate dtype 

252 return super().fillna(value, method, limit) 

253 

254 def astype(self, dtype, copy=True): 

255 dtype = pandas_dtype(dtype) 

256 if isinstance(dtype, StringDtype): 

257 if copy: 

258 return self.copy() 

259 return self 

260 return super().astype(dtype, copy) 

261 

262 def _reduce(self, name, skipna=True, **kwargs): 

263 raise TypeError(f"Cannot perform reduction '{name}' with string dtype") 

264 

265 def value_counts(self, dropna=False): 

266 from pandas import value_counts 

267 

268 return value_counts(self._ndarray, dropna=dropna).astype("Int64") 

269 

270 # Overrride parent because we have different return types. 

271 @classmethod 

272 def _create_arithmetic_method(cls, op): 

273 # Note: this handles both arithmetic and comparison methods. 

274 def method(self, other): 

275 from pandas.arrays import BooleanArray 

276 

277 assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS 

278 

279 if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): 

280 return NotImplemented 

281 

282 elif isinstance(other, cls): 

283 other = other._ndarray 

284 

285 mask = isna(self) | isna(other) 

286 valid = ~mask 

287 

288 if not lib.is_scalar(other): 

289 if len(other) != len(self): 

290 # prevent improper broadcasting when other is 2D 

291 raise ValueError( 

292 f"Lengths of operands do not match: {len(self)} != {len(other)}" 

293 ) 

294 

295 other = np.asarray(other) 

296 other = other[valid] 

297 

298 if op.__name__ in ops.ARITHMETIC_BINOPS: 

299 result = np.empty_like(self._ndarray, dtype="object") 

300 result[mask] = StringDtype.na_value 

301 result[valid] = op(self._ndarray[valid], other) 

302 return StringArray(result) 

303 else: 

304 # logical 

305 result = np.zeros(len(self._ndarray), dtype="bool") 

306 result[valid] = op(self._ndarray[valid], other) 

307 return BooleanArray(result, mask) 

308 

309 return compat.set_function_name(method, f"__{op.__name__}__", cls) 

310 

311 @classmethod 

312 def _add_arithmetic_ops(cls): 

313 cls.__add__ = cls._create_arithmetic_method(operator.add) 

314 cls.__radd__ = cls._create_arithmetic_method(ops.radd) 

315 

316 cls.__mul__ = cls._create_arithmetic_method(operator.mul) 

317 cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) 

318 

319 _create_comparison_method = _create_arithmetic_method 

320 

321 

322StringArray._add_arithmetic_ops() 

323StringArray._add_comparison_ops()