Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" define extension dtypes """ 

2import re 

3from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Type, Union, cast 

4 

5import numpy as np 

6import pytz 

7 

8from pandas._libs.interval import Interval 

9from pandas._libs.tslibs import NaT, Period, Timestamp, timezones 

10from pandas._typing import Ordered 

11 

12from pandas.core.dtypes.base import ExtensionDtype 

13from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass 

14from pandas.core.dtypes.inference import is_bool, is_list_like 

15 

16str_type = str 

17 

18 

19def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: 

20 """ 

21 Register an ExtensionType with pandas as class decorator. 

22 

23 .. versionadded:: 0.24.0 

24 

25 This enables operations like ``.astype(name)`` for the name 

26 of the ExtensionDtype. 

27 

28 Returns 

29 ------- 

30 callable 

31 A class decorator. 

32 

33 Examples 

34 -------- 

35 >>> from pandas.api.extensions import register_extension_dtype 

36 >>> from pandas.api.extensions import ExtensionDtype 

37 >>> @register_extension_dtype 

38 ... class MyExtensionDtype(ExtensionDtype): 

39 ... pass 

40 """ 

41 registry.register(cls) 

42 return cls 

43 

44 

45class Registry: 

46 """ 

47 Registry for dtype inference. 

48 

49 The registry allows one to map a string repr of a extension 

50 dtype to an extension dtype. The string alias can be used in several 

51 places, including 

52 

53 * Series and Index constructors 

54 * :meth:`pandas.array` 

55 * :meth:`pandas.Series.astype` 

56 

57 Multiple extension types can be registered. 

58 These are tried in order. 

59 """ 

60 

61 def __init__(self): 

62 self.dtypes: List[Type[ExtensionDtype]] = [] 

63 

64 def register(self, dtype: Type[ExtensionDtype]) -> None: 

65 """ 

66 Parameters 

67 ---------- 

68 dtype : ExtensionDtype 

69 """ 

70 if not issubclass(dtype, ExtensionDtype): 

71 raise ValueError("can only register pandas extension dtypes") 

72 

73 self.dtypes.append(dtype) 

74 

75 def find( 

76 self, dtype: Union[Type[ExtensionDtype], str] 

77 ) -> Optional[Type[ExtensionDtype]]: 

78 """ 

79 Parameters 

80 ---------- 

81 dtype : Type[ExtensionDtype] or str 

82 

83 Returns 

84 ------- 

85 return the first matching dtype, otherwise return None 

86 """ 

87 if not isinstance(dtype, str): 

88 dtype_type = dtype 

89 if not isinstance(dtype, type): 

90 dtype_type = type(dtype) 

91 if issubclass(dtype_type, ExtensionDtype): 

92 return dtype 

93 

94 return None 

95 

96 for dtype_type in self.dtypes: 

97 try: 

98 return dtype_type.construct_from_string(dtype) 

99 except TypeError: 

100 pass 

101 

102 return None 

103 

104 

105registry = Registry() 

106 

107 

108class PandasExtensionDtype(ExtensionDtype): 

109 """ 

110 A np.dtype duck-typed class, suitable for holding a custom dtype. 

111 

112 THIS IS NOT A REAL NUMPY DTYPE 

113 """ 

114 

115 type: Any 

116 kind: Any 

117 # The Any type annotations above are here only because mypy seems to have a 

118 # problem dealing with with multiple inheritance from PandasExtensionDtype 

119 # and ExtensionDtype's @properties in the subclasses below. The kind and 

120 # type variables in those subclasses are explicitly typed below. 

121 subdtype = None 

122 str: Optional[str_type] = None 

123 num = 100 

124 shape: Tuple[int, ...] = tuple() 

125 itemsize = 8 

126 base = None 

127 isbuiltin = 0 

128 isnative = 0 

129 _cache: Dict[str_type, "PandasExtensionDtype"] = {} 

130 

131 def __str__(self) -> str_type: 

132 """ 

133 Return a string representation for a particular Object 

134 """ 

135 return self.name 

136 

137 def __repr__(self) -> str_type: 

138 """ 

139 Return a string representation for a particular object. 

140 """ 

141 return str(self) 

142 

143 def __hash__(self) -> int: 

144 raise NotImplementedError("sub-classes should implement an __hash__ method") 

145 

146 def __getstate__(self) -> Dict[str_type, Any]: 

147 # pickle support; we don't want to pickle the cache 

148 return {k: getattr(self, k, None) for k in self._metadata} 

149 

150 @classmethod 

151 def reset_cache(cls) -> None: 

152 """ clear the cache """ 

153 cls._cache = {} 

154 

155 

156class CategoricalDtypeType(type): 

157 """ 

158 the type of CategoricalDtype, this metaclass determines subclass ability 

159 """ 

160 

161 pass 

162 

163 

164@register_extension_dtype 

165class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): 

166 """ 

167 Type for categorical data with the categories and orderedness. 

168 

169 .. versionchanged:: 0.21.0 

170 

171 Parameters 

172 ---------- 

173 categories : sequence, optional 

174 Must be unique, and must not contain any nulls. 

175 ordered : bool or None, default False 

176 Whether or not this categorical is treated as a ordered categorical. 

177 None can be used to maintain the ordered value of existing categoricals when 

178 used in operations that combine categoricals, e.g. astype, and will resolve to 

179 False if there is no existing ordered to maintain. 

180 

181 Attributes 

182 ---------- 

183 categories 

184 ordered 

185 

186 Methods 

187 ------- 

188 None 

189 

190 See Also 

191 -------- 

192 Categorical 

193 

194 Notes 

195 ----- 

196 This class is useful for specifying the type of a ``Categorical`` 

197 independent of the values. See :ref:`categorical.categoricaldtype` 

198 for more. 

199 

200 Examples 

201 -------- 

202 >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) 

203 >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) 

204 0 a 

205 1 b 

206 2 a 

207 3 NaN 

208 dtype: category 

209 Categories (2, object): [b < a] 

210 """ 

211 

212 # TODO: Document public vs. private API 

213 name = "category" 

214 type: Type[CategoricalDtypeType] = CategoricalDtypeType 

215 kind: str_type = "O" 

216 str = "|O08" 

217 base = np.dtype("O") 

218 _metadata = ("categories", "ordered") 

219 _cache: Dict[str_type, PandasExtensionDtype] = {} 

220 

221 def __init__(self, categories=None, ordered: Ordered = False): 

222 self._finalize(categories, ordered, fastpath=False) 

223 

224 @classmethod 

225 def _from_fastpath( 

226 cls, categories=None, ordered: Optional[bool] = None 

227 ) -> "CategoricalDtype": 

228 self = cls.__new__(cls) 

229 self._finalize(categories, ordered, fastpath=True) 

230 return self 

231 

232 @classmethod 

233 def _from_categorical_dtype( 

234 cls, dtype: "CategoricalDtype", categories=None, ordered: Ordered = None 

235 ) -> "CategoricalDtype": 

236 if categories is ordered is None: 

237 return dtype 

238 if categories is None: 

239 categories = dtype.categories 

240 if ordered is None: 

241 ordered = dtype.ordered 

242 return cls(categories, ordered) 

243 

244 @classmethod 

245 def _from_values_or_dtype( 

246 cls, 

247 values=None, 

248 categories=None, 

249 ordered: Optional[bool] = None, 

250 dtype: Optional["CategoricalDtype"] = None, 

251 ) -> "CategoricalDtype": 

252 """ 

253 Construct dtype from the input parameters used in :class:`Categorical`. 

254 

255 This constructor method specifically does not do the factorization 

256 step, if that is needed to find the categories. This constructor may 

257 therefore return ``CategoricalDtype(categories=None, ordered=None)``, 

258 which may not be useful. Additional steps may therefore have to be 

259 taken to create the final dtype. 

260 

261 The return dtype is specified from the inputs in this prioritized 

262 order: 

263 1. if dtype is a CategoricalDtype, return dtype 

264 2. if dtype is the string 'category', create a CategoricalDtype from 

265 the supplied categories and ordered parameters, and return that. 

266 3. if values is a categorical, use value.dtype, but override it with 

267 categories and ordered if either/both of those are not None. 

268 4. if dtype is None and values is not a categorical, construct the 

269 dtype from categories and ordered, even if either of those is None. 

270 

271 Parameters 

272 ---------- 

273 values : list-like, optional 

274 The list-like must be 1-dimensional. 

275 categories : list-like, optional 

276 Categories for the CategoricalDtype. 

277 ordered : bool, optional 

278 Designating if the categories are ordered. 

279 dtype : CategoricalDtype or the string "category", optional 

280 If ``CategoricalDtype``, cannot be used together with 

281 `categories` or `ordered`. 

282 

283 Returns 

284 ------- 

285 CategoricalDtype 

286 

287 Examples 

288 -------- 

289 >>> CategoricalDtype._from_values_or_dtype() 

290 CategoricalDtype(categories=None, ordered=None) 

291 >>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'], 

292 ... ordered=True) 

293 CategoricalDtype(categories=['a', 'b'], ordered=True) 

294 >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True) 

295 >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False) 

296 >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True) 

297 >>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True, 

298 ... dtype=dtype2) 

299 ValueError: Cannot specify `categories` or `ordered` together with 

300 `dtype`. 

301 

302 The supplied dtype takes precedence over values' dtype: 

303 

304 >>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) 

305 CategoricalDtype(['x', 'y'], ordered=False) 

306 """ 

307 from pandas.core.dtypes.common import is_categorical 

308 

309 if dtype is not None: 

310 # The dtype argument takes precedence over values.dtype (if any) 

311 if isinstance(dtype, str): 

312 if dtype == "category": 

313 dtype = CategoricalDtype(categories, ordered) 

314 else: 

315 raise ValueError(f"Unknown dtype {repr(dtype)}") 

316 elif categories is not None or ordered is not None: 

317 raise ValueError( 

318 "Cannot specify `categories` or `ordered` together with `dtype`." 

319 ) 

320 elif is_categorical(values): 

321 # If no "dtype" was passed, use the one from "values", but honor 

322 # the "ordered" and "categories" arguments 

323 dtype = values.dtype._from_categorical_dtype( 

324 values.dtype, categories, ordered 

325 ) 

326 else: 

327 # If dtype=None and values is not categorical, create a new dtype. 

328 # Note: This could potentially have categories=None and 

329 # ordered=None. 

330 dtype = CategoricalDtype(categories, ordered) 

331 

332 return dtype 

333 

334 @classmethod 

335 def construct_from_string(cls, string: str_type) -> "CategoricalDtype": 

336 """ 

337 Construct a CategoricalDtype from a string. 

338 

339 Parameters 

340 ---------- 

341 string : str 

342 Must be the string "category" in order to be successfully constructed. 

343 

344 Returns 

345 ------- 

346 CategoricalDtype 

347 Instance of the dtype. 

348 

349 Raises 

350 ------ 

351 TypeError 

352 If a CategoricalDtype cannot be constructed from the input. 

353 """ 

354 if not isinstance(string, str): 

355 raise TypeError(f"Expects a string, got {type(string)}") 

356 if string != cls.name: 

357 raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'") 

358 

359 # need ordered=None to ensure that operations specifying dtype="category" don't 

360 # override the ordered value for existing categoricals 

361 return cls(ordered=None) 

362 

363 def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: 

364 

365 if ordered is not None: 

366 self.validate_ordered(ordered) 

367 

368 if categories is not None: 

369 categories = self.validate_categories(categories, fastpath=fastpath) 

370 

371 self._categories = categories 

372 self._ordered = ordered 

373 

374 def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: 

375 # for pickle compat. __get_state__ is defined in the 

376 # PandasExtensionDtype superclass and uses the public properties to 

377 # pickle -> need to set the settable private ones here (see GH26067) 

378 self._categories = state.pop("categories", None) 

379 self._ordered = state.pop("ordered", False) 

380 

381 def __hash__(self) -> int: 

382 # _hash_categories returns a uint64, so use the negative 

383 # space for when we have unknown categories to avoid a conflict 

384 if self.categories is None: 

385 if self.ordered: 

386 return -1 

387 else: 

388 return -2 

389 # We *do* want to include the real self.ordered here 

390 return int(self._hash_categories(self.categories, self.ordered)) 

391 

392 def __eq__(self, other: Any) -> bool: 

393 """ 

394 Rules for CDT equality: 

395 1) Any CDT is equal to the string 'category' 

396 2) Any CDT is equal to itself 

397 3) Any CDT is equal to a CDT with categories=None regardless of ordered 

398 4) A CDT with ordered=True is only equal to another CDT with 

399 ordered=True and identical categories in the same order 

400 5) A CDT with ordered={False, None} is only equal to another CDT with 

401 ordered={False, None} and identical categories, but same order is 

402 not required. There is no distinction between False/None. 

403 6) Any other comparison returns False 

404 """ 

405 if isinstance(other, str): 

406 return other == self.name 

407 elif other is self: 

408 return True 

409 elif not (hasattr(other, "ordered") and hasattr(other, "categories")): 

410 return False 

411 elif self.categories is None or other.categories is None: 

412 # We're forced into a suboptimal corner thanks to math and 

413 # backwards compatibility. We require that `CDT(...) == 'category'` 

414 # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* 

415 # CDT(., .) = CDT(None, False) and *all* 

416 # CDT(., .) = CDT(None, True). 

417 return True 

418 elif self.ordered or other.ordered: 

419 # At least one has ordered=True; equal if both have ordered=True 

420 # and the same values for categories in the same order. 

421 return (self.ordered == other.ordered) and self.categories.equals( 

422 other.categories 

423 ) 

424 else: 

425 # Neither has ordered=True; equal if both have the same categories, 

426 # but same order is not necessary. There is no distinction between 

427 # ordered=False and ordered=None: CDT(., False) and CDT(., None) 

428 # will be equal if they have the same categories. 

429 if ( 

430 self.categories.dtype == other.categories.dtype 

431 and self.categories.equals(other.categories) 

432 ): 

433 # Check and see if they happen to be identical categories 

434 return True 

435 return hash(self) == hash(other) 

436 

437 def __repr__(self) -> str_type: 

438 tpl = "CategoricalDtype(categories={data}ordered={ordered})" 

439 if self.categories is None: 

440 data = "None, " 

441 else: 

442 data = self.categories._format_data(name=type(self).__name__) 

443 return tpl.format(data=data, ordered=self.ordered) 

444 

445 @staticmethod 

446 def _hash_categories(categories, ordered: Ordered = True) -> int: 

447 from pandas.core.util.hashing import ( 

448 hash_array, 

449 _combine_hash_arrays, 

450 hash_tuples, 

451 ) 

452 from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE 

453 

454 if len(categories) and isinstance(categories[0], tuple): 

455 # assumes if any individual category is a tuple, then all our. ATM 

456 # I don't really want to support just some of the categories being 

457 # tuples. 

458 categories = list(categories) # breaks if a np.array of categories 

459 cat_array = hash_tuples(categories) 

460 else: 

461 if categories.dtype == "O": 

462 if len({type(x) for x in categories}) != 1: 

463 # TODO: hash_array doesn't handle mixed types. It casts 

464 # everything to a str first, which means we treat 

465 # {'1', '2'} the same as {'1', 2} 

466 # find a better solution 

467 hashed = hash((tuple(categories), ordered)) 

468 return hashed 

469 

470 if is_datetime64tz_dtype(categories.dtype): 

471 # Avoid future warning. 

472 categories = categories.astype(_NS_DTYPE) 

473 

474 cat_array = hash_array(np.asarray(categories), categorize=False) 

475 if ordered: 

476 cat_array = np.vstack( 

477 [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] 

478 ) 

479 else: 

480 cat_array = [cat_array] 

481 hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) 

482 return np.bitwise_xor.reduce(hashed) 

483 

484 @classmethod 

485 def construct_array_type(cls): 

486 """ 

487 Return the array type associated with this dtype. 

488 

489 Returns 

490 ------- 

491 type 

492 """ 

493 from pandas import Categorical 

494 

495 return Categorical 

496 

497 @staticmethod 

498 def validate_ordered(ordered: Ordered) -> None: 

499 """ 

500 Validates that we have a valid ordered parameter. If 

501 it is not a boolean, a TypeError will be raised. 

502 

503 Parameters 

504 ---------- 

505 ordered : object 

506 The parameter to be verified. 

507 

508 Raises 

509 ------ 

510 TypeError 

511 If 'ordered' is not a boolean. 

512 """ 

513 if not is_bool(ordered): 

514 raise TypeError("'ordered' must either be 'True' or 'False'") 

515 

516 @staticmethod 

517 def validate_categories(categories, fastpath: bool = False): 

518 """ 

519 Validates that we have good categories 

520 

521 Parameters 

522 ---------- 

523 categories : array-like 

524 fastpath : bool 

525 Whether to skip nan and uniqueness checks 

526 

527 Returns 

528 ------- 

529 categories : Index 

530 """ 

531 from pandas.core.indexes.base import Index 

532 

533 if not fastpath and not is_list_like(categories): 

534 raise TypeError( 

535 f"Parameter 'categories' must be list-like, was {repr(categories)}" 

536 ) 

537 elif not isinstance(categories, ABCIndexClass): 

538 categories = Index(categories, tupleize_cols=False) 

539 

540 if not fastpath: 

541 

542 if categories.hasnans: 

543 raise ValueError("Categorial categories cannot be null") 

544 

545 if not categories.is_unique: 

546 raise ValueError("Categorical categories must be unique") 

547 

548 if isinstance(categories, ABCCategoricalIndex): 

549 categories = categories.categories 

550 

551 return categories 

552 

553 def update_dtype( 

554 self, dtype: Union[str_type, "CategoricalDtype"] 

555 ) -> "CategoricalDtype": 

556 """ 

557 Returns a CategoricalDtype with categories and ordered taken from dtype 

558 if specified, otherwise falling back to self if unspecified 

559 

560 Parameters 

561 ---------- 

562 dtype : CategoricalDtype 

563 

564 Returns 

565 ------- 

566 new_dtype : CategoricalDtype 

567 """ 

568 if isinstance(dtype, str) and dtype == "category": 

569 # dtype='category' should not change anything 

570 return self 

571 elif not self.is_dtype(dtype): 

572 raise ValueError( 

573 f"a CategoricalDtype must be passed to perform an update, " 

574 f"got {repr(dtype)}" 

575 ) 

576 else: 

577 # from here on, dtype is a CategoricalDtype 

578 dtype = cast(CategoricalDtype, dtype) 

579 

580 # update categories/ordered unless they've been explicitly passed as None 

581 new_categories = ( 

582 dtype.categories if dtype.categories is not None else self.categories 

583 ) 

584 new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered 

585 

586 return CategoricalDtype(new_categories, new_ordered) 

587 

588 @property 

589 def categories(self): 

590 """ 

591 An ``Index`` containing the unique categories allowed. 

592 """ 

593 return self._categories 

594 

595 @property 

596 def ordered(self) -> Ordered: 

597 """ 

598 Whether the categories have an ordered relationship. 

599 """ 

600 return self._ordered 

601 

602 @property 

603 def _is_boolean(self) -> bool: 

604 from pandas.core.dtypes.common import is_bool_dtype 

605 

606 return is_bool_dtype(self.categories) 

607 

608 

609@register_extension_dtype 

610class DatetimeTZDtype(PandasExtensionDtype): 

611 """ 

612 An ExtensionDtype for timezone-aware datetime data. 

613 

614 **This is not an actual numpy dtype**, but a duck type. 

615 

616 Parameters 

617 ---------- 

618 unit : str, default "ns" 

619 The precision of the datetime data. Currently limited 

620 to ``"ns"``. 

621 tz : str, int, or datetime.tzinfo 

622 The timezone. 

623 

624 Attributes 

625 ---------- 

626 unit 

627 tz 

628 

629 Methods 

630 ------- 

631 None 

632 

633 Raises 

634 ------ 

635 pytz.UnknownTimeZoneError 

636 When the requested timezone cannot be found. 

637 

638 Examples 

639 -------- 

640 >>> pd.DatetimeTZDtype(tz='UTC') 

641 datetime64[ns, UTC] 

642 

643 >>> pd.DatetimeTZDtype(tz='dateutil/US/Central') 

644 datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] 

645 """ 

646 

647 type: Type[Timestamp] = Timestamp 

648 kind: str_type = "M" 

649 str = "|M8[ns]" 

650 num = 101 

651 base = np.dtype("M8[ns]") 

652 na_value = NaT 

653 _metadata = ("unit", "tz") 

654 _match = re.compile(r"(datetime64|M8)\[(?P<unit>.+), (?P<tz>.+)\]") 

655 _cache: Dict[str_type, PandasExtensionDtype] = {} 

656 

657 def __init__(self, unit="ns", tz=None): 

658 if isinstance(unit, DatetimeTZDtype): 

659 unit, tz = unit.unit, unit.tz 

660 

661 if unit != "ns": 

662 if isinstance(unit, str) and tz is None: 

663 # maybe a string like datetime64[ns, tz], which we support for 

664 # now. 

665 result = type(self).construct_from_string(unit) 

666 unit = result.unit 

667 tz = result.tz 

668 msg = ( 

669 f"Passing a dtype alias like 'datetime64[ns, {tz}]' " 

670 "to DatetimeTZDtype is no longer supported. Use " 

671 "'DatetimeTZDtype.construct_from_string()' instead." 

672 ) 

673 raise ValueError(msg) 

674 else: 

675 raise ValueError("DatetimeTZDtype only supports ns units") 

676 

677 if tz: 

678 tz = timezones.maybe_get_tz(tz) 

679 tz = timezones.tz_standardize(tz) 

680 elif tz is not None: 

681 raise pytz.UnknownTimeZoneError(tz) 

682 if tz is None: 

683 raise TypeError("A 'tz' is required.") 

684 

685 self._unit = unit 

686 self._tz = tz 

687 

688 @property 

689 def unit(self): 

690 """ 

691 The precision of the datetime data. 

692 """ 

693 return self._unit 

694 

695 @property 

696 def tz(self): 

697 """ 

698 The timezone. 

699 """ 

700 return self._tz 

701 

702 @classmethod 

703 def construct_array_type(cls): 

704 """ 

705 Return the array type associated with this dtype. 

706 

707 Returns 

708 ------- 

709 type 

710 """ 

711 from pandas.core.arrays import DatetimeArray 

712 

713 return DatetimeArray 

714 

715 @classmethod 

716 def construct_from_string(cls, string: str_type): 

717 """ 

718 Construct a DatetimeTZDtype from a string. 

719 

720 Parameters 

721 ---------- 

722 string : str 

723 The string alias for this DatetimeTZDtype. 

724 Should be formatted like ``datetime64[ns, <tz>]``, 

725 where ``<tz>`` is the timezone name. 

726 

727 Examples 

728 -------- 

729 >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]') 

730 datetime64[ns, UTC] 

731 """ 

732 if isinstance(string, str): 

733 msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'" 

734 match = cls._match.match(string) 

735 if match: 

736 d = match.groupdict() 

737 try: 

738 return cls(unit=d["unit"], tz=d["tz"]) 

739 except (KeyError, TypeError, ValueError) as err: 

740 # KeyError if maybe_get_tz tries and fails to get a 

741 # pytz timezone (actually pytz.UnknownTimeZoneError). 

742 # TypeError if we pass a nonsense tz; 

743 # ValueError if we pass a unit other than "ns" 

744 raise TypeError(msg) from err 

745 raise TypeError(msg) 

746 

747 raise TypeError("Cannot construct a 'DatetimeTZDtype'") 

748 

749 def __str__(self) -> str_type: 

750 return f"datetime64[{self.unit}, {self.tz}]" 

751 

752 @property 

753 def name(self) -> str_type: 

754 """A string representation of the dtype.""" 

755 return str(self) 

756 

757 def __hash__(self) -> int: 

758 # make myself hashable 

759 # TODO: update this. 

760 return hash(str(self)) 

761 

762 def __eq__(self, other: Any) -> bool: 

763 if isinstance(other, str): 

764 return other == self.name 

765 

766 return ( 

767 isinstance(other, DatetimeTZDtype) 

768 and self.unit == other.unit 

769 and str(self.tz) == str(other.tz) 

770 ) 

771 

772 def __setstate__(self, state): 

773 # for pickle compat. __get_state__ is defined in the 

774 # PandasExtensionDtype superclass and uses the public properties to 

775 # pickle -> need to set the settable private ones here (see GH26067) 

776 self._tz = state["tz"] 

777 self._unit = state["unit"] 

778 

779 

780@register_extension_dtype 

781class PeriodDtype(PandasExtensionDtype): 

782 """ 

783 An ExtensionDtype for Period data. 

784 

785 **This is not an actual numpy dtype**, but a duck type. 

786 

787 Parameters 

788 ---------- 

789 freq : str or DateOffset 

790 The frequency of this PeriodDtype. 

791 

792 Attributes 

793 ---------- 

794 freq 

795 

796 Methods 

797 ------- 

798 None 

799 

800 Examples 

801 -------- 

802 >>> pd.PeriodDtype(freq='D') 

803 period[D] 

804 

805 >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd()) 

806 period[M] 

807 """ 

808 

809 type: Type[Period] = Period 

810 kind: str_type = "O" 

811 str = "|O08" 

812 base = np.dtype("O") 

813 num = 102 

814 _metadata = ("freq",) 

815 _match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]") 

816 _cache: Dict[str_type, PandasExtensionDtype] = {} 

817 

818 def __new__(cls, freq=None): 

819 """ 

820 Parameters 

821 ---------- 

822 freq : frequency 

823 """ 

824 

825 if isinstance(freq, PeriodDtype): 

826 return freq 

827 

828 elif freq is None: 

829 # empty constructor for pickle compat 

830 u = object.__new__(cls) 

831 u._freq = None 

832 return u 

833 

834 if not isinstance(freq, ABCDateOffset): 

835 freq = cls._parse_dtype_strict(freq) 

836 

837 try: 

838 return cls._cache[freq.freqstr] 

839 except KeyError: 

840 u = object.__new__(cls) 

841 u._freq = freq 

842 cls._cache[freq.freqstr] = u 

843 return u 

844 

845 @property 

846 def freq(self): 

847 """ 

848 The frequency object of this PeriodDtype. 

849 """ 

850 return self._freq 

851 

852 @classmethod 

853 def _parse_dtype_strict(cls, freq): 

854 if isinstance(freq, str): 

855 if freq.startswith("period[") or freq.startswith("Period["): 

856 m = cls._match.search(freq) 

857 if m is not None: 

858 freq = m.group("freq") 

859 from pandas.tseries.frequencies import to_offset 

860 

861 freq = to_offset(freq) 

862 if freq is not None: 

863 return freq 

864 

865 raise ValueError("could not construct PeriodDtype") 

866 

867 @classmethod 

868 def construct_from_string(cls, string): 

869 """ 

870 Strict construction from a string, raise a TypeError if not 

871 possible 

872 """ 

873 if ( 

874 isinstance(string, str) 

875 and (string.startswith("period[") or string.startswith("Period[")) 

876 or isinstance(string, ABCDateOffset) 

877 ): 

878 # do not parse string like U as period[U] 

879 # avoid tuple to be regarded as freq 

880 try: 

881 return cls(freq=string) 

882 except ValueError: 

883 pass 

884 if isinstance(string, str): 

885 msg = f"Cannot construct a 'PeriodDtype' from '{string}'" 

886 else: 

887 msg = f"'construct_from_string' expects a string, got {type(string)}" 

888 raise TypeError(msg) 

889 

890 def __str__(self) -> str_type: 

891 return self.name 

892 

893 @property 

894 def name(self) -> str_type: 

895 return f"period[{self.freq.freqstr}]" 

896 

897 @property 

898 def na_value(self): 

899 return NaT 

900 

901 def __hash__(self) -> int: 

902 # make myself hashable 

903 return hash(str(self)) 

904 

905 def __eq__(self, other: Any) -> bool: 

906 if isinstance(other, str): 

907 return other == self.name or other == self.name.title() 

908 

909 return isinstance(other, PeriodDtype) and self.freq == other.freq 

910 

911 def __setstate__(self, state): 

912 # for pickle compat. __get_state__ is defined in the 

913 # PandasExtensionDtype superclass and uses the public properties to 

914 # pickle -> need to set the settable private ones here (see GH26067) 

915 self._freq = state["freq"] 

916 

917 @classmethod 

918 def is_dtype(cls, dtype) -> bool: 

919 """ 

920 Return a boolean if we if the passed type is an actual dtype that we 

921 can match (via string or type) 

922 """ 

923 

924 if isinstance(dtype, str): 

925 # PeriodDtype can be instantiated from freq string like "U", 

926 # but doesn't regard freq str like "U" as dtype. 

927 if dtype.startswith("period[") or dtype.startswith("Period["): 

928 try: 

929 if cls._parse_dtype_strict(dtype) is not None: 

930 return True 

931 else: 

932 return False 

933 except ValueError: 

934 return False 

935 else: 

936 return False 

937 return super().is_dtype(dtype) 

938 

939 @classmethod 

940 def construct_array_type(cls): 

941 """ 

942 Return the array type associated with this dtype. 

943 

944 Returns 

945 ------- 

946 type 

947 """ 

948 from pandas.core.arrays import PeriodArray 

949 

950 return PeriodArray 

951 

952 def __from_arrow__(self, array): 

953 """Construct PeriodArray from pyarrow Array/ChunkedArray.""" 

954 import pyarrow 

955 from pandas.core.arrays import PeriodArray 

956 from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask 

957 

958 if isinstance(array, pyarrow.Array): 

959 chunks = [array] 

960 else: 

961 chunks = array.chunks 

962 

963 results = [] 

964 for arr in chunks: 

965 data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") 

966 parr = PeriodArray(data.copy(), freq=self.freq, copy=False) 

967 parr[~mask] = NaT 

968 results.append(parr) 

969 

970 return PeriodArray._concat_same_type(results) 

971 

972 

973@register_extension_dtype 

974class IntervalDtype(PandasExtensionDtype): 

975 """ 

976 An ExtensionDtype for Interval data. 

977 

978 **This is not an actual numpy dtype**, but a duck type. 

979 

980 Parameters 

981 ---------- 

982 subtype : str, np.dtype 

983 The dtype of the Interval bounds. 

984 

985 Attributes 

986 ---------- 

987 subtype 

988 

989 Methods 

990 ------- 

991 None 

992 

993 Examples 

994 -------- 

995 >>> pd.IntervalDtype(subtype='int64') 

996 interval[int64] 

997 """ 

998 

999 name = "interval" 

1000 kind: str_type = "O" 

1001 str = "|O08" 

1002 base = np.dtype("O") 

1003 num = 103 

1004 _metadata = ("subtype",) 

1005 _match = re.compile(r"(I|i)nterval\[(?P<subtype>.+)\]") 

1006 _cache: Dict[str_type, PandasExtensionDtype] = {} 

1007 

1008 def __new__(cls, subtype=None): 

1009 from pandas.core.dtypes.common import ( 

1010 is_categorical_dtype, 

1011 is_string_dtype, 

1012 pandas_dtype, 

1013 ) 

1014 

1015 if isinstance(subtype, IntervalDtype): 

1016 return subtype 

1017 elif subtype is None: 

1018 # we are called as an empty constructor 

1019 # generally for pickle compat 

1020 u = object.__new__(cls) 

1021 u._subtype = None 

1022 return u 

1023 elif isinstance(subtype, str) and subtype.lower() == "interval": 

1024 subtype = None 

1025 else: 

1026 if isinstance(subtype, str): 

1027 m = cls._match.search(subtype) 

1028 if m is not None: 

1029 subtype = m.group("subtype") 

1030 

1031 try: 

1032 subtype = pandas_dtype(subtype) 

1033 except TypeError: 

1034 raise TypeError("could not construct IntervalDtype") 

1035 

1036 if is_categorical_dtype(subtype) or is_string_dtype(subtype): 

1037 # GH 19016 

1038 msg = ( 

1039 "category, object, and string subtypes are not supported " 

1040 "for IntervalDtype" 

1041 ) 

1042 raise TypeError(msg) 

1043 

1044 try: 

1045 return cls._cache[str(subtype)] 

1046 except KeyError: 

1047 u = object.__new__(cls) 

1048 u._subtype = subtype 

1049 cls._cache[str(subtype)] = u 

1050 return u 

1051 

1052 @property 

1053 def subtype(self): 

1054 """ 

1055 The dtype of the Interval bounds. 

1056 """ 

1057 return self._subtype 

1058 

1059 @classmethod 

1060 def construct_array_type(cls): 

1061 """ 

1062 Return the array type associated with this dtype. 

1063 

1064 Returns 

1065 ------- 

1066 type 

1067 """ 

1068 from pandas.core.arrays import IntervalArray 

1069 

1070 return IntervalArray 

1071 

1072 @classmethod 

1073 def construct_from_string(cls, string): 

1074 """ 

1075 attempt to construct this type from a string, raise a TypeError 

1076 if its not possible 

1077 """ 

1078 if not isinstance(string, str): 

1079 raise TypeError(f"a string needs to be passed, got type {type(string)}") 

1080 

1081 if string.lower() == "interval" or cls._match.search(string) is not None: 

1082 return cls(string) 

1083 

1084 msg = ( 

1085 f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n" 

1086 "Incorrectly formatted string passed to constructor. " 

1087 "Valid formats include Interval or Interval[dtype] " 

1088 "where dtype is numeric, datetime, or timedelta" 

1089 ) 

1090 raise TypeError(msg) 

1091 

1092 @property 

1093 def type(self): 

1094 return Interval 

1095 

1096 def __str__(self) -> str_type: 

1097 if self.subtype is None: 

1098 return "interval" 

1099 return f"interval[{self.subtype}]" 

1100 

1101 def __hash__(self) -> int: 

1102 # make myself hashable 

1103 return hash(str(self)) 

1104 

1105 def __eq__(self, other: Any) -> bool: 

1106 if isinstance(other, str): 

1107 return other.lower() in (self.name.lower(), str(self).lower()) 

1108 elif not isinstance(other, IntervalDtype): 

1109 return False 

1110 elif self.subtype is None or other.subtype is None: 

1111 # None should match any subtype 

1112 return True 

1113 else: 

1114 from pandas.core.dtypes.common import is_dtype_equal 

1115 

1116 return is_dtype_equal(self.subtype, other.subtype) 

1117 

1118 def __setstate__(self, state): 

1119 # for pickle compat. __get_state__ is defined in the 

1120 # PandasExtensionDtype superclass and uses the public properties to 

1121 # pickle -> need to set the settable private ones here (see GH26067) 

1122 self._subtype = state["subtype"] 

1123 

1124 @classmethod 

1125 def is_dtype(cls, dtype) -> bool: 

1126 """ 

1127 Return a boolean if we if the passed type is an actual dtype that we 

1128 can match (via string or type) 

1129 """ 

1130 

1131 if isinstance(dtype, str): 

1132 if dtype.lower().startswith("interval"): 

1133 try: 

1134 if cls.construct_from_string(dtype) is not None: 

1135 return True 

1136 else: 

1137 return False 

1138 except (ValueError, TypeError): 

1139 return False 

1140 else: 

1141 return False 

1142 return super().is_dtype(dtype) 

1143 

1144 def __from_arrow__(self, array): 

1145 """Construct IntervalArray from pyarrow Array/ChunkedArray.""" 

1146 import pyarrow 

1147 from pandas.core.arrays import IntervalArray 

1148 

1149 if isinstance(array, pyarrow.Array): 

1150 chunks = [array] 

1151 else: 

1152 chunks = array.chunks 

1153 

1154 results = [] 

1155 for arr in chunks: 

1156 left = np.asarray(arr.storage.field("left"), dtype=self.subtype) 

1157 right = np.asarray(arr.storage.field("right"), dtype=self.subtype) 

1158 iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) 

1159 results.append(iarr) 

1160 

1161 return IntervalArray._concat_same_type(results)