Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import codecs 

2from functools import wraps 

3import re 

4import textwrap 

5from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union 

6import warnings 

7 

8import numpy as np 

9 

10import pandas._libs.lib as lib 

11import pandas._libs.missing as libmissing 

12import pandas._libs.ops as libops 

13from pandas._typing import ArrayLike, Dtype 

14from pandas.util._decorators import Appender 

15 

16from pandas.core.dtypes.common import ( 

17 ensure_object, 

18 is_bool_dtype, 

19 is_categorical_dtype, 

20 is_extension_array_dtype, 

21 is_integer, 

22 is_integer_dtype, 

23 is_list_like, 

24 is_object_dtype, 

25 is_re, 

26 is_scalar, 

27 is_string_dtype, 

28) 

29from pandas.core.dtypes.generic import ( 

30 ABCDataFrame, 

31 ABCIndexClass, 

32 ABCMultiIndex, 

33 ABCSeries, 

34) 

35from pandas.core.dtypes.missing import isna 

36 

37from pandas.core.algorithms import take_1d 

38from pandas.core.base import NoNewAttributesMixin 

39import pandas.core.common as com 

40from pandas.core.construction import extract_array 

41 

42if TYPE_CHECKING: 

43 from pandas.arrays import StringArray 

44 

45_cpython_optimized_encoders = ( 

46 "utf-8", 

47 "utf8", 

48 "latin-1", 

49 "latin1", 

50 "iso-8859-1", 

51 "mbcs", 

52 "ascii", 

53) 

54_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") 

55 

56_shared_docs: Dict[str, str] = dict() 

57 

58 

59def cat_core(list_of_columns: List, sep: str): 

60 """ 

61 Auxiliary function for :meth:`str.cat` 

62 

63 Parameters 

64 ---------- 

65 list_of_columns : list of numpy arrays 

66 List of arrays to be concatenated with sep; 

67 these arrays may not contain NaNs! 

68 sep : string 

69 The separator string for concatenating the columns. 

70 

71 Returns 

72 ------- 

73 nd.array 

74 The concatenation of list_of_columns with sep. 

75 """ 

76 if sep == "": 

77 # no need to interleave sep if it is empty 

78 arr_of_cols = np.asarray(list_of_columns, dtype=object) 

79 return np.sum(arr_of_cols, axis=0) 

80 list_with_sep = [sep] * (2 * len(list_of_columns) - 1) 

81 list_with_sep[::2] = list_of_columns 

82 arr_with_sep = np.asarray(list_with_sep, dtype=object) 

83 return np.sum(arr_with_sep, axis=0) 

84 

85 

86def cat_safe(list_of_columns: List, sep: str): 

87 """ 

88 Auxiliary function for :meth:`str.cat`. 

89 

90 Same signature as cat_core, but handles TypeErrors in concatenation, which 

91 happen if the arrays in list_of columns have the wrong dtypes or content. 

92 

93 Parameters 

94 ---------- 

95 list_of_columns : list of numpy arrays 

96 List of arrays to be concatenated with sep; 

97 these arrays may not contain NaNs! 

98 sep : string 

99 The separator string for concatenating the columns. 

100 

101 Returns 

102 ------- 

103 nd.array 

104 The concatenation of list_of_columns with sep. 

105 """ 

106 try: 

107 result = cat_core(list_of_columns, sep) 

108 except TypeError: 

109 # if there are any non-string values (wrong dtype or hidden behind 

110 # object dtype), np.sum will fail; catch and return with better message 

111 for column in list_of_columns: 

112 dtype = lib.infer_dtype(column, skipna=True) 

113 if dtype not in ["string", "empty"]: 

114 raise TypeError( 

115 "Concatenation requires list-likes containing only " 

116 "strings (or missing values). Offending values found in " 

117 f"column {dtype}" 

118 ) from None 

119 return result 

120 

121 

122def _na_map(f, arr, na_result=None, dtype=object): 

123 if is_extension_array_dtype(arr.dtype): 

124 if na_result is None: 

125 na_result = libmissing.NA 

126 # just StringDtype 

127 arr = extract_array(arr) 

128 return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) 

129 if na_result is None: 

130 na_result = np.nan 

131 return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) 

132 

133 

134def _map_stringarray( 

135 func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype 

136) -> ArrayLike: 

137 """ 

138 Map a callable over valid elements of a StringArrray. 

139 

140 Parameters 

141 ---------- 

142 func : Callable[[str], Any] 

143 Apply to each valid element. 

144 arr : StringArray 

145 na_value : Any 

146 The value to use for missing values. By default, this is 

147 the original value (NA). 

148 dtype : Dtype 

149 The result dtype to use. Specifying this avoids an intermediate 

150 object-dtype allocation. 

151 

152 Returns 

153 ------- 

154 ArrayLike 

155 An ExtensionArray for integer or string dtypes, otherwise 

156 an ndarray. 

157 

158 """ 

159 from pandas.arrays import IntegerArray, StringArray, BooleanArray 

160 

161 mask = isna(arr) 

162 

163 assert isinstance(arr, StringArray) 

164 arr = np.asarray(arr) 

165 

166 if is_integer_dtype(dtype) or is_bool_dtype(dtype): 

167 constructor: Union[Type[IntegerArray], Type[BooleanArray]] 

168 if is_integer_dtype(dtype): 

169 constructor = IntegerArray 

170 else: 

171 constructor = BooleanArray 

172 

173 na_value_is_na = isna(na_value) 

174 if na_value_is_na: 

175 na_value = 1 

176 result = lib.map_infer_mask( 

177 arr, 

178 func, 

179 mask.view("uint8"), 

180 convert=False, 

181 na_value=na_value, 

182 dtype=np.dtype(dtype), 

183 ) 

184 

185 if not na_value_is_na: 

186 mask[:] = False 

187 

188 return constructor(result, mask) 

189 

190 elif is_string_dtype(dtype) and not is_object_dtype(dtype): 

191 # i.e. StringDtype 

192 result = lib.map_infer_mask( 

193 arr, func, mask.view("uint8"), convert=False, na_value=na_value 

194 ) 

195 return StringArray(result) 

196 else: 

197 # This is when the result type is object. We reach this when 

198 # -> We know the result type is truly object (e.g. .encode returns bytes 

199 # or .findall returns a list). 

200 # -> We don't know the result type. E.g. `.get` can return anything. 

201 return lib.map_infer_mask(arr, func, mask.view("uint8")) 

202 

203 

204def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): 

205 if not len(arr): 

206 return np.ndarray(0, dtype=dtype) 

207 

208 if isinstance(arr, ABCSeries): 

209 arr = arr.values 

210 if not isinstance(arr, np.ndarray): 

211 arr = np.asarray(arr, dtype=object) 

212 if na_mask: 

213 mask = isna(arr) 

214 convert = not np.all(mask) 

215 try: 

216 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) 

217 except (TypeError, AttributeError) as e: 

218 # Reraise the exception if callable `f` got wrong number of args. 

219 # The user may want to be warned by this, instead of getting NaN 

220 p_err = ( 

221 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " 

222 r"(?(3)required )positional arguments?" 

223 ) 

224 

225 if len(e.args) >= 1 and re.search(p_err, e.args[0]): 

226 # FIXME: this should be totally avoidable 

227 raise e 

228 

229 def g(x): 

230 try: 

231 return f(x) 

232 except (TypeError, AttributeError): 

233 return na_value 

234 

235 return _map_object(g, arr, dtype=dtype) 

236 if na_value is not np.nan: 

237 np.putmask(result, mask, na_value) 

238 if result.dtype == object: 

239 result = lib.maybe_convert_objects(result) 

240 return result 

241 else: 

242 return lib.map_infer(arr, f) 

243 

244 

245def str_count(arr, pat, flags=0): 

246 """ 

247 Count occurrences of pattern in each string of the Series/Index. 

248 

249 This function is used to count the number of times a particular regex 

250 pattern is repeated in each of the string elements of the 

251 :class:`~pandas.Series`. 

252 

253 Parameters 

254 ---------- 

255 pat : str 

256 Valid regular expression. 

257 flags : int, default 0, meaning no flags 

258 Flags for the `re` module. For a complete list, `see here 

259 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_. 

260 **kwargs 

261 For compatibility with other string methods. Not used. 

262 

263 Returns 

264 ------- 

265 Series or Index 

266 Same type as the calling object containing the integer counts. 

267 

268 See Also 

269 -------- 

270 re : Standard library module for regular expressions. 

271 str.count : Standard library version, without regular expression support. 

272 

273 Notes 

274 ----- 

275 Some characters need to be escaped when passing in `pat`. 

276 eg. ``'$'`` has a special meaning in regex and must be escaped when 

277 finding this literal character. 

278 

279 Examples 

280 -------- 

281 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) 

282 >>> s.str.count('a') 

283 0 0.0 

284 1 0.0 

285 2 2.0 

286 3 2.0 

287 4 NaN 

288 5 0.0 

289 6 1.0 

290 dtype: float64 

291 

292 Escape ``'$'`` to find the literal dollar sign. 

293 

294 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) 

295 >>> s.str.count('\\$') 

296 0 1 

297 1 0 

298 2 1 

299 3 2 

300 4 2 

301 5 0 

302 dtype: int64 

303 

304 This is also available on Index 

305 

306 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') 

307 Int64Index([0, 0, 2, 1], dtype='int64') 

308 """ 

309 regex = re.compile(pat, flags=flags) 

310 f = lambda x: len(regex.findall(x)) 

311 return _na_map(f, arr, dtype="int64") 

312 

313 

314def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): 

315 """ 

316 Test if pattern or regex is contained within a string of a Series or Index. 

317 

318 Return boolean Series or Index based on whether a given pattern or regex is 

319 contained within a string of a Series or Index. 

320 

321 Parameters 

322 ---------- 

323 pat : str 

324 Character sequence or regular expression. 

325 case : bool, default True 

326 If True, case sensitive. 

327 flags : int, default 0 (no flags) 

328 Flags to pass through to the re module, e.g. re.IGNORECASE. 

329 na : default NaN 

330 Fill value for missing values. 

331 regex : bool, default True 

332 If True, assumes the pat is a regular expression. 

333 

334 If False, treats the pat as a literal string. 

335 

336 Returns 

337 ------- 

338 Series or Index of boolean values 

339 A Series or Index of boolean values indicating whether the 

340 given pattern is contained within the string of each element 

341 of the Series or Index. 

342 

343 See Also 

344 -------- 

345 match : Analogous, but stricter, relying on re.match instead of re.search. 

346 Series.str.startswith : Test if the start of each string element matches a 

347 pattern. 

348 Series.str.endswith : Same as startswith, but tests the end of string. 

349 

350 Examples 

351 -------- 

352 

353 Returning a Series of booleans using only a literal pattern. 

354 

355 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) 

356 >>> s1.str.contains('og', regex=False) 

357 0 False 

358 1 True 

359 2 False 

360 3 False 

361 4 NaN 

362 dtype: object 

363 

364 Returning an Index of booleans using only a literal pattern. 

365 

366 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) 

367 >>> ind.str.contains('23', regex=False) 

368 Index([False, False, False, True, nan], dtype='object') 

369 

370 Specifying case sensitivity using `case`. 

371 

372 >>> s1.str.contains('oG', case=True, regex=True) 

373 0 False 

374 1 False 

375 2 False 

376 3 False 

377 4 NaN 

378 dtype: object 

379 

380 Specifying `na` to be `False` instead of `NaN` replaces NaN values 

381 with `False`. If Series or Index does not contain NaN values 

382 the resultant dtype will be `bool`, otherwise, an `object` dtype. 

383 

384 >>> s1.str.contains('og', na=False, regex=True) 

385 0 False 

386 1 True 

387 2 False 

388 3 False 

389 4 False 

390 dtype: bool 

391 

392 Returning 'house' or 'dog' when either expression occurs in a string. 

393 

394 >>> s1.str.contains('house|dog', regex=True) 

395 0 False 

396 1 True 

397 2 True 

398 3 False 

399 4 NaN 

400 dtype: object 

401 

402 Ignoring case sensitivity using `flags` with regex. 

403 

404 >>> import re 

405 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) 

406 0 False 

407 1 False 

408 2 True 

409 3 False 

410 4 NaN 

411 dtype: object 

412 

413 Returning any digit using regular expression. 

414 

415 >>> s1.str.contains('\\d', regex=True) 

416 0 False 

417 1 False 

418 2 False 

419 3 True 

420 4 NaN 

421 dtype: object 

422 

423 Ensure `pat` is a not a literal pattern when `regex` is set to True. 

424 Note in the following example one might expect only `s2[1]` and `s2[3]` to 

425 return `True`. However, '.0' as a regex matches any character 

426 followed by a 0. 

427 

428 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) 

429 >>> s2.str.contains('.0', regex=True) 

430 0 True 

431 1 True 

432 2 False 

433 3 True 

434 4 False 

435 dtype: bool 

436 """ 

437 if regex: 

438 if not case: 

439 flags |= re.IGNORECASE 

440 

441 regex = re.compile(pat, flags=flags) 

442 

443 if regex.groups > 0: 

444 warnings.warn( 

445 "This pattern has match groups. To actually get the " 

446 "groups, use str.extract.", 

447 UserWarning, 

448 stacklevel=3, 

449 ) 

450 

451 f = lambda x: bool(regex.search(x)) 

452 else: 

453 if case: 

454 f = lambda x: pat in x 

455 else: 

456 upper_pat = pat.upper() 

457 f = lambda x: upper_pat in x 

458 uppered = _na_map(lambda x: x.upper(), arr) 

459 return _na_map(f, uppered, na, dtype=bool) 

460 return _na_map(f, arr, na, dtype=bool) 

461 

462 

463def str_startswith(arr, pat, na=np.nan): 

464 """ 

465 Test if the start of each string element matches a pattern. 

466 

467 Equivalent to :meth:`str.startswith`. 

468 

469 Parameters 

470 ---------- 

471 pat : str 

472 Character sequence. Regular expressions are not accepted. 

473 na : object, default NaN 

474 Object shown if element tested is not a string. 

475 

476 Returns 

477 ------- 

478 Series or Index of bool 

479 A Series of booleans indicating whether the given pattern matches 

480 the start of each string element. 

481 

482 See Also 

483 -------- 

484 str.startswith : Python standard library string method. 

485 Series.str.endswith : Same as startswith, but tests the end of string. 

486 Series.str.contains : Tests if string element contains a pattern. 

487 

488 Examples 

489 -------- 

490 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) 

491 >>> s 

492 0 bat 

493 1 Bear 

494 2 cat 

495 3 NaN 

496 dtype: object 

497 

498 >>> s.str.startswith('b') 

499 0 True 

500 1 False 

501 2 False 

502 3 NaN 

503 dtype: object 

504 

505 Specifying `na` to be `False` instead of `NaN`. 

506 

507 >>> s.str.startswith('b', na=False) 

508 0 True 

509 1 False 

510 2 False 

511 3 False 

512 dtype: bool 

513 """ 

514 f = lambda x: x.startswith(pat) 

515 return _na_map(f, arr, na, dtype=bool) 

516 

517 

518def str_endswith(arr, pat, na=np.nan): 

519 """ 

520 Test if the end of each string element matches a pattern. 

521 

522 Equivalent to :meth:`str.endswith`. 

523 

524 Parameters 

525 ---------- 

526 pat : str 

527 Character sequence. Regular expressions are not accepted. 

528 na : object, default NaN 

529 Object shown if element tested is not a string. 

530 

531 Returns 

532 ------- 

533 Series or Index of bool 

534 A Series of booleans indicating whether the given pattern matches 

535 the end of each string element. 

536 

537 See Also 

538 -------- 

539 str.endswith : Python standard library string method. 

540 Series.str.startswith : Same as endswith, but tests the start of string. 

541 Series.str.contains : Tests if string element contains a pattern. 

542 

543 Examples 

544 -------- 

545 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) 

546 >>> s 

547 0 bat 

548 1 bear 

549 2 caT 

550 3 NaN 

551 dtype: object 

552 

553 >>> s.str.endswith('t') 

554 0 True 

555 1 False 

556 2 False 

557 3 NaN 

558 dtype: object 

559 

560 Specifying `na` to be `False` instead of `NaN`. 

561 

562 >>> s.str.endswith('t', na=False) 

563 0 True 

564 1 False 

565 2 False 

566 3 False 

567 dtype: bool 

568 """ 

569 f = lambda x: x.endswith(pat) 

570 return _na_map(f, arr, na, dtype=bool) 

571 

572 

573def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): 

574 r""" 

575 Replace occurrences of pattern/regex in the Series/Index with 

576 some other string. Equivalent to :meth:`str.replace` or 

577 :func:`re.sub`. 

578 

579 Parameters 

580 ---------- 

581 pat : str or compiled regex 

582 String can be a character sequence or regular expression. 

583 repl : str or callable 

584 Replacement string or a callable. The callable is passed the regex 

585 match object and must return a replacement string to be used. 

586 See :func:`re.sub`. 

587 n : int, default -1 (all) 

588 Number of replacements to make from start. 

589 case : bool, default None 

590 Determines if replace is case sensitive: 

591 

592 - If True, case sensitive (the default if `pat` is a string) 

593 - Set to False for case insensitive 

594 - Cannot be set if `pat` is a compiled regex. 

595 

596 flags : int, default 0 (no flags) 

597 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled 

598 regex. 

599 regex : bool, default True 

600 Determines if assumes the passed-in pattern is a regular expression: 

601 

602 - If True, assumes the passed-in pattern is a regular expression. 

603 - If False, treats the pattern as a literal string 

604 - Cannot be set to False if `pat` is a compiled regex or `repl` is 

605 a callable. 

606 

607 .. versionadded:: 0.23.0 

608 

609 Returns 

610 ------- 

611 Series or Index of object 

612 A copy of the object with all matching occurrences of `pat` replaced by 

613 `repl`. 

614 

615 Raises 

616 ------ 

617 ValueError 

618 * if `regex` is False and `repl` is a callable or `pat` is a compiled 

619 regex 

620 * if `pat` is a compiled regex and `case` or `flags` is set 

621 

622 Notes 

623 ----- 

624 When `pat` is a compiled regex, all flags should be included in the 

625 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled 

626 regex will raise an error. 

627 

628 Examples 

629 -------- 

630 When `pat` is a string and `regex` is True (the default), the given `pat` 

631 is compiled as a regex. When `repl` is a string, it replaces matching 

632 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are 

633 left as is: 

634 

635 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) 

636 0 bao 

637 1 baz 

638 2 NaN 

639 dtype: object 

640 

641 When `pat` is a string and `regex` is False, every `pat` is replaced with 

642 `repl` as with :meth:`str.replace`: 

643 

644 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) 

645 0 bao 

646 1 fuz 

647 2 NaN 

648 dtype: object 

649 

650 When `repl` is a callable, it is called on every `pat` using 

651 :func:`re.sub`. The callable should expect one positional argument 

652 (a regex object) and return a string. 

653 

654 To get the idea: 

655 

656 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) 

657 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo 

658 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz 

659 2 NaN 

660 dtype: object 

661 

662 Reverse every lowercase alphabetic word: 

663 

664 >>> repl = lambda m: m.group(0)[::-1] 

665 >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) 

666 0 oof 123 

667 1 rab zab 

668 2 NaN 

669 dtype: object 

670 

671 Using regex groups (extract second group and swap case): 

672 

673 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)" 

674 >>> repl = lambda m: m.group('two').swapcase() 

675 >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) 

676 0 tWO 

677 1 bAR 

678 dtype: object 

679 

680 Using a compiled regex with flags 

681 

682 >>> import re 

683 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) 

684 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') 

685 0 foo 

686 1 bar 

687 2 NaN 

688 dtype: object 

689 """ 

690 

691 # Check whether repl is valid (GH 13438, GH 15055) 

692 if not (isinstance(repl, str) or callable(repl)): 

693 raise TypeError("repl must be a string or callable") 

694 

695 is_compiled_re = is_re(pat) 

696 if regex: 

697 if is_compiled_re: 

698 if (case is not None) or (flags != 0): 

699 raise ValueError( 

700 "case and flags cannot be set when pat is a compiled regex" 

701 ) 

702 else: 

703 # not a compiled regex 

704 # set default case 

705 if case is None: 

706 case = True 

707 

708 # add case flag, if provided 

709 if case is False: 

710 flags |= re.IGNORECASE 

711 if is_compiled_re or len(pat) > 1 or flags or callable(repl): 

712 n = n if n >= 0 else 0 

713 compiled = re.compile(pat, flags=flags) 

714 f = lambda x: compiled.sub(repl=repl, string=x, count=n) 

715 else: 

716 f = lambda x: x.replace(pat, repl, n) 

717 else: 

718 if is_compiled_re: 

719 raise ValueError( 

720 "Cannot use a compiled regex as replacement pattern with regex=False" 

721 ) 

722 if callable(repl): 

723 raise ValueError("Cannot use a callable replacement when regex=False") 

724 f = lambda x: x.replace(pat, repl, n) 

725 

726 return _na_map(f, arr, dtype=str) 

727 

728 

729def str_repeat(arr, repeats): 

730 """ 

731 Duplicate each string in the Series or Index. 

732 

733 Parameters 

734 ---------- 

735 repeats : int or sequence of int 

736 Same value for all (int) or different value per (sequence). 

737 

738 Returns 

739 ------- 

740 Series or Index of object 

741 Series or Index of repeated string objects specified by 

742 input parameter repeats. 

743 

744 Examples 

745 -------- 

746 >>> s = pd.Series(['a', 'b', 'c']) 

747 >>> s 

748 0 a 

749 1 b 

750 2 c 

751 dtype: object 

752 

753 Single int repeats string in Series 

754 

755 >>> s.str.repeat(repeats=2) 

756 0 aa 

757 1 bb 

758 2 cc 

759 dtype: object 

760 

761 Sequence of int repeats corresponding string in Series 

762 

763 >>> s.str.repeat(repeats=[1, 2, 3]) 

764 0 a 

765 1 bb 

766 2 ccc 

767 dtype: object 

768 """ 

769 if is_scalar(repeats): 

770 

771 def scalar_rep(x): 

772 try: 

773 return bytes.__mul__(x, repeats) 

774 except TypeError: 

775 return str.__mul__(x, repeats) 

776 

777 return _na_map(scalar_rep, arr, dtype=str) 

778 else: 

779 

780 def rep(x, r): 

781 if x is libmissing.NA: 

782 return x 

783 try: 

784 return bytes.__mul__(x, r) 

785 except TypeError: 

786 return str.__mul__(x, r) 

787 

788 repeats = np.asarray(repeats, dtype=object) 

789 result = libops.vec_binop(com.values_from_object(arr), repeats, rep) 

790 return result 

791 

792 

793def str_match(arr, pat, case=True, flags=0, na=np.nan): 

794 """ 

795 Determine if each string matches a regular expression. 

796 

797 Parameters 

798 ---------- 

799 pat : str 

800 Character sequence or regular expression. 

801 case : bool, default True 

802 If True, case sensitive. 

803 flags : int, default 0 (no flags) 

804 Regex module flags, e.g. re.IGNORECASE. 

805 na : default NaN 

806 Fill value for missing values. 

807 

808 Returns 

809 ------- 

810 Series/array of boolean values 

811 

812 See Also 

813 -------- 

814 contains : Analogous, but less strict, relying on re.search instead of 

815 re.match. 

816 extract : Extract matched groups. 

817 """ 

818 if not case: 

819 flags |= re.IGNORECASE 

820 

821 regex = re.compile(pat, flags=flags) 

822 

823 dtype = bool 

824 f = lambda x: bool(regex.match(x)) 

825 

826 return _na_map(f, arr, na, dtype=dtype) 

827 

828 

829def _get_single_group_name(rx): 

830 try: 

831 return list(rx.groupindex.keys()).pop() 

832 except IndexError: 

833 return None 

834 

835 

836def _groups_or_na_fun(regex): 

837 """Used in both extract_noexpand and extract_frame""" 

838 if regex.groups == 0: 

839 raise ValueError("pattern contains no capture groups") 

840 empty_row = [np.nan] * regex.groups 

841 

842 def f(x): 

843 if not isinstance(x, str): 

844 return empty_row 

845 m = regex.search(x) 

846 if m: 

847 return [np.nan if item is None else item for item in m.groups()] 

848 else: 

849 return empty_row 

850 

851 return f 

852 

853 

854def _result_dtype(arr): 

855 # workaround #27953 

856 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails 

857 # when the list of values is empty. 

858 if arr.dtype.name == "string": 

859 return "string" 

860 else: 

861 return object 

862 

863 

864def _str_extract_noexpand(arr, pat, flags=0): 

865 """ 

866 Find groups in each string in the Series using passed regular 

867 expression. This function is called from 

868 str_extract(expand=False), and can return Series, DataFrame, or 

869 Index. 

870 

871 """ 

872 from pandas import DataFrame 

873 

874 regex = re.compile(pat, flags=flags) 

875 groups_or_na = _groups_or_na_fun(regex) 

876 

877 if regex.groups == 1: 

878 result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) 

879 name = _get_single_group_name(regex) 

880 else: 

881 if isinstance(arr, ABCIndexClass): 

882 raise ValueError("only one regex group is supported with Index") 

883 name = None 

884 names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) 

885 columns = [names.get(1 + i, i) for i in range(regex.groups)] 

886 if arr.empty: 

887 result = DataFrame(columns=columns, dtype=object) 

888 else: 

889 dtype = _result_dtype(arr) 

890 result = DataFrame( 

891 [groups_or_na(val) for val in arr], 

892 columns=columns, 

893 index=arr.index, 

894 dtype=dtype, 

895 ) 

896 return result, name 

897 

898 

899def _str_extract_frame(arr, pat, flags=0): 

900 """ 

901 For each subject string in the Series, extract groups from the 

902 first match of regular expression pat. This function is called from 

903 str_extract(expand=True), and always returns a DataFrame. 

904 

905 """ 

906 from pandas import DataFrame 

907 

908 regex = re.compile(pat, flags=flags) 

909 groups_or_na = _groups_or_na_fun(regex) 

910 names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) 

911 columns = [names.get(1 + i, i) for i in range(regex.groups)] 

912 

913 if len(arr) == 0: 

914 return DataFrame(columns=columns, dtype=object) 

915 try: 

916 result_index = arr.index 

917 except AttributeError: 

918 result_index = None 

919 dtype = _result_dtype(arr) 

920 return DataFrame( 

921 [groups_or_na(val) for val in arr], 

922 columns=columns, 

923 index=result_index, 

924 dtype=dtype, 

925 ) 

926 

927 

928def str_extract(arr, pat, flags=0, expand=True): 

929 r""" 

930 Extract capture groups in the regex `pat` as columns in a DataFrame. 

931 

932 For each subject string in the Series, extract groups from the 

933 first match of regular expression `pat`. 

934 

935 Parameters 

936 ---------- 

937 pat : str 

938 Regular expression pattern with capturing groups. 

939 flags : int, default 0 (no flags) 

940 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that 

941 modify regular expression matching for things like case, 

942 spaces, etc. For more details, see :mod:`re`. 

943 expand : bool, default True 

944 If True, return DataFrame with one column per capture group. 

945 If False, return a Series/Index if there is one capture group 

946 or DataFrame if there are multiple capture groups. 

947 

948 Returns 

949 ------- 

950 DataFrame or Series or Index 

951 A DataFrame with one row for each subject string, and one 

952 column for each group. Any capture group names in regular 

953 expression pat will be used for column names; otherwise 

954 capture group numbers will be used. The dtype of each result 

955 column is always object, even when no match is found. If 

956 ``expand=False`` and pat has only one capture group, then 

957 return a Series (if subject is a Series) or Index (if subject 

958 is an Index). 

959 

960 See Also 

961 -------- 

962 extractall : Returns all matches (not just the first match). 

963 

964 Examples 

965 -------- 

966 A pattern with two groups will return a DataFrame with two columns. 

967 Non-matches will be NaN. 

968 

969 >>> s = pd.Series(['a1', 'b2', 'c3']) 

970 >>> s.str.extract(r'([ab])(\d)') 

971 0 1 

972 0 a 1 

973 1 b 2 

974 2 NaN NaN 

975 

976 A pattern may contain optional groups. 

977 

978 >>> s.str.extract(r'([ab])?(\d)') 

979 0 1 

980 0 a 1 

981 1 b 2 

982 2 NaN 3 

983 

984 Named groups will become column names in the result. 

985 

986 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)') 

987 letter digit 

988 0 a 1 

989 1 b 2 

990 2 NaN NaN 

991 

992 A pattern with one group will return a DataFrame with one column 

993 if expand=True. 

994 

995 >>> s.str.extract(r'[ab](\d)', expand=True) 

996 0 

997 0 1 

998 1 2 

999 2 NaN 

1000 

1001 A pattern with one group will return a Series if expand=False. 

1002 

1003 >>> s.str.extract(r'[ab](\d)', expand=False) 

1004 0 1 

1005 1 2 

1006 2 NaN 

1007 dtype: object 

1008 """ 

1009 if not isinstance(expand, bool): 

1010 raise ValueError("expand must be True or False") 

1011 if expand: 

1012 return _str_extract_frame(arr._orig, pat, flags=flags) 

1013 else: 

1014 result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) 

1015 return arr._wrap_result(result, name=name, expand=expand) 

1016 

1017 

1018def str_extractall(arr, pat, flags=0): 

1019 r""" 

1020 For each subject string in the Series, extract groups from all 

1021 matches of regular expression pat. When each subject string in the 

1022 Series has exactly one match, extractall(pat).xs(0, level='match') 

1023 is the same as extract(pat). 

1024 

1025 Parameters 

1026 ---------- 

1027 pat : str 

1028 Regular expression pattern with capturing groups. 

1029 flags : int, default 0 (no flags) 

1030 A ``re`` module flag, for example ``re.IGNORECASE``. These allow 

1031 to modify regular expression matching for things like case, spaces, 

1032 etc. Multiple flags can be combined with the bitwise OR operator, 

1033 for example ``re.IGNORECASE | re.MULTILINE``. 

1034 

1035 Returns 

1036 ------- 

1037 DataFrame 

1038 A ``DataFrame`` with one row for each match, and one column for each 

1039 group. Its rows have a ``MultiIndex`` with first levels that come from 

1040 the subject ``Series``. The last level is named 'match' and indexes the 

1041 matches in each item of the ``Series``. Any capture group names in 

1042 regular expression pat will be used for column names; otherwise capture 

1043 group numbers will be used. 

1044 

1045 See Also 

1046 -------- 

1047 extract : Returns first match only (not all matches). 

1048 

1049 Examples 

1050 -------- 

1051 A pattern with one group will return a DataFrame with one column. 

1052 Indices with no matches will not appear in the result. 

1053 

1054 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) 

1055 >>> s.str.extractall(r"[ab](\d)") 

1056 0 

1057 match 

1058 A 0 1 

1059 1 2 

1060 B 0 1 

1061 

1062 Capture group names are used for column names of the result. 

1063 

1064 >>> s.str.extractall(r"[ab](?P<digit>\d)") 

1065 digit 

1066 match 

1067 A 0 1 

1068 1 2 

1069 B 0 1 

1070 

1071 A pattern with two groups will return a DataFrame with two columns. 

1072 

1073 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") 

1074 letter digit 

1075 match 

1076 A 0 a 1 

1077 1 a 2 

1078 B 0 b 1 

1079 

1080 Optional groups that do not match are NaN in the result. 

1081 

1082 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)") 

1083 letter digit 

1084 match 

1085 A 0 a 1 

1086 1 a 2 

1087 B 0 b 1 

1088 C 0 NaN 1 

1089 """ 

1090 

1091 regex = re.compile(pat, flags=flags) 

1092 # the regex must contain capture groups. 

1093 if regex.groups == 0: 

1094 raise ValueError("pattern contains no capture groups") 

1095 

1096 if isinstance(arr, ABCIndexClass): 

1097 arr = arr.to_series().reset_index(drop=True) 

1098 

1099 names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) 

1100 columns = [names.get(1 + i, i) for i in range(regex.groups)] 

1101 match_list = [] 

1102 index_list = [] 

1103 is_mi = arr.index.nlevels > 1 

1104 

1105 for subject_key, subject in arr.items(): 

1106 if isinstance(subject, str): 

1107 

1108 if not is_mi: 

1109 subject_key = (subject_key,) 

1110 

1111 for match_i, match_tuple in enumerate(regex.findall(subject)): 

1112 if isinstance(match_tuple, str): 

1113 match_tuple = (match_tuple,) 

1114 na_tuple = [np.NaN if group == "" else group for group in match_tuple] 

1115 match_list.append(na_tuple) 

1116 result_key = tuple(subject_key + (match_i,)) 

1117 index_list.append(result_key) 

1118 

1119 from pandas import MultiIndex 

1120 

1121 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) 

1122 dtype = _result_dtype(arr) 

1123 

1124 result = arr._constructor_expanddim( 

1125 match_list, index=index, columns=columns, dtype=dtype 

1126 ) 

1127 return result 

1128 

1129 

1130def str_get_dummies(arr, sep="|"): 

1131 """ 

1132 Split each string in the Series by sep and return a DataFrame 

1133 of dummy/indicator variables. 

1134 

1135 Parameters 

1136 ---------- 

1137 sep : str, default "|" 

1138 String to split on. 

1139 

1140 Returns 

1141 ------- 

1142 DataFrame 

1143 Dummy variables corresponding to values of the Series. 

1144 

1145 See Also 

1146 -------- 

1147 get_dummies : Convert categorical variable into dummy/indicator 

1148 variables. 

1149 

1150 Examples 

1151 -------- 

1152 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() 

1153 a b c 

1154 0 1 1 0 

1155 1 1 0 0 

1156 2 1 0 1 

1157 

1158 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() 

1159 a b c 

1160 0 1 1 0 

1161 1 0 0 0 

1162 2 1 0 1 

1163 """ 

1164 arr = arr.fillna("") 

1165 try: 

1166 arr = sep + arr + sep 

1167 except TypeError: 

1168 arr = sep + arr.astype(str) + sep 

1169 

1170 tags = set() 

1171 for ts in arr.str.split(sep): 

1172 tags.update(ts) 

1173 tags = sorted(tags - {""}) 

1174 

1175 dummies = np.empty((len(arr), len(tags)), dtype=np.int64) 

1176 

1177 for i, t in enumerate(tags): 

1178 pat = sep + t + sep 

1179 dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) 

1180 return dummies, tags 

1181 

1182 

1183def str_join(arr, sep): 

1184 """ 

1185 Join lists contained as elements in the Series/Index with passed delimiter. 

1186 

1187 If the elements of a Series are lists themselves, join the content of these 

1188 lists using the delimiter passed to the function. 

1189 This function is an equivalent to :meth:`str.join`. 

1190 

1191 Parameters 

1192 ---------- 

1193 sep : str 

1194 Delimiter to use between list entries. 

1195 

1196 Returns 

1197 ------- 

1198 Series/Index: object 

1199 The list entries concatenated by intervening occurrences of the 

1200 delimiter. 

1201 

1202 Raises 

1203 ------ 

1204 AttributeError 

1205 If the supplied Series contains neither strings nor lists. 

1206 

1207 See Also 

1208 -------- 

1209 str.join : Standard library version of this method. 

1210 Series.str.split : Split strings around given separator/delimiter. 

1211 

1212 Notes 

1213 ----- 

1214 If any of the list items is not a string object, the result of the join 

1215 will be `NaN`. 

1216 

1217 Examples 

1218 -------- 

1219 Example with a list that contains non-string elements. 

1220 

1221 >>> s = pd.Series([['lion', 'elephant', 'zebra'], 

1222 ... [1.1, 2.2, 3.3], 

1223 ... ['cat', np.nan, 'dog'], 

1224 ... ['cow', 4.5, 'goat'], 

1225 ... ['duck', ['swan', 'fish'], 'guppy']]) 

1226 >>> s 

1227 0 [lion, elephant, zebra] 

1228 1 [1.1, 2.2, 3.3] 

1229 2 [cat, nan, dog] 

1230 3 [cow, 4.5, goat] 

1231 4 [duck, [swan, fish], guppy] 

1232 dtype: object 

1233 

1234 Join all lists using a '-'. The lists containing object(s) of types other 

1235 than str will produce a NaN. 

1236 

1237 >>> s.str.join('-') 

1238 0 lion-elephant-zebra 

1239 1 NaN 

1240 2 NaN 

1241 3 NaN 

1242 4 NaN 

1243 dtype: object 

1244 """ 

1245 return _na_map(sep.join, arr, dtype=str) 

1246 

1247 

1248def str_findall(arr, pat, flags=0): 

1249 """ 

1250 Find all occurrences of pattern or regular expression in the Series/Index. 

1251 

1252 Equivalent to applying :func:`re.findall` to all the elements in the 

1253 Series/Index. 

1254 

1255 Parameters 

1256 ---------- 

1257 pat : str 

1258 Pattern or regular expression. 

1259 flags : int, default 0 

1260 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which 

1261 means no flags). 

1262 

1263 Returns 

1264 ------- 

1265 Series/Index of lists of strings 

1266 All non-overlapping matches of pattern or regular expression in each 

1267 string of this Series/Index. 

1268 

1269 See Also 

1270 -------- 

1271 count : Count occurrences of pattern or regular expression in each string 

1272 of the Series/Index. 

1273 extractall : For each string in the Series, extract groups from all matches 

1274 of regular expression and return a DataFrame with one row for each 

1275 match and one column for each group. 

1276 re.findall : The equivalent ``re`` function to all non-overlapping matches 

1277 of pattern or regular expression in string, as a list of strings. 

1278 

1279 Examples 

1280 -------- 

1281 

1282 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) 

1283 

1284 The search for the pattern 'Monkey' returns one match: 

1285 

1286 >>> s.str.findall('Monkey') 

1287 0 [] 

1288 1 [Monkey] 

1289 2 [] 

1290 dtype: object 

1291 

1292 On the other hand, the search for the pattern 'MONKEY' doesn't return any 

1293 match: 

1294 

1295 >>> s.str.findall('MONKEY') 

1296 0 [] 

1297 1 [] 

1298 2 [] 

1299 dtype: object 

1300 

1301 Flags can be added to the pattern or regular expression. For instance, 

1302 to find the pattern 'MONKEY' ignoring the case: 

1303 

1304 >>> import re 

1305 >>> s.str.findall('MONKEY', flags=re.IGNORECASE) 

1306 0 [] 

1307 1 [Monkey] 

1308 2 [] 

1309 dtype: object 

1310 

1311 When the pattern matches more than one string in the Series, all matches 

1312 are returned: 

1313 

1314 >>> s.str.findall('on') 

1315 0 [on] 

1316 1 [on] 

1317 2 [] 

1318 dtype: object 

1319 

1320 Regular expressions are supported too. For instance, the search for all the 

1321 strings ending with the word 'on' is shown next: 

1322 

1323 >>> s.str.findall('on$') 

1324 0 [on] 

1325 1 [] 

1326 2 [] 

1327 dtype: object 

1328 

1329 If the pattern is found more than once in the same string, then a list of 

1330 multiple strings is returned: 

1331 

1332 >>> s.str.findall('b') 

1333 0 [] 

1334 1 [] 

1335 2 [b, b] 

1336 dtype: object 

1337 """ 

1338 regex = re.compile(pat, flags=flags) 

1339 return _na_map(regex.findall, arr) 

1340 

1341 

1342def str_find(arr, sub, start=0, end=None, side="left"): 

1343 """ 

1344 Return indexes in each strings in the Series/Index where the 

1345 substring is fully contained between [start:end]. Return -1 on failure. 

1346 

1347 Parameters 

1348 ---------- 

1349 sub : str 

1350 Substring being searched. 

1351 start : int 

1352 Left edge index. 

1353 end : int 

1354 Right edge index. 

1355 side : {'left', 'right'}, default 'left' 

1356 Specifies a starting side, equivalent to ``find`` or ``rfind``. 

1357 

1358 Returns 

1359 ------- 

1360 Series or Index 

1361 Indexes where substring is found. 

1362 """ 

1363 

1364 if not isinstance(sub, str): 

1365 msg = f"expected a string object, not {type(sub).__name__}" 

1366 raise TypeError(msg) 

1367 

1368 if side == "left": 

1369 method = "find" 

1370 elif side == "right": 

1371 method = "rfind" 

1372 else: # pragma: no cover 

1373 raise ValueError("Invalid side") 

1374 

1375 if end is None: 

1376 f = lambda x: getattr(x, method)(sub, start) 

1377 else: 

1378 f = lambda x: getattr(x, method)(sub, start, end) 

1379 

1380 return _na_map(f, arr, dtype="int64") 

1381 

1382 

1383def str_index(arr, sub, start=0, end=None, side="left"): 

1384 if not isinstance(sub, str): 

1385 msg = f"expected a string object, not {type(sub).__name__}" 

1386 raise TypeError(msg) 

1387 

1388 if side == "left": 

1389 method = "index" 

1390 elif side == "right": 

1391 method = "rindex" 

1392 else: # pragma: no cover 

1393 raise ValueError("Invalid side") 

1394 

1395 if end is None: 

1396 f = lambda x: getattr(x, method)(sub, start) 

1397 else: 

1398 f = lambda x: getattr(x, method)(sub, start, end) 

1399 

1400 return _na_map(f, arr, dtype="int64") 

1401 

1402 

1403def str_pad(arr, width, side="left", fillchar=" "): 

1404 """ 

1405 Pad strings in the Series/Index up to width. 

1406 

1407 Parameters 

1408 ---------- 

1409 width : int 

1410 Minimum width of resulting string; additional characters will be filled 

1411 with character defined in `fillchar`. 

1412 side : {'left', 'right', 'both'}, default 'left' 

1413 Side from which to fill resulting string. 

1414 fillchar : str, default ' ' 

1415 Additional character for filling, default is whitespace. 

1416 

1417 Returns 

1418 ------- 

1419 Series or Index of object 

1420 Returns Series or Index with minimum number of char in object. 

1421 

1422 See Also 

1423 -------- 

1424 Series.str.rjust : Fills the left side of strings with an arbitrary 

1425 character. Equivalent to ``Series.str.pad(side='left')``. 

1426 Series.str.ljust : Fills the right side of strings with an arbitrary 

1427 character. Equivalent to ``Series.str.pad(side='right')``. 

1428 Series.str.center : Fills boths sides of strings with an arbitrary 

1429 character. Equivalent to ``Series.str.pad(side='both')``. 

1430 Series.str.zfill : Pad strings in the Series/Index by prepending '0' 

1431 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. 

1432 

1433 Examples 

1434 -------- 

1435 >>> s = pd.Series(["caribou", "tiger"]) 

1436 >>> s 

1437 0 caribou 

1438 1 tiger 

1439 dtype: object 

1440 

1441 >>> s.str.pad(width=10) 

1442 0 caribou 

1443 1 tiger 

1444 dtype: object 

1445 

1446 >>> s.str.pad(width=10, side='right', fillchar='-') 

1447 0 caribou--- 

1448 1 tiger----- 

1449 dtype: object 

1450 

1451 >>> s.str.pad(width=10, side='both', fillchar='-') 

1452 0 -caribou-- 

1453 1 --tiger--- 

1454 dtype: object 

1455 """ 

1456 if not isinstance(fillchar, str): 

1457 msg = f"fillchar must be a character, not {type(fillchar).__name__}" 

1458 raise TypeError(msg) 

1459 

1460 if len(fillchar) != 1: 

1461 raise TypeError("fillchar must be a character, not str") 

1462 

1463 if not is_integer(width): 

1464 msg = f"width must be of integer type, not {type(width).__name__}" 

1465 raise TypeError(msg) 

1466 

1467 if side == "left": 

1468 f = lambda x: x.rjust(width, fillchar) 

1469 elif side == "right": 

1470 f = lambda x: x.ljust(width, fillchar) 

1471 elif side == "both": 

1472 f = lambda x: x.center(width, fillchar) 

1473 else: # pragma: no cover 

1474 raise ValueError("Invalid side") 

1475 

1476 return _na_map(f, arr, dtype=str) 

1477 

1478 

1479def str_split(arr, pat=None, n=None): 

1480 

1481 if pat is None: 

1482 if n is None or n == 0: 

1483 n = -1 

1484 f = lambda x: x.split(pat, n) 

1485 else: 

1486 if len(pat) == 1: 

1487 if n is None or n == 0: 

1488 n = -1 

1489 f = lambda x: x.split(pat, n) 

1490 else: 

1491 if n is None or n == -1: 

1492 n = 0 

1493 regex = re.compile(pat) 

1494 f = lambda x: regex.split(x, maxsplit=n) 

1495 res = _na_map(f, arr) 

1496 return res 

1497 

1498 

1499def str_rsplit(arr, pat=None, n=None): 

1500 

1501 if n is None or n == 0: 

1502 n = -1 

1503 f = lambda x: x.rsplit(pat, n) 

1504 res = _na_map(f, arr) 

1505 return res 

1506 

1507 

1508def str_slice(arr, start=None, stop=None, step=None): 

1509 """ 

1510 Slice substrings from each element in the Series or Index. 

1511 

1512 Parameters 

1513 ---------- 

1514 start : int, optional 

1515 Start position for slice operation. 

1516 stop : int, optional 

1517 Stop position for slice operation. 

1518 step : int, optional 

1519 Step size for slice operation. 

1520 

1521 Returns 

1522 ------- 

1523 Series or Index of object 

1524 Series or Index from sliced substring from original string object. 

1525 

1526 See Also 

1527 -------- 

1528 Series.str.slice_replace : Replace a slice with a string. 

1529 Series.str.get : Return element at position. 

1530 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` 

1531 being the position. 

1532 

1533 Examples 

1534 -------- 

1535 >>> s = pd.Series(["koala", "fox", "chameleon"]) 

1536 >>> s 

1537 0 koala 

1538 1 fox 

1539 2 chameleon 

1540 dtype: object 

1541 

1542 >>> s.str.slice(start=1) 

1543 0 oala 

1544 1 ox 

1545 2 hameleon 

1546 dtype: object 

1547 

1548 >>> s.str.slice(start=-1) 

1549 0 a 

1550 1 x 

1551 2 n 

1552 dtype: object 

1553 

1554 >>> s.str.slice(stop=2) 

1555 0 ko 

1556 1 fo 

1557 2 ch 

1558 dtype: object 

1559 

1560 >>> s.str.slice(step=2) 

1561 0 kaa 

1562 1 fx 

1563 2 caeen 

1564 dtype: object 

1565 

1566 >>> s.str.slice(start=0, stop=5, step=3) 

1567 0 kl 

1568 1 f 

1569 2 cm 

1570 dtype: object 

1571 

1572 Equivalent behaviour to: 

1573 

1574 >>> s.str[0:5:3] 

1575 0 kl 

1576 1 f 

1577 2 cm 

1578 dtype: object 

1579 """ 

1580 obj = slice(start, stop, step) 

1581 f = lambda x: x[obj] 

1582 return _na_map(f, arr, dtype=str) 

1583 

1584 

1585def str_slice_replace(arr, start=None, stop=None, repl=None): 

1586 """ 

1587 Replace a positional slice of a string with another value. 

1588 

1589 Parameters 

1590 ---------- 

1591 start : int, optional 

1592 Left index position to use for the slice. If not specified (None), 

1593 the slice is unbounded on the left, i.e. slice from the start 

1594 of the string. 

1595 stop : int, optional 

1596 Right index position to use for the slice. If not specified (None), 

1597 the slice is unbounded on the right, i.e. slice until the 

1598 end of the string. 

1599 repl : str, optional 

1600 String for replacement. If not specified (None), the sliced region 

1601 is replaced with an empty string. 

1602 

1603 Returns 

1604 ------- 

1605 Series or Index 

1606 Same type as the original object. 

1607 

1608 See Also 

1609 -------- 

1610 Series.str.slice : Just slicing without replacement. 

1611 

1612 Examples 

1613 -------- 

1614 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) 

1615 >>> s 

1616 0 a 

1617 1 ab 

1618 2 abc 

1619 3 abdc 

1620 4 abcde 

1621 dtype: object 

1622 

1623 Specify just `start`, meaning replace `start` until the end of the 

1624 string with `repl`. 

1625 

1626 >>> s.str.slice_replace(1, repl='X') 

1627 0 aX 

1628 1 aX 

1629 2 aX 

1630 3 aX 

1631 4 aX 

1632 dtype: object 

1633 

1634 Specify just `stop`, meaning the start of the string to `stop` is replaced 

1635 with `repl`, and the rest of the string is included. 

1636 

1637 >>> s.str.slice_replace(stop=2, repl='X') 

1638 0 X 

1639 1 X 

1640 2 Xc 

1641 3 Xdc 

1642 4 Xcde 

1643 dtype: object 

1644 

1645 Specify `start` and `stop`, meaning the slice from `start` to `stop` is 

1646 replaced with `repl`. Everything before or after `start` and `stop` is 

1647 included as is. 

1648 

1649 >>> s.str.slice_replace(start=1, stop=3, repl='X') 

1650 0 aX 

1651 1 aX 

1652 2 aX 

1653 3 aXc 

1654 4 aXde 

1655 dtype: object 

1656 """ 

1657 if repl is None: 

1658 repl = "" 

1659 

1660 def f(x): 

1661 if x[start:stop] == "": 

1662 local_stop = start 

1663 else: 

1664 local_stop = stop 

1665 y = "" 

1666 if start is not None: 

1667 y += x[:start] 

1668 y += repl 

1669 if stop is not None: 

1670 y += x[local_stop:] 

1671 return y 

1672 

1673 return _na_map(f, arr, dtype=str) 

1674 

1675 

1676def str_strip(arr, to_strip=None, side="both"): 

1677 """ 

1678 Strip whitespace (including newlines) from each string in the 

1679 Series/Index. 

1680 

1681 Parameters 

1682 ---------- 

1683 to_strip : str or unicode 

1684 side : {'left', 'right', 'both'}, default 'both' 

1685 

1686 Returns 

1687 ------- 

1688 Series or Index 

1689 """ 

1690 if side == "both": 

1691 f = lambda x: x.strip(to_strip) 

1692 elif side == "left": 

1693 f = lambda x: x.lstrip(to_strip) 

1694 elif side == "right": 

1695 f = lambda x: x.rstrip(to_strip) 

1696 else: # pragma: no cover 

1697 raise ValueError("Invalid side") 

1698 return _na_map(f, arr, dtype=str) 

1699 

1700 

1701def str_wrap(arr, width, **kwargs): 

1702 r""" 

1703 Wrap long strings in the Series/Index to be formatted in 

1704 paragraphs with length less than a given width. 

1705 

1706 This method has the same keyword parameters and defaults as 

1707 :class:`textwrap.TextWrapper`. 

1708 

1709 Parameters 

1710 ---------- 

1711 width : int 

1712 Maximum line width. 

1713 expand_tabs : bool, optional 

1714 If True, tab characters will be expanded to spaces (default: True). 

1715 replace_whitespace : bool, optional 

1716 If True, each whitespace character (as defined by string.whitespace) 

1717 remaining after tab expansion will be replaced by a single space 

1718 (default: True). 

1719 drop_whitespace : bool, optional 

1720 If True, whitespace that, after wrapping, happens to end up at the 

1721 beginning or end of a line is dropped (default: True). 

1722 break_long_words : bool, optional 

1723 If True, then words longer than width will be broken in order to ensure 

1724 that no lines are longer than width. If it is false, long words will 

1725 not be broken, and some lines may be longer than width (default: True). 

1726 break_on_hyphens : bool, optional 

1727 If True, wrapping will occur preferably on whitespace and right after 

1728 hyphens in compound words, as it is customary in English. If false, 

1729 only whitespaces will be considered as potentially good places for line 

1730 breaks, but you need to set break_long_words to false if you want truly 

1731 insecable words (default: True). 

1732 

1733 Returns 

1734 ------- 

1735 Series or Index 

1736 

1737 Notes 

1738 ----- 

1739 Internally, this method uses a :class:`textwrap.TextWrapper` instance with 

1740 default settings. To achieve behavior matching R's stringr library str_wrap 

1741 function, use the arguments: 

1742 

1743 - expand_tabs = False 

1744 - replace_whitespace = True 

1745 - drop_whitespace = True 

1746 - break_long_words = False 

1747 - break_on_hyphens = False 

1748 

1749 Examples 

1750 -------- 

1751 

1752 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) 

1753 >>> s.str.wrap(12) 

1754 0 line to be\nwrapped 

1755 1 another line\nto be\nwrapped 

1756 dtype: object 

1757 """ 

1758 kwargs["width"] = width 

1759 

1760 tw = textwrap.TextWrapper(**kwargs) 

1761 

1762 return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) 

1763 

1764 

1765def str_translate(arr, table): 

1766 """ 

1767 Map all characters in the string through the given mapping table. 

1768 Equivalent to standard :meth:`str.translate`. 

1769 

1770 Parameters 

1771 ---------- 

1772 table : dict 

1773 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or 

1774 None. Unmapped characters are left untouched. 

1775 Characters mapped to None are deleted. :meth:`str.maketrans` is a 

1776 helper function for making translation tables. 

1777 

1778 Returns 

1779 ------- 

1780 Series or Index 

1781 """ 

1782 return _na_map(lambda x: x.translate(table), arr, dtype=str) 

1783 

1784 

1785def str_get(arr, i): 

1786 """ 

1787 Extract element from each component at specified position. 

1788 

1789 Extract element from lists, tuples, or strings in each element in the 

1790 Series/Index. 

1791 

1792 Parameters 

1793 ---------- 

1794 i : int 

1795 Position of element to extract. 

1796 

1797 Returns 

1798 ------- 

1799 Series or Index 

1800 

1801 Examples 

1802 -------- 

1803 >>> s = pd.Series(["String", 

1804 ... (1, 2, 3), 

1805 ... ["a", "b", "c"], 

1806 ... 123, 

1807 ... -456, 

1808 ... {1: "Hello", "2": "World"}]) 

1809 >>> s 

1810 0 String 

1811 1 (1, 2, 3) 

1812 2 [a, b, c] 

1813 3 123 

1814 4 -456 

1815 5 {1: 'Hello', '2': 'World'} 

1816 dtype: object 

1817 

1818 >>> s.str.get(1) 

1819 0 t 

1820 1 2 

1821 2 b 

1822 3 NaN 

1823 4 NaN 

1824 5 Hello 

1825 dtype: object 

1826 

1827 >>> s.str.get(-1) 

1828 0 g 

1829 1 3 

1830 2 c 

1831 3 NaN 

1832 4 NaN 

1833 5 None 

1834 dtype: object 

1835 """ 

1836 

1837 def f(x): 

1838 if isinstance(x, dict): 

1839 return x.get(i) 

1840 elif len(x) > i >= -len(x): 

1841 return x[i] 

1842 return np.nan 

1843 

1844 return _na_map(f, arr) 

1845 

1846 

1847def str_decode(arr, encoding, errors="strict"): 

1848 """ 

1849 Decode character string in the Series/Index using indicated encoding. 

1850 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in 

1851 python3. 

1852 

1853 Parameters 

1854 ---------- 

1855 encoding : str 

1856 errors : str, optional 

1857 

1858 Returns 

1859 ------- 

1860 Series or Index 

1861 """ 

1862 if encoding in _cpython_optimized_decoders: 

1863 # CPython optimized implementation 

1864 f = lambda x: x.decode(encoding, errors) 

1865 else: 

1866 decoder = codecs.getdecoder(encoding) 

1867 f = lambda x: decoder(x, errors)[0] 

1868 return _na_map(f, arr) 

1869 

1870 

1871def str_encode(arr, encoding, errors="strict"): 

1872 """ 

1873 Encode character string in the Series/Index using indicated encoding. 

1874 Equivalent to :meth:`str.encode`. 

1875 

1876 Parameters 

1877 ---------- 

1878 encoding : str 

1879 errors : str, optional 

1880 

1881 Returns 

1882 ------- 

1883 encoded : Series/Index of objects 

1884 """ 

1885 if encoding in _cpython_optimized_encoders: 

1886 # CPython optimized implementation 

1887 f = lambda x: x.encode(encoding, errors) 

1888 else: 

1889 encoder = codecs.getencoder(encoding) 

1890 f = lambda x: encoder(x, errors)[0] 

1891 return _na_map(f, arr) 

1892 

1893 

1894def forbid_nonstring_types(forbidden, name=None): 

1895 """ 

1896 Decorator to forbid specific types for a method of StringMethods. 

1897 

1898 For calling `.str.{method}` on a Series or Index, it is necessary to first 

1899 initialize the :class:`StringMethods` object, and then call the method. 

1900 However, different methods allow different input types, and so this can not 

1901 be checked during :meth:`StringMethods.__init__`, but must be done on a 

1902 per-method basis. This decorator exists to facilitate this process, and 

1903 make it explicit which (inferred) types are disallowed by the method. 

1904 

1905 :meth:`StringMethods.__init__` allows the *union* of types its different 

1906 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), 

1907 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. 

1908 

1909 The default string types ['string', 'empty'] are allowed for all methods. 

1910 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method 

1911 then needs to forbid the types it is not intended for. 

1912 

1913 Parameters 

1914 ---------- 

1915 forbidden : list-of-str or None 

1916 List of forbidden non-string types, may be one or more of 

1917 `['bytes', 'mixed', 'mixed-integer']`. 

1918 name : str, default None 

1919 Name of the method to use in the error message. By default, this is 

1920 None, in which case the name from the method being wrapped will be 

1921 copied. However, for working with further wrappers (like _pat_wrapper 

1922 and _noarg_wrapper), it is necessary to specify the name. 

1923 

1924 Returns 

1925 ------- 

1926 func : wrapper 

1927 The method to which the decorator is applied, with an added check that 

1928 enforces the inferred type to not be in the list of forbidden types. 

1929 

1930 Raises 

1931 ------ 

1932 TypeError 

1933 If the inferred type of the underlying data is in `forbidden`. 

1934 """ 

1935 

1936 # deal with None 

1937 forbidden = [] if forbidden is None else forbidden 

1938 

1939 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( 

1940 forbidden 

1941 ) 

1942 

1943 def _forbid_nonstring_types(func): 

1944 func_name = func.__name__ if name is None else name 

1945 

1946 @wraps(func) 

1947 def wrapper(self, *args, **kwargs): 

1948 if self._inferred_dtype not in allowed_types: 

1949 msg = ( 

1950 f"Cannot use .str.{func_name} with values of " 

1951 f"inferred dtype '{self._inferred_dtype}'." 

1952 ) 

1953 raise TypeError(msg) 

1954 return func(self, *args, **kwargs) 

1955 

1956 wrapper.__name__ = func_name 

1957 return wrapper 

1958 

1959 return _forbid_nonstring_types 

1960 

1961 

1962def _noarg_wrapper( 

1963 f, 

1964 name=None, 

1965 docstring=None, 

1966 forbidden_types=["bytes"], 

1967 returns_string=True, 

1968 **kargs, 

1969): 

1970 @forbid_nonstring_types(forbidden_types, name=name) 

1971 def wrapper(self): 

1972 result = _na_map(f, self._parent, **kargs) 

1973 return self._wrap_result(result, returns_string=returns_string) 

1974 

1975 wrapper.__name__ = f.__name__ if name is None else name 

1976 if docstring is not None: 

1977 wrapper.__doc__ = docstring 

1978 else: 

1979 raise ValueError("Provide docstring") 

1980 

1981 return wrapper 

1982 

1983 

1984def _pat_wrapper( 

1985 f, 

1986 flags=False, 

1987 na=False, 

1988 name=None, 

1989 forbidden_types=["bytes"], 

1990 returns_string=True, 

1991 **kwargs, 

1992): 

1993 @forbid_nonstring_types(forbidden_types, name=name) 

1994 def wrapper1(self, pat): 

1995 result = f(self._parent, pat) 

1996 return self._wrap_result(result, returns_string=returns_string) 

1997 

1998 @forbid_nonstring_types(forbidden_types, name=name) 

1999 def wrapper2(self, pat, flags=0, **kwargs): 

2000 result = f(self._parent, pat, flags=flags, **kwargs) 

2001 return self._wrap_result(result, returns_string=returns_string) 

2002 

2003 @forbid_nonstring_types(forbidden_types, name=name) 

2004 def wrapper3(self, pat, na=np.nan): 

2005 result = f(self._parent, pat, na=na) 

2006 return self._wrap_result(result, returns_string=returns_string) 

2007 

2008 wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 

2009 

2010 wrapper.__name__ = f.__name__ if name is None else name 

2011 if f.__doc__: 

2012 wrapper.__doc__ = f.__doc__ 

2013 

2014 return wrapper 

2015 

2016 

2017def copy(source): 

2018 "Copy a docstring from another source function (if present)" 

2019 

2020 def do_copy(target): 

2021 if source.__doc__: 

2022 target.__doc__ = source.__doc__ 

2023 return target 

2024 

2025 return do_copy 

2026 

2027 

2028class StringMethods(NoNewAttributesMixin): 

2029 """ 

2030 Vectorized string functions for Series and Index. NAs stay NA unless 

2031 handled otherwise by a particular method. Patterned after Python's string 

2032 methods, with some inspiration from R's stringr package. 

2033 

2034 Examples 

2035 -------- 

2036 >>> s.str.split('_') 

2037 >>> s.str.replace('_', '') 

2038 """ 

2039 

2040 def __init__(self, data): 

2041 self._inferred_dtype = self._validate(data) 

2042 self._is_categorical = is_categorical_dtype(data) 

2043 self._is_string = data.dtype.name == "string" 

2044 

2045 # .values.categories works for both Series/Index 

2046 self._parent = data.values.categories if self._is_categorical else data 

2047 # save orig to blow up categoricals to the right type 

2048 self._orig = data 

2049 self._freeze() 

2050 

2051 @staticmethod 

2052 def _validate(data): 

2053 """ 

2054 Auxiliary function for StringMethods, infers and checks dtype of data. 

2055 

2056 This is a "first line of defence" at the creation of the StringMethods- 

2057 object (see _make_accessor), and just checks that the dtype is in the 

2058 *union* of the allowed types over all string methods below; this 

2059 restriction is then refined on a per-method basis using the decorator 

2060 @forbid_nonstring_types (more info in the corresponding docstring). 

2061 

2062 This really should exclude all series/index with any non-string values, 

2063 but that isn't practical for performance reasons until we have a str 

2064 dtype (GH 9343 / 13877) 

2065 

2066 Parameters 

2067 ---------- 

2068 data : The content of the Series 

2069 

2070 Returns 

2071 ------- 

2072 dtype : inferred dtype of data 

2073 """ 

2074 from pandas import StringDtype 

2075 

2076 if isinstance(data, ABCMultiIndex): 

2077 raise AttributeError( 

2078 "Can only use .str accessor with Index, not MultiIndex" 

2079 ) 

2080 

2081 # see _libs/lib.pyx for list of inferred types 

2082 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] 

2083 

2084 values = getattr(data, "values", data) # Series / Index 

2085 values = getattr(values, "categories", values) # categorical / normal 

2086 

2087 # explicitly allow StringDtype 

2088 if isinstance(values.dtype, StringDtype): 

2089 return "string" 

2090 

2091 try: 

2092 inferred_dtype = lib.infer_dtype(values, skipna=True) 

2093 except ValueError: 

2094 # GH#27571 mostly occurs with ExtensionArray 

2095 inferred_dtype = None 

2096 

2097 if inferred_dtype not in allowed_types: 

2098 raise AttributeError("Can only use .str accessor with string values!") 

2099 return inferred_dtype 

2100 

2101 def __getitem__(self, key): 

2102 if isinstance(key, slice): 

2103 return self.slice(start=key.start, stop=key.stop, step=key.step) 

2104 else: 

2105 return self.get(key) 

2106 

2107 def __iter__(self): 

2108 warnings.warn( 

2109 "Columnar iteration over characters will be deprecated in future releases.", 

2110 FutureWarning, 

2111 stacklevel=2, 

2112 ) 

2113 i = 0 

2114 g = self.get(i) 

2115 while g.notna().any(): 

2116 yield g 

2117 i += 1 

2118 g = self.get(i) 

2119 

2120 def _wrap_result( 

2121 self, 

2122 result, 

2123 use_codes=True, 

2124 name=None, 

2125 expand=None, 

2126 fill_value=np.nan, 

2127 returns_string=True, 

2128 ): 

2129 

2130 from pandas import Index, Series, MultiIndex 

2131 

2132 # for category, we do the stuff on the categories, so blow it up 

2133 # to the full series again 

2134 # But for some operations, we have to do the stuff on the full values, 

2135 # so make it possible to skip this step as the method already did this 

2136 # before the transformation... 

2137 if use_codes and self._is_categorical: 

2138 # if self._orig is a CategoricalIndex, there is no .cat-accessor 

2139 result = take_1d( 

2140 result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value 

2141 ) 

2142 

2143 if not hasattr(result, "ndim") or not hasattr(result, "dtype"): 

2144 return result 

2145 assert result.ndim < 3 

2146 

2147 # We can be wrapping a string / object / categorical result, in which 

2148 # case we'll want to return the same dtype as the input. 

2149 # Or we can be wrapping a numeric output, in which case we don't want 

2150 # to return a StringArray. 

2151 if self._is_string and returns_string: 

2152 dtype = "string" 

2153 else: 

2154 dtype = None 

2155 

2156 if expand is None: 

2157 # infer from ndim if expand is not specified 

2158 expand = result.ndim != 1 

2159 

2160 elif expand is True and not isinstance(self._orig, ABCIndexClass): 

2161 # required when expand=True is explicitly specified 

2162 # not needed when inferred 

2163 

2164 def cons_row(x): 

2165 if is_list_like(x): 

2166 return x 

2167 else: 

2168 return [x] 

2169 

2170 result = [cons_row(x) for x in result] 

2171 if result: 

2172 # propagate nan values to match longest sequence (GH 18450) 

2173 max_len = max(len(x) for x in result) 

2174 result = [ 

2175 x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result 

2176 ] 

2177 

2178 if not isinstance(expand, bool): 

2179 raise ValueError("expand must be True or False") 

2180 

2181 if expand is False: 

2182 # if expand is False, result should have the same name 

2183 # as the original otherwise specified 

2184 if name is None: 

2185 name = getattr(result, "name", None) 

2186 if name is None: 

2187 # do not use logical or, _orig may be a DataFrame 

2188 # which has "name" column 

2189 name = self._orig.name 

2190 

2191 # Wait until we are sure result is a Series or Index before 

2192 # checking attributes (GH 12180) 

2193 if isinstance(self._orig, ABCIndexClass): 

2194 # if result is a boolean np.array, return the np.array 

2195 # instead of wrapping it into a boolean Index (GH 8875) 

2196 if is_bool_dtype(result): 

2197 return result 

2198 

2199 if expand: 

2200 result = list(result) 

2201 out = MultiIndex.from_tuples(result, names=name) 

2202 if out.nlevels == 1: 

2203 # We had all tuples of length-one, which are 

2204 # better represented as a regular Index. 

2205 out = out.get_level_values(0) 

2206 return out 

2207 else: 

2208 return Index(result, name=name) 

2209 else: 

2210 index = self._orig.index 

2211 if expand: 

2212 cons = self._orig._constructor_expanddim 

2213 result = cons(result, columns=name, index=index, dtype=dtype) 

2214 else: 

2215 # Must be a Series 

2216 cons = self._orig._constructor 

2217 result = cons(result, name=name, index=index, dtype=dtype) 

2218 return result 

2219 

2220 def _get_series_list(self, others): 

2221 """ 

2222 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input 

2223 into a list of Series (elements without an index must match the length 

2224 of the calling Series/Index). 

2225 

2226 Parameters 

2227 ---------- 

2228 others : Series, DataFrame, np.ndarray, list-like or list-like of 

2229 Objects that are either Series, Index or np.ndarray (1-dim). 

2230 

2231 Returns 

2232 ------- 

2233 list of Series 

2234 Others transformed into list of Series. 

2235 """ 

2236 from pandas import Series, DataFrame 

2237 

2238 # self._orig is either Series or Index 

2239 idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index 

2240 

2241 # Generally speaking, all objects without an index inherit the index 

2242 # `idx` of the calling Series/Index - i.e. must have matching length. 

2243 # Objects with an index (i.e. Series/Index/DataFrame) keep their own. 

2244 if isinstance(others, ABCSeries): 

2245 return [others] 

2246 elif isinstance(others, ABCIndexClass): 

2247 return [Series(others.values, index=others)] 

2248 elif isinstance(others, ABCDataFrame): 

2249 return [others[x] for x in others] 

2250 elif isinstance(others, np.ndarray) and others.ndim == 2: 

2251 others = DataFrame(others, index=idx) 

2252 return [others[x] for x in others] 

2253 elif is_list_like(others, allow_sets=False): 

2254 others = list(others) # ensure iterators do not get read twice etc 

2255 

2256 # in case of list-like `others`, all elements must be 

2257 # either Series/Index/np.ndarray (1-dim)... 

2258 if all( 

2259 isinstance(x, (ABCSeries, ABCIndexClass)) 

2260 or (isinstance(x, np.ndarray) and x.ndim == 1) 

2261 for x in others 

2262 ): 

2263 los = [] 

2264 while others: # iterate through list and append each element 

2265 los = los + self._get_series_list(others.pop(0)) 

2266 return los 

2267 # ... or just strings 

2268 elif all(not is_list_like(x) for x in others): 

2269 return [Series(others, index=idx)] 

2270 raise TypeError( 

2271 "others must be Series, Index, DataFrame, np.ndarrary " 

2272 "or list-like (either containing only strings or " 

2273 "containing only objects of type Series/Index/" 

2274 "np.ndarray[1-dim])" 

2275 ) 

2276 

2277 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) 

2278 def cat(self, others=None, sep=None, na_rep=None, join="left"): 

2279 """ 

2280 Concatenate strings in the Series/Index with given separator. 

2281 

2282 If `others` is specified, this function concatenates the Series/Index 

2283 and elements of `others` element-wise. 

2284 If `others` is not passed, then all values in the Series/Index are 

2285 concatenated into a single string with a given `sep`. 

2286 

2287 Parameters 

2288 ---------- 

2289 others : Series, Index, DataFrame, np.ndarray or list-like 

2290 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and 

2291 other list-likes of strings must have the same length as the 

2292 calling Series/Index, with the exception of indexed objects (i.e. 

2293 Series/Index/DataFrame) if `join` is not None. 

2294 

2295 If others is a list-like that contains a combination of Series, 

2296 Index or np.ndarray (1-dim), then all elements will be unpacked and 

2297 must satisfy the above criteria individually. 

2298 

2299 If others is None, the method returns the concatenation of all 

2300 strings in the calling Series/Index. 

2301 sep : str, default '' 

2302 The separator between the different elements/columns. By default 

2303 the empty string `''` is used. 

2304 na_rep : str or None, default None 

2305 Representation that is inserted for all missing values: 

2306 

2307 - If `na_rep` is None, and `others` is None, missing values in the 

2308 Series/Index are omitted from the result. 

2309 - If `na_rep` is None, and `others` is not None, a row containing a 

2310 missing value in any of the columns (before concatenation) will 

2311 have a missing value in the result. 

2312 join : {'left', 'right', 'outer', 'inner'}, default 'left' 

2313 Determines the join-style between the calling Series/Index and any 

2314 Series/Index/DataFrame in `others` (objects without an index need 

2315 to match the length of the calling Series/Index). To disable 

2316 alignment, use `.values` on any Series/Index/DataFrame in `others`. 

2317 

2318 .. versionadded:: 0.23.0 

2319 .. versionchanged:: 1.0.0 

2320 Changed default of `join` from None to `'left'`. 

2321 

2322 Returns 

2323 ------- 

2324 str, Series or Index 

2325 If `others` is None, `str` is returned, otherwise a `Series/Index` 

2326 (same type as caller) of objects is returned. 

2327 

2328 See Also 

2329 -------- 

2330 split : Split each string in the Series/Index. 

2331 join : Join lists contained as elements in the Series/Index. 

2332 

2333 Examples 

2334 -------- 

2335 When not passing `others`, all values are concatenated into a single 

2336 string: 

2337 

2338 >>> s = pd.Series(['a', 'b', np.nan, 'd']) 

2339 >>> s.str.cat(sep=' ') 

2340 'a b d' 

2341 

2342 By default, NA values in the Series are ignored. Using `na_rep`, they 

2343 can be given a representation: 

2344 

2345 >>> s.str.cat(sep=' ', na_rep='?') 

2346 'a b ? d' 

2347 

2348 If `others` is specified, corresponding values are concatenated with 

2349 the separator. Result will be a Series of strings. 

2350 

2351 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') 

2352 0 a,A 

2353 1 b,B 

2354 2 NaN 

2355 3 d,D 

2356 dtype: object 

2357 

2358 Missing values will remain missing in the result, but can again be 

2359 represented using `na_rep` 

2360 

2361 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') 

2362 0 a,A 

2363 1 b,B 

2364 2 -,C 

2365 3 d,D 

2366 dtype: object 

2367 

2368 If `sep` is not specified, the values are concatenated without 

2369 separation. 

2370 

2371 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') 

2372 0 aA 

2373 1 bB 

2374 2 -C 

2375 3 dD 

2376 dtype: object 

2377 

2378 Series with different indexes can be aligned before concatenation. The 

2379 `join`-keyword works as in other methods. 

2380 

2381 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) 

2382 >>> s.str.cat(t, join='left', na_rep='-') 

2383 0 aa 

2384 1 b- 

2385 2 -c 

2386 3 dd 

2387 dtype: object 

2388 >>> 

2389 >>> s.str.cat(t, join='outer', na_rep='-') 

2390 0 aa 

2391 1 b- 

2392 2 -c 

2393 3 dd 

2394 4 -e 

2395 dtype: object 

2396 >>> 

2397 >>> s.str.cat(t, join='inner', na_rep='-') 

2398 0 aa 

2399 2 -c 

2400 3 dd 

2401 dtype: object 

2402 >>> 

2403 >>> s.str.cat(t, join='right', na_rep='-') 

2404 3 dd 

2405 0 aa 

2406 4 -e 

2407 2 -c 

2408 dtype: object 

2409 

2410 For more examples, see :ref:`here <text.concatenate>`. 

2411 """ 

2412 from pandas import Index, Series, concat 

2413 

2414 if isinstance(others, str): 

2415 raise ValueError("Did you mean to supply a `sep` keyword?") 

2416 if sep is None: 

2417 sep = "" 

2418 

2419 if isinstance(self._orig, ABCIndexClass): 

2420 data = Series(self._orig, index=self._orig) 

2421 else: # Series 

2422 data = self._orig 

2423 

2424 # concatenate Series/Index with itself if no "others" 

2425 if others is None: 

2426 data = ensure_object(data) 

2427 na_mask = isna(data) 

2428 if na_rep is None and na_mask.any(): 

2429 data = data[~na_mask] 

2430 elif na_rep is not None and na_mask.any(): 

2431 data = np.where(na_mask, na_rep, data) 

2432 return sep.join(data) 

2433 

2434 try: 

2435 # turn anything in "others" into lists of Series 

2436 others = self._get_series_list(others) 

2437 except ValueError: # do not catch TypeError raised by _get_series_list 

2438 raise ValueError( 

2439 "If `others` contains arrays or lists (or other " 

2440 "list-likes without an index), these must all be " 

2441 "of the same length as the calling Series/Index." 

2442 ) 

2443 

2444 # align if required 

2445 if any(not data.index.equals(x.index) for x in others): 

2446 # Need to add keys for uniqueness in case of duplicate columns 

2447 others = concat( 

2448 others, 

2449 axis=1, 

2450 join=(join if join == "inner" else "outer"), 

2451 keys=range(len(others)), 

2452 sort=False, 

2453 copy=False, 

2454 ) 

2455 data, others = data.align(others, join=join) 

2456 others = [others[x] for x in others] # again list of Series 

2457 

2458 all_cols = [ensure_object(x) for x in [data] + others] 

2459 na_masks = np.array([isna(x) for x in all_cols]) 

2460 union_mask = np.logical_or.reduce(na_masks, axis=0) 

2461 

2462 if na_rep is None and union_mask.any(): 

2463 # no na_rep means NaNs for all rows where any column has a NaN 

2464 # only necessary if there are actually any NaNs 

2465 result = np.empty(len(data), dtype=object) 

2466 np.putmask(result, union_mask, np.nan) 

2467 

2468 not_masked = ~union_mask 

2469 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) 

2470 elif na_rep is not None and union_mask.any(): 

2471 # fill NaNs with na_rep in case there are actually any NaNs 

2472 all_cols = [ 

2473 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) 

2474 ] 

2475 result = cat_safe(all_cols, sep) 

2476 else: 

2477 # no NaNs - can just concatenate 

2478 result = cat_safe(all_cols, sep) 

2479 

2480 if isinstance(self._orig, ABCIndexClass): 

2481 # add dtype for case that result is all-NA 

2482 result = Index(result, dtype=object, name=self._orig.name) 

2483 else: # Series 

2484 if is_categorical_dtype(self._orig.dtype): 

2485 # We need to infer the new categories. 

2486 dtype = None 

2487 else: 

2488 dtype = self._orig.dtype 

2489 result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) 

2490 return result 

2491 

2492 _shared_docs[ 

2493 "str_split" 

2494 ] = r""" 

2495 Split strings around given separator/delimiter. 

2496 

2497 Splits the string in the Series/Index from the %(side)s, 

2498 at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. 

2499 

2500 Parameters 

2501 ---------- 

2502 pat : str, optional 

2503 String or regular expression to split on. 

2504 If not specified, split on whitespace. 

2505 n : int, default -1 (all) 

2506 Limit number of splits in output. 

2507 ``None``, 0 and -1 will be interpreted as return all splits. 

2508 expand : bool, default False 

2509 Expand the splitted strings into separate columns. 

2510 

2511 * If ``True``, return DataFrame/MultiIndex expanding dimensionality. 

2512 * If ``False``, return Series/Index, containing lists of strings. 

2513 

2514 Returns 

2515 ------- 

2516 Series, Index, DataFrame or MultiIndex 

2517 Type matches caller unless ``expand=True`` (see Notes). 

2518 

2519 See Also 

2520 -------- 

2521 Series.str.split : Split strings around given separator/delimiter. 

2522 Series.str.rsplit : Splits string around given separator/delimiter, 

2523 starting from the right. 

2524 Series.str.join : Join lists contained as elements in the Series/Index 

2525 with passed delimiter. 

2526 str.split : Standard library version for split. 

2527 str.rsplit : Standard library version for rsplit. 

2528 

2529 Notes 

2530 ----- 

2531 The handling of the `n` keyword depends on the number of found splits: 

2532 

2533 - If found splits > `n`, make first `n` splits only 

2534 - If found splits <= `n`, make all splits 

2535 - If for a certain row the number of found splits < `n`, 

2536 append `None` for padding up to `n` if ``expand=True`` 

2537 

2538 If using ``expand=True``, Series and Index callers return DataFrame and 

2539 MultiIndex objects, respectively. 

2540 

2541 Examples 

2542 -------- 

2543 >>> s = pd.Series(["this is a regular sentence", 

2544 ... "https://docs.python.org/3/tutorial/index.html", 

2545 ... np.nan]) 

2546 0 this is a regular sentence 

2547 1 https://docs.python.org/3/tutorial/index.html 

2548 2 NaN 

2549 dtype: object 

2550 

2551 In the default setting, the string is split by whitespace. 

2552 

2553 >>> s.str.split() 

2554 0 [this, is, a, regular, sentence] 

2555 1 [https://docs.python.org/3/tutorial/index.html] 

2556 2 NaN 

2557 dtype: object 

2558 

2559 Without the `n` parameter, the outputs of `rsplit` and `split` 

2560 are identical. 

2561 

2562 >>> s.str.rsplit() 

2563 0 [this, is, a, regular, sentence] 

2564 1 [https://docs.python.org/3/tutorial/index.html] 

2565 2 NaN 

2566 dtype: object 

2567 

2568 The `n` parameter can be used to limit the number of splits on the 

2569 delimiter. The outputs of `split` and `rsplit` are different. 

2570 

2571 >>> s.str.split(n=2) 

2572 0 [this, is, a regular sentence] 

2573 1 [https://docs.python.org/3/tutorial/index.html] 

2574 2 NaN 

2575 dtype: object 

2576 

2577 >>> s.str.rsplit(n=2) 

2578 0 [this is a, regular, sentence] 

2579 1 [https://docs.python.org/3/tutorial/index.html] 

2580 2 NaN 

2581 dtype: object 

2582 

2583 The `pat` parameter can be used to split by other characters. 

2584 

2585 >>> s.str.split(pat = "/") 

2586 0 [this is a regular sentence] 

2587 1 [https:, , docs.python.org, 3, tutorial, index... 

2588 2 NaN 

2589 dtype: object 

2590 

2591 When using ``expand=True``, the split elements will expand out into 

2592 separate columns. If NaN is present, it is propagated throughout 

2593 the columns during the split. 

2594 

2595 >>> s.str.split(expand=True) 

2596 0 1 2 3 

2597 0 this is a regular 

2598 1 https://docs.python.org/3/tutorial/index.html None None None 

2599 2 NaN NaN NaN NaN \ 

2600 4 

2601 0 sentence 

2602 1 None 

2603 2 NaN 

2604 

2605 For slightly more complex use cases like splitting the html document name 

2606 from a url, a combination of parameter settings can be used. 

2607 

2608 >>> s.str.rsplit("/", n=1, expand=True) 

2609 0 1 

2610 0 this is a regular sentence None 

2611 1 https://docs.python.org/3/tutorial index.html 

2612 2 NaN NaN 

2613 

2614 Remember to escape special characters when explicitly using regular 

2615 expressions. 

2616 

2617 >>> s = pd.Series(["1+1=2"]) 

2618 

2619 >>> s.str.split(r"\+|=", expand=True) 

2620 0 1 2 

2621 0 1 1 2 

2622 """ 

2623 

2624 @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) 

2625 @forbid_nonstring_types(["bytes"]) 

2626 def split(self, pat=None, n=-1, expand=False): 

2627 result = str_split(self._parent, pat, n=n) 

2628 return self._wrap_result(result, expand=expand, returns_string=expand) 

2629 

2630 @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) 

2631 @forbid_nonstring_types(["bytes"]) 

2632 def rsplit(self, pat=None, n=-1, expand=False): 

2633 result = str_rsplit(self._parent, pat, n=n) 

2634 return self._wrap_result(result, expand=expand, returns_string=expand) 

2635 

2636 _shared_docs[ 

2637 "str_partition" 

2638 ] = """ 

2639 Split the string at the %(side)s occurrence of `sep`. 

2640 

2641 This method splits the string at the %(side)s occurrence of `sep`, 

2642 and returns 3 elements containing the part before the separator, 

2643 the separator itself, and the part after the separator. 

2644 If the separator is not found, return %(return)s. 

2645 

2646 Parameters 

2647 ---------- 

2648 sep : str, default whitespace 

2649 String to split on. 

2650 expand : bool, default True 

2651 If True, return DataFrame/MultiIndex expanding dimensionality. 

2652 If False, return Series/Index. 

2653 

2654 Returns 

2655 ------- 

2656 DataFrame/MultiIndex or Series/Index of objects 

2657 

2658 See Also 

2659 -------- 

2660 %(also)s 

2661 Series.str.split : Split strings around given separators. 

2662 str.partition : Standard library version. 

2663 

2664 Examples 

2665 -------- 

2666 

2667 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) 

2668 >>> s 

2669 0 Linda van der Berg 

2670 1 George Pitt-Rivers 

2671 dtype: object 

2672 

2673 >>> s.str.partition() 

2674 0 1 2 

2675 0 Linda van der Berg 

2676 1 George Pitt-Rivers 

2677 

2678 To partition by the last space instead of the first one: 

2679 

2680 >>> s.str.rpartition() 

2681 0 1 2 

2682 0 Linda van der Berg 

2683 1 George Pitt-Rivers 

2684 

2685 To partition by something different than a space: 

2686 

2687 >>> s.str.partition('-') 

2688 0 1 2 

2689 0 Linda van der Berg 

2690 1 George Pitt - Rivers 

2691 

2692 To return a Series containing tuples instead of a DataFrame: 

2693 

2694 >>> s.str.partition('-', expand=False) 

2695 0 (Linda van der Berg, , ) 

2696 1 (George Pitt, -, Rivers) 

2697 dtype: object 

2698 

2699 Also available on indices: 

2700 

2701 >>> idx = pd.Index(['X 123', 'Y 999']) 

2702 >>> idx 

2703 Index(['X 123', 'Y 999'], dtype='object') 

2704 

2705 Which will create a MultiIndex: 

2706 

2707 >>> idx.str.partition() 

2708 MultiIndex([('X', ' ', '123'), 

2709 ('Y', ' ', '999')], 

2710 dtype='object') 

2711 

2712 Or an index with tuples with ``expand=False``: 

2713 

2714 >>> idx.str.partition(expand=False) 

2715 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') 

2716 """ 

2717 

2718 @Appender( 

2719 _shared_docs["str_partition"] 

2720 % { 

2721 "side": "first", 

2722 "return": "3 elements containing the string itself, followed by two " 

2723 "empty strings", 

2724 "also": "rpartition : Split the string at the last occurrence of `sep`.", 

2725 } 

2726 ) 

2727 @forbid_nonstring_types(["bytes"]) 

2728 def partition(self, sep=" ", expand=True): 

2729 f = lambda x: x.partition(sep) 

2730 result = _na_map(f, self._parent) 

2731 return self._wrap_result(result, expand=expand, returns_string=expand) 

2732 

2733 @Appender( 

2734 _shared_docs["str_partition"] 

2735 % { 

2736 "side": "last", 

2737 "return": "3 elements containing two empty strings, followed by the " 

2738 "string itself", 

2739 "also": "partition : Split the string at the first occurrence of `sep`.", 

2740 } 

2741 ) 

2742 @forbid_nonstring_types(["bytes"]) 

2743 def rpartition(self, sep=" ", expand=True): 

2744 f = lambda x: x.rpartition(sep) 

2745 result = _na_map(f, self._parent) 

2746 return self._wrap_result(result, expand=expand, returns_string=expand) 

2747 

2748 @copy(str_get) 

2749 def get(self, i): 

2750 result = str_get(self._parent, i) 

2751 return self._wrap_result(result) 

2752 

2753 @copy(str_join) 

2754 @forbid_nonstring_types(["bytes"]) 

2755 def join(self, sep): 

2756 result = str_join(self._parent, sep) 

2757 return self._wrap_result(result) 

2758 

2759 @copy(str_contains) 

2760 @forbid_nonstring_types(["bytes"]) 

2761 def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): 

2762 result = str_contains( 

2763 self._parent, pat, case=case, flags=flags, na=na, regex=regex 

2764 ) 

2765 return self._wrap_result(result, fill_value=na, returns_string=False) 

2766 

2767 @copy(str_match) 

2768 @forbid_nonstring_types(["bytes"]) 

2769 def match(self, pat, case=True, flags=0, na=np.nan): 

2770 result = str_match(self._parent, pat, case=case, flags=flags, na=na) 

2771 return self._wrap_result(result, fill_value=na, returns_string=False) 

2772 

2773 @copy(str_replace) 

2774 @forbid_nonstring_types(["bytes"]) 

2775 def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): 

2776 result = str_replace( 

2777 self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex 

2778 ) 

2779 return self._wrap_result(result) 

2780 

2781 @copy(str_repeat) 

2782 @forbid_nonstring_types(["bytes"]) 

2783 def repeat(self, repeats): 

2784 result = str_repeat(self._parent, repeats) 

2785 return self._wrap_result(result) 

2786 

2787 @copy(str_pad) 

2788 @forbid_nonstring_types(["bytes"]) 

2789 def pad(self, width, side="left", fillchar=" "): 

2790 result = str_pad(self._parent, width, side=side, fillchar=fillchar) 

2791 return self._wrap_result(result) 

2792 

2793 _shared_docs[ 

2794 "str_pad" 

2795 ] = """ 

2796 Filling %(side)s side of strings in the Series/Index with an 

2797 additional character. Equivalent to :meth:`str.%(method)s`. 

2798 

2799 Parameters 

2800 ---------- 

2801 width : int 

2802 Minimum width of resulting string; additional characters will be filled 

2803 with ``fillchar``. 

2804 fillchar : str 

2805 Additional character for filling, default is whitespace. 

2806 

2807 Returns 

2808 ------- 

2809 filled : Series/Index of objects. 

2810 """ 

2811 

2812 @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) 

2813 @forbid_nonstring_types(["bytes"]) 

2814 def center(self, width, fillchar=" "): 

2815 return self.pad(width, side="both", fillchar=fillchar) 

2816 

2817 @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) 

2818 @forbid_nonstring_types(["bytes"]) 

2819 def ljust(self, width, fillchar=" "): 

2820 return self.pad(width, side="right", fillchar=fillchar) 

2821 

2822 @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) 

2823 @forbid_nonstring_types(["bytes"]) 

2824 def rjust(self, width, fillchar=" "): 

2825 return self.pad(width, side="left", fillchar=fillchar) 

2826 

2827 @forbid_nonstring_types(["bytes"]) 

2828 def zfill(self, width): 

2829 """ 

2830 Pad strings in the Series/Index by prepending '0' characters. 

2831 

2832 Strings in the Series/Index are padded with '0' characters on the 

2833 left of the string to reach a total string length `width`. Strings 

2834 in the Series/Index with length greater or equal to `width` are 

2835 unchanged. 

2836 

2837 Parameters 

2838 ---------- 

2839 width : int 

2840 Minimum length of resulting string; strings with length less 

2841 than `width` be prepended with '0' characters. 

2842 

2843 Returns 

2844 ------- 

2845 Series/Index of objects. 

2846 

2847 See Also 

2848 -------- 

2849 Series.str.rjust : Fills the left side of strings with an arbitrary 

2850 character. 

2851 Series.str.ljust : Fills the right side of strings with an arbitrary 

2852 character. 

2853 Series.str.pad : Fills the specified sides of strings with an arbitrary 

2854 character. 

2855 Series.str.center : Fills boths sides of strings with an arbitrary 

2856 character. 

2857 

2858 Notes 

2859 ----- 

2860 Differs from :meth:`str.zfill` which has special handling 

2861 for '+'/'-' in the string. 

2862 

2863 Examples 

2864 -------- 

2865 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) 

2866 >>> s 

2867 0 -1 

2868 1 1 

2869 2 1000 

2870 3 10 

2871 4 NaN 

2872 dtype: object 

2873 

2874 Note that ``10`` and ``NaN`` are not strings, therefore they are 

2875 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a 

2876 regular character and the zero is added to the left of it 

2877 (:meth:`str.zfill` would have moved it to the left). ``1000`` 

2878 remains unchanged as it is longer than `width`. 

2879 

2880 >>> s.str.zfill(3) 

2881 0 0-1 

2882 1 001 

2883 2 1000 

2884 3 NaN 

2885 4 NaN 

2886 dtype: object 

2887 """ 

2888 result = str_pad(self._parent, width, side="left", fillchar="0") 

2889 return self._wrap_result(result) 

2890 

2891 @copy(str_slice) 

2892 def slice(self, start=None, stop=None, step=None): 

2893 result = str_slice(self._parent, start, stop, step) 

2894 return self._wrap_result(result) 

2895 

2896 @copy(str_slice_replace) 

2897 @forbid_nonstring_types(["bytes"]) 

2898 def slice_replace(self, start=None, stop=None, repl=None): 

2899 result = str_slice_replace(self._parent, start, stop, repl) 

2900 return self._wrap_result(result) 

2901 

2902 @copy(str_decode) 

2903 def decode(self, encoding, errors="strict"): 

2904 # need to allow bytes here 

2905 result = str_decode(self._parent, encoding, errors) 

2906 # TODO: Not sure how to handle this. 

2907 return self._wrap_result(result, returns_string=False) 

2908 

2909 @copy(str_encode) 

2910 @forbid_nonstring_types(["bytes"]) 

2911 def encode(self, encoding, errors="strict"): 

2912 result = str_encode(self._parent, encoding, errors) 

2913 return self._wrap_result(result, returns_string=False) 

2914 

2915 _shared_docs[ 

2916 "str_strip" 

2917 ] = r""" 

2918 Remove leading and trailing characters. 

2919 

2920 Strip whitespaces (including newlines) or a set of specified characters 

2921 from each string in the Series/Index from %(side)s. 

2922 Equivalent to :meth:`str.%(method)s`. 

2923 

2924 Parameters 

2925 ---------- 

2926 to_strip : str or None, default None 

2927 Specifying the set of characters to be removed. 

2928 All combinations of this set of characters will be stripped. 

2929 If None then whitespaces are removed. 

2930 

2931 Returns 

2932 ------- 

2933 Series or Index of object 

2934 

2935 See Also 

2936 -------- 

2937 Series.str.strip : Remove leading and trailing characters in Series/Index. 

2938 Series.str.lstrip : Remove leading characters in Series/Index. 

2939 Series.str.rstrip : Remove trailing characters in Series/Index. 

2940 

2941 Examples 

2942 -------- 

2943 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) 

2944 >>> s 

2945 0 1. Ant. 

2946 1 2. Bee!\n 

2947 2 3. Cat?\t 

2948 3 NaN 

2949 dtype: object 

2950 

2951 >>> s.str.strip() 

2952 0 1. Ant. 

2953 1 2. Bee! 

2954 2 3. Cat? 

2955 3 NaN 

2956 dtype: object 

2957 

2958 >>> s.str.lstrip('123.') 

2959 0 Ant. 

2960 1 Bee!\n 

2961 2 Cat?\t 

2962 3 NaN 

2963 dtype: object 

2964 

2965 >>> s.str.rstrip('.!? \n\t') 

2966 0 1. Ant 

2967 1 2. Bee 

2968 2 3. Cat 

2969 3 NaN 

2970 dtype: object 

2971 

2972 >>> s.str.strip('123.!? \n\t') 

2973 0 Ant 

2974 1 Bee 

2975 2 Cat 

2976 3 NaN 

2977 dtype: object 

2978 """ 

2979 

2980 @Appender( 

2981 _shared_docs["str_strip"] % dict(side="left and right sides", method="strip") 

2982 ) 

2983 @forbid_nonstring_types(["bytes"]) 

2984 def strip(self, to_strip=None): 

2985 result = str_strip(self._parent, to_strip, side="both") 

2986 return self._wrap_result(result) 

2987 

2988 @Appender(_shared_docs["str_strip"] % dict(side="left side", method="lstrip")) 

2989 @forbid_nonstring_types(["bytes"]) 

2990 def lstrip(self, to_strip=None): 

2991 result = str_strip(self._parent, to_strip, side="left") 

2992 return self._wrap_result(result) 

2993 

2994 @Appender(_shared_docs["str_strip"] % dict(side="right side", method="rstrip")) 

2995 @forbid_nonstring_types(["bytes"]) 

2996 def rstrip(self, to_strip=None): 

2997 result = str_strip(self._parent, to_strip, side="right") 

2998 return self._wrap_result(result) 

2999 

3000 @copy(str_wrap) 

3001 @forbid_nonstring_types(["bytes"]) 

3002 def wrap(self, width, **kwargs): 

3003 result = str_wrap(self._parent, width, **kwargs) 

3004 return self._wrap_result(result) 

3005 

3006 @copy(str_get_dummies) 

3007 @forbid_nonstring_types(["bytes"]) 

3008 def get_dummies(self, sep="|"): 

3009 # we need to cast to Series of strings as only that has all 

3010 # methods available for making the dummies... 

3011 data = self._orig.astype(str) if self._is_categorical else self._parent 

3012 result, name = str_get_dummies(data, sep) 

3013 return self._wrap_result( 

3014 result, 

3015 use_codes=(not self._is_categorical), 

3016 name=name, 

3017 expand=True, 

3018 returns_string=False, 

3019 ) 

3020 

3021 @copy(str_translate) 

3022 @forbid_nonstring_types(["bytes"]) 

3023 def translate(self, table): 

3024 result = str_translate(self._parent, table) 

3025 return self._wrap_result(result) 

3026 

3027 count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) 

3028 startswith = _pat_wrapper( 

3029 str_startswith, na=True, name="startswith", returns_string=False 

3030 ) 

3031 endswith = _pat_wrapper( 

3032 str_endswith, na=True, name="endswith", returns_string=False 

3033 ) 

3034 findall = _pat_wrapper( 

3035 str_findall, flags=True, name="findall", returns_string=False 

3036 ) 

3037 

3038 @copy(str_extract) 

3039 @forbid_nonstring_types(["bytes"]) 

3040 def extract(self, pat, flags=0, expand=True): 

3041 return str_extract(self, pat, flags=flags, expand=expand) 

3042 

3043 @copy(str_extractall) 

3044 @forbid_nonstring_types(["bytes"]) 

3045 def extractall(self, pat, flags=0): 

3046 return str_extractall(self._orig, pat, flags=flags) 

3047 

3048 _shared_docs[ 

3049 "find" 

3050 ] = """ 

3051 Return %(side)s indexes in each strings in the Series/Index 

3052 where the substring is fully contained between [start:end]. 

3053 Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`. 

3054 

3055 Parameters 

3056 ---------- 

3057 sub : str 

3058 Substring being searched. 

3059 start : int 

3060 Left edge index. 

3061 end : int 

3062 Right edge index. 

3063 

3064 Returns 

3065 ------- 

3066 Series or Index of int. 

3067 

3068 See Also 

3069 -------- 

3070 %(also)s 

3071 """ 

3072 

3073 @Appender( 

3074 _shared_docs["find"] 

3075 % dict( 

3076 side="lowest", 

3077 method="find", 

3078 also="rfind : Return highest indexes in each strings.", 

3079 ) 

3080 ) 

3081 @forbid_nonstring_types(["bytes"]) 

3082 def find(self, sub, start=0, end=None): 

3083 result = str_find(self._parent, sub, start=start, end=end, side="left") 

3084 return self._wrap_result(result, returns_string=False) 

3085 

3086 @Appender( 

3087 _shared_docs["find"] 

3088 % dict( 

3089 side="highest", 

3090 method="rfind", 

3091 also="find : Return lowest indexes in each strings.", 

3092 ) 

3093 ) 

3094 @forbid_nonstring_types(["bytes"]) 

3095 def rfind(self, sub, start=0, end=None): 

3096 result = str_find(self._parent, sub, start=start, end=end, side="right") 

3097 return self._wrap_result(result, returns_string=False) 

3098 

3099 @forbid_nonstring_types(["bytes"]) 

3100 def normalize(self, form): 

3101 """ 

3102 Return the Unicode normal form for the strings in the Series/Index. 

3103 For more information on the forms, see the 

3104 :func:`unicodedata.normalize`. 

3105 

3106 Parameters 

3107 ---------- 

3108 form : {'NFC', 'NFKC', 'NFD', 'NFKD'} 

3109 Unicode form. 

3110 

3111 Returns 

3112 ------- 

3113 normalized : Series/Index of objects 

3114 """ 

3115 import unicodedata 

3116 

3117 f = lambda x: unicodedata.normalize(form, x) 

3118 result = _na_map(f, self._parent, dtype=str) 

3119 return self._wrap_result(result) 

3120 

3121 _shared_docs[ 

3122 "index" 

3123 ] = """ 

3124 Return %(side)s indexes in each strings where the substring is 

3125 fully contained between [start:end]. This is the same as 

3126 ``str.%(similar)s`` except instead of returning -1, it raises a ValueError 

3127 when the substring is not found. Equivalent to standard ``str.%(method)s``. 

3128 

3129 Parameters 

3130 ---------- 

3131 sub : str 

3132 Substring being searched. 

3133 start : int 

3134 Left edge index. 

3135 end : int 

3136 Right edge index. 

3137 

3138 Returns 

3139 ------- 

3140 Series or Index of object 

3141 

3142 See Also 

3143 -------- 

3144 %(also)s 

3145 """ 

3146 

3147 @Appender( 

3148 _shared_docs["index"] 

3149 % dict( 

3150 side="lowest", 

3151 similar="find", 

3152 method="index", 

3153 also="rindex : Return highest indexes in each strings.", 

3154 ) 

3155 ) 

3156 @forbid_nonstring_types(["bytes"]) 

3157 def index(self, sub, start=0, end=None): 

3158 result = str_index(self._parent, sub, start=start, end=end, side="left") 

3159 return self._wrap_result(result, returns_string=False) 

3160 

3161 @Appender( 

3162 _shared_docs["index"] 

3163 % dict( 

3164 side="highest", 

3165 similar="rfind", 

3166 method="rindex", 

3167 also="index : Return lowest indexes in each strings.", 

3168 ) 

3169 ) 

3170 @forbid_nonstring_types(["bytes"]) 

3171 def rindex(self, sub, start=0, end=None): 

3172 result = str_index(self._parent, sub, start=start, end=end, side="right") 

3173 return self._wrap_result(result, returns_string=False) 

3174 

3175 _shared_docs[ 

3176 "len" 

3177 ] = """ 

3178 Compute the length of each element in the Series/Index. The element may be 

3179 a sequence (such as a string, tuple or list) or a collection 

3180 (such as a dictionary). 

3181 

3182 Returns 

3183 ------- 

3184 Series or Index of int 

3185 A Series or Index of integer values indicating the length of each 

3186 element in the Series or Index. 

3187 

3188 See Also 

3189 -------- 

3190 str.len : Python built-in function returning the length of an object. 

3191 Series.size : Returns the length of the Series. 

3192 

3193 Examples 

3194 -------- 

3195 Returns the length (number of characters) in a string. Returns the 

3196 number of entries for dictionaries, lists or tuples. 

3197 

3198 >>> s = pd.Series(['dog', 

3199 ... '', 

3200 ... 5, 

3201 ... {'foo' : 'bar'}, 

3202 ... [2, 3, 5, 7], 

3203 ... ('one', 'two', 'three')]) 

3204 >>> s 

3205 0 dog 

3206 1 

3207 2 5 

3208 3 {'foo': 'bar'} 

3209 4 [2, 3, 5, 7] 

3210 5 (one, two, three) 

3211 dtype: object 

3212 >>> s.str.len() 

3213 0 3.0 

3214 1 0.0 

3215 2 NaN 

3216 3 1.0 

3217 4 4.0 

3218 5 3.0 

3219 dtype: float64 

3220 """ 

3221 len = _noarg_wrapper( 

3222 len, 

3223 docstring=_shared_docs["len"], 

3224 forbidden_types=None, 

3225 dtype="int64", 

3226 returns_string=False, 

3227 ) 

3228 

3229 _shared_docs[ 

3230 "casemethods" 

3231 ] = """ 

3232 Convert strings in the Series/Index to %(type)s. 

3233 %(version)s 

3234 Equivalent to :meth:`str.%(method)s`. 

3235 

3236 Returns 

3237 ------- 

3238 Series or Index of object 

3239 

3240 See Also 

3241 -------- 

3242 Series.str.lower : Converts all characters to lowercase. 

3243 Series.str.upper : Converts all characters to uppercase. 

3244 Series.str.title : Converts first character of each word to uppercase and 

3245 remaining to lowercase. 

3246 Series.str.capitalize : Converts first character to uppercase and 

3247 remaining to lowercase. 

3248 Series.str.swapcase : Converts uppercase to lowercase and lowercase to 

3249 uppercase. 

3250 Series.str.casefold: Removes all case distinctions in the string. 

3251 

3252 Examples 

3253 -------- 

3254 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) 

3255 >>> s 

3256 0 lower 

3257 1 CAPITALS 

3258 2 this is a sentence 

3259 3 SwApCaSe 

3260 dtype: object 

3261 

3262 >>> s.str.lower() 

3263 0 lower 

3264 1 capitals 

3265 2 this is a sentence 

3266 3 swapcase 

3267 dtype: object 

3268 

3269 >>> s.str.upper() 

3270 0 LOWER 

3271 1 CAPITALS 

3272 2 THIS IS A SENTENCE 

3273 3 SWAPCASE 

3274 dtype: object 

3275 

3276 >>> s.str.title() 

3277 0 Lower 

3278 1 Capitals 

3279 2 This Is A Sentence 

3280 3 Swapcase 

3281 dtype: object 

3282 

3283 >>> s.str.capitalize() 

3284 0 Lower 

3285 1 Capitals 

3286 2 This is a sentence 

3287 3 Swapcase 

3288 dtype: object 

3289 

3290 >>> s.str.swapcase() 

3291 0 LOWER 

3292 1 capitals 

3293 2 THIS IS A SENTENCE 

3294 3 sWaPcAsE 

3295 dtype: object 

3296 """ 

3297 

3298 # _doc_args holds dict of strings to use in substituting casemethod docs 

3299 _doc_args: Dict[str, Dict[str, str]] = {} 

3300 _doc_args["lower"] = dict(type="lowercase", method="lower", version="") 

3301 _doc_args["upper"] = dict(type="uppercase", method="upper", version="") 

3302 _doc_args["title"] = dict(type="titlecase", method="title", version="") 

3303 _doc_args["capitalize"] = dict( 

3304 type="be capitalized", method="capitalize", version="" 

3305 ) 

3306 _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") 

3307 _doc_args["casefold"] = dict( 

3308 type="be casefolded", 

3309 method="casefold", 

3310 version="\n .. versionadded:: 0.25.0\n", 

3311 ) 

3312 lower = _noarg_wrapper( 

3313 lambda x: x.lower(), 

3314 name="lower", 

3315 docstring=_shared_docs["casemethods"] % _doc_args["lower"], 

3316 dtype=str, 

3317 ) 

3318 upper = _noarg_wrapper( 

3319 lambda x: x.upper(), 

3320 name="upper", 

3321 docstring=_shared_docs["casemethods"] % _doc_args["upper"], 

3322 dtype=str, 

3323 ) 

3324 title = _noarg_wrapper( 

3325 lambda x: x.title(), 

3326 name="title", 

3327 docstring=_shared_docs["casemethods"] % _doc_args["title"], 

3328 dtype=str, 

3329 ) 

3330 capitalize = _noarg_wrapper( 

3331 lambda x: x.capitalize(), 

3332 name="capitalize", 

3333 docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], 

3334 dtype=str, 

3335 ) 

3336 swapcase = _noarg_wrapper( 

3337 lambda x: x.swapcase(), 

3338 name="swapcase", 

3339 docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], 

3340 dtype=str, 

3341 ) 

3342 casefold = _noarg_wrapper( 

3343 lambda x: x.casefold(), 

3344 name="casefold", 

3345 docstring=_shared_docs["casemethods"] % _doc_args["casefold"], 

3346 dtype=str, 

3347 ) 

3348 

3349 _shared_docs[ 

3350 "ismethods" 

3351 ] = """ 

3352 Check whether all characters in each string are %(type)s. 

3353 

3354 This is equivalent to running the Python string method 

3355 :meth:`str.%(method)s` for each element of the Series/Index. If a string 

3356 has zero characters, ``False`` is returned for that check. 

3357 

3358 Returns 

3359 ------- 

3360 Series or Index of bool 

3361 Series or Index of boolean values with the same length as the original 

3362 Series/Index. 

3363 

3364 See Also 

3365 -------- 

3366 Series.str.isalpha : Check whether all characters are alphabetic. 

3367 Series.str.isnumeric : Check whether all characters are numeric. 

3368 Series.str.isalnum : Check whether all characters are alphanumeric. 

3369 Series.str.isdigit : Check whether all characters are digits. 

3370 Series.str.isdecimal : Check whether all characters are decimal. 

3371 Series.str.isspace : Check whether all characters are whitespace. 

3372 Series.str.islower : Check whether all characters are lowercase. 

3373 Series.str.isupper : Check whether all characters are uppercase. 

3374 Series.str.istitle : Check whether all characters are titlecase. 

3375 

3376 Examples 

3377 -------- 

3378 **Checks for Alphabetic and Numeric Characters** 

3379 

3380 >>> s1 = pd.Series(['one', 'one1', '1', '']) 

3381 

3382 >>> s1.str.isalpha() 

3383 0 True 

3384 1 False 

3385 2 False 

3386 3 False 

3387 dtype: bool 

3388 

3389 >>> s1.str.isnumeric() 

3390 0 False 

3391 1 False 

3392 2 True 

3393 3 False 

3394 dtype: bool 

3395 

3396 >>> s1.str.isalnum() 

3397 0 True 

3398 1 True 

3399 2 True 

3400 3 False 

3401 dtype: bool 

3402 

3403 Note that checks against characters mixed with any additional punctuation 

3404 or whitespace will evaluate to false for an alphanumeric check. 

3405 

3406 >>> s2 = pd.Series(['A B', '1.5', '3,000']) 

3407 >>> s2.str.isalnum() 

3408 0 False 

3409 1 False 

3410 2 False 

3411 dtype: bool 

3412 

3413 **More Detailed Checks for Numeric Characters** 

3414 

3415 There are several different but overlapping sets of numeric characters that 

3416 can be checked for. 

3417 

3418 >>> s3 = pd.Series(['23', '³', '⅕', '']) 

3419 

3420 The ``s3.str.isdecimal`` method checks for characters used to form numbers 

3421 in base 10. 

3422 

3423 >>> s3.str.isdecimal() 

3424 0 True 

3425 1 False 

3426 2 False 

3427 3 False 

3428 dtype: bool 

3429 

3430 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also 

3431 includes special digits, like superscripted and subscripted digits in 

3432 unicode. 

3433 

3434 >>> s3.str.isdigit() 

3435 0 True 

3436 1 True 

3437 2 False 

3438 3 False 

3439 dtype: bool 

3440 

3441 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also 

3442 includes other characters that can represent quantities such as unicode 

3443 fractions. 

3444 

3445 >>> s3.str.isnumeric() 

3446 0 True 

3447 1 True 

3448 2 True 

3449 3 False 

3450 dtype: bool 

3451 

3452 **Checks for Whitespace** 

3453 

3454 >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) 

3455 >>> s4.str.isspace() 

3456 0 True 

3457 1 True 

3458 2 False 

3459 dtype: bool 

3460 

3461 **Checks for Character Case** 

3462 

3463 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) 

3464 

3465 >>> s5.str.islower() 

3466 0 True 

3467 1 False 

3468 2 False 

3469 3 False 

3470 dtype: bool 

3471 

3472 >>> s5.str.isupper() 

3473 0 False 

3474 1 False 

3475 2 True 

3476 3 False 

3477 dtype: bool 

3478 

3479 The ``s5.str.istitle`` method checks for whether all words are in title 

3480 case (whether only the first letter of each word is capitalized). Words are 

3481 assumed to be as any sequence of non-numeric characters separated by 

3482 whitespace characters. 

3483 

3484 >>> s5.str.istitle() 

3485 0 False 

3486 1 True 

3487 2 False 

3488 3 False 

3489 dtype: bool 

3490 """ 

3491 _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") 

3492 _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") 

3493 _doc_args["isdigit"] = dict(type="digits", method="isdigit") 

3494 _doc_args["isspace"] = dict(type="whitespace", method="isspace") 

3495 _doc_args["islower"] = dict(type="lowercase", method="islower") 

3496 _doc_args["isupper"] = dict(type="uppercase", method="isupper") 

3497 _doc_args["istitle"] = dict(type="titlecase", method="istitle") 

3498 _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") 

3499 _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") 

3500 # force _noarg_wrapper return type with dtype=bool (GH 29624) 

3501 isalnum = _noarg_wrapper( 

3502 lambda x: x.isalnum(), 

3503 name="isalnum", 

3504 docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], 

3505 returns_string=False, 

3506 dtype=bool, 

3507 ) 

3508 isalpha = _noarg_wrapper( 

3509 lambda x: x.isalpha(), 

3510 name="isalpha", 

3511 docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], 

3512 returns_string=False, 

3513 dtype=bool, 

3514 ) 

3515 isdigit = _noarg_wrapper( 

3516 lambda x: x.isdigit(), 

3517 name="isdigit", 

3518 docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], 

3519 returns_string=False, 

3520 dtype=bool, 

3521 ) 

3522 isspace = _noarg_wrapper( 

3523 lambda x: x.isspace(), 

3524 name="isspace", 

3525 docstring=_shared_docs["ismethods"] % _doc_args["isspace"], 

3526 returns_string=False, 

3527 dtype=bool, 

3528 ) 

3529 islower = _noarg_wrapper( 

3530 lambda x: x.islower(), 

3531 name="islower", 

3532 docstring=_shared_docs["ismethods"] % _doc_args["islower"], 

3533 returns_string=False, 

3534 dtype=bool, 

3535 ) 

3536 isupper = _noarg_wrapper( 

3537 lambda x: x.isupper(), 

3538 name="isupper", 

3539 docstring=_shared_docs["ismethods"] % _doc_args["isupper"], 

3540 returns_string=False, 

3541 dtype=bool, 

3542 ) 

3543 istitle = _noarg_wrapper( 

3544 lambda x: x.istitle(), 

3545 name="istitle", 

3546 docstring=_shared_docs["ismethods"] % _doc_args["istitle"], 

3547 returns_string=False, 

3548 dtype=bool, 

3549 ) 

3550 isnumeric = _noarg_wrapper( 

3551 lambda x: x.isnumeric(), 

3552 name="isnumeric", 

3553 docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], 

3554 returns_string=False, 

3555 dtype=bool, 

3556 ) 

3557 isdecimal = _noarg_wrapper( 

3558 lambda x: x.isdecimal(), 

3559 name="isdecimal", 

3560 docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], 

3561 returns_string=False, 

3562 dtype=bool, 

3563 ) 

3564 

3565 @classmethod 

3566 def _make_accessor(cls, data): 

3567 cls._validate(data) 

3568 return cls(data)