Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/strings.py : 29%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import codecs
2from functools import wraps
3import re
4import textwrap
5from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union
6import warnings
8import numpy as np
10import pandas._libs.lib as lib
11import pandas._libs.missing as libmissing
12import pandas._libs.ops as libops
13from pandas._typing import ArrayLike, Dtype
14from pandas.util._decorators import Appender
16from pandas.core.dtypes.common import (
17 ensure_object,
18 is_bool_dtype,
19 is_categorical_dtype,
20 is_extension_array_dtype,
21 is_integer,
22 is_integer_dtype,
23 is_list_like,
24 is_object_dtype,
25 is_re,
26 is_scalar,
27 is_string_dtype,
28)
29from pandas.core.dtypes.generic import (
30 ABCDataFrame,
31 ABCIndexClass,
32 ABCMultiIndex,
33 ABCSeries,
34)
35from pandas.core.dtypes.missing import isna
37from pandas.core.algorithms import take_1d
38from pandas.core.base import NoNewAttributesMixin
39import pandas.core.common as com
40from pandas.core.construction import extract_array
42if TYPE_CHECKING:
43 from pandas.arrays import StringArray
45_cpython_optimized_encoders = (
46 "utf-8",
47 "utf8",
48 "latin-1",
49 "latin1",
50 "iso-8859-1",
51 "mbcs",
52 "ascii",
53)
54_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
56_shared_docs: Dict[str, str] = dict()
59def cat_core(list_of_columns: List, sep: str):
60 """
61 Auxiliary function for :meth:`str.cat`
63 Parameters
64 ----------
65 list_of_columns : list of numpy arrays
66 List of arrays to be concatenated with sep;
67 these arrays may not contain NaNs!
68 sep : string
69 The separator string for concatenating the columns.
71 Returns
72 -------
73 nd.array
74 The concatenation of list_of_columns with sep.
75 """
76 if sep == "":
77 # no need to interleave sep if it is empty
78 arr_of_cols = np.asarray(list_of_columns, dtype=object)
79 return np.sum(arr_of_cols, axis=0)
80 list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
81 list_with_sep[::2] = list_of_columns
82 arr_with_sep = np.asarray(list_with_sep, dtype=object)
83 return np.sum(arr_with_sep, axis=0)
86def cat_safe(list_of_columns: List, sep: str):
87 """
88 Auxiliary function for :meth:`str.cat`.
90 Same signature as cat_core, but handles TypeErrors in concatenation, which
91 happen if the arrays in list_of columns have the wrong dtypes or content.
93 Parameters
94 ----------
95 list_of_columns : list of numpy arrays
96 List of arrays to be concatenated with sep;
97 these arrays may not contain NaNs!
98 sep : string
99 The separator string for concatenating the columns.
101 Returns
102 -------
103 nd.array
104 The concatenation of list_of_columns with sep.
105 """
106 try:
107 result = cat_core(list_of_columns, sep)
108 except TypeError:
109 # if there are any non-string values (wrong dtype or hidden behind
110 # object dtype), np.sum will fail; catch and return with better message
111 for column in list_of_columns:
112 dtype = lib.infer_dtype(column, skipna=True)
113 if dtype not in ["string", "empty"]:
114 raise TypeError(
115 "Concatenation requires list-likes containing only "
116 "strings (or missing values). Offending values found in "
117 f"column {dtype}"
118 ) from None
119 return result
122def _na_map(f, arr, na_result=None, dtype=object):
123 if is_extension_array_dtype(arr.dtype):
124 if na_result is None:
125 na_result = libmissing.NA
126 # just StringDtype
127 arr = extract_array(arr)
128 return _map_stringarray(f, arr, na_value=na_result, dtype=dtype)
129 if na_result is None:
130 na_result = np.nan
131 return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
134def _map_stringarray(
135 func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype
136) -> ArrayLike:
137 """
138 Map a callable over valid elements of a StringArrray.
140 Parameters
141 ----------
142 func : Callable[[str], Any]
143 Apply to each valid element.
144 arr : StringArray
145 na_value : Any
146 The value to use for missing values. By default, this is
147 the original value (NA).
148 dtype : Dtype
149 The result dtype to use. Specifying this avoids an intermediate
150 object-dtype allocation.
152 Returns
153 -------
154 ArrayLike
155 An ExtensionArray for integer or string dtypes, otherwise
156 an ndarray.
158 """
159 from pandas.arrays import IntegerArray, StringArray, BooleanArray
161 mask = isna(arr)
163 assert isinstance(arr, StringArray)
164 arr = np.asarray(arr)
166 if is_integer_dtype(dtype) or is_bool_dtype(dtype):
167 constructor: Union[Type[IntegerArray], Type[BooleanArray]]
168 if is_integer_dtype(dtype):
169 constructor = IntegerArray
170 else:
171 constructor = BooleanArray
173 na_value_is_na = isna(na_value)
174 if na_value_is_na:
175 na_value = 1
176 result = lib.map_infer_mask(
177 arr,
178 func,
179 mask.view("uint8"),
180 convert=False,
181 na_value=na_value,
182 dtype=np.dtype(dtype),
183 )
185 if not na_value_is_na:
186 mask[:] = False
188 return constructor(result, mask)
190 elif is_string_dtype(dtype) and not is_object_dtype(dtype):
191 # i.e. StringDtype
192 result = lib.map_infer_mask(
193 arr, func, mask.view("uint8"), convert=False, na_value=na_value
194 )
195 return StringArray(result)
196 else:
197 # This is when the result type is object. We reach this when
198 # -> We know the result type is truly object (e.g. .encode returns bytes
199 # or .findall returns a list).
200 # -> We don't know the result type. E.g. `.get` can return anything.
201 return lib.map_infer_mask(arr, func, mask.view("uint8"))
204def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object):
205 if not len(arr):
206 return np.ndarray(0, dtype=dtype)
208 if isinstance(arr, ABCSeries):
209 arr = arr.values
210 if not isinstance(arr, np.ndarray):
211 arr = np.asarray(arr, dtype=object)
212 if na_mask:
213 mask = isna(arr)
214 convert = not np.all(mask)
215 try:
216 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
217 except (TypeError, AttributeError) as e:
218 # Reraise the exception if callable `f` got wrong number of args.
219 # The user may want to be warned by this, instead of getting NaN
220 p_err = (
221 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
222 r"(?(3)required )positional arguments?"
223 )
225 if len(e.args) >= 1 and re.search(p_err, e.args[0]):
226 # FIXME: this should be totally avoidable
227 raise e
229 def g(x):
230 try:
231 return f(x)
232 except (TypeError, AttributeError):
233 return na_value
235 return _map_object(g, arr, dtype=dtype)
236 if na_value is not np.nan:
237 np.putmask(result, mask, na_value)
238 if result.dtype == object:
239 result = lib.maybe_convert_objects(result)
240 return result
241 else:
242 return lib.map_infer(arr, f)
245def str_count(arr, pat, flags=0):
246 """
247 Count occurrences of pattern in each string of the Series/Index.
249 This function is used to count the number of times a particular regex
250 pattern is repeated in each of the string elements of the
251 :class:`~pandas.Series`.
253 Parameters
254 ----------
255 pat : str
256 Valid regular expression.
257 flags : int, default 0, meaning no flags
258 Flags for the `re` module. For a complete list, `see here
259 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
260 **kwargs
261 For compatibility with other string methods. Not used.
263 Returns
264 -------
265 Series or Index
266 Same type as the calling object containing the integer counts.
268 See Also
269 --------
270 re : Standard library module for regular expressions.
271 str.count : Standard library version, without regular expression support.
273 Notes
274 -----
275 Some characters need to be escaped when passing in `pat`.
276 eg. ``'$'`` has a special meaning in regex and must be escaped when
277 finding this literal character.
279 Examples
280 --------
281 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
282 >>> s.str.count('a')
283 0 0.0
284 1 0.0
285 2 2.0
286 3 2.0
287 4 NaN
288 5 0.0
289 6 1.0
290 dtype: float64
292 Escape ``'$'`` to find the literal dollar sign.
294 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
295 >>> s.str.count('\\$')
296 0 1
297 1 0
298 2 1
299 3 2
300 4 2
301 5 0
302 dtype: int64
304 This is also available on Index
306 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
307 Int64Index([0, 0, 2, 1], dtype='int64')
308 """
309 regex = re.compile(pat, flags=flags)
310 f = lambda x: len(regex.findall(x))
311 return _na_map(f, arr, dtype="int64")
314def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
315 """
316 Test if pattern or regex is contained within a string of a Series or Index.
318 Return boolean Series or Index based on whether a given pattern or regex is
319 contained within a string of a Series or Index.
321 Parameters
322 ----------
323 pat : str
324 Character sequence or regular expression.
325 case : bool, default True
326 If True, case sensitive.
327 flags : int, default 0 (no flags)
328 Flags to pass through to the re module, e.g. re.IGNORECASE.
329 na : default NaN
330 Fill value for missing values.
331 regex : bool, default True
332 If True, assumes the pat is a regular expression.
334 If False, treats the pat as a literal string.
336 Returns
337 -------
338 Series or Index of boolean values
339 A Series or Index of boolean values indicating whether the
340 given pattern is contained within the string of each element
341 of the Series or Index.
343 See Also
344 --------
345 match : Analogous, but stricter, relying on re.match instead of re.search.
346 Series.str.startswith : Test if the start of each string element matches a
347 pattern.
348 Series.str.endswith : Same as startswith, but tests the end of string.
350 Examples
351 --------
353 Returning a Series of booleans using only a literal pattern.
355 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
356 >>> s1.str.contains('og', regex=False)
357 0 False
358 1 True
359 2 False
360 3 False
361 4 NaN
362 dtype: object
364 Returning an Index of booleans using only a literal pattern.
366 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
367 >>> ind.str.contains('23', regex=False)
368 Index([False, False, False, True, nan], dtype='object')
370 Specifying case sensitivity using `case`.
372 >>> s1.str.contains('oG', case=True, regex=True)
373 0 False
374 1 False
375 2 False
376 3 False
377 4 NaN
378 dtype: object
380 Specifying `na` to be `False` instead of `NaN` replaces NaN values
381 with `False`. If Series or Index does not contain NaN values
382 the resultant dtype will be `bool`, otherwise, an `object` dtype.
384 >>> s1.str.contains('og', na=False, regex=True)
385 0 False
386 1 True
387 2 False
388 3 False
389 4 False
390 dtype: bool
392 Returning 'house' or 'dog' when either expression occurs in a string.
394 >>> s1.str.contains('house|dog', regex=True)
395 0 False
396 1 True
397 2 True
398 3 False
399 4 NaN
400 dtype: object
402 Ignoring case sensitivity using `flags` with regex.
404 >>> import re
405 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
406 0 False
407 1 False
408 2 True
409 3 False
410 4 NaN
411 dtype: object
413 Returning any digit using regular expression.
415 >>> s1.str.contains('\\d', regex=True)
416 0 False
417 1 False
418 2 False
419 3 True
420 4 NaN
421 dtype: object
423 Ensure `pat` is a not a literal pattern when `regex` is set to True.
424 Note in the following example one might expect only `s2[1]` and `s2[3]` to
425 return `True`. However, '.0' as a regex matches any character
426 followed by a 0.
428 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
429 >>> s2.str.contains('.0', regex=True)
430 0 True
431 1 True
432 2 False
433 3 True
434 4 False
435 dtype: bool
436 """
437 if regex:
438 if not case:
439 flags |= re.IGNORECASE
441 regex = re.compile(pat, flags=flags)
443 if regex.groups > 0:
444 warnings.warn(
445 "This pattern has match groups. To actually get the "
446 "groups, use str.extract.",
447 UserWarning,
448 stacklevel=3,
449 )
451 f = lambda x: bool(regex.search(x))
452 else:
453 if case:
454 f = lambda x: pat in x
455 else:
456 upper_pat = pat.upper()
457 f = lambda x: upper_pat in x
458 uppered = _na_map(lambda x: x.upper(), arr)
459 return _na_map(f, uppered, na, dtype=bool)
460 return _na_map(f, arr, na, dtype=bool)
463def str_startswith(arr, pat, na=np.nan):
464 """
465 Test if the start of each string element matches a pattern.
467 Equivalent to :meth:`str.startswith`.
469 Parameters
470 ----------
471 pat : str
472 Character sequence. Regular expressions are not accepted.
473 na : object, default NaN
474 Object shown if element tested is not a string.
476 Returns
477 -------
478 Series or Index of bool
479 A Series of booleans indicating whether the given pattern matches
480 the start of each string element.
482 See Also
483 --------
484 str.startswith : Python standard library string method.
485 Series.str.endswith : Same as startswith, but tests the end of string.
486 Series.str.contains : Tests if string element contains a pattern.
488 Examples
489 --------
490 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
491 >>> s
492 0 bat
493 1 Bear
494 2 cat
495 3 NaN
496 dtype: object
498 >>> s.str.startswith('b')
499 0 True
500 1 False
501 2 False
502 3 NaN
503 dtype: object
505 Specifying `na` to be `False` instead of `NaN`.
507 >>> s.str.startswith('b', na=False)
508 0 True
509 1 False
510 2 False
511 3 False
512 dtype: bool
513 """
514 f = lambda x: x.startswith(pat)
515 return _na_map(f, arr, na, dtype=bool)
518def str_endswith(arr, pat, na=np.nan):
519 """
520 Test if the end of each string element matches a pattern.
522 Equivalent to :meth:`str.endswith`.
524 Parameters
525 ----------
526 pat : str
527 Character sequence. Regular expressions are not accepted.
528 na : object, default NaN
529 Object shown if element tested is not a string.
531 Returns
532 -------
533 Series or Index of bool
534 A Series of booleans indicating whether the given pattern matches
535 the end of each string element.
537 See Also
538 --------
539 str.endswith : Python standard library string method.
540 Series.str.startswith : Same as endswith, but tests the start of string.
541 Series.str.contains : Tests if string element contains a pattern.
543 Examples
544 --------
545 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
546 >>> s
547 0 bat
548 1 bear
549 2 caT
550 3 NaN
551 dtype: object
553 >>> s.str.endswith('t')
554 0 True
555 1 False
556 2 False
557 3 NaN
558 dtype: object
560 Specifying `na` to be `False` instead of `NaN`.
562 >>> s.str.endswith('t', na=False)
563 0 True
564 1 False
565 2 False
566 3 False
567 dtype: bool
568 """
569 f = lambda x: x.endswith(pat)
570 return _na_map(f, arr, na, dtype=bool)
573def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
574 r"""
575 Replace occurrences of pattern/regex in the Series/Index with
576 some other string. Equivalent to :meth:`str.replace` or
577 :func:`re.sub`.
579 Parameters
580 ----------
581 pat : str or compiled regex
582 String can be a character sequence or regular expression.
583 repl : str or callable
584 Replacement string or a callable. The callable is passed the regex
585 match object and must return a replacement string to be used.
586 See :func:`re.sub`.
587 n : int, default -1 (all)
588 Number of replacements to make from start.
589 case : bool, default None
590 Determines if replace is case sensitive:
592 - If True, case sensitive (the default if `pat` is a string)
593 - Set to False for case insensitive
594 - Cannot be set if `pat` is a compiled regex.
596 flags : int, default 0 (no flags)
597 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
598 regex.
599 regex : bool, default True
600 Determines if assumes the passed-in pattern is a regular expression:
602 - If True, assumes the passed-in pattern is a regular expression.
603 - If False, treats the pattern as a literal string
604 - Cannot be set to False if `pat` is a compiled regex or `repl` is
605 a callable.
607 .. versionadded:: 0.23.0
609 Returns
610 -------
611 Series or Index of object
612 A copy of the object with all matching occurrences of `pat` replaced by
613 `repl`.
615 Raises
616 ------
617 ValueError
618 * if `regex` is False and `repl` is a callable or `pat` is a compiled
619 regex
620 * if `pat` is a compiled regex and `case` or `flags` is set
622 Notes
623 -----
624 When `pat` is a compiled regex, all flags should be included in the
625 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
626 regex will raise an error.
628 Examples
629 --------
630 When `pat` is a string and `regex` is True (the default), the given `pat`
631 is compiled as a regex. When `repl` is a string, it replaces matching
632 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
633 left as is:
635 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
636 0 bao
637 1 baz
638 2 NaN
639 dtype: object
641 When `pat` is a string and `regex` is False, every `pat` is replaced with
642 `repl` as with :meth:`str.replace`:
644 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
645 0 bao
646 1 fuz
647 2 NaN
648 dtype: object
650 When `repl` is a callable, it is called on every `pat` using
651 :func:`re.sub`. The callable should expect one positional argument
652 (a regex object) and return a string.
654 To get the idea:
656 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
657 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo
658 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz
659 2 NaN
660 dtype: object
662 Reverse every lowercase alphabetic word:
664 >>> repl = lambda m: m.group(0)[::-1]
665 >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
666 0 oof 123
667 1 rab zab
668 2 NaN
669 dtype: object
671 Using regex groups (extract second group and swap case):
673 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
674 >>> repl = lambda m: m.group('two').swapcase()
675 >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
676 0 tWO
677 1 bAR
678 dtype: object
680 Using a compiled regex with flags
682 >>> import re
683 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
684 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
685 0 foo
686 1 bar
687 2 NaN
688 dtype: object
689 """
691 # Check whether repl is valid (GH 13438, GH 15055)
692 if not (isinstance(repl, str) or callable(repl)):
693 raise TypeError("repl must be a string or callable")
695 is_compiled_re = is_re(pat)
696 if regex:
697 if is_compiled_re:
698 if (case is not None) or (flags != 0):
699 raise ValueError(
700 "case and flags cannot be set when pat is a compiled regex"
701 )
702 else:
703 # not a compiled regex
704 # set default case
705 if case is None:
706 case = True
708 # add case flag, if provided
709 if case is False:
710 flags |= re.IGNORECASE
711 if is_compiled_re or len(pat) > 1 or flags or callable(repl):
712 n = n if n >= 0 else 0
713 compiled = re.compile(pat, flags=flags)
714 f = lambda x: compiled.sub(repl=repl, string=x, count=n)
715 else:
716 f = lambda x: x.replace(pat, repl, n)
717 else:
718 if is_compiled_re:
719 raise ValueError(
720 "Cannot use a compiled regex as replacement pattern with regex=False"
721 )
722 if callable(repl):
723 raise ValueError("Cannot use a callable replacement when regex=False")
724 f = lambda x: x.replace(pat, repl, n)
726 return _na_map(f, arr, dtype=str)
729def str_repeat(arr, repeats):
730 """
731 Duplicate each string in the Series or Index.
733 Parameters
734 ----------
735 repeats : int or sequence of int
736 Same value for all (int) or different value per (sequence).
738 Returns
739 -------
740 Series or Index of object
741 Series or Index of repeated string objects specified by
742 input parameter repeats.
744 Examples
745 --------
746 >>> s = pd.Series(['a', 'b', 'c'])
747 >>> s
748 0 a
749 1 b
750 2 c
751 dtype: object
753 Single int repeats string in Series
755 >>> s.str.repeat(repeats=2)
756 0 aa
757 1 bb
758 2 cc
759 dtype: object
761 Sequence of int repeats corresponding string in Series
763 >>> s.str.repeat(repeats=[1, 2, 3])
764 0 a
765 1 bb
766 2 ccc
767 dtype: object
768 """
769 if is_scalar(repeats):
771 def scalar_rep(x):
772 try:
773 return bytes.__mul__(x, repeats)
774 except TypeError:
775 return str.__mul__(x, repeats)
777 return _na_map(scalar_rep, arr, dtype=str)
778 else:
780 def rep(x, r):
781 if x is libmissing.NA:
782 return x
783 try:
784 return bytes.__mul__(x, r)
785 except TypeError:
786 return str.__mul__(x, r)
788 repeats = np.asarray(repeats, dtype=object)
789 result = libops.vec_binop(com.values_from_object(arr), repeats, rep)
790 return result
793def str_match(arr, pat, case=True, flags=0, na=np.nan):
794 """
795 Determine if each string matches a regular expression.
797 Parameters
798 ----------
799 pat : str
800 Character sequence or regular expression.
801 case : bool, default True
802 If True, case sensitive.
803 flags : int, default 0 (no flags)
804 Regex module flags, e.g. re.IGNORECASE.
805 na : default NaN
806 Fill value for missing values.
808 Returns
809 -------
810 Series/array of boolean values
812 See Also
813 --------
814 contains : Analogous, but less strict, relying on re.search instead of
815 re.match.
816 extract : Extract matched groups.
817 """
818 if not case:
819 flags |= re.IGNORECASE
821 regex = re.compile(pat, flags=flags)
823 dtype = bool
824 f = lambda x: bool(regex.match(x))
826 return _na_map(f, arr, na, dtype=dtype)
829def _get_single_group_name(rx):
830 try:
831 return list(rx.groupindex.keys()).pop()
832 except IndexError:
833 return None
836def _groups_or_na_fun(regex):
837 """Used in both extract_noexpand and extract_frame"""
838 if regex.groups == 0:
839 raise ValueError("pattern contains no capture groups")
840 empty_row = [np.nan] * regex.groups
842 def f(x):
843 if not isinstance(x, str):
844 return empty_row
845 m = regex.search(x)
846 if m:
847 return [np.nan if item is None else item for item in m.groups()]
848 else:
849 return empty_row
851 return f
854def _result_dtype(arr):
855 # workaround #27953
856 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
857 # when the list of values is empty.
858 if arr.dtype.name == "string":
859 return "string"
860 else:
861 return object
864def _str_extract_noexpand(arr, pat, flags=0):
865 """
866 Find groups in each string in the Series using passed regular
867 expression. This function is called from
868 str_extract(expand=False), and can return Series, DataFrame, or
869 Index.
871 """
872 from pandas import DataFrame
874 regex = re.compile(pat, flags=flags)
875 groups_or_na = _groups_or_na_fun(regex)
877 if regex.groups == 1:
878 result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
879 name = _get_single_group_name(regex)
880 else:
881 if isinstance(arr, ABCIndexClass):
882 raise ValueError("only one regex group is supported with Index")
883 name = None
884 names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
885 columns = [names.get(1 + i, i) for i in range(regex.groups)]
886 if arr.empty:
887 result = DataFrame(columns=columns, dtype=object)
888 else:
889 dtype = _result_dtype(arr)
890 result = DataFrame(
891 [groups_or_na(val) for val in arr],
892 columns=columns,
893 index=arr.index,
894 dtype=dtype,
895 )
896 return result, name
899def _str_extract_frame(arr, pat, flags=0):
900 """
901 For each subject string in the Series, extract groups from the
902 first match of regular expression pat. This function is called from
903 str_extract(expand=True), and always returns a DataFrame.
905 """
906 from pandas import DataFrame
908 regex = re.compile(pat, flags=flags)
909 groups_or_na = _groups_or_na_fun(regex)
910 names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
911 columns = [names.get(1 + i, i) for i in range(regex.groups)]
913 if len(arr) == 0:
914 return DataFrame(columns=columns, dtype=object)
915 try:
916 result_index = arr.index
917 except AttributeError:
918 result_index = None
919 dtype = _result_dtype(arr)
920 return DataFrame(
921 [groups_or_na(val) for val in arr],
922 columns=columns,
923 index=result_index,
924 dtype=dtype,
925 )
928def str_extract(arr, pat, flags=0, expand=True):
929 r"""
930 Extract capture groups in the regex `pat` as columns in a DataFrame.
932 For each subject string in the Series, extract groups from the
933 first match of regular expression `pat`.
935 Parameters
936 ----------
937 pat : str
938 Regular expression pattern with capturing groups.
939 flags : int, default 0 (no flags)
940 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
941 modify regular expression matching for things like case,
942 spaces, etc. For more details, see :mod:`re`.
943 expand : bool, default True
944 If True, return DataFrame with one column per capture group.
945 If False, return a Series/Index if there is one capture group
946 or DataFrame if there are multiple capture groups.
948 Returns
949 -------
950 DataFrame or Series or Index
951 A DataFrame with one row for each subject string, and one
952 column for each group. Any capture group names in regular
953 expression pat will be used for column names; otherwise
954 capture group numbers will be used. The dtype of each result
955 column is always object, even when no match is found. If
956 ``expand=False`` and pat has only one capture group, then
957 return a Series (if subject is a Series) or Index (if subject
958 is an Index).
960 See Also
961 --------
962 extractall : Returns all matches (not just the first match).
964 Examples
965 --------
966 A pattern with two groups will return a DataFrame with two columns.
967 Non-matches will be NaN.
969 >>> s = pd.Series(['a1', 'b2', 'c3'])
970 >>> s.str.extract(r'([ab])(\d)')
971 0 1
972 0 a 1
973 1 b 2
974 2 NaN NaN
976 A pattern may contain optional groups.
978 >>> s.str.extract(r'([ab])?(\d)')
979 0 1
980 0 a 1
981 1 b 2
982 2 NaN 3
984 Named groups will become column names in the result.
986 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
987 letter digit
988 0 a 1
989 1 b 2
990 2 NaN NaN
992 A pattern with one group will return a DataFrame with one column
993 if expand=True.
995 >>> s.str.extract(r'[ab](\d)', expand=True)
996 0
997 0 1
998 1 2
999 2 NaN
1001 A pattern with one group will return a Series if expand=False.
1003 >>> s.str.extract(r'[ab](\d)', expand=False)
1004 0 1
1005 1 2
1006 2 NaN
1007 dtype: object
1008 """
1009 if not isinstance(expand, bool):
1010 raise ValueError("expand must be True or False")
1011 if expand:
1012 return _str_extract_frame(arr._orig, pat, flags=flags)
1013 else:
1014 result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
1015 return arr._wrap_result(result, name=name, expand=expand)
1018def str_extractall(arr, pat, flags=0):
1019 r"""
1020 For each subject string in the Series, extract groups from all
1021 matches of regular expression pat. When each subject string in the
1022 Series has exactly one match, extractall(pat).xs(0, level='match')
1023 is the same as extract(pat).
1025 Parameters
1026 ----------
1027 pat : str
1028 Regular expression pattern with capturing groups.
1029 flags : int, default 0 (no flags)
1030 A ``re`` module flag, for example ``re.IGNORECASE``. These allow
1031 to modify regular expression matching for things like case, spaces,
1032 etc. Multiple flags can be combined with the bitwise OR operator,
1033 for example ``re.IGNORECASE | re.MULTILINE``.
1035 Returns
1036 -------
1037 DataFrame
1038 A ``DataFrame`` with one row for each match, and one column for each
1039 group. Its rows have a ``MultiIndex`` with first levels that come from
1040 the subject ``Series``. The last level is named 'match' and indexes the
1041 matches in each item of the ``Series``. Any capture group names in
1042 regular expression pat will be used for column names; otherwise capture
1043 group numbers will be used.
1045 See Also
1046 --------
1047 extract : Returns first match only (not all matches).
1049 Examples
1050 --------
1051 A pattern with one group will return a DataFrame with one column.
1052 Indices with no matches will not appear in the result.
1054 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
1055 >>> s.str.extractall(r"[ab](\d)")
1056 0
1057 match
1058 A 0 1
1059 1 2
1060 B 0 1
1062 Capture group names are used for column names of the result.
1064 >>> s.str.extractall(r"[ab](?P<digit>\d)")
1065 digit
1066 match
1067 A 0 1
1068 1 2
1069 B 0 1
1071 A pattern with two groups will return a DataFrame with two columns.
1073 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
1074 letter digit
1075 match
1076 A 0 a 1
1077 1 a 2
1078 B 0 b 1
1080 Optional groups that do not match are NaN in the result.
1082 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
1083 letter digit
1084 match
1085 A 0 a 1
1086 1 a 2
1087 B 0 b 1
1088 C 0 NaN 1
1089 """
1091 regex = re.compile(pat, flags=flags)
1092 # the regex must contain capture groups.
1093 if regex.groups == 0:
1094 raise ValueError("pattern contains no capture groups")
1096 if isinstance(arr, ABCIndexClass):
1097 arr = arr.to_series().reset_index(drop=True)
1099 names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
1100 columns = [names.get(1 + i, i) for i in range(regex.groups)]
1101 match_list = []
1102 index_list = []
1103 is_mi = arr.index.nlevels > 1
1105 for subject_key, subject in arr.items():
1106 if isinstance(subject, str):
1108 if not is_mi:
1109 subject_key = (subject_key,)
1111 for match_i, match_tuple in enumerate(regex.findall(subject)):
1112 if isinstance(match_tuple, str):
1113 match_tuple = (match_tuple,)
1114 na_tuple = [np.NaN if group == "" else group for group in match_tuple]
1115 match_list.append(na_tuple)
1116 result_key = tuple(subject_key + (match_i,))
1117 index_list.append(result_key)
1119 from pandas import MultiIndex
1121 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
1122 dtype = _result_dtype(arr)
1124 result = arr._constructor_expanddim(
1125 match_list, index=index, columns=columns, dtype=dtype
1126 )
1127 return result
1130def str_get_dummies(arr, sep="|"):
1131 """
1132 Split each string in the Series by sep and return a DataFrame
1133 of dummy/indicator variables.
1135 Parameters
1136 ----------
1137 sep : str, default "|"
1138 String to split on.
1140 Returns
1141 -------
1142 DataFrame
1143 Dummy variables corresponding to values of the Series.
1145 See Also
1146 --------
1147 get_dummies : Convert categorical variable into dummy/indicator
1148 variables.
1150 Examples
1151 --------
1152 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
1153 a b c
1154 0 1 1 0
1155 1 1 0 0
1156 2 1 0 1
1158 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
1159 a b c
1160 0 1 1 0
1161 1 0 0 0
1162 2 1 0 1
1163 """
1164 arr = arr.fillna("")
1165 try:
1166 arr = sep + arr + sep
1167 except TypeError:
1168 arr = sep + arr.astype(str) + sep
1170 tags = set()
1171 for ts in arr.str.split(sep):
1172 tags.update(ts)
1173 tags = sorted(tags - {""})
1175 dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
1177 for i, t in enumerate(tags):
1178 pat = sep + t + sep
1179 dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x)
1180 return dummies, tags
1183def str_join(arr, sep):
1184 """
1185 Join lists contained as elements in the Series/Index with passed delimiter.
1187 If the elements of a Series are lists themselves, join the content of these
1188 lists using the delimiter passed to the function.
1189 This function is an equivalent to :meth:`str.join`.
1191 Parameters
1192 ----------
1193 sep : str
1194 Delimiter to use between list entries.
1196 Returns
1197 -------
1198 Series/Index: object
1199 The list entries concatenated by intervening occurrences of the
1200 delimiter.
1202 Raises
1203 ------
1204 AttributeError
1205 If the supplied Series contains neither strings nor lists.
1207 See Also
1208 --------
1209 str.join : Standard library version of this method.
1210 Series.str.split : Split strings around given separator/delimiter.
1212 Notes
1213 -----
1214 If any of the list items is not a string object, the result of the join
1215 will be `NaN`.
1217 Examples
1218 --------
1219 Example with a list that contains non-string elements.
1221 >>> s = pd.Series([['lion', 'elephant', 'zebra'],
1222 ... [1.1, 2.2, 3.3],
1223 ... ['cat', np.nan, 'dog'],
1224 ... ['cow', 4.5, 'goat'],
1225 ... ['duck', ['swan', 'fish'], 'guppy']])
1226 >>> s
1227 0 [lion, elephant, zebra]
1228 1 [1.1, 2.2, 3.3]
1229 2 [cat, nan, dog]
1230 3 [cow, 4.5, goat]
1231 4 [duck, [swan, fish], guppy]
1232 dtype: object
1234 Join all lists using a '-'. The lists containing object(s) of types other
1235 than str will produce a NaN.
1237 >>> s.str.join('-')
1238 0 lion-elephant-zebra
1239 1 NaN
1240 2 NaN
1241 3 NaN
1242 4 NaN
1243 dtype: object
1244 """
1245 return _na_map(sep.join, arr, dtype=str)
1248def str_findall(arr, pat, flags=0):
1249 """
1250 Find all occurrences of pattern or regular expression in the Series/Index.
1252 Equivalent to applying :func:`re.findall` to all the elements in the
1253 Series/Index.
1255 Parameters
1256 ----------
1257 pat : str
1258 Pattern or regular expression.
1259 flags : int, default 0
1260 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
1261 means no flags).
1263 Returns
1264 -------
1265 Series/Index of lists of strings
1266 All non-overlapping matches of pattern or regular expression in each
1267 string of this Series/Index.
1269 See Also
1270 --------
1271 count : Count occurrences of pattern or regular expression in each string
1272 of the Series/Index.
1273 extractall : For each string in the Series, extract groups from all matches
1274 of regular expression and return a DataFrame with one row for each
1275 match and one column for each group.
1276 re.findall : The equivalent ``re`` function to all non-overlapping matches
1277 of pattern or regular expression in string, as a list of strings.
1279 Examples
1280 --------
1282 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
1284 The search for the pattern 'Monkey' returns one match:
1286 >>> s.str.findall('Monkey')
1287 0 []
1288 1 [Monkey]
1289 2 []
1290 dtype: object
1292 On the other hand, the search for the pattern 'MONKEY' doesn't return any
1293 match:
1295 >>> s.str.findall('MONKEY')
1296 0 []
1297 1 []
1298 2 []
1299 dtype: object
1301 Flags can be added to the pattern or regular expression. For instance,
1302 to find the pattern 'MONKEY' ignoring the case:
1304 >>> import re
1305 >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
1306 0 []
1307 1 [Monkey]
1308 2 []
1309 dtype: object
1311 When the pattern matches more than one string in the Series, all matches
1312 are returned:
1314 >>> s.str.findall('on')
1315 0 [on]
1316 1 [on]
1317 2 []
1318 dtype: object
1320 Regular expressions are supported too. For instance, the search for all the
1321 strings ending with the word 'on' is shown next:
1323 >>> s.str.findall('on$')
1324 0 [on]
1325 1 []
1326 2 []
1327 dtype: object
1329 If the pattern is found more than once in the same string, then a list of
1330 multiple strings is returned:
1332 >>> s.str.findall('b')
1333 0 []
1334 1 []
1335 2 [b, b]
1336 dtype: object
1337 """
1338 regex = re.compile(pat, flags=flags)
1339 return _na_map(regex.findall, arr)
1342def str_find(arr, sub, start=0, end=None, side="left"):
1343 """
1344 Return indexes in each strings in the Series/Index where the
1345 substring is fully contained between [start:end]. Return -1 on failure.
1347 Parameters
1348 ----------
1349 sub : str
1350 Substring being searched.
1351 start : int
1352 Left edge index.
1353 end : int
1354 Right edge index.
1355 side : {'left', 'right'}, default 'left'
1356 Specifies a starting side, equivalent to ``find`` or ``rfind``.
1358 Returns
1359 -------
1360 Series or Index
1361 Indexes where substring is found.
1362 """
1364 if not isinstance(sub, str):
1365 msg = f"expected a string object, not {type(sub).__name__}"
1366 raise TypeError(msg)
1368 if side == "left":
1369 method = "find"
1370 elif side == "right":
1371 method = "rfind"
1372 else: # pragma: no cover
1373 raise ValueError("Invalid side")
1375 if end is None:
1376 f = lambda x: getattr(x, method)(sub, start)
1377 else:
1378 f = lambda x: getattr(x, method)(sub, start, end)
1380 return _na_map(f, arr, dtype="int64")
1383def str_index(arr, sub, start=0, end=None, side="left"):
1384 if not isinstance(sub, str):
1385 msg = f"expected a string object, not {type(sub).__name__}"
1386 raise TypeError(msg)
1388 if side == "left":
1389 method = "index"
1390 elif side == "right":
1391 method = "rindex"
1392 else: # pragma: no cover
1393 raise ValueError("Invalid side")
1395 if end is None:
1396 f = lambda x: getattr(x, method)(sub, start)
1397 else:
1398 f = lambda x: getattr(x, method)(sub, start, end)
1400 return _na_map(f, arr, dtype="int64")
1403def str_pad(arr, width, side="left", fillchar=" "):
1404 """
1405 Pad strings in the Series/Index up to width.
1407 Parameters
1408 ----------
1409 width : int
1410 Minimum width of resulting string; additional characters will be filled
1411 with character defined in `fillchar`.
1412 side : {'left', 'right', 'both'}, default 'left'
1413 Side from which to fill resulting string.
1414 fillchar : str, default ' '
1415 Additional character for filling, default is whitespace.
1417 Returns
1418 -------
1419 Series or Index of object
1420 Returns Series or Index with minimum number of char in object.
1422 See Also
1423 --------
1424 Series.str.rjust : Fills the left side of strings with an arbitrary
1425 character. Equivalent to ``Series.str.pad(side='left')``.
1426 Series.str.ljust : Fills the right side of strings with an arbitrary
1427 character. Equivalent to ``Series.str.pad(side='right')``.
1428 Series.str.center : Fills boths sides of strings with an arbitrary
1429 character. Equivalent to ``Series.str.pad(side='both')``.
1430 Series.str.zfill : Pad strings in the Series/Index by prepending '0'
1431 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
1433 Examples
1434 --------
1435 >>> s = pd.Series(["caribou", "tiger"])
1436 >>> s
1437 0 caribou
1438 1 tiger
1439 dtype: object
1441 >>> s.str.pad(width=10)
1442 0 caribou
1443 1 tiger
1444 dtype: object
1446 >>> s.str.pad(width=10, side='right', fillchar='-')
1447 0 caribou---
1448 1 tiger-----
1449 dtype: object
1451 >>> s.str.pad(width=10, side='both', fillchar='-')
1452 0 -caribou--
1453 1 --tiger---
1454 dtype: object
1455 """
1456 if not isinstance(fillchar, str):
1457 msg = f"fillchar must be a character, not {type(fillchar).__name__}"
1458 raise TypeError(msg)
1460 if len(fillchar) != 1:
1461 raise TypeError("fillchar must be a character, not str")
1463 if not is_integer(width):
1464 msg = f"width must be of integer type, not {type(width).__name__}"
1465 raise TypeError(msg)
1467 if side == "left":
1468 f = lambda x: x.rjust(width, fillchar)
1469 elif side == "right":
1470 f = lambda x: x.ljust(width, fillchar)
1471 elif side == "both":
1472 f = lambda x: x.center(width, fillchar)
1473 else: # pragma: no cover
1474 raise ValueError("Invalid side")
1476 return _na_map(f, arr, dtype=str)
1479def str_split(arr, pat=None, n=None):
1481 if pat is None:
1482 if n is None or n == 0:
1483 n = -1
1484 f = lambda x: x.split(pat, n)
1485 else:
1486 if len(pat) == 1:
1487 if n is None or n == 0:
1488 n = -1
1489 f = lambda x: x.split(pat, n)
1490 else:
1491 if n is None or n == -1:
1492 n = 0
1493 regex = re.compile(pat)
1494 f = lambda x: regex.split(x, maxsplit=n)
1495 res = _na_map(f, arr)
1496 return res
1499def str_rsplit(arr, pat=None, n=None):
1501 if n is None or n == 0:
1502 n = -1
1503 f = lambda x: x.rsplit(pat, n)
1504 res = _na_map(f, arr)
1505 return res
1508def str_slice(arr, start=None, stop=None, step=None):
1509 """
1510 Slice substrings from each element in the Series or Index.
1512 Parameters
1513 ----------
1514 start : int, optional
1515 Start position for slice operation.
1516 stop : int, optional
1517 Stop position for slice operation.
1518 step : int, optional
1519 Step size for slice operation.
1521 Returns
1522 -------
1523 Series or Index of object
1524 Series or Index from sliced substring from original string object.
1526 See Also
1527 --------
1528 Series.str.slice_replace : Replace a slice with a string.
1529 Series.str.get : Return element at position.
1530 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
1531 being the position.
1533 Examples
1534 --------
1535 >>> s = pd.Series(["koala", "fox", "chameleon"])
1536 >>> s
1537 0 koala
1538 1 fox
1539 2 chameleon
1540 dtype: object
1542 >>> s.str.slice(start=1)
1543 0 oala
1544 1 ox
1545 2 hameleon
1546 dtype: object
1548 >>> s.str.slice(start=-1)
1549 0 a
1550 1 x
1551 2 n
1552 dtype: object
1554 >>> s.str.slice(stop=2)
1555 0 ko
1556 1 fo
1557 2 ch
1558 dtype: object
1560 >>> s.str.slice(step=2)
1561 0 kaa
1562 1 fx
1563 2 caeen
1564 dtype: object
1566 >>> s.str.slice(start=0, stop=5, step=3)
1567 0 kl
1568 1 f
1569 2 cm
1570 dtype: object
1572 Equivalent behaviour to:
1574 >>> s.str[0:5:3]
1575 0 kl
1576 1 f
1577 2 cm
1578 dtype: object
1579 """
1580 obj = slice(start, stop, step)
1581 f = lambda x: x[obj]
1582 return _na_map(f, arr, dtype=str)
1585def str_slice_replace(arr, start=None, stop=None, repl=None):
1586 """
1587 Replace a positional slice of a string with another value.
1589 Parameters
1590 ----------
1591 start : int, optional
1592 Left index position to use for the slice. If not specified (None),
1593 the slice is unbounded on the left, i.e. slice from the start
1594 of the string.
1595 stop : int, optional
1596 Right index position to use for the slice. If not specified (None),
1597 the slice is unbounded on the right, i.e. slice until the
1598 end of the string.
1599 repl : str, optional
1600 String for replacement. If not specified (None), the sliced region
1601 is replaced with an empty string.
1603 Returns
1604 -------
1605 Series or Index
1606 Same type as the original object.
1608 See Also
1609 --------
1610 Series.str.slice : Just slicing without replacement.
1612 Examples
1613 --------
1614 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
1615 >>> s
1616 0 a
1617 1 ab
1618 2 abc
1619 3 abdc
1620 4 abcde
1621 dtype: object
1623 Specify just `start`, meaning replace `start` until the end of the
1624 string with `repl`.
1626 >>> s.str.slice_replace(1, repl='X')
1627 0 aX
1628 1 aX
1629 2 aX
1630 3 aX
1631 4 aX
1632 dtype: object
1634 Specify just `stop`, meaning the start of the string to `stop` is replaced
1635 with `repl`, and the rest of the string is included.
1637 >>> s.str.slice_replace(stop=2, repl='X')
1638 0 X
1639 1 X
1640 2 Xc
1641 3 Xdc
1642 4 Xcde
1643 dtype: object
1645 Specify `start` and `stop`, meaning the slice from `start` to `stop` is
1646 replaced with `repl`. Everything before or after `start` and `stop` is
1647 included as is.
1649 >>> s.str.slice_replace(start=1, stop=3, repl='X')
1650 0 aX
1651 1 aX
1652 2 aX
1653 3 aXc
1654 4 aXde
1655 dtype: object
1656 """
1657 if repl is None:
1658 repl = ""
1660 def f(x):
1661 if x[start:stop] == "":
1662 local_stop = start
1663 else:
1664 local_stop = stop
1665 y = ""
1666 if start is not None:
1667 y += x[:start]
1668 y += repl
1669 if stop is not None:
1670 y += x[local_stop:]
1671 return y
1673 return _na_map(f, arr, dtype=str)
1676def str_strip(arr, to_strip=None, side="both"):
1677 """
1678 Strip whitespace (including newlines) from each string in the
1679 Series/Index.
1681 Parameters
1682 ----------
1683 to_strip : str or unicode
1684 side : {'left', 'right', 'both'}, default 'both'
1686 Returns
1687 -------
1688 Series or Index
1689 """
1690 if side == "both":
1691 f = lambda x: x.strip(to_strip)
1692 elif side == "left":
1693 f = lambda x: x.lstrip(to_strip)
1694 elif side == "right":
1695 f = lambda x: x.rstrip(to_strip)
1696 else: # pragma: no cover
1697 raise ValueError("Invalid side")
1698 return _na_map(f, arr, dtype=str)
1701def str_wrap(arr, width, **kwargs):
1702 r"""
1703 Wrap long strings in the Series/Index to be formatted in
1704 paragraphs with length less than a given width.
1706 This method has the same keyword parameters and defaults as
1707 :class:`textwrap.TextWrapper`.
1709 Parameters
1710 ----------
1711 width : int
1712 Maximum line width.
1713 expand_tabs : bool, optional
1714 If True, tab characters will be expanded to spaces (default: True).
1715 replace_whitespace : bool, optional
1716 If True, each whitespace character (as defined by string.whitespace)
1717 remaining after tab expansion will be replaced by a single space
1718 (default: True).
1719 drop_whitespace : bool, optional
1720 If True, whitespace that, after wrapping, happens to end up at the
1721 beginning or end of a line is dropped (default: True).
1722 break_long_words : bool, optional
1723 If True, then words longer than width will be broken in order to ensure
1724 that no lines are longer than width. If it is false, long words will
1725 not be broken, and some lines may be longer than width (default: True).
1726 break_on_hyphens : bool, optional
1727 If True, wrapping will occur preferably on whitespace and right after
1728 hyphens in compound words, as it is customary in English. If false,
1729 only whitespaces will be considered as potentially good places for line
1730 breaks, but you need to set break_long_words to false if you want truly
1731 insecable words (default: True).
1733 Returns
1734 -------
1735 Series or Index
1737 Notes
1738 -----
1739 Internally, this method uses a :class:`textwrap.TextWrapper` instance with
1740 default settings. To achieve behavior matching R's stringr library str_wrap
1741 function, use the arguments:
1743 - expand_tabs = False
1744 - replace_whitespace = True
1745 - drop_whitespace = True
1746 - break_long_words = False
1747 - break_on_hyphens = False
1749 Examples
1750 --------
1752 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
1753 >>> s.str.wrap(12)
1754 0 line to be\nwrapped
1755 1 another line\nto be\nwrapped
1756 dtype: object
1757 """
1758 kwargs["width"] = width
1760 tw = textwrap.TextWrapper(**kwargs)
1762 return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str)
1765def str_translate(arr, table):
1766 """
1767 Map all characters in the string through the given mapping table.
1768 Equivalent to standard :meth:`str.translate`.
1770 Parameters
1771 ----------
1772 table : dict
1773 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
1774 None. Unmapped characters are left untouched.
1775 Characters mapped to None are deleted. :meth:`str.maketrans` is a
1776 helper function for making translation tables.
1778 Returns
1779 -------
1780 Series or Index
1781 """
1782 return _na_map(lambda x: x.translate(table), arr, dtype=str)
1785def str_get(arr, i):
1786 """
1787 Extract element from each component at specified position.
1789 Extract element from lists, tuples, or strings in each element in the
1790 Series/Index.
1792 Parameters
1793 ----------
1794 i : int
1795 Position of element to extract.
1797 Returns
1798 -------
1799 Series or Index
1801 Examples
1802 --------
1803 >>> s = pd.Series(["String",
1804 ... (1, 2, 3),
1805 ... ["a", "b", "c"],
1806 ... 123,
1807 ... -456,
1808 ... {1: "Hello", "2": "World"}])
1809 >>> s
1810 0 String
1811 1 (1, 2, 3)
1812 2 [a, b, c]
1813 3 123
1814 4 -456
1815 5 {1: 'Hello', '2': 'World'}
1816 dtype: object
1818 >>> s.str.get(1)
1819 0 t
1820 1 2
1821 2 b
1822 3 NaN
1823 4 NaN
1824 5 Hello
1825 dtype: object
1827 >>> s.str.get(-1)
1828 0 g
1829 1 3
1830 2 c
1831 3 NaN
1832 4 NaN
1833 5 None
1834 dtype: object
1835 """
1837 def f(x):
1838 if isinstance(x, dict):
1839 return x.get(i)
1840 elif len(x) > i >= -len(x):
1841 return x[i]
1842 return np.nan
1844 return _na_map(f, arr)
1847def str_decode(arr, encoding, errors="strict"):
1848 """
1849 Decode character string in the Series/Index using indicated encoding.
1850 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
1851 python3.
1853 Parameters
1854 ----------
1855 encoding : str
1856 errors : str, optional
1858 Returns
1859 -------
1860 Series or Index
1861 """
1862 if encoding in _cpython_optimized_decoders:
1863 # CPython optimized implementation
1864 f = lambda x: x.decode(encoding, errors)
1865 else:
1866 decoder = codecs.getdecoder(encoding)
1867 f = lambda x: decoder(x, errors)[0]
1868 return _na_map(f, arr)
1871def str_encode(arr, encoding, errors="strict"):
1872 """
1873 Encode character string in the Series/Index using indicated encoding.
1874 Equivalent to :meth:`str.encode`.
1876 Parameters
1877 ----------
1878 encoding : str
1879 errors : str, optional
1881 Returns
1882 -------
1883 encoded : Series/Index of objects
1884 """
1885 if encoding in _cpython_optimized_encoders:
1886 # CPython optimized implementation
1887 f = lambda x: x.encode(encoding, errors)
1888 else:
1889 encoder = codecs.getencoder(encoding)
1890 f = lambda x: encoder(x, errors)[0]
1891 return _na_map(f, arr)
1894def forbid_nonstring_types(forbidden, name=None):
1895 """
1896 Decorator to forbid specific types for a method of StringMethods.
1898 For calling `.str.{method}` on a Series or Index, it is necessary to first
1899 initialize the :class:`StringMethods` object, and then call the method.
1900 However, different methods allow different input types, and so this can not
1901 be checked during :meth:`StringMethods.__init__`, but must be done on a
1902 per-method basis. This decorator exists to facilitate this process, and
1903 make it explicit which (inferred) types are disallowed by the method.
1905 :meth:`StringMethods.__init__` allows the *union* of types its different
1906 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),
1907 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].
1909 The default string types ['string', 'empty'] are allowed for all methods.
1910 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method
1911 then needs to forbid the types it is not intended for.
1913 Parameters
1914 ----------
1915 forbidden : list-of-str or None
1916 List of forbidden non-string types, may be one or more of
1917 `['bytes', 'mixed', 'mixed-integer']`.
1918 name : str, default None
1919 Name of the method to use in the error message. By default, this is
1920 None, in which case the name from the method being wrapped will be
1921 copied. However, for working with further wrappers (like _pat_wrapper
1922 and _noarg_wrapper), it is necessary to specify the name.
1924 Returns
1925 -------
1926 func : wrapper
1927 The method to which the decorator is applied, with an added check that
1928 enforces the inferred type to not be in the list of forbidden types.
1930 Raises
1931 ------
1932 TypeError
1933 If the inferred type of the underlying data is in `forbidden`.
1934 """
1936 # deal with None
1937 forbidden = [] if forbidden is None else forbidden
1939 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
1940 forbidden
1941 )
1943 def _forbid_nonstring_types(func):
1944 func_name = func.__name__ if name is None else name
1946 @wraps(func)
1947 def wrapper(self, *args, **kwargs):
1948 if self._inferred_dtype not in allowed_types:
1949 msg = (
1950 f"Cannot use .str.{func_name} with values of "
1951 f"inferred dtype '{self._inferred_dtype}'."
1952 )
1953 raise TypeError(msg)
1954 return func(self, *args, **kwargs)
1956 wrapper.__name__ = func_name
1957 return wrapper
1959 return _forbid_nonstring_types
1962def _noarg_wrapper(
1963 f,
1964 name=None,
1965 docstring=None,
1966 forbidden_types=["bytes"],
1967 returns_string=True,
1968 **kargs,
1969):
1970 @forbid_nonstring_types(forbidden_types, name=name)
1971 def wrapper(self):
1972 result = _na_map(f, self._parent, **kargs)
1973 return self._wrap_result(result, returns_string=returns_string)
1975 wrapper.__name__ = f.__name__ if name is None else name
1976 if docstring is not None:
1977 wrapper.__doc__ = docstring
1978 else:
1979 raise ValueError("Provide docstring")
1981 return wrapper
1984def _pat_wrapper(
1985 f,
1986 flags=False,
1987 na=False,
1988 name=None,
1989 forbidden_types=["bytes"],
1990 returns_string=True,
1991 **kwargs,
1992):
1993 @forbid_nonstring_types(forbidden_types, name=name)
1994 def wrapper1(self, pat):
1995 result = f(self._parent, pat)
1996 return self._wrap_result(result, returns_string=returns_string)
1998 @forbid_nonstring_types(forbidden_types, name=name)
1999 def wrapper2(self, pat, flags=0, **kwargs):
2000 result = f(self._parent, pat, flags=flags, **kwargs)
2001 return self._wrap_result(result, returns_string=returns_string)
2003 @forbid_nonstring_types(forbidden_types, name=name)
2004 def wrapper3(self, pat, na=np.nan):
2005 result = f(self._parent, pat, na=na)
2006 return self._wrap_result(result, returns_string=returns_string)
2008 wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
2010 wrapper.__name__ = f.__name__ if name is None else name
2011 if f.__doc__:
2012 wrapper.__doc__ = f.__doc__
2014 return wrapper
2017def copy(source):
2018 "Copy a docstring from another source function (if present)"
2020 def do_copy(target):
2021 if source.__doc__:
2022 target.__doc__ = source.__doc__
2023 return target
2025 return do_copy
2028class StringMethods(NoNewAttributesMixin):
2029 """
2030 Vectorized string functions for Series and Index. NAs stay NA unless
2031 handled otherwise by a particular method. Patterned after Python's string
2032 methods, with some inspiration from R's stringr package.
2034 Examples
2035 --------
2036 >>> s.str.split('_')
2037 >>> s.str.replace('_', '')
2038 """
2040 def __init__(self, data):
2041 self._inferred_dtype = self._validate(data)
2042 self._is_categorical = is_categorical_dtype(data)
2043 self._is_string = data.dtype.name == "string"
2045 # .values.categories works for both Series/Index
2046 self._parent = data.values.categories if self._is_categorical else data
2047 # save orig to blow up categoricals to the right type
2048 self._orig = data
2049 self._freeze()
2051 @staticmethod
2052 def _validate(data):
2053 """
2054 Auxiliary function for StringMethods, infers and checks dtype of data.
2056 This is a "first line of defence" at the creation of the StringMethods-
2057 object (see _make_accessor), and just checks that the dtype is in the
2058 *union* of the allowed types over all string methods below; this
2059 restriction is then refined on a per-method basis using the decorator
2060 @forbid_nonstring_types (more info in the corresponding docstring).
2062 This really should exclude all series/index with any non-string values,
2063 but that isn't practical for performance reasons until we have a str
2064 dtype (GH 9343 / 13877)
2066 Parameters
2067 ----------
2068 data : The content of the Series
2070 Returns
2071 -------
2072 dtype : inferred dtype of data
2073 """
2074 from pandas import StringDtype
2076 if isinstance(data, ABCMultiIndex):
2077 raise AttributeError(
2078 "Can only use .str accessor with Index, not MultiIndex"
2079 )
2081 # see _libs/lib.pyx for list of inferred types
2082 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
2084 values = getattr(data, "values", data) # Series / Index
2085 values = getattr(values, "categories", values) # categorical / normal
2087 # explicitly allow StringDtype
2088 if isinstance(values.dtype, StringDtype):
2089 return "string"
2091 try:
2092 inferred_dtype = lib.infer_dtype(values, skipna=True)
2093 except ValueError:
2094 # GH#27571 mostly occurs with ExtensionArray
2095 inferred_dtype = None
2097 if inferred_dtype not in allowed_types:
2098 raise AttributeError("Can only use .str accessor with string values!")
2099 return inferred_dtype
2101 def __getitem__(self, key):
2102 if isinstance(key, slice):
2103 return self.slice(start=key.start, stop=key.stop, step=key.step)
2104 else:
2105 return self.get(key)
2107 def __iter__(self):
2108 warnings.warn(
2109 "Columnar iteration over characters will be deprecated in future releases.",
2110 FutureWarning,
2111 stacklevel=2,
2112 )
2113 i = 0
2114 g = self.get(i)
2115 while g.notna().any():
2116 yield g
2117 i += 1
2118 g = self.get(i)
2120 def _wrap_result(
2121 self,
2122 result,
2123 use_codes=True,
2124 name=None,
2125 expand=None,
2126 fill_value=np.nan,
2127 returns_string=True,
2128 ):
2130 from pandas import Index, Series, MultiIndex
2132 # for category, we do the stuff on the categories, so blow it up
2133 # to the full series again
2134 # But for some operations, we have to do the stuff on the full values,
2135 # so make it possible to skip this step as the method already did this
2136 # before the transformation...
2137 if use_codes and self._is_categorical:
2138 # if self._orig is a CategoricalIndex, there is no .cat-accessor
2139 result = take_1d(
2140 result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value
2141 )
2143 if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
2144 return result
2145 assert result.ndim < 3
2147 # We can be wrapping a string / object / categorical result, in which
2148 # case we'll want to return the same dtype as the input.
2149 # Or we can be wrapping a numeric output, in which case we don't want
2150 # to return a StringArray.
2151 if self._is_string and returns_string:
2152 dtype = "string"
2153 else:
2154 dtype = None
2156 if expand is None:
2157 # infer from ndim if expand is not specified
2158 expand = result.ndim != 1
2160 elif expand is True and not isinstance(self._orig, ABCIndexClass):
2161 # required when expand=True is explicitly specified
2162 # not needed when inferred
2164 def cons_row(x):
2165 if is_list_like(x):
2166 return x
2167 else:
2168 return [x]
2170 result = [cons_row(x) for x in result]
2171 if result:
2172 # propagate nan values to match longest sequence (GH 18450)
2173 max_len = max(len(x) for x in result)
2174 result = [
2175 x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result
2176 ]
2178 if not isinstance(expand, bool):
2179 raise ValueError("expand must be True or False")
2181 if expand is False:
2182 # if expand is False, result should have the same name
2183 # as the original otherwise specified
2184 if name is None:
2185 name = getattr(result, "name", None)
2186 if name is None:
2187 # do not use logical or, _orig may be a DataFrame
2188 # which has "name" column
2189 name = self._orig.name
2191 # Wait until we are sure result is a Series or Index before
2192 # checking attributes (GH 12180)
2193 if isinstance(self._orig, ABCIndexClass):
2194 # if result is a boolean np.array, return the np.array
2195 # instead of wrapping it into a boolean Index (GH 8875)
2196 if is_bool_dtype(result):
2197 return result
2199 if expand:
2200 result = list(result)
2201 out = MultiIndex.from_tuples(result, names=name)
2202 if out.nlevels == 1:
2203 # We had all tuples of length-one, which are
2204 # better represented as a regular Index.
2205 out = out.get_level_values(0)
2206 return out
2207 else:
2208 return Index(result, name=name)
2209 else:
2210 index = self._orig.index
2211 if expand:
2212 cons = self._orig._constructor_expanddim
2213 result = cons(result, columns=name, index=index, dtype=dtype)
2214 else:
2215 # Must be a Series
2216 cons = self._orig._constructor
2217 result = cons(result, name=name, index=index, dtype=dtype)
2218 return result
2220 def _get_series_list(self, others):
2221 """
2222 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
2223 into a list of Series (elements without an index must match the length
2224 of the calling Series/Index).
2226 Parameters
2227 ----------
2228 others : Series, DataFrame, np.ndarray, list-like or list-like of
2229 Objects that are either Series, Index or np.ndarray (1-dim).
2231 Returns
2232 -------
2233 list of Series
2234 Others transformed into list of Series.
2235 """
2236 from pandas import Series, DataFrame
2238 # self._orig is either Series or Index
2239 idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index
2241 # Generally speaking, all objects without an index inherit the index
2242 # `idx` of the calling Series/Index - i.e. must have matching length.
2243 # Objects with an index (i.e. Series/Index/DataFrame) keep their own.
2244 if isinstance(others, ABCSeries):
2245 return [others]
2246 elif isinstance(others, ABCIndexClass):
2247 return [Series(others.values, index=others)]
2248 elif isinstance(others, ABCDataFrame):
2249 return [others[x] for x in others]
2250 elif isinstance(others, np.ndarray) and others.ndim == 2:
2251 others = DataFrame(others, index=idx)
2252 return [others[x] for x in others]
2253 elif is_list_like(others, allow_sets=False):
2254 others = list(others) # ensure iterators do not get read twice etc
2256 # in case of list-like `others`, all elements must be
2257 # either Series/Index/np.ndarray (1-dim)...
2258 if all(
2259 isinstance(x, (ABCSeries, ABCIndexClass))
2260 or (isinstance(x, np.ndarray) and x.ndim == 1)
2261 for x in others
2262 ):
2263 los = []
2264 while others: # iterate through list and append each element
2265 los = los + self._get_series_list(others.pop(0))
2266 return los
2267 # ... or just strings
2268 elif all(not is_list_like(x) for x in others):
2269 return [Series(others, index=idx)]
2270 raise TypeError(
2271 "others must be Series, Index, DataFrame, np.ndarrary "
2272 "or list-like (either containing only strings or "
2273 "containing only objects of type Series/Index/"
2274 "np.ndarray[1-dim])"
2275 )
2277 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
2278 def cat(self, others=None, sep=None, na_rep=None, join="left"):
2279 """
2280 Concatenate strings in the Series/Index with given separator.
2282 If `others` is specified, this function concatenates the Series/Index
2283 and elements of `others` element-wise.
2284 If `others` is not passed, then all values in the Series/Index are
2285 concatenated into a single string with a given `sep`.
2287 Parameters
2288 ----------
2289 others : Series, Index, DataFrame, np.ndarray or list-like
2290 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
2291 other list-likes of strings must have the same length as the
2292 calling Series/Index, with the exception of indexed objects (i.e.
2293 Series/Index/DataFrame) if `join` is not None.
2295 If others is a list-like that contains a combination of Series,
2296 Index or np.ndarray (1-dim), then all elements will be unpacked and
2297 must satisfy the above criteria individually.
2299 If others is None, the method returns the concatenation of all
2300 strings in the calling Series/Index.
2301 sep : str, default ''
2302 The separator between the different elements/columns. By default
2303 the empty string `''` is used.
2304 na_rep : str or None, default None
2305 Representation that is inserted for all missing values:
2307 - If `na_rep` is None, and `others` is None, missing values in the
2308 Series/Index are omitted from the result.
2309 - If `na_rep` is None, and `others` is not None, a row containing a
2310 missing value in any of the columns (before concatenation) will
2311 have a missing value in the result.
2312 join : {'left', 'right', 'outer', 'inner'}, default 'left'
2313 Determines the join-style between the calling Series/Index and any
2314 Series/Index/DataFrame in `others` (objects without an index need
2315 to match the length of the calling Series/Index). To disable
2316 alignment, use `.values` on any Series/Index/DataFrame in `others`.
2318 .. versionadded:: 0.23.0
2319 .. versionchanged:: 1.0.0
2320 Changed default of `join` from None to `'left'`.
2322 Returns
2323 -------
2324 str, Series or Index
2325 If `others` is None, `str` is returned, otherwise a `Series/Index`
2326 (same type as caller) of objects is returned.
2328 See Also
2329 --------
2330 split : Split each string in the Series/Index.
2331 join : Join lists contained as elements in the Series/Index.
2333 Examples
2334 --------
2335 When not passing `others`, all values are concatenated into a single
2336 string:
2338 >>> s = pd.Series(['a', 'b', np.nan, 'd'])
2339 >>> s.str.cat(sep=' ')
2340 'a b d'
2342 By default, NA values in the Series are ignored. Using `na_rep`, they
2343 can be given a representation:
2345 >>> s.str.cat(sep=' ', na_rep='?')
2346 'a b ? d'
2348 If `others` is specified, corresponding values are concatenated with
2349 the separator. Result will be a Series of strings.
2351 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
2352 0 a,A
2353 1 b,B
2354 2 NaN
2355 3 d,D
2356 dtype: object
2358 Missing values will remain missing in the result, but can again be
2359 represented using `na_rep`
2361 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
2362 0 a,A
2363 1 b,B
2364 2 -,C
2365 3 d,D
2366 dtype: object
2368 If `sep` is not specified, the values are concatenated without
2369 separation.
2371 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
2372 0 aA
2373 1 bB
2374 2 -C
2375 3 dD
2376 dtype: object
2378 Series with different indexes can be aligned before concatenation. The
2379 `join`-keyword works as in other methods.
2381 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
2382 >>> s.str.cat(t, join='left', na_rep='-')
2383 0 aa
2384 1 b-
2385 2 -c
2386 3 dd
2387 dtype: object
2388 >>>
2389 >>> s.str.cat(t, join='outer', na_rep='-')
2390 0 aa
2391 1 b-
2392 2 -c
2393 3 dd
2394 4 -e
2395 dtype: object
2396 >>>
2397 >>> s.str.cat(t, join='inner', na_rep='-')
2398 0 aa
2399 2 -c
2400 3 dd
2401 dtype: object
2402 >>>
2403 >>> s.str.cat(t, join='right', na_rep='-')
2404 3 dd
2405 0 aa
2406 4 -e
2407 2 -c
2408 dtype: object
2410 For more examples, see :ref:`here <text.concatenate>`.
2411 """
2412 from pandas import Index, Series, concat
2414 if isinstance(others, str):
2415 raise ValueError("Did you mean to supply a `sep` keyword?")
2416 if sep is None:
2417 sep = ""
2419 if isinstance(self._orig, ABCIndexClass):
2420 data = Series(self._orig, index=self._orig)
2421 else: # Series
2422 data = self._orig
2424 # concatenate Series/Index with itself if no "others"
2425 if others is None:
2426 data = ensure_object(data)
2427 na_mask = isna(data)
2428 if na_rep is None and na_mask.any():
2429 data = data[~na_mask]
2430 elif na_rep is not None and na_mask.any():
2431 data = np.where(na_mask, na_rep, data)
2432 return sep.join(data)
2434 try:
2435 # turn anything in "others" into lists of Series
2436 others = self._get_series_list(others)
2437 except ValueError: # do not catch TypeError raised by _get_series_list
2438 raise ValueError(
2439 "If `others` contains arrays or lists (or other "
2440 "list-likes without an index), these must all be "
2441 "of the same length as the calling Series/Index."
2442 )
2444 # align if required
2445 if any(not data.index.equals(x.index) for x in others):
2446 # Need to add keys for uniqueness in case of duplicate columns
2447 others = concat(
2448 others,
2449 axis=1,
2450 join=(join if join == "inner" else "outer"),
2451 keys=range(len(others)),
2452 sort=False,
2453 copy=False,
2454 )
2455 data, others = data.align(others, join=join)
2456 others = [others[x] for x in others] # again list of Series
2458 all_cols = [ensure_object(x) for x in [data] + others]
2459 na_masks = np.array([isna(x) for x in all_cols])
2460 union_mask = np.logical_or.reduce(na_masks, axis=0)
2462 if na_rep is None and union_mask.any():
2463 # no na_rep means NaNs for all rows where any column has a NaN
2464 # only necessary if there are actually any NaNs
2465 result = np.empty(len(data), dtype=object)
2466 np.putmask(result, union_mask, np.nan)
2468 not_masked = ~union_mask
2469 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
2470 elif na_rep is not None and union_mask.any():
2471 # fill NaNs with na_rep in case there are actually any NaNs
2472 all_cols = [
2473 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
2474 ]
2475 result = cat_safe(all_cols, sep)
2476 else:
2477 # no NaNs - can just concatenate
2478 result = cat_safe(all_cols, sep)
2480 if isinstance(self._orig, ABCIndexClass):
2481 # add dtype for case that result is all-NA
2482 result = Index(result, dtype=object, name=self._orig.name)
2483 else: # Series
2484 if is_categorical_dtype(self._orig.dtype):
2485 # We need to infer the new categories.
2486 dtype = None
2487 else:
2488 dtype = self._orig.dtype
2489 result = Series(result, dtype=dtype, index=data.index, name=self._orig.name)
2490 return result
2492 _shared_docs[
2493 "str_split"
2494 ] = r"""
2495 Split strings around given separator/delimiter.
2497 Splits the string in the Series/Index from the %(side)s,
2498 at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
2500 Parameters
2501 ----------
2502 pat : str, optional
2503 String or regular expression to split on.
2504 If not specified, split on whitespace.
2505 n : int, default -1 (all)
2506 Limit number of splits in output.
2507 ``None``, 0 and -1 will be interpreted as return all splits.
2508 expand : bool, default False
2509 Expand the splitted strings into separate columns.
2511 * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
2512 * If ``False``, return Series/Index, containing lists of strings.
2514 Returns
2515 -------
2516 Series, Index, DataFrame or MultiIndex
2517 Type matches caller unless ``expand=True`` (see Notes).
2519 See Also
2520 --------
2521 Series.str.split : Split strings around given separator/delimiter.
2522 Series.str.rsplit : Splits string around given separator/delimiter,
2523 starting from the right.
2524 Series.str.join : Join lists contained as elements in the Series/Index
2525 with passed delimiter.
2526 str.split : Standard library version for split.
2527 str.rsplit : Standard library version for rsplit.
2529 Notes
2530 -----
2531 The handling of the `n` keyword depends on the number of found splits:
2533 - If found splits > `n`, make first `n` splits only
2534 - If found splits <= `n`, make all splits
2535 - If for a certain row the number of found splits < `n`,
2536 append `None` for padding up to `n` if ``expand=True``
2538 If using ``expand=True``, Series and Index callers return DataFrame and
2539 MultiIndex objects, respectively.
2541 Examples
2542 --------
2543 >>> s = pd.Series(["this is a regular sentence",
2544 ... "https://docs.python.org/3/tutorial/index.html",
2545 ... np.nan])
2546 0 this is a regular sentence
2547 1 https://docs.python.org/3/tutorial/index.html
2548 2 NaN
2549 dtype: object
2551 In the default setting, the string is split by whitespace.
2553 >>> s.str.split()
2554 0 [this, is, a, regular, sentence]
2555 1 [https://docs.python.org/3/tutorial/index.html]
2556 2 NaN
2557 dtype: object
2559 Without the `n` parameter, the outputs of `rsplit` and `split`
2560 are identical.
2562 >>> s.str.rsplit()
2563 0 [this, is, a, regular, sentence]
2564 1 [https://docs.python.org/3/tutorial/index.html]
2565 2 NaN
2566 dtype: object
2568 The `n` parameter can be used to limit the number of splits on the
2569 delimiter. The outputs of `split` and `rsplit` are different.
2571 >>> s.str.split(n=2)
2572 0 [this, is, a regular sentence]
2573 1 [https://docs.python.org/3/tutorial/index.html]
2574 2 NaN
2575 dtype: object
2577 >>> s.str.rsplit(n=2)
2578 0 [this is a, regular, sentence]
2579 1 [https://docs.python.org/3/tutorial/index.html]
2580 2 NaN
2581 dtype: object
2583 The `pat` parameter can be used to split by other characters.
2585 >>> s.str.split(pat = "/")
2586 0 [this is a regular sentence]
2587 1 [https:, , docs.python.org, 3, tutorial, index...
2588 2 NaN
2589 dtype: object
2591 When using ``expand=True``, the split elements will expand out into
2592 separate columns. If NaN is present, it is propagated throughout
2593 the columns during the split.
2595 >>> s.str.split(expand=True)
2596 0 1 2 3
2597 0 this is a regular
2598 1 https://docs.python.org/3/tutorial/index.html None None None
2599 2 NaN NaN NaN NaN \
2600 4
2601 0 sentence
2602 1 None
2603 2 NaN
2605 For slightly more complex use cases like splitting the html document name
2606 from a url, a combination of parameter settings can be used.
2608 >>> s.str.rsplit("/", n=1, expand=True)
2609 0 1
2610 0 this is a regular sentence None
2611 1 https://docs.python.org/3/tutorial index.html
2612 2 NaN NaN
2614 Remember to escape special characters when explicitly using regular
2615 expressions.
2617 >>> s = pd.Series(["1+1=2"])
2619 >>> s.str.split(r"\+|=", expand=True)
2620 0 1 2
2621 0 1 1 2
2622 """
2624 @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
2625 @forbid_nonstring_types(["bytes"])
2626 def split(self, pat=None, n=-1, expand=False):
2627 result = str_split(self._parent, pat, n=n)
2628 return self._wrap_result(result, expand=expand, returns_string=expand)
2630 @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
2631 @forbid_nonstring_types(["bytes"])
2632 def rsplit(self, pat=None, n=-1, expand=False):
2633 result = str_rsplit(self._parent, pat, n=n)
2634 return self._wrap_result(result, expand=expand, returns_string=expand)
2636 _shared_docs[
2637 "str_partition"
2638 ] = """
2639 Split the string at the %(side)s occurrence of `sep`.
2641 This method splits the string at the %(side)s occurrence of `sep`,
2642 and returns 3 elements containing the part before the separator,
2643 the separator itself, and the part after the separator.
2644 If the separator is not found, return %(return)s.
2646 Parameters
2647 ----------
2648 sep : str, default whitespace
2649 String to split on.
2650 expand : bool, default True
2651 If True, return DataFrame/MultiIndex expanding dimensionality.
2652 If False, return Series/Index.
2654 Returns
2655 -------
2656 DataFrame/MultiIndex or Series/Index of objects
2658 See Also
2659 --------
2660 %(also)s
2661 Series.str.split : Split strings around given separators.
2662 str.partition : Standard library version.
2664 Examples
2665 --------
2667 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
2668 >>> s
2669 0 Linda van der Berg
2670 1 George Pitt-Rivers
2671 dtype: object
2673 >>> s.str.partition()
2674 0 1 2
2675 0 Linda van der Berg
2676 1 George Pitt-Rivers
2678 To partition by the last space instead of the first one:
2680 >>> s.str.rpartition()
2681 0 1 2
2682 0 Linda van der Berg
2683 1 George Pitt-Rivers
2685 To partition by something different than a space:
2687 >>> s.str.partition('-')
2688 0 1 2
2689 0 Linda van der Berg
2690 1 George Pitt - Rivers
2692 To return a Series containing tuples instead of a DataFrame:
2694 >>> s.str.partition('-', expand=False)
2695 0 (Linda van der Berg, , )
2696 1 (George Pitt, -, Rivers)
2697 dtype: object
2699 Also available on indices:
2701 >>> idx = pd.Index(['X 123', 'Y 999'])
2702 >>> idx
2703 Index(['X 123', 'Y 999'], dtype='object')
2705 Which will create a MultiIndex:
2707 >>> idx.str.partition()
2708 MultiIndex([('X', ' ', '123'),
2709 ('Y', ' ', '999')],
2710 dtype='object')
2712 Or an index with tuples with ``expand=False``:
2714 >>> idx.str.partition(expand=False)
2715 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
2716 """
2718 @Appender(
2719 _shared_docs["str_partition"]
2720 % {
2721 "side": "first",
2722 "return": "3 elements containing the string itself, followed by two "
2723 "empty strings",
2724 "also": "rpartition : Split the string at the last occurrence of `sep`.",
2725 }
2726 )
2727 @forbid_nonstring_types(["bytes"])
2728 def partition(self, sep=" ", expand=True):
2729 f = lambda x: x.partition(sep)
2730 result = _na_map(f, self._parent)
2731 return self._wrap_result(result, expand=expand, returns_string=expand)
2733 @Appender(
2734 _shared_docs["str_partition"]
2735 % {
2736 "side": "last",
2737 "return": "3 elements containing two empty strings, followed by the "
2738 "string itself",
2739 "also": "partition : Split the string at the first occurrence of `sep`.",
2740 }
2741 )
2742 @forbid_nonstring_types(["bytes"])
2743 def rpartition(self, sep=" ", expand=True):
2744 f = lambda x: x.rpartition(sep)
2745 result = _na_map(f, self._parent)
2746 return self._wrap_result(result, expand=expand, returns_string=expand)
2748 @copy(str_get)
2749 def get(self, i):
2750 result = str_get(self._parent, i)
2751 return self._wrap_result(result)
2753 @copy(str_join)
2754 @forbid_nonstring_types(["bytes"])
2755 def join(self, sep):
2756 result = str_join(self._parent, sep)
2757 return self._wrap_result(result)
2759 @copy(str_contains)
2760 @forbid_nonstring_types(["bytes"])
2761 def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
2762 result = str_contains(
2763 self._parent, pat, case=case, flags=flags, na=na, regex=regex
2764 )
2765 return self._wrap_result(result, fill_value=na, returns_string=False)
2767 @copy(str_match)
2768 @forbid_nonstring_types(["bytes"])
2769 def match(self, pat, case=True, flags=0, na=np.nan):
2770 result = str_match(self._parent, pat, case=case, flags=flags, na=na)
2771 return self._wrap_result(result, fill_value=na, returns_string=False)
2773 @copy(str_replace)
2774 @forbid_nonstring_types(["bytes"])
2775 def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
2776 result = str_replace(
2777 self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex
2778 )
2779 return self._wrap_result(result)
2781 @copy(str_repeat)
2782 @forbid_nonstring_types(["bytes"])
2783 def repeat(self, repeats):
2784 result = str_repeat(self._parent, repeats)
2785 return self._wrap_result(result)
2787 @copy(str_pad)
2788 @forbid_nonstring_types(["bytes"])
2789 def pad(self, width, side="left", fillchar=" "):
2790 result = str_pad(self._parent, width, side=side, fillchar=fillchar)
2791 return self._wrap_result(result)
2793 _shared_docs[
2794 "str_pad"
2795 ] = """
2796 Filling %(side)s side of strings in the Series/Index with an
2797 additional character. Equivalent to :meth:`str.%(method)s`.
2799 Parameters
2800 ----------
2801 width : int
2802 Minimum width of resulting string; additional characters will be filled
2803 with ``fillchar``.
2804 fillchar : str
2805 Additional character for filling, default is whitespace.
2807 Returns
2808 -------
2809 filled : Series/Index of objects.
2810 """
2812 @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center"))
2813 @forbid_nonstring_types(["bytes"])
2814 def center(self, width, fillchar=" "):
2815 return self.pad(width, side="both", fillchar=fillchar)
2817 @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust"))
2818 @forbid_nonstring_types(["bytes"])
2819 def ljust(self, width, fillchar=" "):
2820 return self.pad(width, side="right", fillchar=fillchar)
2822 @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust"))
2823 @forbid_nonstring_types(["bytes"])
2824 def rjust(self, width, fillchar=" "):
2825 return self.pad(width, side="left", fillchar=fillchar)
2827 @forbid_nonstring_types(["bytes"])
2828 def zfill(self, width):
2829 """
2830 Pad strings in the Series/Index by prepending '0' characters.
2832 Strings in the Series/Index are padded with '0' characters on the
2833 left of the string to reach a total string length `width`. Strings
2834 in the Series/Index with length greater or equal to `width` are
2835 unchanged.
2837 Parameters
2838 ----------
2839 width : int
2840 Minimum length of resulting string; strings with length less
2841 than `width` be prepended with '0' characters.
2843 Returns
2844 -------
2845 Series/Index of objects.
2847 See Also
2848 --------
2849 Series.str.rjust : Fills the left side of strings with an arbitrary
2850 character.
2851 Series.str.ljust : Fills the right side of strings with an arbitrary
2852 character.
2853 Series.str.pad : Fills the specified sides of strings with an arbitrary
2854 character.
2855 Series.str.center : Fills boths sides of strings with an arbitrary
2856 character.
2858 Notes
2859 -----
2860 Differs from :meth:`str.zfill` which has special handling
2861 for '+'/'-' in the string.
2863 Examples
2864 --------
2865 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
2866 >>> s
2867 0 -1
2868 1 1
2869 2 1000
2870 3 10
2871 4 NaN
2872 dtype: object
2874 Note that ``10`` and ``NaN`` are not strings, therefore they are
2875 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
2876 regular character and the zero is added to the left of it
2877 (:meth:`str.zfill` would have moved it to the left). ``1000``
2878 remains unchanged as it is longer than `width`.
2880 >>> s.str.zfill(3)
2881 0 0-1
2882 1 001
2883 2 1000
2884 3 NaN
2885 4 NaN
2886 dtype: object
2887 """
2888 result = str_pad(self._parent, width, side="left", fillchar="0")
2889 return self._wrap_result(result)
2891 @copy(str_slice)
2892 def slice(self, start=None, stop=None, step=None):
2893 result = str_slice(self._parent, start, stop, step)
2894 return self._wrap_result(result)
2896 @copy(str_slice_replace)
2897 @forbid_nonstring_types(["bytes"])
2898 def slice_replace(self, start=None, stop=None, repl=None):
2899 result = str_slice_replace(self._parent, start, stop, repl)
2900 return self._wrap_result(result)
2902 @copy(str_decode)
2903 def decode(self, encoding, errors="strict"):
2904 # need to allow bytes here
2905 result = str_decode(self._parent, encoding, errors)
2906 # TODO: Not sure how to handle this.
2907 return self._wrap_result(result, returns_string=False)
2909 @copy(str_encode)
2910 @forbid_nonstring_types(["bytes"])
2911 def encode(self, encoding, errors="strict"):
2912 result = str_encode(self._parent, encoding, errors)
2913 return self._wrap_result(result, returns_string=False)
2915 _shared_docs[
2916 "str_strip"
2917 ] = r"""
2918 Remove leading and trailing characters.
2920 Strip whitespaces (including newlines) or a set of specified characters
2921 from each string in the Series/Index from %(side)s.
2922 Equivalent to :meth:`str.%(method)s`.
2924 Parameters
2925 ----------
2926 to_strip : str or None, default None
2927 Specifying the set of characters to be removed.
2928 All combinations of this set of characters will be stripped.
2929 If None then whitespaces are removed.
2931 Returns
2932 -------
2933 Series or Index of object
2935 See Also
2936 --------
2937 Series.str.strip : Remove leading and trailing characters in Series/Index.
2938 Series.str.lstrip : Remove leading characters in Series/Index.
2939 Series.str.rstrip : Remove trailing characters in Series/Index.
2941 Examples
2942 --------
2943 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan])
2944 >>> s
2945 0 1. Ant.
2946 1 2. Bee!\n
2947 2 3. Cat?\t
2948 3 NaN
2949 dtype: object
2951 >>> s.str.strip()
2952 0 1. Ant.
2953 1 2. Bee!
2954 2 3. Cat?
2955 3 NaN
2956 dtype: object
2958 >>> s.str.lstrip('123.')
2959 0 Ant.
2960 1 Bee!\n
2961 2 Cat?\t
2962 3 NaN
2963 dtype: object
2965 >>> s.str.rstrip('.!? \n\t')
2966 0 1. Ant
2967 1 2. Bee
2968 2 3. Cat
2969 3 NaN
2970 dtype: object
2972 >>> s.str.strip('123.!? \n\t')
2973 0 Ant
2974 1 Bee
2975 2 Cat
2976 3 NaN
2977 dtype: object
2978 """
2980 @Appender(
2981 _shared_docs["str_strip"] % dict(side="left and right sides", method="strip")
2982 )
2983 @forbid_nonstring_types(["bytes"])
2984 def strip(self, to_strip=None):
2985 result = str_strip(self._parent, to_strip, side="both")
2986 return self._wrap_result(result)
2988 @Appender(_shared_docs["str_strip"] % dict(side="left side", method="lstrip"))
2989 @forbid_nonstring_types(["bytes"])
2990 def lstrip(self, to_strip=None):
2991 result = str_strip(self._parent, to_strip, side="left")
2992 return self._wrap_result(result)
2994 @Appender(_shared_docs["str_strip"] % dict(side="right side", method="rstrip"))
2995 @forbid_nonstring_types(["bytes"])
2996 def rstrip(self, to_strip=None):
2997 result = str_strip(self._parent, to_strip, side="right")
2998 return self._wrap_result(result)
3000 @copy(str_wrap)
3001 @forbid_nonstring_types(["bytes"])
3002 def wrap(self, width, **kwargs):
3003 result = str_wrap(self._parent, width, **kwargs)
3004 return self._wrap_result(result)
3006 @copy(str_get_dummies)
3007 @forbid_nonstring_types(["bytes"])
3008 def get_dummies(self, sep="|"):
3009 # we need to cast to Series of strings as only that has all
3010 # methods available for making the dummies...
3011 data = self._orig.astype(str) if self._is_categorical else self._parent
3012 result, name = str_get_dummies(data, sep)
3013 return self._wrap_result(
3014 result,
3015 use_codes=(not self._is_categorical),
3016 name=name,
3017 expand=True,
3018 returns_string=False,
3019 )
3021 @copy(str_translate)
3022 @forbid_nonstring_types(["bytes"])
3023 def translate(self, table):
3024 result = str_translate(self._parent, table)
3025 return self._wrap_result(result)
3027 count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False)
3028 startswith = _pat_wrapper(
3029 str_startswith, na=True, name="startswith", returns_string=False
3030 )
3031 endswith = _pat_wrapper(
3032 str_endswith, na=True, name="endswith", returns_string=False
3033 )
3034 findall = _pat_wrapper(
3035 str_findall, flags=True, name="findall", returns_string=False
3036 )
3038 @copy(str_extract)
3039 @forbid_nonstring_types(["bytes"])
3040 def extract(self, pat, flags=0, expand=True):
3041 return str_extract(self, pat, flags=flags, expand=expand)
3043 @copy(str_extractall)
3044 @forbid_nonstring_types(["bytes"])
3045 def extractall(self, pat, flags=0):
3046 return str_extractall(self._orig, pat, flags=flags)
3048 _shared_docs[
3049 "find"
3050 ] = """
3051 Return %(side)s indexes in each strings in the Series/Index
3052 where the substring is fully contained between [start:end].
3053 Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`.
3055 Parameters
3056 ----------
3057 sub : str
3058 Substring being searched.
3059 start : int
3060 Left edge index.
3061 end : int
3062 Right edge index.
3064 Returns
3065 -------
3066 Series or Index of int.
3068 See Also
3069 --------
3070 %(also)s
3071 """
3073 @Appender(
3074 _shared_docs["find"]
3075 % dict(
3076 side="lowest",
3077 method="find",
3078 also="rfind : Return highest indexes in each strings.",
3079 )
3080 )
3081 @forbid_nonstring_types(["bytes"])
3082 def find(self, sub, start=0, end=None):
3083 result = str_find(self._parent, sub, start=start, end=end, side="left")
3084 return self._wrap_result(result, returns_string=False)
3086 @Appender(
3087 _shared_docs["find"]
3088 % dict(
3089 side="highest",
3090 method="rfind",
3091 also="find : Return lowest indexes in each strings.",
3092 )
3093 )
3094 @forbid_nonstring_types(["bytes"])
3095 def rfind(self, sub, start=0, end=None):
3096 result = str_find(self._parent, sub, start=start, end=end, side="right")
3097 return self._wrap_result(result, returns_string=False)
3099 @forbid_nonstring_types(["bytes"])
3100 def normalize(self, form):
3101 """
3102 Return the Unicode normal form for the strings in the Series/Index.
3103 For more information on the forms, see the
3104 :func:`unicodedata.normalize`.
3106 Parameters
3107 ----------
3108 form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
3109 Unicode form.
3111 Returns
3112 -------
3113 normalized : Series/Index of objects
3114 """
3115 import unicodedata
3117 f = lambda x: unicodedata.normalize(form, x)
3118 result = _na_map(f, self._parent, dtype=str)
3119 return self._wrap_result(result)
3121 _shared_docs[
3122 "index"
3123 ] = """
3124 Return %(side)s indexes in each strings where the substring is
3125 fully contained between [start:end]. This is the same as
3126 ``str.%(similar)s`` except instead of returning -1, it raises a ValueError
3127 when the substring is not found. Equivalent to standard ``str.%(method)s``.
3129 Parameters
3130 ----------
3131 sub : str
3132 Substring being searched.
3133 start : int
3134 Left edge index.
3135 end : int
3136 Right edge index.
3138 Returns
3139 -------
3140 Series or Index of object
3142 See Also
3143 --------
3144 %(also)s
3145 """
3147 @Appender(
3148 _shared_docs["index"]
3149 % dict(
3150 side="lowest",
3151 similar="find",
3152 method="index",
3153 also="rindex : Return highest indexes in each strings.",
3154 )
3155 )
3156 @forbid_nonstring_types(["bytes"])
3157 def index(self, sub, start=0, end=None):
3158 result = str_index(self._parent, sub, start=start, end=end, side="left")
3159 return self._wrap_result(result, returns_string=False)
3161 @Appender(
3162 _shared_docs["index"]
3163 % dict(
3164 side="highest",
3165 similar="rfind",
3166 method="rindex",
3167 also="index : Return lowest indexes in each strings.",
3168 )
3169 )
3170 @forbid_nonstring_types(["bytes"])
3171 def rindex(self, sub, start=0, end=None):
3172 result = str_index(self._parent, sub, start=start, end=end, side="right")
3173 return self._wrap_result(result, returns_string=False)
3175 _shared_docs[
3176 "len"
3177 ] = """
3178 Compute the length of each element in the Series/Index. The element may be
3179 a sequence (such as a string, tuple or list) or a collection
3180 (such as a dictionary).
3182 Returns
3183 -------
3184 Series or Index of int
3185 A Series or Index of integer values indicating the length of each
3186 element in the Series or Index.
3188 See Also
3189 --------
3190 str.len : Python built-in function returning the length of an object.
3191 Series.size : Returns the length of the Series.
3193 Examples
3194 --------
3195 Returns the length (number of characters) in a string. Returns the
3196 number of entries for dictionaries, lists or tuples.
3198 >>> s = pd.Series(['dog',
3199 ... '',
3200 ... 5,
3201 ... {'foo' : 'bar'},
3202 ... [2, 3, 5, 7],
3203 ... ('one', 'two', 'three')])
3204 >>> s
3205 0 dog
3206 1
3207 2 5
3208 3 {'foo': 'bar'}
3209 4 [2, 3, 5, 7]
3210 5 (one, two, three)
3211 dtype: object
3212 >>> s.str.len()
3213 0 3.0
3214 1 0.0
3215 2 NaN
3216 3 1.0
3217 4 4.0
3218 5 3.0
3219 dtype: float64
3220 """
3221 len = _noarg_wrapper(
3222 len,
3223 docstring=_shared_docs["len"],
3224 forbidden_types=None,
3225 dtype="int64",
3226 returns_string=False,
3227 )
3229 _shared_docs[
3230 "casemethods"
3231 ] = """
3232 Convert strings in the Series/Index to %(type)s.
3233 %(version)s
3234 Equivalent to :meth:`str.%(method)s`.
3236 Returns
3237 -------
3238 Series or Index of object
3240 See Also
3241 --------
3242 Series.str.lower : Converts all characters to lowercase.
3243 Series.str.upper : Converts all characters to uppercase.
3244 Series.str.title : Converts first character of each word to uppercase and
3245 remaining to lowercase.
3246 Series.str.capitalize : Converts first character to uppercase and
3247 remaining to lowercase.
3248 Series.str.swapcase : Converts uppercase to lowercase and lowercase to
3249 uppercase.
3250 Series.str.casefold: Removes all case distinctions in the string.
3252 Examples
3253 --------
3254 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
3255 >>> s
3256 0 lower
3257 1 CAPITALS
3258 2 this is a sentence
3259 3 SwApCaSe
3260 dtype: object
3262 >>> s.str.lower()
3263 0 lower
3264 1 capitals
3265 2 this is a sentence
3266 3 swapcase
3267 dtype: object
3269 >>> s.str.upper()
3270 0 LOWER
3271 1 CAPITALS
3272 2 THIS IS A SENTENCE
3273 3 SWAPCASE
3274 dtype: object
3276 >>> s.str.title()
3277 0 Lower
3278 1 Capitals
3279 2 This Is A Sentence
3280 3 Swapcase
3281 dtype: object
3283 >>> s.str.capitalize()
3284 0 Lower
3285 1 Capitals
3286 2 This is a sentence
3287 3 Swapcase
3288 dtype: object
3290 >>> s.str.swapcase()
3291 0 LOWER
3292 1 capitals
3293 2 THIS IS A SENTENCE
3294 3 sWaPcAsE
3295 dtype: object
3296 """
3298 # _doc_args holds dict of strings to use in substituting casemethod docs
3299 _doc_args: Dict[str, Dict[str, str]] = {}
3300 _doc_args["lower"] = dict(type="lowercase", method="lower", version="")
3301 _doc_args["upper"] = dict(type="uppercase", method="upper", version="")
3302 _doc_args["title"] = dict(type="titlecase", method="title", version="")
3303 _doc_args["capitalize"] = dict(
3304 type="be capitalized", method="capitalize", version=""
3305 )
3306 _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="")
3307 _doc_args["casefold"] = dict(
3308 type="be casefolded",
3309 method="casefold",
3310 version="\n .. versionadded:: 0.25.0\n",
3311 )
3312 lower = _noarg_wrapper(
3313 lambda x: x.lower(),
3314 name="lower",
3315 docstring=_shared_docs["casemethods"] % _doc_args["lower"],
3316 dtype=str,
3317 )
3318 upper = _noarg_wrapper(
3319 lambda x: x.upper(),
3320 name="upper",
3321 docstring=_shared_docs["casemethods"] % _doc_args["upper"],
3322 dtype=str,
3323 )
3324 title = _noarg_wrapper(
3325 lambda x: x.title(),
3326 name="title",
3327 docstring=_shared_docs["casemethods"] % _doc_args["title"],
3328 dtype=str,
3329 )
3330 capitalize = _noarg_wrapper(
3331 lambda x: x.capitalize(),
3332 name="capitalize",
3333 docstring=_shared_docs["casemethods"] % _doc_args["capitalize"],
3334 dtype=str,
3335 )
3336 swapcase = _noarg_wrapper(
3337 lambda x: x.swapcase(),
3338 name="swapcase",
3339 docstring=_shared_docs["casemethods"] % _doc_args["swapcase"],
3340 dtype=str,
3341 )
3342 casefold = _noarg_wrapper(
3343 lambda x: x.casefold(),
3344 name="casefold",
3345 docstring=_shared_docs["casemethods"] % _doc_args["casefold"],
3346 dtype=str,
3347 )
3349 _shared_docs[
3350 "ismethods"
3351 ] = """
3352 Check whether all characters in each string are %(type)s.
3354 This is equivalent to running the Python string method
3355 :meth:`str.%(method)s` for each element of the Series/Index. If a string
3356 has zero characters, ``False`` is returned for that check.
3358 Returns
3359 -------
3360 Series or Index of bool
3361 Series or Index of boolean values with the same length as the original
3362 Series/Index.
3364 See Also
3365 --------
3366 Series.str.isalpha : Check whether all characters are alphabetic.
3367 Series.str.isnumeric : Check whether all characters are numeric.
3368 Series.str.isalnum : Check whether all characters are alphanumeric.
3369 Series.str.isdigit : Check whether all characters are digits.
3370 Series.str.isdecimal : Check whether all characters are decimal.
3371 Series.str.isspace : Check whether all characters are whitespace.
3372 Series.str.islower : Check whether all characters are lowercase.
3373 Series.str.isupper : Check whether all characters are uppercase.
3374 Series.str.istitle : Check whether all characters are titlecase.
3376 Examples
3377 --------
3378 **Checks for Alphabetic and Numeric Characters**
3380 >>> s1 = pd.Series(['one', 'one1', '1', ''])
3382 >>> s1.str.isalpha()
3383 0 True
3384 1 False
3385 2 False
3386 3 False
3387 dtype: bool
3389 >>> s1.str.isnumeric()
3390 0 False
3391 1 False
3392 2 True
3393 3 False
3394 dtype: bool
3396 >>> s1.str.isalnum()
3397 0 True
3398 1 True
3399 2 True
3400 3 False
3401 dtype: bool
3403 Note that checks against characters mixed with any additional punctuation
3404 or whitespace will evaluate to false for an alphanumeric check.
3406 >>> s2 = pd.Series(['A B', '1.5', '3,000'])
3407 >>> s2.str.isalnum()
3408 0 False
3409 1 False
3410 2 False
3411 dtype: bool
3413 **More Detailed Checks for Numeric Characters**
3415 There are several different but overlapping sets of numeric characters that
3416 can be checked for.
3418 >>> s3 = pd.Series(['23', '³', '⅕', ''])
3420 The ``s3.str.isdecimal`` method checks for characters used to form numbers
3421 in base 10.
3423 >>> s3.str.isdecimal()
3424 0 True
3425 1 False
3426 2 False
3427 3 False
3428 dtype: bool
3430 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
3431 includes special digits, like superscripted and subscripted digits in
3432 unicode.
3434 >>> s3.str.isdigit()
3435 0 True
3436 1 True
3437 2 False
3438 3 False
3439 dtype: bool
3441 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
3442 includes other characters that can represent quantities such as unicode
3443 fractions.
3445 >>> s3.str.isnumeric()
3446 0 True
3447 1 True
3448 2 True
3449 3 False
3450 dtype: bool
3452 **Checks for Whitespace**
3454 >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
3455 >>> s4.str.isspace()
3456 0 True
3457 1 True
3458 2 False
3459 dtype: bool
3461 **Checks for Character Case**
3463 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
3465 >>> s5.str.islower()
3466 0 True
3467 1 False
3468 2 False
3469 3 False
3470 dtype: bool
3472 >>> s5.str.isupper()
3473 0 False
3474 1 False
3475 2 True
3476 3 False
3477 dtype: bool
3479 The ``s5.str.istitle`` method checks for whether all words are in title
3480 case (whether only the first letter of each word is capitalized). Words are
3481 assumed to be as any sequence of non-numeric characters separated by
3482 whitespace characters.
3484 >>> s5.str.istitle()
3485 0 False
3486 1 True
3487 2 False
3488 3 False
3489 dtype: bool
3490 """
3491 _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum")
3492 _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha")
3493 _doc_args["isdigit"] = dict(type="digits", method="isdigit")
3494 _doc_args["isspace"] = dict(type="whitespace", method="isspace")
3495 _doc_args["islower"] = dict(type="lowercase", method="islower")
3496 _doc_args["isupper"] = dict(type="uppercase", method="isupper")
3497 _doc_args["istitle"] = dict(type="titlecase", method="istitle")
3498 _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric")
3499 _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal")
3500 # force _noarg_wrapper return type with dtype=bool (GH 29624)
3501 isalnum = _noarg_wrapper(
3502 lambda x: x.isalnum(),
3503 name="isalnum",
3504 docstring=_shared_docs["ismethods"] % _doc_args["isalnum"],
3505 returns_string=False,
3506 dtype=bool,
3507 )
3508 isalpha = _noarg_wrapper(
3509 lambda x: x.isalpha(),
3510 name="isalpha",
3511 docstring=_shared_docs["ismethods"] % _doc_args["isalpha"],
3512 returns_string=False,
3513 dtype=bool,
3514 )
3515 isdigit = _noarg_wrapper(
3516 lambda x: x.isdigit(),
3517 name="isdigit",
3518 docstring=_shared_docs["ismethods"] % _doc_args["isdigit"],
3519 returns_string=False,
3520 dtype=bool,
3521 )
3522 isspace = _noarg_wrapper(
3523 lambda x: x.isspace(),
3524 name="isspace",
3525 docstring=_shared_docs["ismethods"] % _doc_args["isspace"],
3526 returns_string=False,
3527 dtype=bool,
3528 )
3529 islower = _noarg_wrapper(
3530 lambda x: x.islower(),
3531 name="islower",
3532 docstring=_shared_docs["ismethods"] % _doc_args["islower"],
3533 returns_string=False,
3534 dtype=bool,
3535 )
3536 isupper = _noarg_wrapper(
3537 lambda x: x.isupper(),
3538 name="isupper",
3539 docstring=_shared_docs["ismethods"] % _doc_args["isupper"],
3540 returns_string=False,
3541 dtype=bool,
3542 )
3543 istitle = _noarg_wrapper(
3544 lambda x: x.istitle(),
3545 name="istitle",
3546 docstring=_shared_docs["ismethods"] % _doc_args["istitle"],
3547 returns_string=False,
3548 dtype=bool,
3549 )
3550 isnumeric = _noarg_wrapper(
3551 lambda x: x.isnumeric(),
3552 name="isnumeric",
3553 docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"],
3554 returns_string=False,
3555 dtype=bool,
3556 )
3557 isdecimal = _noarg_wrapper(
3558 lambda x: x.isdecimal(),
3559 name="isdecimal",
3560 docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"],
3561 returns_string=False,
3562 dtype=bool,
3563 )
3565 @classmethod
3566 def _make_accessor(cls, data):
3567 cls._validate(data)
3568 return cls(data)