diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 072871f89bdae..032d51c5a388f 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -74,6 +74,7 @@ These are places where the behavior of ``StringDtype`` objects differ from l. For ``StringDtype``, :ref:`string accessor methods` that return **numeric** output will always return a nullable integer dtype, rather than either int or float dtype, depending on the presence of NA values. + Methods returning **boolean** output will return a nullable boolean dtype. .. ipython:: python @@ -89,7 +90,13 @@ l. For ``StringDtype``, :ref:`string accessor methods` s.astype(object).str.count("a") s.astype(object).dropna().str.count("a") - When NA values are present, the output dtype is float64. + When NA values are present, the output dtype is float64. Similarly for + methods returning boolean values. + + .. ipython:: python + + s.str.isdigit() + s.str.match("a") 2. Some string methods, like :meth:`Series.str.decode` are not available on ``StringArray`` because ``StringArray`` only holds strings, not diff --git a/pandas/core/strings.py b/pandas/core/strings.py index d4d8be90402b7..6ef42eb185e49 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union import warnings import numpy as np @@ -140,7 +140,7 @@ def _map_stringarray( The value to use for missing values. By default, this is the original value (NA). dtype : Dtype - The result dtype to use. Specifying this aviods an intermediate + The result dtype to use. Specifying this avoids an intermediate object-dtype allocation. Returns @@ -150,14 +150,20 @@ def _map_stringarray( an ndarray. """ - from pandas.arrays import IntegerArray, StringArray + from pandas.arrays import IntegerArray, StringArray, BooleanArray mask = isna(arr) assert isinstance(arr, StringArray) arr = np.asarray(arr) - if is_integer_dtype(dtype): + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + na_value_is_na = isna(na_value) if na_value_is_na: na_value = 1 @@ -167,13 +173,13 @@ def _map_stringarray( mask.view("uint8"), convert=False, na_value=na_value, - dtype=np.dtype("int64"), + dtype=np.dtype(dtype), ) if not na_value_is_na: mask[:] = False - return IntegerArray(result, mask) + return constructor(result, mask) elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype @@ -181,7 +187,6 @@ def _map_stringarray( arr, func, mask.view("uint8"), convert=False, na_value=na_value ) return StringArray(result) - # TODO: BooleanArray else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes @@ -297,7 +302,7 @@ def str_count(arr, pat, flags=0): """ regex = re.compile(pat, flags=flags) f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): @@ -1363,7 +1368,7 @@ def str_find(arr, sub, start=0, end=None, side="left"): else: f = lambda x: getattr(x, method)(sub, start, end) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_index(arr, sub, start=0, end=None, side="left"): @@ -1383,7 +1388,7 @@ def str_index(arr, sub, start=0, end=None, side="left"): else: f = lambda x: getattr(x, method)(sub, start, end) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_pad(arr, width, side="left", fillchar=" "): @@ -3208,7 +3213,7 @@ def rindex(self, sub, start=0, end=None): len, docstring=_shared_docs["len"], forbidden_types=None, - dtype=int, + dtype="int64", returns_string=False, ) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 584550d562b0d..d8b9c5983618e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1825,7 +1825,7 @@ def test_extractall_same_as_extract_subject_index(self): def test_empty_str_methods(self): empty_str = empty = Series(dtype=object) - empty_int = Series(dtype=int) + empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) empty_bytes = Series(dtype=object) @@ -3524,6 +3524,12 @@ def test_string_array(any_string_method): assert result.dtype == "string" result = result.astype(object) + elif expected.dtype == "object" and lib.is_bool_array( + expected.values, skipna=True + ): + assert result.dtype == "boolean" + result = result.astype(object) + elif expected.dtype == "float" and expected.isna().any(): assert result.dtype == "Int64" result = result.astype("float") @@ -3549,3 +3555,19 @@ def test_string_array_numeric_integer_array(method, expected): result = getattr(s.str, method)("a") expected = Series(expected, dtype="Int64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("isdigit", [False, None, True]), + ("isalpha", [True, None, False]), + ("isalnum", [True, None, True]), + ("isdigit", [False, None, True]), + ], +) +def test_string_array_boolean_array(method, expected): + s = Series(["a", None, "1"], dtype="string") + result = getattr(s.str, method)() + expected = Series(expected, dtype="boolean") + tm.assert_series_equal(result, expected)