Skip to content
4 changes: 4 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1435,6 +1435,10 @@ def any_string_dtype(request):
return pd.StringDtype(storage, na_value)


# Generate cartesian product of any_string_dtype:
any_string_dtype2 = any_string_dtype


@pytest.fixture(params=tm.DATETIME64_DTYPES)
def datetime64_dtype(request):
"""
Expand Down
26 changes: 12 additions & 14 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat.numpy import np_version_gte1p25

import pandas as pd
Expand Down Expand Up @@ -2664,46 +2662,46 @@ def test_pivot_columns_not_given(self):
with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
df.pivot()

@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
def test_pivot_columns_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do need to figure out how this should work with the new string data type; the larger discussion to that is being had in #60329 (comment)

result = df.pivot(columns=None)
expected = DataFrame({("b", 1): [2], ("c", 1): 3})
expected.columns = expected.columns.set_levels(
expected.columns.levels[0].astype(object), level=0
)
tm.assert_frame_equal(result, expected)

result = df.pivot(columns=None, index="b")
expected = DataFrame({("c", 1): 3}, index=Index([2], name="b"))
expected.columns = expected.columns.set_levels(
expected.columns.levels[0].astype(object), level=0
)
tm.assert_frame_equal(result, expected)

result = df.pivot(columns=None, index="b", values="c")
expected = DataFrame({1: 3}, index=Index([2], name="b"))
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
def test_pivot_index_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object"))

result = df.pivot(columns="b", index=None)
expected = DataFrame({("c", 2): 3}, index=[1])
expected.columns = expected.columns.set_levels(
expected.columns.levels[0].astype(object), level=0
)
expected.columns.names = [None, "b"]
tm.assert_frame_equal(result, expected)

result = df.pivot(columns="b", index=None, values="c")
expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
def test_pivot_values_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object"))

result = df.pivot(columns="b", index="c", values=None)
expected = DataFrame(
Expand Down
23 changes: 6 additions & 17 deletions pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,23 +293,12 @@ def test_startswith_endswith_validate_na(any_string_dtype):
dtype=any_string_dtype,
)

dtype = ser.dtype
if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"):
msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser.str.startswith("kapow", na="baz")
msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser.str.endswith("bar", na="baz")
else:
# TODO(infer_string): don't surface pyarrow errors
import pyarrow as pa

msg = "Could not convert 'baz' with type str: tried to convert to boolean"
with pytest.raises(pa.lib.ArrowInvalid, match=msg):
ser.str.startswith("kapow", na="baz")
with pytest.raises(pa.lib.ArrowInvalid, match=msg):
ser.str.endswith("kapow", na="baz")
msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser.str.startswith("kapow", na="baz")
msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser.str.endswith("bar", na="baz")


@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
Expand Down
99 changes: 72 additions & 27 deletions pandas/tests/strings/test_get_dummies.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas.util._test_decorators as td

from pandas import (
Expand Down Expand Up @@ -98,30 +96,77 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):


# GH#47872
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_get_dummies_with_str_dtype(any_string_dtype):
@pytest.mark.parametrize("use_string_repr", [True, False])
def test_get_dummies_with_any_string_dtype(
request, any_string_dtype, any_string_dtype2, use_string_repr, using_infer_string
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a new feature for 3.0.0

(that doesn't appear to be fully tested and consistent - hence additional parameterisation addded).

so using using_infer_string is probably not needed as we won't need backwards compatibility once inference is enable by default for 3.0.0

also any changes won't need backporting.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I commented about this on the PR adding this test, see #59577 (comment), because I don't think it makes much sense to allow this. I see there is a PR following up on it to disallow dtype=str in get_dummies (#59786), but we should follow-up on it to ensure it gets into 3.0

(but so conclusion: I would leave this alone for this PR, and focus on fixing it in #59786)

):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies("|", dtype=str)
expected = DataFrame(
[["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]],
columns=list("abc"),
dtype=str,
)
tm.assert_frame_equal(result, expected)


# GH#47872
@td.skip_if_no("pyarrow")
def test_get_dummies_with_pa_str_dtype(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies("|", dtype="str[pyarrow]")
expected = DataFrame(
[
["true", "true", "false"],
["true", "false", "true"],
["false", "false", "false"],
],
columns=list("abc"),
dtype="str[pyarrow]",
)
test_ids = request.node.callspec.id.split("-")
series_dtype_id = test_ids[0][7:]
expected_dtype_id = test_ids[1][7:]
if expected_dtype_id == "object":
if "pyarrow" in series_dtype_id:
request.applymarker(
pytest.mark.xfail(
reason=("pyarrow.lib.ArrowTypeError: Expected integer, got bool"),
strict=True,
)
)
expected = DataFrame(
[
[True, True, False],
[True, False, True],
[False, False, False],
],
columns=list("abc"),
dtype=np.bool_,
)
elif expected_dtype_id == "str[pyarrow]" and use_string_repr:
# data type 'str[pyarrow]' uses pandas.ArrowDtype instead
expected = DataFrame(
[
["true", "true", "false"],
["true", "false", "true"],
["false", "false", "false"],
],
columns=list("abc"),
dtype="str[pyarrow]",
)
elif expected_dtype_id == "str[python]" and use_string_repr:
# data type 'str[python]' not understood"
expected_dtype_id = str
if using_infer_string:
expected = DataFrame(
[
["True", "True", "False"],
["True", "False", "True"],
["False", "False", "False"],
],
columns=list("abc"),
dtype=expected_dtype_id,
)
else:
expected = DataFrame(
[
["T", "T", "F"],
["T", "F", "T"],
["F", "F", "F"],
],
columns=list("abc"),
dtype=expected_dtype_id,
)
else:
expected = DataFrame(
[
["True", "True", "False"],
["True", "False", "True"],
["False", "False", "False"],
],
columns=list("abc"),
dtype=any_string_dtype2,
)
if use_string_repr:
result = s.str.get_dummies("|", dtype=expected_dtype_id)
else:
result = s.str.get_dummies("|", dtype=any_string_dtype2)
tm.assert_frame_equal(result, expected)