Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,15 @@ def nselect_method(request):
return request.param


@pytest.fixture(params=["first", "last", False])
def keep(request):
"""
Valid values for the 'keep' parameter used in
.duplicated or .drop_duplicates
"""
return request.param


@pytest.fixture(params=["left", "right", "both", "neither"])
def closed(request):
"""
Expand Down
102 changes: 0 additions & 102 deletions pandas/tests/base/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,108 +594,6 @@ def test_factorize_repeated(self):
expected = o[5:10].append(o[:5])
tm.assert_index_equal(uniques, expected, check_names=False)

def test_duplicated_drop_duplicates_index(self):
# GH 4060
for original in self.objs:
if isinstance(original, Index):

# special case
if original.is_boolean():
result = original.drop_duplicates()
expected = Index([False, True], name="a")
tm.assert_index_equal(result, expected)
continue

# original doesn't have duplicates
expected = np.array([False] * len(original), dtype=bool)
duplicated = original.duplicated()
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
result = original.drop_duplicates()
tm.assert_index_equal(result, original)
assert result is not original
Comment on lines -609 to -616
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now tested in test_drop_duplicates_no_duplicates in tests/indexes/test_common.py, see below


# has_duplicates
assert not original.has_duplicates

# create repeated values, 3rd and 5th values are duplicated
idx = original[list(range(len(original))) + [5, 3]]
expected = np.array([False] * len(original) + [True, True], dtype=bool)
duplicated = idx.duplicated()
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
tm.assert_index_equal(idx.drop_duplicates(), original)

base = [False] * len(idx)
base[3] = True
base[5] = True
expected = np.array(base)

duplicated = idx.duplicated(keep="last")
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
result = idx.drop_duplicates(keep="last")
tm.assert_index_equal(result, idx[~expected])

base = [False] * len(original) + [True, True]
base[3] = True
base[5] = True
expected = np.array(base)

duplicated = idx.duplicated(keep=False)
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
result = idx.drop_duplicates(keep=False)
tm.assert_index_equal(result, idx[~expected])
Comment on lines -618 to -649
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested in test_drop_duplicates in tests/indexes/test_common.py which was extended and refactored, see below


with pytest.raises(
TypeError,
match=r"drop_duplicates\(\) got an unexpected keyword argument",
):
idx.drop_duplicates(inplace=True)
Comment on lines -651 to -655
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now tested in test_drop_duplicates_inplace in tests/indexes/test_common.py, see below


else:
expected = Series(
[False] * len(original), index=original.index, name="a"
)
tm.assert_series_equal(original.duplicated(), expected)
result = original.drop_duplicates()
tm.assert_series_equal(result, original)
assert result is not original
Comment on lines -658 to -664
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now tested in test_drop_duplicates_no_duplicates in tests/series/methods/test_drop_duplicates.py, see below


idx = original.index[list(range(len(original))) + [5, 3]]
values = original._values[list(range(len(original))) + [5, 3]]
s = Series(values, index=idx, name="a")

expected = Series(
[False] * len(original) + [True, True], index=idx, name="a"
)
tm.assert_series_equal(s.duplicated(), expected)
tm.assert_series_equal(s.drop_duplicates(), original)

base = [False] * len(idx)
base[3] = True
base[5] = True
expected = Series(base, index=idx, name="a")

tm.assert_series_equal(s.duplicated(keep="last"), expected)
tm.assert_series_equal(
s.drop_duplicates(keep="last"), s[~np.array(base)]
)

base = [False] * len(original) + [True, True]
base[3] = True
base[5] = True
expected = Series(base, index=idx, name="a")

tm.assert_series_equal(s.duplicated(keep=False), expected)
tm.assert_series_equal(
s.drop_duplicates(keep=False), s[~np.array(base)]
)

s.drop_duplicates(inplace=True)
tm.assert_series_equal(s, original)
Comment on lines -666 to -697
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


def test_drop_duplicates_series_vs_dataframe(self):
# GH 14192
df = pd.DataFrame(
Expand Down
76 changes: 52 additions & 24 deletions pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,32 +302,60 @@ def test_pickle(self, indices):
assert indices.equals(unpickled)
indices.name = original_name

@pytest.mark.parametrize("keep", ["first", "last", False])
def test_duplicated(self, indices, keep):
if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
# MultiIndex tested separately in:
# tests/indexes/multi/test_unique_and_duplicates
pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex")

def test_drop_duplicates(self, indices, keep):
if isinstance(indices, MultiIndex):
pytest.skip("MultiIndex is tested separately")
if isinstance(indices, RangeIndex):
pytest.skip(
"RangeIndex is tested in test_drop_duplicates_no_duplicates"
" as it cannot hold duplicates"
)

# make unique index
holder = type(indices)
unique_values = list(set(indices))
unique_idx = holder(unique_values)

# make duplicated index
n = len(unique_idx)
duplicated_selection = np.random.choice(n, int(n * 1.5))
idx = holder(unique_idx.values[duplicated_selection])

# Series.duplicated is tested separately
expected_duplicated = (
pd.Series(duplicated_selection).duplicated(keep=keep).values
)
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected_duplicated)

# Series.drop_duplicates is tested separately
expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep))
tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped)

def test_drop_duplicates_no_duplicates(self, indices):
if isinstance(indices, MultiIndex):
pytest.skip("MultiIndex is tested separately")

idx = holder(indices)
if idx.has_duplicates:
# We are testing the duplicated-method here, so we need to know
# exactly which indices are duplicate and how (for the result).
# This is not possible if "idx" has duplicates already, which we
# therefore remove. This is seemingly circular, as drop_duplicates
# invokes duplicated, but in the end, it all works out because we
# cross-check with Series.duplicated, which is tested separately.
idx = idx.drop_duplicates()

n, k = len(idx), 10
duplicated_selection = np.random.choice(n, k * n)
expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
idx = holder(idx.values[duplicated_selection])

result = idx.duplicated(keep=keep)
tm.assert_numpy_array_equal(result, expected)
# make unique index
if isinstance(indices, RangeIndex):
# RangeIndex cannot have duplicates
unique_idx = indices
else:
holder = type(indices)
unique_values = list(set(indices))
unique_idx = holder(unique_values)

# check on unique index
expected_duplicated = np.array([False] * len(unique_idx), dtype="bool")
tm.assert_numpy_array_equal(unique_idx.duplicated(), expected_duplicated)
result_dropped = unique_idx.drop_duplicates()
tm.assert_index_equal(result_dropped, unique_idx)
# validate shallow copy
assert result_dropped is not unique_idx

def test_drop_duplicates_inplace(self, indices):
msg = r"drop_duplicates\(\) got an unexpected keyword argument"
with pytest.raises(TypeError, match=msg):
indices.drop_duplicates(inplace=True)

def test_has_duplicates(self, indices):
holder = type(indices)
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/series/methods/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,26 @@ def test_drop_duplicates_bool(keep, expected):
tm.assert_series_equal(sc, tc[~expected])


@pytest.mark.parametrize("values", [[], list(range(5))])
def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
tc = Series(values, dtype=np.dtype(any_numpy_dtype))
expected = Series([False] * len(tc), dtype="bool")

if tc.dtype == "bool":
# 0 -> False and 1-> True
# any other value would be duplicated
tc = tc[:2]
expected = expected[:2]

tm.assert_series_equal(tc.duplicated(keep=keep), expected)

result_dropped = tc.drop_duplicates(keep=keep)
tm.assert_series_equal(result_dropped, tc)

# validate shallow copy
assert result_dropped is not tc


class TestSeriesDropDuplicates:
@pytest.mark.parametrize(
"dtype",
Expand Down