Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is backed by an extension dtype (:issue:`51574`)
- Performance improvement in :meth:`read_orc` when reading a remote URI file path. (:issue:`51609`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`)
- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)

.. ---------------------------------------------------------------------------
.. _whatsnew_210.bug_fixes:
Expand Down Expand Up @@ -195,7 +196,7 @@ Groupby/resample/rolling

Reshaping
^^^^^^^^^
-
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
-

Sparse
Expand Down
18 changes: 14 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3264,13 +3264,23 @@ def combine_first(self, other) -> Series:
falcon NaN
dtype: float64
"""
from pandas.core.reshape.concat import concat

new_index = self.index.union(other.index)
this = self.reindex(new_index, copy=False)
other = other.reindex(new_index, copy=False)

this = self
null_mask = isna(this)
if null_mask.any():
drop = this.index[null_mask].intersection(other.index[notna(other)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a quick comment what this is doing? Took me a while to figure it out, don't want to do this again in 6 months time :)

Otherwise lgtm

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've simplified this logic a bit. should be easier to follow now

if len(drop):
this = this.drop(drop)

other = other.reindex(other.index.difference(this.index), copy=False)
if this.dtype.kind == "M" and other.dtype.kind != "M":
other = to_datetime(other)

return this.where(notna(this), other)
combined = concat([this, other])
combined = combined.reindex(new_index, copy=False)
return combined.__finalize__(self, method="combine_first")

def update(self, other: Series | Sequence | Mapping) -> None:
"""
Expand Down
7 changes: 0 additions & 7 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,13 +327,6 @@ def test_where_series(self, data, na_value):
self.assert_series_equal(result, expected)

def test_combine_first(self, data, request):
if data.dtype.subtype == "int":
# Right now this is upcasted to float, just like combine_first
# for Series[int]
mark = pytest.mark.xfail(
reason="TODO(SparseArray.__setitem__) will preserve dtype."
)
request.node.add_marker(mark)
super().test_combine_first(data)

def test_searchsorted(self, data_for_sorting, as_series):
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/series/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,11 @@ def test_combine_first_timezone_series_with_empty_series(self):
s2 = Series(index=time_index)
result = s1.combine_first(s2)
tm.assert_series_equal(result, s1)

def test_combine_first_preserves_dtype(self):
# GH51764
s1 = Series([np.iinfo(np.int64).min, np.iinfo(np.int64).max])
s2 = Series([1, 2, 3])
result = s1.combine_first(s2)
expected = Series([np.iinfo(np.int64).min, np.iinfo(np.int64).max, 3])
tm.assert_series_equal(result, expected)