Skip to content

Commit 4013ffa

Browse files
authored
Avoid deep-copy when constructing groupby codes (#9429)
Closes #9426
1 parent 2783255 commit 4013ffa

File tree

3 files changed

+26
-5
lines changed

3 files changed

+26
-5
lines changed

asv_bench/benchmarks/groupby.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# import flox to avoid the cost of first import
2+
import cftime
23
import flox.xarray # noqa
34
import numpy as np
45
import pandas as pd
@@ -96,7 +97,7 @@ def setup(self, *args, **kwargs):
9697

9798
requires_dask()
9899
super().setup(**kwargs)
99-
self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
100+
self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dask_dataframe()
100101
self.ds1d_mean = self.ds1d.groupby("b").mean().compute()
101102

102103
def time_binary_op_2d(self):
@@ -169,7 +170,21 @@ class GroupByLongTime:
169170
def setup(self, use_cftime, use_flox):
170171
arr = np.random.randn(10, 10, 365 * 30)
171172
time = xr.date_range("2000", periods=30 * 365, use_cftime=use_cftime)
172-
self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time})
173+
174+
# GH9426 - deep-copying CFTime object arrays is weirdly slow
175+
asda = xr.DataArray(time)
176+
labeled_time = []
177+
for year, month in zip(asda.dt.year, asda.dt.month):
178+
labeled_time.append(cftime.datetime(year, month, 1))
179+
180+
self.da = xr.DataArray(
181+
arr,
182+
dims=("y", "x", "time"),
183+
coords={"time": time, "time2": ("time", labeled_time)},
184+
)
185+
186+
def time_setup(self, use_cftime, use_flox):
187+
self.da.groupby("time.month")
173188

174189
def time_mean(self, use_cftime, use_flox):
175190
with xr.set_options(use_flox=use_flox):

doc/whats-new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ Bug fixes
6161
in NumPy 2.0 (:issue:`9312`, :pull:`9393`)
6262
By `Andrew Scherer <https://github.com/andrew-s28>`_.
6363

64+
Performance
65+
~~~~~~~~~~~
66+
67+
- Speed up grouping by avoiding deep-copy of non-dimension coordinates (:issue:`9426`, :pull:`9393`)
68+
By `Deepak Cherian <https://github.com/dcherian>`_.
69+
6470
Documentation
6571
~~~~~~~~~~~~~
6672

xarray/groupers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def _factorize_unique(self) -> EncodedGroups:
184184
raise ValueError(
185185
"Failed to group data. Are you grouping by a variable that is all NaN?"
186186
)
187-
codes = self.group.copy(data=codes_.reshape(self.group.shape))
187+
codes = self.group.copy(data=codes_.reshape(self.group.shape), deep=False)
188188
unique_coord = Variable(
189189
dims=codes.name, data=unique_values, attrs=self.group.attrs
190190
)
@@ -212,7 +212,7 @@ def _factorize_dummy(self) -> EncodedGroups:
212212
full_index = pd.RangeIndex(self.group.size)
213213
coords = Coordinates()
214214
else:
215-
codes = self.group.copy(data=size_range)
215+
codes = self.group.copy(data=size_range, deep=False)
216216
unique_coord = self.group.variable.to_base_variable()
217217
full_index = self.group_as_index
218218
if isinstance(full_index, pd.MultiIndex):
@@ -438,7 +438,7 @@ def factorize(self, group: T_Group) -> EncodedGroups:
438438
unique_coord = Variable(
439439
dims=group.name, data=first_items.index, attrs=group.attrs
440440
)
441-
codes = group.copy(data=codes_.reshape(group.shape))
441+
codes = group.copy(data=codes_.reshape(group.shape), deep=False)
442442

443443
return EncodedGroups(
444444
codes=codes,

0 commit comments

Comments
 (0)