diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dea110b5e46..ca94eb09025 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -51,15 +51,14 @@ Bug fixes ~~~~~~~~~ - Fix regression introduced in v0.14.0 that would cause a crash if dask is installed but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle `_ - -- Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4. +- Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`). + By `Deepak Cherian `_. +- Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4. By `Anderson Banihirwe `_. - - Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and :py:meth:`xarray.core.groupby.DatasetGroupBy.reduce` when reducing over multiple dimensions. (:issue:`3402`). By `Deepak Cherian `_ - Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 62c055fed51..c3f712b31ac 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -361,6 +361,13 @@ def __init__( group_indices = [slice(i, i + 1) for i in group_indices] unique_coord = group else: + if group.isnull().any(): + # drop any NaN valued groups. + # also drop obj values where group was NaN + # Use where instead of reindex to account for duplicate coordinate labels. + obj = obj.where(group.notnull(), drop=True) + group = group.dropna(group_dim) + # look through group to find the unique values unique_values, group_indices = unique_value_groups( safe_cast_to_index(group), sort=(bins is None) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index d74d684dc54..e2216547ac8 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -5,7 +5,7 @@ import xarray as xr from xarray.core.groupby import _consolidate_slices -from . import assert_allclose, assert_identical, raises_regex +from . import assert_allclose, assert_equal, assert_identical, raises_regex @pytest.fixture @@ -48,14 +48,14 @@ def test_groupby_dims_property(dataset): def test_multi_index_groupby_apply(dataset): # regression test for GH873 ds = dataset.isel(z=1, drop=True)[["foo"]] - doubled = 2 * ds - group_doubled = ( + expected = 2 * ds + actual = ( ds.stack(space=["x", "y"]) .groupby("space") .apply(lambda x: 2 * x) .unstack("space") ) - assert doubled.equals(group_doubled) + assert_equal(expected, actual) def test_multi_index_groupby_sum(): @@ -66,7 +66,7 @@ def test_multi_index_groupby_sum(): ) expected = ds.sum("z") actual = ds.stack(space=["x", "y"]).groupby("space").sum("z").unstack("space") - assert expected.equals(actual) + assert_equal(expected, actual) def test_groupby_da_datetime(): @@ -86,7 +86,7 @@ def test_groupby_da_datetime(): expected = xr.DataArray( [3, 7], coords=dict(reference_date=reference_dates), dims="reference_date" ) - assert actual.equals(expected) + assert_equal(expected, actual) def test_groupby_duplicate_coordinate_labels(): @@ -94,7 +94,7 @@ def test_groupby_duplicate_coordinate_labels(): array = xr.DataArray([1, 2, 3], [("x", [1, 1, 2])]) expected = xr.DataArray([3, 3], [("x", [1, 2])]) actual = array.groupby("x").sum() - assert expected.equals(actual) + assert_equal(expected, actual) def test_groupby_input_mutation(): @@ -263,6 +263,72 @@ def test_groupby_repr_datetime(obj): assert actual == expected +def test_groupby_drops_nans(): + # GH2383 + # nan in 2D data variable (requires stacking) + ds = xr.Dataset( + { + "variable": (("lat", "lon", "time"), np.arange(60.0).reshape((4, 3, 5))), + "id": (("lat", "lon"), np.arange(12.0).reshape((4, 3))), + }, + coords={"lat": np.arange(4), "lon": np.arange(3), "time": np.arange(5)}, + ) + + ds["id"].values[0, 0] = np.nan + ds["id"].values[3, 0] = np.nan + ds["id"].values[-1, -1] = np.nan + + grouped = ds.groupby(ds.id) + + # non reduction operation + expected = ds.copy() + expected.variable.values[0, 0, :] = np.nan + expected.variable.values[-1, -1, :] = np.nan + expected.variable.values[3, 0, :] = np.nan + actual = grouped.apply(lambda x: x).transpose(*ds.variable.dims) + assert_identical(actual, expected) + + # reduction along grouped dimension + actual = grouped.mean() + stacked = ds.stack({"xy": ["lat", "lon"]}) + expected = ( + stacked.variable.where(stacked.id.notnull()).rename({"xy": "id"}).to_dataset() + ) + expected["id"] = stacked.id.values + assert_identical(actual, expected.dropna("id").transpose(*actual.dims)) + + # reduction operation along a different dimension + actual = grouped.mean("time") + expected = ds.mean("time").where(ds.id.notnull()) + assert_identical(actual, expected) + + # NaN in non-dimensional coordinate + array = xr.DataArray([1, 2, 3], [("x", [1, 2, 3])]) + array["x1"] = ("x", [1, 1, np.nan]) + expected = xr.DataArray(3, [("x1", [1])]) + actual = array.groupby("x1").sum() + assert_equal(expected, actual) + + # NaT in non-dimensional coordinate + array["t"] = ( + "x", + [ + np.datetime64("2001-01-01"), + np.datetime64("2001-01-01"), + np.datetime64("NaT"), + ], + ) + expected = xr.DataArray(3, [("t", [np.datetime64("2001-01-01")])]) + actual = array.groupby("t").sum() + assert_equal(expected, actual) + + # test for repeated coordinate labels + array = xr.DataArray([0, 1, 2, 4, 3, 4], [("x", [np.nan, 1, 1, np.nan, 2, np.nan])]) + expected = xr.DataArray([3, 3], [("x", [1, 2])]) + actual = array.groupby("x").sum() + assert_equal(expected, actual) + + def test_groupby_grouping_errors(): dataset = xr.Dataset({"foo": ("x", [1, 1, 1])}, {"x": [1, 2, 3]}) with raises_regex(ValueError, "None of the data falls within bins with edges"):