diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7e006e875e2..799a84410fa 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -62,9 +62,10 @@ Bug fixes - Fixed performance bug where ``cftime`` import attempted within various core operations if ``cftime`` not installed (:pull:`5640`). By `Luke Sewell `_ - - Numbers are properly formatted in a plot's title (:issue:`5788`, :pull:`5789`). By `Maxime Liquet `_. +- Fixed bug when combining named DataArrays using :py:meth:`combine_by_coords`. (:pull:`5834`). + By `Tom Nicholas `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 56956a57e02..ebf316ee6ba 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -356,6 +356,14 @@ def _nested_combine( # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) + # Promote any DataArrays to Datasets + for id, obj in combined_ids.items(): + if isinstance(obj, DataArray): + if obj.name is None: + combined_ids[id] = obj._to_temp_dataset() + else: + combined_ids[id] = obj.to_dataset() + # Apply series of concatenate or merge operations along each dimension combined = _combine_nd( combined_ids, @@ -372,11 +380,11 @@ def _nested_combine( # Define type for arbitrarily-nested list of lists recursively # Currently mypy cannot handle this but other linters can (https://stackoverflow.com/a/53845083/3154101) -DATASET_HYPERCUBE = Union[Dataset, Iterable["DATASET_HYPERCUBE"]] # type: ignore +DATA_HYPERCUBE = Union[Dataset, DataArray, Iterable["DATA_HYPERCUBE"]] # type: ignore def combine_nested( - datasets: DATASET_HYPERCUBE, + datasets: DATA_HYPERCUBE, concat_dim: Union[ str, DataArray, None, Sequence[Union[str, "DataArray", pd.Index, None]] ], @@ -386,9 +394,9 @@ def combine_nested( fill_value: object = dtypes.NA, join: str = "outer", combine_attrs: str = "drop", -) -> Dataset: +) -> Union[Dataset, DataArray]: """ - Explicitly combine an N-dimensional grid of datasets into one by using a + Explicitly combine an N-dimensional grid of datasets (or dataarrays) into one by using a succession of concat and merge operations along each dimension of the grid. Does not sort the supplied datasets under any circumstances, so the @@ -474,7 +482,8 @@ def combine_nested( Returns ------- - combined : xarray.Dataset + combined : xarray.Dataset or xarray.DataArray + Will only return a DataArray in the case that all the inputs are unnamed xarray.DataArrays. Examples -------- @@ -567,31 +576,61 @@ def combine_nested( -------- concat merge + combine_by_coords """ - mixed_datasets_and_arrays = any( - isinstance(obj, Dataset) for obj in iterate_nested(datasets) - ) and any( - isinstance(obj, DataArray) and obj.name is None - for obj in iterate_nested(datasets) - ) - if mixed_datasets_and_arrays: - raise ValueError("Can't combine datasets with unnamed arrays.") + + # TODO deprecation cycle to change the name of this argument... + data_objects = datasets if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: concat_dim = [concat_dim] - # The IDs argument tells _nested_combine that datasets aren't yet sorted - return _nested_combine( - datasets, - concat_dims=concat_dim, - compat=compat, - data_vars=data_vars, - coords=coords, - ids=False, - fill_value=fill_value, - join=join, - combine_attrs=combine_attrs, - ) + objs_are_unnamed_dataarrays = [ + isinstance(data_object, DataArray) and data_object.name is None + for data_object in iterate_nested(data_objects) + ] + if any(objs_are_unnamed_dataarrays): + if all(objs_are_unnamed_dataarrays): + # Combine into a single larger DataArray + unnamed_arrays = data_objects + + combined_temp_dataset = _nested_combine( + unnamed_arrays, + concat_dims=concat_dim, + compat=compat, + data_vars=data_vars, + coords=coords, + ids=False, + fill_value=fill_value, + join=join, + combine_attrs=combine_attrs, + ) + return DataArray()._from_temp_dataset(combined_temp_dataset) + else: + # Must be a mix of unnamed dataarrays with either named dataarrays or with datasets + # Can't combine these as we wouldn't know whether to merge or concatenate the arrays + raise ValueError( + "Can't automatically combine unnamed dataarrays with either named dataarrays or datasets." + ) + else: + # Promote any named DataArrays to single-variable Datasets to simplify combining + # data_objects = [ + # obj.to_dataset() if isinstance(obj, DataArray) else obj + # for obj in data_objects + # ] + + # The IDs argument tells _nested_combine that datasets aren't yet sorted + return _nested_combine( + data_objects, + concat_dims=concat_dim, + compat=compat, + data_vars=data_vars, + coords=coords, + ids=False, + fill_value=fill_value, + join=join, + combine_attrs=combine_attrs, + ) def vars_as_keys(ds): @@ -697,7 +736,6 @@ def combine_by_coords( ---------- data_objects : sequence of xarray.Dataset or sequence of xarray.DataArray Data objects to combine. - compat : {"identical", "equals", "broadcast_equals", "no_conflicts", "override"}, optional String indicating how to compare variables of the same name for potential conflicts: @@ -765,6 +803,8 @@ def combine_by_coords( Returns ------- combined : xarray.Dataset or xarray.DataArray + Will only return a DataArray in the case that all the inputs are unnamed xarray.DataArrays. + See also -------- @@ -883,33 +923,41 @@ def combine_by_coords( if not data_objects: return Dataset() - mixed_arrays_and_datasets = any( - isinstance(data_object, DataArray) and data_object.name is None - for data_object in data_objects - ) and any(isinstance(data_object, Dataset) for data_object in data_objects) - if mixed_arrays_and_datasets: - raise ValueError("Can't automatically combine datasets with unnamed arrays.") - - all_unnamed_data_arrays = all( + objs_are_unnamed_dataarrays = [ isinstance(data_object, DataArray) and data_object.name is None for data_object in data_objects - ) - if all_unnamed_data_arrays: - unnamed_arrays = data_objects - temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays] - - combined_temp_dataset = _combine_single_variable_hypercube( - temp_datasets, - fill_value=fill_value, - data_vars=data_vars, - coords=coords, - compat=compat, - join=join, - combine_attrs=combine_attrs, - ) - return DataArray()._from_temp_dataset(combined_temp_dataset) - + ] + if any(objs_are_unnamed_dataarrays): + if all(objs_are_unnamed_dataarrays): + # Combine into a single larger DataArray + unnamed_arrays = data_objects + temp_datasets = [ + data_array._to_temp_dataset() for data_array in unnamed_arrays + ] + + combined_temp_dataset = _combine_single_variable_hypercube( + temp_datasets, + fill_value=fill_value, + data_vars=data_vars, + coords=coords, + compat=compat, + join=join, + combine_attrs=combine_attrs, + ) + return DataArray()._from_temp_dataset(combined_temp_dataset) + else: + # Must be a mix of unnamed dataarrays with either named dataarrays or with datasets + # Can't combine these as we wouldn't know whether to merge or concatenate the arrays + raise ValueError( + "Can't automatically combine unnamed dataarrays with either named dataarrays or datasets." + ) else: + # Promote any named DataArrays to single-variable Datasets to simplify combining + data_objects = [ + obj.to_dataset() if isinstance(obj, DataArray) else obj + for obj in data_objects + ] + # Group by data vars sorted_datasets = sorted(data_objects, key=vars_as_keys) grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index cbe09aab815..6c708ddf691 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -683,12 +683,12 @@ def test_nested_combine_mixed_datasets_arrays(self): Dataset({"x": [2, 3]}), ] with pytest.raises( - ValueError, match=r"Can't combine datasets with unnamed arrays." + ValueError, match="Can't automatically combine unnamed dataarrays with" ): combine_nested(objs, "x") -class TestCombineAuto: +class TestCombineDatasetsbyCoords: def test_combine_by_coords(self): objs = [Dataset({"x": [0]}), Dataset({"x": [1]})] actual = combine_by_coords(objs) @@ -730,17 +730,6 @@ def test_combine_by_coords(self): def test_empty_input(self): assert_identical(Dataset(), combine_by_coords([])) - def test_combine_coords_mixed_datasets_arrays(self): - objs = [ - DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})), - Dataset({"x": [2, 3]}), - ] - with pytest.raises( - ValueError, - match=r"Can't automatically combine datasets with unnamed arrays.", - ): - combine_by_coords(objs) - @pytest.mark.parametrize( "join, expected", [ @@ -1044,20 +1033,92 @@ def test_combine_by_coords_incomplete_hypercube(self): with pytest.raises(ValueError): combine_by_coords([x1, x2, x3], fill_value=None) - def test_combine_by_coords_unnamed_arrays(self): + +class TestCombineMixedObjects: + def test_combine_unnamed_named_dataarrays(self): + named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + unnamed_da = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") + + with pytest.raises( + ValueError, match="Can't automatically combine unnamed dataarrays with" + ): + combine_by_coords([named_da, unnamed_da]) + with pytest.raises( + ValueError, match="Can't automatically combine unnamed dataarrays with" + ): + combine_nested([named_da, unnamed_da], concat_dim="x") + + da = DataArray([0, 1], dims="x", coords=({"x": [0, 1]})) + ds = Dataset({"x": [2, 3]}) + with pytest.raises( + ValueError, + match="Can't automatically combine unnamed dataarrays with", + ): + combine_by_coords([da, ds]) + with pytest.raises( + ValueError, + match="Can't automatically combine unnamed dataarrays with", + ): + combine_nested([da, ds], concat_dim="x") + + def test_combine_mixed_datasets_named_dataarrays(self): + da = DataArray(name="a", data=[4, 5], dims="x", coords=({"x": [0, 1]})) + ds = Dataset({"b": ("x", [2, 3])}) + expected = Dataset( + {"a": ("x", [4, 5]), "b": ("x", [2, 3])}, coords={"x": ("x", [0, 1])} + ) + + actual = combine_by_coords([da, ds]) + assert_identical(expected, actual) + + actual = combine_nested([da, ds], concat_dim="x") + assert_identical(expected, actual) + + def test_combine_all_unnamed_dataarrays(self): unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + expected = unnamed_array actual = combine_by_coords([unnamed_array]) - expected = unnamed_array + assert_identical(expected, actual) + + actual = combine_nested([unnamed_array], concat_dim=None) assert_identical(expected, actual) unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") - - actual = combine_by_coords([unnamed_array1, unnamed_array2]) expected = DataArray( data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x" ) + + actual = combine_by_coords([unnamed_array1, unnamed_array2]) + assert_identical(expected, actual) + + actual = combine_nested([unnamed_array1, unnamed_array2], concat_dim="x") + assert_identical(expected, actual) + + def test_combine_all_named_dataarrays(self): + named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + expected = named_da.to_dataset() + + actual = combine_by_coords([named_da]) + assert_identical(expected, actual) + + actual = combine_nested([named_da], concat_dim=None) + assert_identical(expected, actual) + + named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") + expected = Dataset( + { + "a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"), + "b": DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"), + } + ) + + actual = combine_by_coords([named_da1, named_da2]) + assert_identical(expected, actual) + + actual = combine_nested([named_da1, named_da2], concat_dim="x") assert_identical(expected, actual)