From 3f6f6374765922fdb99cfc456831bc913fd342a5 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 16 Jul 2022 15:05:06 +0200 Subject: [PATCH 01/26] temporary API to set custom indexes --- xarray/core/dataset.py | 78 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c3717190df6..eb9f9e0e187 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4149,6 +4149,84 @@ def reset_index( return self._replace(variables, coord_names=coord_names, indexes=indexes) + def set_xindex(self, coord_names, index_cls, **kwargs): + """Temporary API for creating and setting a new, custom index from + existing coordinate(s). + + Parameters + ---------- + coord_names : str or list + Name(s) of the coordinate(s) used to build the index. + If several names are given, their order matters. + index_cls : class + Xarray index subclass. + **kwargs + Options passed to the index constructor. Not working for now + (not sure yet how to do it). + + """ + warnings.warn("This is temporary API to experiment with custom indexes") + + if not issubclass(index_cls, Index): + raise TypeError( + f"{index_cls} is not a subclass of xarray.core.indexes.Index" + ) + + if isinstance(coord_names, str): + coord_names = [coord_names] + + invalid_coords = set(coord_names) - self._coord_names + + if invalid_coords: + raise ValueError( + f"those coordinates don't exist in Dataset: {invalid_coords}" + ) + + # we could be more clever here (e.g., drop-in index replacement if index + # coordinates do not conflict), but let's not allow this for now + indexed_coords = set(coord_names) & set(self._indexes) + + if indexed_coords: + raise ValueError( + f"those coordinates already have an index: {indexed_coords}" + ) + + coord_vars = {k: self._variables[k] for k in coord_names} + + # note: extra checks (e.g., all coordinates must have the same dimension(s)) + # should be done in the implementation of Index.from_variables + index = index_cls.from_variables(coord_vars) + + # in case there are index coordinate variable wrappers + # (e.g., for PandasIndex we create coordinate variables that wrap pd.Index). + # `coord_vars` passed as argument is for propagating coordinate metadata + new_coord_vars = index.create_variables(coord_vars) + + # reorder variables and indexes so that coordinates having the same index + # are next to each other + variables = {} + for k, v in self._variables.items(): + if k not in coord_names: + variables[k] = v + + for k in coord_names: + variables[k] = new_coord_vars.get(k, self._variables[k]) + + indexes = {} + for k, v in self._indexes.items(): + if k not in coord_names: + indexes[k] = v + + for k in coord_names: + indexes[k] = index + + return self._construct_direct( + variables=variables, + coord_names=self._coord_names, + dims=self._dims, + indexes=indexes, + ) + def reorder_levels( self: T_Dataset, dim_order: Mapping[Any, Sequence[int | Hashable]] | None = None, From bf30d5424b57c02fe5fb94fda27ca325ce545953 Mon Sep 17 00:00:00 2001 From: Keewis Date: Sat, 16 Jul 2022 17:53:40 +0200 Subject: [PATCH 02/26] add the temporary index API to DataArray --- xarray/core/dataarray.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 9dfdb6603e6..362bfabe2c7 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2248,6 +2248,24 @@ def reset_index( ds = self._to_temp_dataset().reset_index(dims_or_levels, drop=drop) return self._from_temp_dataset(ds) + def set_xindex(self, coord_names, index_cls, **kwargs): + """Temporary API for creating and setting a new, custom index from + existing coordinate(s). + + Parameters + ---------- + coord_names : str or list + Name(s) of the coordinate(s) used to build the index. + If several names are given, their order matters. + index_cls : class + Xarray index subclass. + **kwargs + Options passed to the index constructor. Not working for now + (not sure yet how to do it). + """ + ds = self._to_temp_dataset().set_xindex(coord_names, index_cls, **kwargs) + return self._from_temp_dataset(ds) + def reorder_levels( self: T_DataArray, dim_order: Mapping[Any, Sequence[int | Hashable]] | None = None, From 9de9c46e2f899f64c62d4ddcd1d0ee8e08836784 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 17 Jul 2022 21:40:58 +0200 Subject: [PATCH 03/26] add options argument to Index.from_variables() It allows passing options to the constructor of a custom index class (if any). The **options arguments of Dataset.set_xindex() are passed through. Also add type annotations to set_xindex(). --- xarray/core/dataarray.py | 15 ++++++++++----- xarray/core/dataset.py | 29 ++++++++++++++++------------- xarray/core/indexes.py | 20 ++++++++++++++++---- xarray/tests/test_indexes.py | 18 ++++++++++-------- 4 files changed, 52 insertions(+), 30 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 362bfabe2c7..5a7fb51e629 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2248,7 +2248,12 @@ def reset_index( ds = self._to_temp_dataset().reset_index(dims_or_levels, drop=drop) return self._from_temp_dataset(ds) - def set_xindex(self, coord_names, index_cls, **kwargs): + def set_xindex( + self: T_DataArray, + coord_names: Hashable | Sequence[Hashable], + index_cls: type[Index], + **options, + ) -> T_DataArray: """Temporary API for creating and setting a new, custom index from existing coordinate(s). @@ -2259,11 +2264,11 @@ def set_xindex(self, coord_names, index_cls, **kwargs): If several names are given, their order matters. index_cls : class Xarray index subclass. - **kwargs - Options passed to the index constructor. Not working for now - (not sure yet how to do it). + **options + Options passed to the index constructor. + """ - ds = self._to_temp_dataset().set_xindex(coord_names, index_cls, **kwargs) + ds = self._to_temp_dataset().set_xindex(coord_names, index_cls, **options) return self._from_temp_dataset(ds) def reorder_levels( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index eb9f9e0e187..d88a04bc6e3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4031,7 +4031,7 @@ def set_index( f"dimension mismatch: try setting an index for dimension {dim!r} with " f"variable {var_name!r} that has dimensions {var.dims}" ) - idx = PandasIndex.from_variables({dim: var}) + idx = PandasIndex.from_variables({dim: var}, {}) idx_vars = idx.create_variables({var_name: var}) else: if append: @@ -4149,7 +4149,12 @@ def reset_index( return self._replace(variables, coord_names=coord_names, indexes=indexes) - def set_xindex(self, coord_names, index_cls, **kwargs): + def set_xindex( + self: T_Dataset, + coord_names: Hashable | Sequence[Hashable], + index_cls: type[Index], + **options, + ) -> T_Dataset: """Temporary API for creating and setting a new, custom index from existing coordinate(s). @@ -4160,9 +4165,8 @@ def set_xindex(self, coord_names, index_cls, **kwargs): If several names are given, their order matters. index_cls : class Xarray index subclass. - **kwargs - Options passed to the index constructor. Not working for now - (not sure yet how to do it). + **options + Options passed to the index constructor. """ warnings.warn("This is temporary API to experiment with custom indexes") @@ -4172,7 +4176,8 @@ def set_xindex(self, coord_names, index_cls, **kwargs): f"{index_cls} is not a subclass of xarray.core.indexes.Index" ) - if isinstance(coord_names, str): + # the Sequence check is required for mypy + if is_scalar(coord_names) or not isinstance(coord_names, Sequence): coord_names = [coord_names] invalid_coords = set(coord_names) - self._coord_names @@ -4195,7 +4200,7 @@ def set_xindex(self, coord_names, index_cls, **kwargs): # note: extra checks (e.g., all coordinates must have the same dimension(s)) # should be done in the implementation of Index.from_variables - index = index_cls.from_variables(coord_vars) + index = index_cls.from_variables(coord_vars, options) # in case there are index coordinate variable wrappers # (e.g., for PandasIndex we create coordinate variables that wrap pd.Index). @@ -4204,20 +4209,18 @@ def set_xindex(self, coord_names, index_cls, **kwargs): # reorder variables and indexes so that coordinates having the same index # are next to each other - variables = {} + variables: dict[Hashable, Variable] = {} for k, v in self._variables.items(): if k not in coord_names: variables[k] = v - for k in coord_names: - variables[k] = new_coord_vars.get(k, self._variables[k]) - - indexes = {} + indexes: dict[Hashable, Index] = {} for k, v in self._indexes.items(): if k not in coord_names: indexes[k] = v for k in coord_names: + variables[k] = new_coord_vars.get(k, self._variables[k]) indexes[k] = index return self._construct_direct( @@ -7856,7 +7859,7 @@ def pad( # reset default index of dimension coordinates if (name,) == var.dims: dim_var = {name: variables[name]} - index = PandasIndex.from_variables(dim_var) + index = PandasIndex.from_variables(dim_var, {}) index_vars = index.create_variables(dim_var) indexes[name] = index variables[name] = index_vars[name] diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 8ff0d40ff07..3bdf11612db 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -35,7 +35,11 @@ class Index: """Base class inherited by all xarray-compatible indexes.""" @classmethod - def from_variables(cls, variables: Mapping[Any, Variable]) -> Index: + def from_variables( + cls, + variables: Mapping[Any, Variable], + options: Mapping[str, Any], + ) -> Index: raise NotImplementedError() @classmethod @@ -247,7 +251,11 @@ def _replace(self, index, dim=None, coord_dtype=None): return type(self)(index, dim, coord_dtype) @classmethod - def from_variables(cls, variables: Mapping[Any, Variable]) -> PandasIndex: + def from_variables( + cls, + variables: Mapping[Any, Variable], + options: Mapping[str, Any], + ) -> PandasIndex: if len(variables) != 1: raise ValueError( f"PandasIndex only accepts one variable, found {len(variables)} variables" @@ -570,7 +578,11 @@ def _replace(self, index, dim=None, level_coords_dtype=None) -> PandasMultiIndex return type(self)(index, dim, level_coords_dtype) @classmethod - def from_variables(cls, variables: Mapping[Any, Variable]) -> PandasMultiIndex: + def from_variables( + cls, + variables: Mapping[Any, Variable], + options: Mapping[str, Any], + ) -> PandasMultiIndex: _check_dim_compat(variables) dim = next(iter(variables.values())).dims[0] @@ -995,7 +1007,7 @@ def create_default_index_implicit( ) else: dim_var = {name: dim_variable} - index = PandasIndex.from_variables(dim_var) + index = PandasIndex.from_variables(dim_var, {}) index_vars = index.create_variables(dim_var) return index, index_vars diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index 302a68ab552..07a4bdafafc 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -44,7 +44,7 @@ def index(self) -> CustomIndex: def test_from_variables(self) -> None: with pytest.raises(NotImplementedError): - Index.from_variables({}) + Index.from_variables({}, {}) def test_concat(self) -> None: with pytest.raises(NotImplementedError): @@ -132,19 +132,19 @@ def test_from_variables(self) -> None: "x", data, attrs={"unit": "m"}, encoding={"dtype": np.float64} ) - index = PandasIndex.from_variables({"x": var}) + index = PandasIndex.from_variables({"x": var}, {}) assert index.dim == "x" assert index.index.equals(pd.Index(data)) assert index.coord_dtype == data.dtype var2 = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]]) with pytest.raises(ValueError, match=r".*only accepts one variable.*"): - PandasIndex.from_variables({"x": var, "foo": var2}) + PandasIndex.from_variables({"x": var, "foo": var2}, {}) with pytest.raises( ValueError, match=r".*only accepts a 1-dimensional variable.*" ): - PandasIndex.from_variables({"foo": var2}) + PandasIndex.from_variables({"foo": var2}, {}) def test_from_variables_index_adapter(self) -> None: # test index type is preserved when variable wraps a pd.Index @@ -152,7 +152,7 @@ def test_from_variables_index_adapter(self) -> None: pd_idx = pd.Index(data) var = xr.Variable("x", pd_idx) - index = PandasIndex.from_variables({"x": var}) + index = PandasIndex.from_variables({"x": var}, {}) assert isinstance(index.index, pd.CategoricalIndex) def test_concat_periods(self): @@ -355,7 +355,7 @@ def test_from_variables(self) -> None: ) index = PandasMultiIndex.from_variables( - {"level1": v_level1, "level2": v_level2} + {"level1": v_level1, "level2": v_level2}, {} ) expected_idx = pd.MultiIndex.from_arrays([v_level1.data, v_level2.data]) @@ -368,13 +368,15 @@ def test_from_variables(self) -> None: with pytest.raises( ValueError, match=r".*only accepts 1-dimensional variables.*" ): - PandasMultiIndex.from_variables({"var": var}) + PandasMultiIndex.from_variables({"var": var}, {}) v_level3 = xr.Variable("y", [4, 5, 6]) with pytest.raises( ValueError, match=r"unmatched dimensions for multi-index variables.*" ): - PandasMultiIndex.from_variables({"level1": v_level1, "level3": v_level3}) + PandasMultiIndex.from_variables( + {"level1": v_level1, "level3": v_level3}, {} + ) def test_concat(self) -> None: pd_midx = pd.MultiIndex.from_product( From aa403a42de23b608930bfd994fe7f110d6db282b Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 17 Jul 2022 22:08:24 +0200 Subject: [PATCH 04/26] fix mypy --- xarray/core/dataset.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d88a04bc6e3..5bfd905f589 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4196,7 +4196,7 @@ def set_xindex( f"those coordinates already have an index: {indexed_coords}" ) - coord_vars = {k: self._variables[k] for k in coord_names} + coord_vars = {name: self._variables[name] for name in coord_names} # note: extra checks (e.g., all coordinates must have the same dimension(s)) # should be done in the implementation of Index.from_variables @@ -4210,18 +4210,18 @@ def set_xindex( # reorder variables and indexes so that coordinates having the same index # are next to each other variables: dict[Hashable, Variable] = {} - for k, v in self._variables.items(): - if k not in coord_names: - variables[k] = v + for name, var in self._variables.items(): + if name not in coord_names: + variables[name] = var indexes: dict[Hashable, Index] = {} - for k, v in self._indexes.items(): - if k not in coord_names: - indexes[k] = v + for name, idx in self._indexes.items(): + if name not in coord_names: + indexes[name] = idx - for k in coord_names: - variables[k] = new_coord_vars.get(k, self._variables[k]) - indexes[k] = index + for name in coord_names: + variables[name] = new_coord_vars.get(name, self._variables[name]) + indexes[name] = index return self._construct_direct( variables=variables, From 210a59a49297326124aa7867339bf0d6f001c04c Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 09:36:09 +0200 Subject: [PATCH 05/26] remove temporary API warning --- xarray/core/dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5bfd905f589..5edee8d9b9c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4169,8 +4169,6 @@ def set_xindex( Options passed to the index constructor. """ - warnings.warn("This is temporary API to experiment with custom indexes") - if not issubclass(index_cls, Index): raise TypeError( f"{index_cls} is not a subclass of xarray.core.indexes.Index" From d8c39859fc74a129abc2a97357d150a8b7d0b452 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 10:22:16 +0200 Subject: [PATCH 06/26] add the Index class in Xarray's root namespace --- doc/api.rst | 1 + xarray/__init__.py | 2 ++ xarray/core/indexes.py | 6 +++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index 11ae5de8531..536f4f09203 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1080,6 +1080,7 @@ Advanced API Variable IndexVariable as_variable + Index Context register_dataset_accessor register_dataarray_accessor diff --git a/xarray/__init__.py b/xarray/__init__.py index 46dcf0e9b32..8ea955e7210 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -30,6 +30,7 @@ from .core.dataarray import DataArray from .core.dataset import Dataset from .core.extensions import register_dataarray_accessor, register_dataset_accessor +from .core.indexes import Index from .core.merge import Context, MergeError, merge from .core.options import get_options, set_options from .core.parallel import map_blocks @@ -99,6 +100,7 @@ "Coordinate", "DataArray", "Dataset", + "Index", "IndexVariable", "Variable", # Exceptions diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 3bdf11612db..8c83589feae 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -32,7 +32,11 @@ class Index: - """Base class inherited by all xarray-compatible indexes.""" + """Base class inherited by all xarray-compatible indexes. + + Do not use this class directly for creating index objects. + + """ @classmethod def from_variables( From c4afabfdf94ed952f54c92f80359ab0516c5dd1c Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 10:25:58 +0200 Subject: [PATCH 07/26] improve set_xindex docstrings and add to api.rst --- doc/api.rst | 2 ++ xarray/core/dataarray.py | 19 +++++++++++++++---- xarray/core/dataset.py | 19 +++++++++++++++---- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 536f4f09203..81bd2d22b08 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -146,6 +146,7 @@ Indexing Dataset.reindex_like Dataset.set_index Dataset.reset_index + Dataset.set_xindex Dataset.reorder_levels Dataset.query @@ -330,6 +331,7 @@ Indexing DataArray.reindex_like DataArray.set_index DataArray.reset_index + DataArray.set_xindex DataArray.reorder_levels DataArray.query diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 5a7fb51e629..9cc56b23a83 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2169,6 +2169,11 @@ def set_index( """Set DataArray (multi-)indexes using one or more existing coordinates. + This legacy method is limited to pandas (multi-)indexes and + 1-dimensional "dimension" coordinates. See + :py:meth:`~DataArray.set_xindex` for setting a pandas or a custom + Xarray-compatible index from one or more arbitrary coordinates. + Parameters ---------- indexes : {dim: index, ...} @@ -2213,6 +2218,7 @@ def set_index( See Also -------- DataArray.reset_index + DataArray.set_xindex """ ds = self._to_temp_dataset().set_index(indexes, append=append, **indexes_kwargs) return self._from_temp_dataset(ds) @@ -2254,19 +2260,24 @@ def set_xindex( index_cls: type[Index], **options, ) -> T_DataArray: - """Temporary API for creating and setting a new, custom index from - existing coordinate(s). + """Set a new, Xarray-compatible index from one or more existing + coordinate(s). Parameters ---------- coord_names : str or list Name(s) of the coordinate(s) used to build the index. If several names are given, their order matters. - index_cls : class - Xarray index subclass. + index_cls : subclass of :class:`~xarray.Index` + The type of index to create. **options Options passed to the index constructor. + Returns + ------- + obj : Dataset + Another dataarray, with this dataarray's data and with a new index. + """ ds = self._to_temp_dataset().set_xindex(coord_names, index_cls, **options) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5edee8d9b9c..aa9a07affda 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3942,6 +3942,11 @@ def set_index( """Set Dataset (multi-)indexes using one or more existing coordinates or variables. + This legacy method is limited to pandas (multi-)indexes and + 1-dimensional "dimension" coordinates. See + :py:meth:`~Dataset.set_xindex` for setting a pandas or a custom + Xarray-compatible index from one or more arbitrary coordinates. + Parameters ---------- indexes : {dim: index, ...} @@ -3989,6 +3994,7 @@ def set_index( See Also -------- Dataset.reset_index + Dataset.set_xindex Dataset.swap_dims """ dim_coords = either_dict_or_kwargs(indexes, indexes_kwargs, "set_index") @@ -4155,19 +4161,24 @@ def set_xindex( index_cls: type[Index], **options, ) -> T_Dataset: - """Temporary API for creating and setting a new, custom index from - existing coordinate(s). + """Set a new, Xarray-compatible index from one or more existing + coordinate(s). Parameters ---------- coord_names : str or list Name(s) of the coordinate(s) used to build the index. If several names are given, their order matters. - index_cls : class - Xarray index subclass. + index_cls : subclass of :class:`~xarray.Index` + The type of index to create. **options Options passed to the index constructor. + Returns + ------- + obj : Dataset + Another dataset, with this dataset's data and with a new index. + """ if not issubclass(index_cls, Index): raise TypeError( From fe723ce1a470194448a242ec01a5b01921a874cc Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 10:26:41 +0200 Subject: [PATCH 08/26] remove temp comments --- xarray/core/dataset.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index aa9a07affda..8807e00fc33 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4207,13 +4207,8 @@ def set_xindex( coord_vars = {name: self._variables[name] for name in coord_names} - # note: extra checks (e.g., all coordinates must have the same dimension(s)) - # should be done in the implementation of Index.from_variables index = index_cls.from_variables(coord_vars, options) - # in case there are index coordinate variable wrappers - # (e.g., for PandasIndex we create coordinate variables that wrap pd.Index). - # `coord_vars` passed as argument is for propagating coordinate metadata new_coord_vars = index.create_variables(coord_vars) # reorder variables and indexes so that coordinates having the same index From a48c8531e63d4725f77e3864d99dd750c3d05c5b Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 11:39:50 +0200 Subject: [PATCH 09/26] special case for pandas multi-index dim coord --- xarray/core/dataset.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8807e00fc33..69e3af3864b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4211,6 +4211,12 @@ def set_xindex( new_coord_vars = index.create_variables(coord_vars) + # special case for setting a pandas multi-index from level coordinates + # TODO: remove it once we depreciate pandas multi-index dimension (tuple + # elements) coordinate + if isinstance(index, PandasMultiIndex): + coord_names = [index.dim] + list(coord_names) + # reorder variables and indexes so that coordinates having the same index # are next to each other variables: dict[Hashable, Variable] = {} @@ -4224,12 +4230,15 @@ def set_xindex( indexes[name] = idx for name in coord_names: - variables[name] = new_coord_vars.get(name, self._variables[name]) + try: + variables[name] = new_coord_vars[name] + except KeyError: + variables[name] = self._variables[name] indexes[name] = index return self._construct_direct( variables=variables, - coord_names=self._coord_names, + coord_names=self._coord_names | set(coord_names), dims=self._dims, indexes=indexes, ) From 01de6bd49259e943b962b492f2b9cdff1e463907 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 11:41:07 +0200 Subject: [PATCH 10/26] add tests for set_xindex --- xarray/tests/test_dataarray.py | 17 ++++++++++++++ xarray/tests/test_dataset.py | 43 +++++++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 298840f3f66..11e145bf2c1 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2046,6 +2046,23 @@ def test_reorder_levels(self) -> None: with pytest.raises(ValueError, match=r"has no MultiIndex"): array.reorder_levels(x=["level_1", "level_2"]) + def test_set_xindex(self) -> None: + da = DataArray( + [1, 2, 3, 4], coords={"foo": ("x", ["a", "a", "b", "b"])}, dims="x" + ) + + class IndexWithOptions(Index): + def __init__(self, opt): + self.opt = opt + + @classmethod + def from_variables(cls, variables, options): + return cls(options["opt"]) + + indexed = da.set_xindex("foo", IndexWithOptions, opt=1) + assert "foo" in indexed.xindexes + assert getattr(indexed.xindexes["foo"], "opt") == 1 + def test_dataset_getitem(self) -> None: dv = self.ds["foo"] assert_identical(dv, self.dv) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e0bc73ec044..7b5e262ab5e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -30,7 +30,7 @@ from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.coordinates import DatasetCoordinates -from xarray.core.indexes import Index +from xarray.core.indexes import Index, PandasIndex, PandasMultiIndex from xarray.core.pycompat import integer_types, sparse_array_type from xarray.core.utils import is_scalar @@ -3260,6 +3260,47 @@ def test_reorder_levels(self) -> None: with pytest.raises(ValueError, match=r"has no MultiIndex"): ds.reorder_levels(x=["level_1", "level_2"]) + def test_set_xindex(self) -> None: + ds = Dataset( + coords={"foo": ("x", ["a", "a", "b", "b"]), "bar": ("x", [0, 1, 2, 3])} + ) + + actual = ds.set_xindex("foo", PandasIndex) + expected = ds.set_index(x="foo").rename_vars(x="foo") + assert_identical(actual, expected, check_default_indexes=False) + + actual_mindex = ds.set_xindex(["foo", "bar"], PandasMultiIndex) + expected_mindex = ds.set_index(x=["foo", "bar"]) + assert_identical(actual_mindex, expected_mindex) + + class NotAnIndex: + ... + + with pytest.raises(TypeError, match=".*not a subclass of xarray.Index"): + ds.set_xindex("foo", NotAnIndex) # type: ignore + + with pytest.raises(ValueError, match="those coordinates don't exist"): + ds.set_xindex("not_a_coordinate", PandasIndex) + + ds2 = Dataset(coords={"x": ("x", [0, 1, 2, 3])}) + + with pytest.raises(ValueError, match="those coordinates already have an index"): + ds2.set_xindex("x", PandasIndex) + + def test_set_xindex_options(self) -> None: + ds = Dataset(coords={"foo": ("x", ["a", "a", "b", "b"])}) + + class IndexWithOptions(Index): + def __init__(self, opt): + self.opt = opt + + @classmethod + def from_variables(cls, variables, options): + return cls(options["opt"]) + + indexed = ds.set_xindex("foo", IndexWithOptions, opt=1) + assert getattr(indexed.xindexes["foo"], "opt") == 1 + def test_stack(self) -> None: ds = Dataset( data_vars={"b": (("x", "y"), [[0, 1], [2, 3]])}, From 201bd05ada1d4969e620e376cd9102fbd7a72de2 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 11:42:01 +0200 Subject: [PATCH 11/26] error message tweaks --- xarray/core/dataset.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 69e3af3864b..b044f9f9d1e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4181,9 +4181,7 @@ def set_xindex( """ if not issubclass(index_cls, Index): - raise TypeError( - f"{index_cls} is not a subclass of xarray.core.indexes.Index" - ) + raise TypeError(f"{index_cls} is not a subclass of xarray.Index") # the Sequence check is required for mypy if is_scalar(coord_names) or not isinstance(coord_names, Sequence): @@ -4192,9 +4190,7 @@ def set_xindex( invalid_coords = set(coord_names) - self._coord_names if invalid_coords: - raise ValueError( - f"those coordinates don't exist in Dataset: {invalid_coords}" - ) + raise ValueError(f"those coordinates don't exist: {invalid_coords}") # we could be more clever here (e.g., drop-in index replacement if index # coordinates do not conflict), but let's not allow this for now From 41c896f9bdfc1dcdce432ff588b8013cca7f96d6 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 13:06:06 +0200 Subject: [PATCH 12/26] set_xindex with 1 coord: avoid reodering coords --- xarray/core/dataset.py | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b044f9f9d1e..701a1edb542 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4213,29 +4213,39 @@ def set_xindex( if isinstance(index, PandasMultiIndex): coord_names = [index.dim] + list(coord_names) - # reorder variables and indexes so that coordinates having the same index - # are next to each other - variables: dict[Hashable, Variable] = {} - for name, var in self._variables.items(): - if name not in coord_names: - variables[name] = var + variables: dict[Hashable, Variable] - indexes: dict[Hashable, Index] = {} - for name, idx in self._indexes.items(): - if name not in coord_names: - indexes[name] = idx + if len(coord_names) == 1: + variables = self._variables.copy() + indexes = self._indexes.copy() - for name in coord_names: - try: + name = set(coord_names).pop() + if name in new_coord_vars: variables[name] = new_coord_vars[name] - except KeyError: - variables[name] = self._variables[name] indexes[name] = index + else: + # reorder variables and indexes so that coordinates having the same + # index are next to each other + variables = {} + for name, var in self._variables.items(): + if name not in coord_names: + variables[name] = var - return self._construct_direct( + indexes: dict[Hashable, Index] = {} + for name, idx in self._indexes.items(): + if name not in coord_names: + indexes[name] = idx + + for name in coord_names: + try: + variables[name] = new_coord_vars[name] + except KeyError: + variables[name] = self._variables[name] + indexes[name] = index + + return self._replace( variables=variables, coord_names=self._coord_names | set(coord_names), - dims=self._dims, indexes=indexes, ) From 1ec5ca683ac8eeb38dbf7fbdd1feaaf52b56b997 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 13:21:03 +0200 Subject: [PATCH 13/26] mypy fixes --- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 9cc56b23a83..e8bf2ae439f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2255,11 +2255,11 @@ def reset_index( return self._from_temp_dataset(ds) def set_xindex( - self: T_DataArray, + self, coord_names: Hashable | Sequence[Hashable], index_cls: type[Index], **options, - ) -> T_DataArray: + ) -> DataArray: """Set a new, Xarray-compatible index from one or more existing coordinate(s). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 701a1edb542..b3fdf9de8f4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4156,11 +4156,11 @@ def reset_index( return self._replace(variables, coord_names=coord_names, indexes=indexes) def set_xindex( - self: T_Dataset, + self, coord_names: Hashable | Sequence[Hashable], index_cls: type[Index], **options, - ) -> T_Dataset: + ) -> Dataset: """Set a new, Xarray-compatible index from one or more existing coordinate(s). @@ -4214,12 +4214,13 @@ def set_xindex( coord_names = [index.dim] + list(coord_names) variables: dict[Hashable, Variable] + indexes: dict[Hashable, Index] if len(coord_names) == 1: variables = self._variables.copy() indexes = self._indexes.copy() - name = set(coord_names).pop() + name = list(coord_names).pop() if name in new_coord_vars: variables[name] = new_coord_vars[name] indexes[name] = index @@ -4231,7 +4232,7 @@ def set_xindex( if name not in coord_names: variables[name] = var - indexes: dict[Hashable, Index] = {} + indexes = {} for name, idx in self._indexes.items(): if name not in coord_names: indexes[name] = idx From a6caa7ada3a6add031e0f9181815698d30d06ed2 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 13:21:34 +0200 Subject: [PATCH 14/26] add Dataset and DataArray drop_indexes methods --- xarray/core/dataarray.py | 25 +++++++++++++++++++ xarray/core/dataset.py | 53 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e8bf2ae439f..e3101d494c8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2592,6 +2592,31 @@ def drop_vars( ds = self._to_temp_dataset().drop_vars(names, errors=errors) return self._from_temp_dataset(ds) + def drop_indexes( + self, + coord_names: Hashable | Iterable[Hashable], + *, + errors: ErrorOptions = "raise", + ) -> DataArray: + """Drop the indexes assigned to the given coordinates. + + Parameters + ---------- + coord_names : hashable or iterable of hashable + Name(s) of the coordinate(s) for which to drop the index. + errors : {"raise", "ignore"}, default: "raise" + If 'raise', raises a ValueError error if any of the coordinates + passed have no index or are not in the dataset. + If 'ignore', no error is raised. + + Returns + ------- + dropped : DataArray + A new dataarray with dropped indexes. + """ + ds = self._to_temp_dataset().drop_indexes(coord_names, errors=errors) + return self._from_temp_dataset(ds) + def drop( self: T_DataArray, labels: Mapping[Any, Any] | None = None, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b3fdf9de8f4..fbc869d2f07 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4971,6 +4971,59 @@ def drop_vars( variables, coord_names=coord_names, indexes=indexes ) + def drop_indexes( + self: T_Dataset, + coord_names: Hashable | Iterable[Hashable], + *, + errors: ErrorOptions = "raise", + ) -> T_Dataset: + """Drop the indexes assigned to the given coordinates. + + Parameters + ---------- + coord_names : hashable or iterable of hashable + Name(s) of the coordinate(s) for which to drop the index. + errors : {"raise", "ignore"}, default: "raise" + If 'raise', raises a ValueError error if any of the coordinates + passed have no index or are not in the dataset. + If 'ignore', no error is raised. + + Returns + ------- + dropped : Dataset + A new dataset with dropped indexes. + + """ + # the Iterable check is required for mypy + if is_scalar(coord_names) or not isinstance(coord_names, Iterable): + coord_names = {coord_names} + else: + coord_names = set(coord_names) + + if errors == "raise": + invalid_coords = coord_names - self._coord_names + if invalid_coords: + raise ValueError(f"those coordinates don't exist: {invalid_coords}") + + unindexed_coords = set(coord_names) - set(self._indexes) + if unindexed_coords: + raise ValueError( + f"those coordinates do not have an index: {unindexed_coords}" + ) + + assert_no_index_corrupted(self.xindexes, coord_names) + + variables = {} + for name, var in self._variables.items(): + if name in coord_names: + variables[name] = var.to_base_variable() + else: + variables[name] = var + + indexes = {k: v for k, v in self._indexes.items() if k not in coord_names} + + return self._replace(variables=variables, indexes=indexes) + def drop( self: T_Dataset, labels=None, From bb07d5a57786ca4bfacd11bc33aad45999e6a8ba Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 14:23:56 +0200 Subject: [PATCH 15/26] improve assert_no_index_corrupted error msg --- xarray/core/dataset.py | 2 +- xarray/core/indexes.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fbc869d2f07..34aa9cc5e05 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5011,7 +5011,7 @@ def drop_indexes( f"those coordinates do not have an index: {unindexed_coords}" ) - assert_no_index_corrupted(self.xindexes, coord_names) + assert_no_index_corrupted(self.xindexes, coord_names, action="remove index(es)") variables = {} for name, var in self._variables.items(): diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 8c83589feae..bb29153288f 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1407,8 +1407,9 @@ def filter_indexes_from_coords( def assert_no_index_corrupted( indexes: Indexes[Index], coord_names: set[Hashable], + action: str = "remove coordinate(s)", ) -> None: - """Assert removing coordinates will not corrupt indexes.""" + """Assert removing coordinates or indexes will not corrupt indexes.""" # An index may be corrupted when the set of its corresponding coordinate name(s) # partially overlaps the set of coordinate names to remove @@ -1418,7 +1419,7 @@ def assert_no_index_corrupted( common_names_str = ", ".join(f"{k!r}" for k in common_names) index_names_str = ", ".join(f"{k!r}" for k in index_coords) raise ValueError( - f"cannot remove coordinate(s) {common_names_str}, which would corrupt " + f"cannot {action} {common_names_str}, which would corrupt " f"the following index built from coordinates {index_names_str}:\n" f"{index}" ) From ec2f8fca562380fc9696248da3b4c1b448e1cc90 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 14:24:49 +0200 Subject: [PATCH 16/26] drop_indexes: add tests --- xarray/tests/test_dataarray.py | 8 ++++++++ xarray/tests/test_dataset.py | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 11e145bf2c1..f6627ff2d60 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2522,6 +2522,14 @@ def test_drop_index_positions(self) -> None: expected = arr[:, 2:] assert_identical(actual, expected) + def test_drop_indexes(self) -> None: + arr = DataArray([1, 2, 3], coords={"x": ("x", [1, 2, 3])}, dims="x") + actual = arr.drop_indexes("x") + assert "x" not in actual.xindexes + + actual = arr.drop_indexes("not_a_coord", errors="ignore") + assert_identical(actual, arr) + def test_dropna(self) -> None: x = np.random.randn(4, 4) x[::2, 0] = np.nan diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7b5e262ab5e..33108ea238a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2648,6 +2648,41 @@ def test_drop_labels_by_position(self) -> None: with pytest.raises(KeyError): data.drop_isel(z=1) + def test_drop_indexes(self) -> None: + ds = Dataset( + coords={ + "x": ("x", [0, 1, 2]), + "y": ("y", [3, 4, 5]), + "foo": ("x", ["a", "a", "b"]), + } + ) + + actual = ds.drop_indexes("x") + assert "x" not in actual.xindexes + assert type(actual.x.variable) is Variable + + actual = ds.drop_indexes(["x", "y"]) + assert "x" not in actual.xindexes + assert "y" not in actual.xindexes + assert type(actual.x.variable) is Variable + assert type(actual.y.variable) is Variable + + with pytest.raises(ValueError, match="those coordinates don't exist"): + ds.drop_indexes("not_a_coord") + + with pytest.raises(ValueError, match="those coordinates do not have an index"): + ds.drop_indexes("foo") + + actual = ds.drop_indexes(["foo", "not_a_coord"], errors="ignore") + assert_identical(actual, ds) + + # test index corrupted + mindex = pd.MultiIndex.from_tuples([([1, 2]), ([3, 4])], names=["a", "b"]) + ds = Dataset(coords={"x": mindex}) + + with pytest.raises(ValueError, match=".*would corrupt the following index.*"): + ds.drop_indexes("a") + def test_drop_dims(self) -> None: data = xr.Dataset( { From f9601b9406ff480c5125bf2365226d4e5858a401 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 14:26:20 +0200 Subject: [PATCH 17/26] add drop_indexes to api.rst --- doc/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 81bd2d22b08..bdea3ef486c 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -107,6 +107,7 @@ Dataset contents Dataset.swap_dims Dataset.expand_dims Dataset.drop_vars + Dataset.drop_indexes Dataset.drop_duplicates Dataset.drop_dims Dataset.set_coords @@ -299,6 +300,7 @@ DataArray contents DataArray.swap_dims DataArray.expand_dims DataArray.drop_vars + DataArray.drop_indexes DataArray.drop_duplicates DataArray.reset_coords DataArray.copy From 1a555bcc03adae36f4f4e72331b09172770c4223 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 15:47:29 +0200 Subject: [PATCH 18/26] improve docstrings of legacy methods --- xarray/core/dataarray.py | 8 ++++++++ xarray/core/dataset.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e3101d494c8..be142b89150 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2232,6 +2232,12 @@ def reset_index( ) -> DataArray: """Reset the specified index(es) or multi-index level(s). + This legacy method is specific to pandas (multi-)indexes and + 1-dimensional "dimension" coordinates. See the more generic + :py:meth:`~DataArray.drop_indexes` and :py:meth:`~DataArray.set_xindex` + method to respectively drop and set pandas or custom indexes for + arbitrary coordinates. + Parameters ---------- dims_or_levels : Hashable or sequence of Hashable @@ -2250,6 +2256,8 @@ def reset_index( See Also -------- DataArray.set_index + DataArray.set_xindex + DataArray.drop_indexes """ ds = self._to_temp_dataset().reset_index(dims_or_levels, drop=drop) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 34aa9cc5e05..9caf1d979f0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4086,6 +4086,12 @@ def reset_index( ) -> T_Dataset: """Reset the specified index(es) or multi-index level(s). + This legacy method is specific to pandas (multi-)indexes and + 1-dimensional "dimension" coordinates. See the more generic + :py:meth:`~Dataset.drop_indexes` and :py:meth:`~Dataset.set_xindex` + method to respectively drop and set pandas or custom indexes for + arbitrary coordinates. + Parameters ---------- dims_or_levels : Hashable or Sequence of Hashable @@ -4103,6 +4109,8 @@ def reset_index( See Also -------- Dataset.set_index + Dataset.set_xindex + Dataset.drop_indexes """ if isinstance(dims_or_levels, str) or not isinstance(dims_or_levels, Sequence): dims_or_levels = [dims_or_levels] From 0b7d582710efc4a7fac3de73e93c270a13d397f6 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2022 15:48:02 +0200 Subject: [PATCH 19/26] add what's new entry --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9ce51e48983..6c780522298 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,10 @@ v2022.07.0 (unreleased) New Features ~~~~~~~~~~~~ +- Add :py:meth:`Dataset.set_xindex` and :py:meth:`Dataset.drop_indexes` and + their DataArray counterpart for setting and dropping pandas or custom indexes + given a set of arbitrary coordinates. (:pull:`6971`) + By `BenoƮt Bovy `_ and `Justus Magin `_. Breaking changes ~~~~~~~~~~~~~~~~ From 3ab0bc934d393664ff872c6dd71539ff547ddd00 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 1 Sep 2022 09:32:57 +0200 Subject: [PATCH 20/26] try using correct typing w/o mypy complaining --- xarray/core/dataarray.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index be142b89150..dead0d44f0a 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2263,11 +2263,11 @@ def reset_index( return self._from_temp_dataset(ds) def set_xindex( - self, + self: T_DataArray, coord_names: Hashable | Sequence[Hashable], index_cls: type[Index], **options, - ) -> DataArray: + ) -> T_DataArray: """Set a new, Xarray-compatible index from one or more existing coordinate(s). @@ -2601,11 +2601,11 @@ def drop_vars( return self._from_temp_dataset(ds) def drop_indexes( - self, + self: T_DataArray, coord_names: Hashable | Iterable[Hashable], *, errors: ErrorOptions = "raise", - ) -> DataArray: + ) -> T_DataArray: """Drop the indexes assigned to the given coordinates. Parameters From 9e75f959ebca1838506a7161021bc93237ab24cf Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 7 Sep 2022 10:51:39 +0200 Subject: [PATCH 21/26] make index_cls arg optional Try setting a pandas (multi-)index by default. --- xarray/core/dataarray.py | 5 +++-- xarray/core/dataset.py | 19 +++++++++++++------ xarray/tests/test_dataset.py | 6 +++--- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index dead0d44f0a..bf0e56d0a05 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2265,7 +2265,7 @@ def reset_index( def set_xindex( self: T_DataArray, coord_names: Hashable | Sequence[Hashable], - index_cls: type[Index], + index_cls: type[Index] | None = None, **options, ) -> T_DataArray: """Set a new, Xarray-compatible index from one or more existing @@ -2277,7 +2277,8 @@ def set_xindex( Name(s) of the coordinate(s) used to build the index. If several names are given, their order matters. index_cls : subclass of :class:`~xarray.Index` - The type of index to create. + The type of index to create. By default, try setting + a pandas (multi-)index from the supplied coordinates. **options Options passed to the index constructor. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9caf1d979f0..0708d44fd81 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4166,7 +4166,7 @@ def reset_index( def set_xindex( self, coord_names: Hashable | Sequence[Hashable], - index_cls: type[Index], + index_cls: type[Index] | None = None, **options, ) -> Dataset: """Set a new, Xarray-compatible index from one or more existing @@ -4177,8 +4177,9 @@ def set_xindex( coord_names : str or list Name(s) of the coordinate(s) used to build the index. If several names are given, their order matters. - index_cls : subclass of :class:`~xarray.Index` - The type of index to create. + index_cls : subclass of :class:`~xarray.Index`, optional + The type of index to create. By default, try setting + a pandas (multi-)index from the supplied coordinates. **options Options passed to the index constructor. @@ -4188,13 +4189,19 @@ def set_xindex( Another dataset, with this dataset's data and with a new index. """ - if not issubclass(index_cls, Index): - raise TypeError(f"{index_cls} is not a subclass of xarray.Index") - # the Sequence check is required for mypy if is_scalar(coord_names) or not isinstance(coord_names, Sequence): coord_names = [coord_names] + if index_cls is None: + if len(coord_names) == 1: + index_cls = PandasIndex + else: + index_cls = PandasMultiIndex + else: + if not issubclass(index_cls, Index): + raise TypeError(f"{index_cls} is not a subclass of xarray.Index") + invalid_coords = set(coord_names) - self._coord_names if invalid_coords: diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 33108ea238a..78ac02b4346 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -30,7 +30,7 @@ from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.coordinates import DatasetCoordinates -from xarray.core.indexes import Index, PandasIndex, PandasMultiIndex +from xarray.core.indexes import Index, PandasIndex from xarray.core.pycompat import integer_types, sparse_array_type from xarray.core.utils import is_scalar @@ -3300,11 +3300,11 @@ def test_set_xindex(self) -> None: coords={"foo": ("x", ["a", "a", "b", "b"]), "bar": ("x", [0, 1, 2, 3])} ) - actual = ds.set_xindex("foo", PandasIndex) + actual = ds.set_xindex("foo") expected = ds.set_index(x="foo").rename_vars(x="foo") assert_identical(actual, expected, check_default_indexes=False) - actual_mindex = ds.set_xindex(["foo", "bar"], PandasMultiIndex) + actual_mindex = ds.set_xindex(["foo", "bar"]) expected_mindex = ds.set_index(x=["foo", "bar"]) assert_identical(actual_mindex, expected_mindex) From 00c271125d3a5d3ce3c805f5ce59a379b7d9eb62 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 23 Sep 2022 09:45:17 +0200 Subject: [PATCH 22/26] docstrings fixes and tweaks --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index bf0e56d0a05..e3d859964bd 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2284,7 +2284,7 @@ def set_xindex( Returns ------- - obj : Dataset + obj : DataArray Another dataarray, with this dataarray's data and with a new index. """ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0708d44fd81..9f6a75421ac 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4179,7 +4179,8 @@ def set_xindex( If several names are given, their order matters. index_cls : subclass of :class:`~xarray.Index`, optional The type of index to create. By default, try setting - a pandas (multi-)index from the supplied coordinates. + a ``PandasIndex`` if ``len(coord_names) == 1``, + otherwise a ``PandasMultiIndex``. **options Options passed to the index constructor. From cb6761254d5525aa091375d2b8006c2036eee982 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 23 Sep 2022 10:01:09 +0200 Subject: [PATCH 23/26] make Index.from_variables options arg keyword only --- xarray/core/dataset.py | 6 +++--- xarray/core/indexes.py | 5 ++++- xarray/tests/test_indexes.py | 16 ++++++++-------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9f6a75421ac..53201e203e5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4037,7 +4037,7 @@ def set_index( f"dimension mismatch: try setting an index for dimension {dim!r} with " f"variable {var_name!r} that has dimensions {var.dims}" ) - idx = PandasIndex.from_variables({dim: var}, {}) + idx = PandasIndex.from_variables({dim: var}, options={}) idx_vars = idx.create_variables({var_name: var}) else: if append: @@ -4219,7 +4219,7 @@ def set_xindex( coord_vars = {name: self._variables[name] for name in coord_names} - index = index_cls.from_variables(coord_vars, options) + index = index_cls.from_variables(coord_vars, options=options) new_coord_vars = index.create_variables(coord_vars) @@ -7948,7 +7948,7 @@ def pad( # reset default index of dimension coordinates if (name,) == var.dims: dim_var = {name: variables[name]} - index = PandasIndex.from_variables(dim_var, {}) + index = PandasIndex.from_variables(dim_var, options={}) index_vars = index.create_variables(dim_var) indexes[name] = index variables[name] = index_vars[name] diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index bb29153288f..9ec32cb1b0e 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -42,6 +42,7 @@ class Index: def from_variables( cls, variables: Mapping[Any, Variable], + *, options: Mapping[str, Any], ) -> Index: raise NotImplementedError() @@ -258,6 +259,7 @@ def _replace(self, index, dim=None, coord_dtype=None): def from_variables( cls, variables: Mapping[Any, Variable], + *, options: Mapping[str, Any], ) -> PandasIndex: if len(variables) != 1: @@ -585,6 +587,7 @@ def _replace(self, index, dim=None, level_coords_dtype=None) -> PandasMultiIndex def from_variables( cls, variables: Mapping[Any, Variable], + *, options: Mapping[str, Any], ) -> PandasMultiIndex: _check_dim_compat(variables) @@ -1011,7 +1014,7 @@ def create_default_index_implicit( ) else: dim_var = {name: dim_variable} - index = PandasIndex.from_variables(dim_var, {}) + index = PandasIndex.from_variables(dim_var, options={}) index_vars = index.create_variables(dim_var) return index, index_vars diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index 07a4bdafafc..9abe9178296 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -44,7 +44,7 @@ def index(self) -> CustomIndex: def test_from_variables(self) -> None: with pytest.raises(NotImplementedError): - Index.from_variables({}, {}) + Index.from_variables({}, options={}) def test_concat(self) -> None: with pytest.raises(NotImplementedError): @@ -132,19 +132,19 @@ def test_from_variables(self) -> None: "x", data, attrs={"unit": "m"}, encoding={"dtype": np.float64} ) - index = PandasIndex.from_variables({"x": var}, {}) + index = PandasIndex.from_variables({"x": var}, options={}) assert index.dim == "x" assert index.index.equals(pd.Index(data)) assert index.coord_dtype == data.dtype var2 = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]]) with pytest.raises(ValueError, match=r".*only accepts one variable.*"): - PandasIndex.from_variables({"x": var, "foo": var2}, {}) + PandasIndex.from_variables({"x": var, "foo": var2}, options={}) with pytest.raises( ValueError, match=r".*only accepts a 1-dimensional variable.*" ): - PandasIndex.from_variables({"foo": var2}, {}) + PandasIndex.from_variables({"foo": var2}, options={}) def test_from_variables_index_adapter(self) -> None: # test index type is preserved when variable wraps a pd.Index @@ -152,7 +152,7 @@ def test_from_variables_index_adapter(self) -> None: pd_idx = pd.Index(data) var = xr.Variable("x", pd_idx) - index = PandasIndex.from_variables({"x": var}, {}) + index = PandasIndex.from_variables({"x": var}, options={}) assert isinstance(index.index, pd.CategoricalIndex) def test_concat_periods(self): @@ -355,7 +355,7 @@ def test_from_variables(self) -> None: ) index = PandasMultiIndex.from_variables( - {"level1": v_level1, "level2": v_level2}, {} + {"level1": v_level1, "level2": v_level2}, options={} ) expected_idx = pd.MultiIndex.from_arrays([v_level1.data, v_level2.data]) @@ -368,14 +368,14 @@ def test_from_variables(self) -> None: with pytest.raises( ValueError, match=r".*only accepts 1-dimensional variables.*" ): - PandasMultiIndex.from_variables({"var": var}, {}) + PandasMultiIndex.from_variables({"var": var}, options={}) v_level3 = xr.Variable("y", [4, 5, 6]) with pytest.raises( ValueError, match=r"unmatched dimensions for multi-index variables.*" ): PandasMultiIndex.from_variables( - {"level1": v_level1, "level3": v_level3}, {} + {"level1": v_level1, "level3": v_level3}, options={} ) def test_concat(self) -> None: From 2cd0aa848c374af77c3d50b4f4e03dc36deb8f42 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 23 Sep 2022 11:59:43 +0200 Subject: [PATCH 24/26] improve set_xindex invalid coordinates error msg --- xarray/core/dataset.py | 11 ++++++++++- xarray/tests/test_dataset.py | 7 ++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2691ea28ae8..a759135f616 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4207,7 +4207,16 @@ def set_xindex( invalid_coords = set(coord_names) - self._coord_names if invalid_coords: - raise ValueError(f"those coordinates don't exist: {invalid_coords}") + msg = ["invalid coordinate(s)"] + no_vars = invalid_coords - set(self._variables) + data_vars = invalid_coords - no_vars + if no_vars: + msg.append(f"those variables don't exist: {no_vars}") + if data_vars: + msg.append( + f"those variables are data variables: {data_vars}, use `set_coords` first" + ) + raise ValueError("\n".join(msg)) # we could be more clever here (e.g., drop-in index replacement if index # coordinates do not conflict), but let's not allow this for now diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 78ac02b4346..9a5d2e23760 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3314,9 +3314,14 @@ class NotAnIndex: with pytest.raises(TypeError, match=".*not a subclass of xarray.Index"): ds.set_xindex("foo", NotAnIndex) # type: ignore - with pytest.raises(ValueError, match="those coordinates don't exist"): + with pytest.raises(ValueError, match="those variables don't exist"): ds.set_xindex("not_a_coordinate", PandasIndex) + ds["data_var"] = ("x", [1, 2, 3, 4]) + + with pytest.raises(ValueError, match="those variables are data variables"): + ds.set_xindex("data_var", PandasIndex) + ds2 = Dataset(coords={"x": ("x", [0, 1, 2, 3])}) with pytest.raises(ValueError, match="those coordinates already have an index"): From 61d6e28193fdc89b4db68e051228d75f5e00b7ba Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 27 Sep 2022 11:56:32 +0200 Subject: [PATCH 25/26] add xarray.indexes namespace --- doc/api.rst | 7 ++++++- xarray/__init__.py | 2 -- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- xarray/indexes/__init__.py | 7 +++++++ 5 files changed, 15 insertions(+), 5 deletions(-) create mode 100644 xarray/indexes/__init__.py diff --git a/doc/api.rst b/doc/api.rst index bdea3ef486c..c3488389d4c 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1084,7 +1084,7 @@ Advanced API Variable IndexVariable as_variable - Index + indexes.Index Context register_dataset_accessor register_dataarray_accessor @@ -1092,6 +1092,11 @@ Advanced API backends.BackendArray backends.BackendEntrypoint +Default, pandas-backed indexes built-in Xarray: + + indexes.PandasIndex + indexes.PandasMultiIndex + These backends provide a low-level interface for lazily loading data from external file-formats or protocols, and can be manually invoked to create arguments for the ``load_store`` and ``dump_to_store`` Dataset methods: diff --git a/xarray/__init__.py b/xarray/__init__.py index 8ea955e7210..46dcf0e9b32 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -30,7 +30,6 @@ from .core.dataarray import DataArray from .core.dataset import Dataset from .core.extensions import register_dataarray_accessor, register_dataset_accessor -from .core.indexes import Index from .core.merge import Context, MergeError, merge from .core.options import get_options, set_options from .core.parallel import map_blocks @@ -100,7 +99,6 @@ "Coordinate", "DataArray", "Dataset", - "Index", "IndexVariable", "Variable", # Exceptions diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e6922f60ef3..7d1509d4483 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2277,7 +2277,7 @@ def set_xindex( coord_names : str or list Name(s) of the coordinate(s) used to build the index. If several names are given, their order matters. - index_cls : subclass of :class:`~xarray.Index` + index_cls : subclass of :class:`~xarray.indexes.Index` The type of index to create. By default, try setting a pandas (multi-)index from the supplied coordinates. **options diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a759135f616..a5d0c5cfe5f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4178,7 +4178,7 @@ def set_xindex( coord_names : str or list Name(s) of the coordinate(s) used to build the index. If several names are given, their order matters. - index_cls : subclass of :class:`~xarray.Index`, optional + index_cls : subclass of :class:`~xarray.indexes.Index`, optional The type of index to create. By default, try setting a ``PandasIndex`` if ``len(coord_names) == 1``, otherwise a ``PandasMultiIndex``. diff --git a/xarray/indexes/__init__.py b/xarray/indexes/__init__.py new file mode 100644 index 00000000000..41321c9a0ff --- /dev/null +++ b/xarray/indexes/__init__.py @@ -0,0 +1,7 @@ +"""Xarray index objects for label-based selection and alignment of Dataset / +DataArray objects. + +""" +from ..core.indexes import Index, PandasIndex, PandasMultiIndex + +__all__ = ["Index", "PandasIndex", "PandasMultiIndex"] From b598447ba2e9c98bb1186719dc9bc6be95e13042 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 27 Sep 2022 13:47:08 +0200 Subject: [PATCH 26/26] type tweaks --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 94f9e2990e7..f98879b689c 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2296,7 +2296,7 @@ def reset_index( def set_xindex( self: T_DataArray, - coord_names: Hashable | Sequence[Hashable], + coord_names: str | Sequence[Hashable], index_cls: type[Index] | None = None, **options, ) -> T_DataArray: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1be3c69b45b..7a73979cef9 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4240,11 +4240,11 @@ def drop_or_convert(var_names): ) def set_xindex( - self, - coord_names: Hashable | Sequence[Hashable], + self: T_Dataset, + coord_names: str | Sequence[Hashable], index_cls: type[Index] | None = None, **options, - ) -> Dataset: + ) -> T_Dataset: """Set a new, Xarray-compatible index from one or more existing coordinate(s).