Skip to content

Allow setting (or skipping) new indexes in open_dataset #8051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Jul 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c703ebc
add set_indexes parameter to open_dataset
benbovy Aug 7, 2023
6f54cd5
implement set_indexes in (zarr) backend store
benbovy Aug 7, 2023
f77aac7
Merge branch 'main' into backend-set-indexes
dcherian Nov 13, 2023
eae983b
Merge branch 'main' into backend-set-indexes
TomNicholas Feb 3, 2024
dfe6496
Merge branch 'main' into backend-set-indexes
keewis Jun 29, 2025
145ae1c
replace `set_indexes` with `create_default_indexes`
keewis Jun 30, 2025
192c367
make sure indexes set by the backend survive
keewis Jun 30, 2025
f5823a7
also add the parameter to `open_datatree`
keewis Jun 30, 2025
2ff8402
share the implementation of the default indexes creation
keewis Jun 30, 2025
82d629b
Merge branch 'main' into backend-set-indexes
keewis Jun 30, 2025
de5ce26
Merge branch 'main' into backend-set-indexes
keewis Jul 1, 2025
294b2f7
check that the store backend entrypoint does not create default indexes
keewis Jul 1, 2025
5c3a843
actually do not create default indexes in the backends
keewis Jul 1, 2025
08939de
rename the helper
keewis Jul 1, 2025
0f281b1
Merge branch 'main' into backend-set-indexes
dcherian Jul 2, 2025
4620490
Merge branch 'main' into backend-set-indexes
dcherian Jul 3, 2025
95dbf8e
move the handling of `create_default_indexes` up the call stack
keewis Jul 3, 2025
eb4f866
Merge branch 'main' into backend-set-indexes
keewis Jul 3, 2025
d7e6daa
what's new
keewis Jul 3, 2025
687f0c2
Merge branch 'main' into backend-set-indexes
benbovy Jul 8, 2025
3d483d3
Fix
dcherian Jul 8, 2025
741564e
fix again
dcherian Jul 8, 2025
8889eda
also create default indexes without chunks
keewis Jul 8, 2025
804db4c
also copy `_close`
keewis Jul 8, 2025
75c1dd6
reuse the code for copying `_close`
keewis Jul 8, 2025
a0d94fb
refactor
dcherian Jul 8, 2025
bbc263b
Merge branch 'main' into backend-set-indexes
dcherian Jul 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ v2025.07.1 (unreleased)

New Features
~~~~~~~~~~~~

- Allow skipping the creation of default indexes when opening datasets (:pull:`8051`).
By `Benoit Bovy <https://github.com/benbovy>`_ and `Justus Magin <https://github.com/keewis>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
69 changes: 62 additions & 7 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from xarray.backends.locks import _get_scheduler
from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder
from xarray.core import indexing
from xarray.core.coordinates import Coordinates
from xarray.core.dataarray import DataArray
from xarray.core.dataset import Dataset
from xarray.core.datatree import DataTree
Expand Down Expand Up @@ -379,6 +380,15 @@ def _chunk_ds(
return backend_ds._replace(variables)


def _maybe_create_default_indexes(ds):
to_index = {
name: coord.variable
for name, coord in ds.coords.items()
if coord.dims == (name,) and name not in ds.xindexes
}
return ds.assign_coords(Coordinates(to_index))


def _dataset_from_backend_dataset(
backend_ds,
filename_or_obj,
Expand All @@ -389,6 +399,7 @@ def _dataset_from_backend_dataset(
inline_array,
chunked_array_type,
from_array_kwargs,
create_default_indexes,
**extra_tokens,
):
if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}:
Expand All @@ -397,11 +408,15 @@ def _dataset_from_backend_dataset(
)

_protect_dataset_variables_inplace(backend_ds, cache)
if chunks is None:
ds = backend_ds

if create_default_indexes:
ds = _maybe_create_default_indexes(backend_ds)
else:
ds = backend_ds

if chunks is not None:
ds = _chunk_ds(
backend_ds,
ds,
filename_or_obj,
engine,
chunks,
Expand Down Expand Up @@ -434,6 +449,7 @@ def _datatree_from_backend_datatree(
inline_array,
chunked_array_type,
from_array_kwargs,
create_default_indexes,
**extra_tokens,
):
if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}:
Expand All @@ -442,9 +458,11 @@ def _datatree_from_backend_datatree(
)

_protect_datatree_variables_inplace(backend_tree, cache)
if chunks is None:
tree = backend_tree
if create_default_indexes:
tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
else:
tree = backend_tree
if chunks is not None:
tree = DataTree.from_dict(
{
path: _chunk_ds(
Expand All @@ -459,11 +477,12 @@ def _datatree_from_backend_datatree(
node=path,
**extra_tokens,
)
for path, [node] in group_subtrees(backend_tree)
for path, [node] in group_subtrees(tree)
},
name=backend_tree.name,
name=tree.name,
)

if create_default_indexes or chunks is not None:
for path, [node] in group_subtrees(backend_tree):
tree[path].set_close(node._close)

Expand Down Expand Up @@ -497,6 +516,7 @@ def open_dataset(
concat_characters: bool | Mapping[str, bool] | None = None,
decode_coords: Literal["coordinates", "all"] | bool | None = None,
drop_variables: str | Iterable[str] | None = None,
create_default_indexes: bool = True,
inline_array: bool = False,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -610,6 +630,13 @@ def open_dataset(
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
create_default_indexes : bool, default: True
If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
which loads the coordinate data into memory. Set it to False if you want to avoid loading
data into memory.

Note that backends can still choose to create other indexes. If you want to control that,
please refer to the backend's documentation.
inline_array: bool, default: False
How to include the array in the dask task graph.
By default(``inline_array=False``) the array is included in a task by
Expand Down Expand Up @@ -702,6 +729,7 @@ def open_dataset(
chunked_array_type,
from_array_kwargs,
drop_variables=drop_variables,
create_default_indexes=create_default_indexes,
**decoders,
**kwargs,
)
Expand All @@ -725,6 +753,7 @@ def open_dataarray(
concat_characters: bool | None = None,
decode_coords: Literal["coordinates", "all"] | bool | None = None,
drop_variables: str | Iterable[str] | None = None,
create_default_indexes: bool = True,
inline_array: bool = False,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -833,6 +862,13 @@ def open_dataarray(
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
create_default_indexes : bool, default: True
If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
which loads the coordinate data into memory. Set it to False if you want to avoid loading
data into memory.

Note that backends can still choose to create other indexes. If you want to control that,
please refer to the backend's documentation.
inline_array: bool, default: False
How to include the array in the dask task graph.
By default(``inline_array=False``) the array is included in a task by
Expand Down Expand Up @@ -890,6 +926,7 @@ def open_dataarray(
chunks=chunks,
cache=cache,
drop_variables=drop_variables,
create_default_indexes=create_default_indexes,
inline_array=inline_array,
chunked_array_type=chunked_array_type,
from_array_kwargs=from_array_kwargs,
Expand Down Expand Up @@ -946,6 +983,7 @@ def open_datatree(
concat_characters: bool | Mapping[str, bool] | None = None,
decode_coords: Literal["coordinates", "all"] | bool | None = None,
drop_variables: str | Iterable[str] | None = None,
create_default_indexes: bool = True,
inline_array: bool = False,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -1055,6 +1093,13 @@ def open_datatree(
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
create_default_indexes : bool, default: True
If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
which loads the coordinate data into memory. Set it to False if you want to avoid loading
data into memory.

Note that backends can still choose to create other indexes. If you want to control that,
please refer to the backend's documentation.
inline_array: bool, default: False
How to include the array in the dask task graph.
By default(``inline_array=False``) the array is included in a task by
Expand Down Expand Up @@ -1148,6 +1193,7 @@ def open_datatree(
chunked_array_type,
from_array_kwargs,
drop_variables=drop_variables,
create_default_indexes=create_default_indexes,
**decoders,
**kwargs,
)
Expand Down Expand Up @@ -1175,6 +1221,7 @@ def open_groups(
concat_characters: bool | Mapping[str, bool] | None = None,
decode_coords: Literal["coordinates", "all"] | bool | None = None,
drop_variables: str | Iterable[str] | None = None,
create_default_indexes: bool = True,
inline_array: bool = False,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -1286,6 +1333,13 @@ def open_groups(
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
create_default_indexes : bool, default: True
If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
which loads the coordinate data into memory. Set it to False if you want to avoid loading
data into memory.

Note that backends can still choose to create other indexes. If you want to control that,
please refer to the backend's documentation.
inline_array: bool, default: False
How to include the array in the dask task graph.
By default(``inline_array=False``) the array is included in a task by
Expand Down Expand Up @@ -1381,6 +1435,7 @@ def open_groups(
chunked_array_type,
from_array_kwargs,
drop_variables=drop_variables,
create_default_indexes=create_default_indexes,
**decoders,
**kwargs,
)
Expand Down
17 changes: 15 additions & 2 deletions xarray/backends/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
AbstractDataStore,
BackendEntrypoint,
)
from xarray.core.coordinates import Coordinates
from xarray.core.dataset import Dataset

if TYPE_CHECKING:
Expand Down Expand Up @@ -36,6 +37,7 @@ def open_dataset(
concat_characters=True,
decode_coords=True,
drop_variables: str | Iterable[str] | None = None,
set_indexes: bool = True,
use_cftime=None,
decode_timedelta=None,
) -> Dataset:
Expand All @@ -56,8 +58,19 @@ def open_dataset(
decode_timedelta=decode_timedelta,
)

ds = Dataset(vars, attrs=attrs)
ds = ds.set_coords(coord_names.intersection(vars))
# split data and coordinate variables (promote dimension coordinates)
data_vars = {}
coord_vars = {}
for name, var in vars.items():
if name in coord_names or var.dims == (name,):
coord_vars[name] = var
else:
data_vars[name] = var

# explicit Coordinates object with no index passed
coords = Coordinates(coord_vars, indexes={})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is fine given our rules for propagating coordinate variables when extracting DataArrays but it is potentially confusing with create_default_indexes=False, decode_coordinates=False.


ds = Dataset(data_vars, coords=coords, attrs=attrs)
ds.set_close(filename_or_obj.close)
ds.encoding = encoding

Expand Down
9 changes: 9 additions & 0 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,7 @@ def open_zarr(
use_zarr_fill_value_as_mask=None,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
create_default_indexes=True,
**kwargs,
):
"""Load and decode a dataset from a Zarr store.
Expand Down Expand Up @@ -1457,6 +1458,13 @@ def open_zarr(
chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg.
Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to
:py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
create_default_indexes : bool, default: True
If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
which loads the coordinate data into memory. Set it to False if you want to avoid loading
data into memory.

Note that backends can still choose to create other indexes. If you want to control that,
please refer to the backend's documentation.

Returns
-------
Expand Down Expand Up @@ -1513,6 +1521,7 @@ def open_zarr(
engine="zarr",
chunks=chunks,
drop_variables=drop_variables,
create_default_indexes=create_default_indexes,
chunked_array_type=chunked_array_type,
from_array_kwargs=from_array_kwargs,
backend_kwargs=backend_kwargs,
Expand Down
61 changes: 61 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from xarray.coding.variables import SerializationWarning
from xarray.conventions import encode_dataset_coordinates
from xarray.core import indexing
from xarray.core.indexes import PandasIndex
from xarray.core.options import set_options
from xarray.core.types import PDDatetimeUnitOptions
from xarray.core.utils import module_available
Expand Down Expand Up @@ -2066,6 +2067,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self):
with self.roundtrip(original):
pass

@pytest.mark.parametrize("create_default_indexes", [True, False])
def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None:
store_path = tmp_path / "tmp.nc"
original_ds = xr.Dataset(
{"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}
)
original_ds.to_netcdf(store_path, engine=self.engine, mode="w")

with open_dataset(
store_path,
engine=self.engine,
create_default_indexes=create_default_indexes,
) as loaded_ds:
if create_default_indexes:
assert list(loaded_ds.xindexes) == ["x"] and isinstance(
loaded_ds.xindexes["x"], PandasIndex
)
else:
assert len(loaded_ds.xindexes) == 0


@requires_netCDF4
class TestNetCDF4Data(NetCDF4Base):
Expand Down Expand Up @@ -4063,6 +4084,26 @@ def test_pickle(self) -> None:
def test_pickle_dataarray(self) -> None:
pass

@pytest.mark.parametrize("create_default_indexes", [True, False])
def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None:
store_path = tmp_path / "tmp.nc"
original_ds = xr.Dataset(
{"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}
)
original_ds.to_netcdf(store_path, engine=self.engine, mode="w")

with open_dataset(
store_path,
engine=self.engine,
create_default_indexes=create_default_indexes,
) as loaded_ds:
if create_default_indexes:
assert list(loaded_ds.xindexes) == ["x"] and isinstance(
loaded_ds.xindexes["x"], PandasIndex
)
else:
assert len(loaded_ds.xindexes) == 0


@requires_scipy
class TestScipyFilePath(CFEncodedBase, NetCDF3Only):
Expand Down Expand Up @@ -6434,6 +6475,26 @@ def test_zarr_closing_internal_zip_store():
assert_identical(original_da, loaded_da)


@requires_zarr
@pytest.mark.parametrize("create_default_indexes", [True, False])
def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None:
from xarray.core.indexes import PandasIndex

store_path = tmp_path / "tmp.zarr"
original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]})
original_ds.to_zarr(store_path, mode="w")

with open_dataset(
store_path, engine="zarr", create_default_indexes=create_default_indexes
) as loaded_ds:
if create_default_indexes:
assert list(loaded_ds.xindexes) == ["x"] and isinstance(
loaded_ds.xindexes["x"], PandasIndex
)
else:
assert len(loaded_ds.xindexes) == 0


@requires_zarr
@pytest.mark.usefixtures("default_zarr_format")
def test_raises_key_error_on_invalid_zarr_store(tmp_path):
Expand Down
Loading
Loading