Merge branch 'main' into async.load

TomNicholas · TomNicholas · commit f8f5e82e4070 · 2025-08-13T21:19:31.000+01:00
diff --git a/doc/user-guide/hierarchical-data.rst b/doc/user-guide/hierarchical-data.rst
@@ -453,6 +453,8 @@ The result is a new tree, containing only the nodes matching the condition.
 
 (Yes, under the hood :py:meth:`~xarray.DataTree.filter` is just syntactic sugar for the pattern we showed you in :ref:`iterating over trees` !)
 
+If you want to filter out empty nodes you can use :py:meth:`~xarray.DataTree.prune`.
+
 .. _Tree Contents:
 
 Tree Contents
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -12,6 +12,9 @@ v2025.07.2 (unreleased)
 
 New Features
 ~~~~~~~~~~~~
+- Added :py:meth:`DataTree.prune` method to remove empty nodes while preserving tree structure.
+  Useful for cleaning up DataTree after time-based filtering operations (:issue:`10590`, :pull:`10598`).
+  By `Alfonso Ladino <https://github.com/aladinor>`_.
 
 - Added new asynchronous loading methods :py:meth:`Dataset.load_async`, :py:meth:`DataArray.load_async`, :py:meth:`Variable.load_async`.
   Note that users are expected to limit concurrency themselves - xarray does not internally limit concurrency in any way.
@@ -24,6 +27,13 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- When writing to NetCDF files with groups, Xarray no longer redefines dimensions
+  that have the same size in parent groups (:issue:`10241`). This conforms with
+  `CF Conventions for group scrope <https://cfconventions.org/cf-conventions/cf-conventions.html#_scope>`_
+  but may require adjustments for code that consumes NetCDF files produced by
+  Xarray.
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
+
 
 Deprecations
 ~~~~~~~~~~~~
@@ -58,6 +68,10 @@ Bug fixes
 Documentation
 ~~~~~~~~~~~~~
 
+- Clarify lazy behaviour and eager loading for ``chunks=None`` in :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_dataarray`, :py:func:`~xarray.open_datatree`, :py:func:`~xarray.open_groups` and :py:func:`~xarray.open_zarr` (:issue:`10612`, :pull:`10627`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
+
+
 
 Internal Changes
 ~~~~~~~~~~~~~~~~
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -578,8 +578,10 @@ def open_dataset(
 
         - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the
           engine preferred chunks.
-        - ``chunks=None`` skips using dask, which is generally faster for
-          small arrays.
+        - ``chunks=None`` skips using dask. This uses xarray's internally private
+          :ref:`lazy indexing classes <internal design.lazy indexing>`,
+          but data is eagerly loaded into memory as numpy arrays when accessed.
+          This can be more efficient for smaller arrays or when large arrays are sliced before computation.
         - ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
         - ``chunks={}`` loads the data with dask using the engine's preferred chunk
           size, generally identical to the format's chunk size. If not available, a
@@ -819,8 +821,10 @@ def open_dataarray(
 
         - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the
           engine preferred chunks.
-        - ``chunks=None`` skips using dask, which is generally faster for
-          small arrays.
+        - ``chunks=None`` skips using dask. This uses xarray's internally private
+          :ref:`lazy indexing classes <internal design.lazy indexing>`,
+          but data is eagerly loaded into memory as numpy arrays when accessed.
+          This can be more efficient for smaller arrays, though results may vary.
         - ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
         - ``chunks={}`` loads the data with dask using engine preferred chunks if
           exposed by the backend, otherwise with a single chunk for all arrays.
@@ -1044,8 +1048,10 @@ def open_datatree(
 
         - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the
           engine preferred chunks.
-        - ``chunks=None`` skips using dask, which is generally faster for
-          small arrays.
+        - ``chunks=None`` skips using dask. This uses xarray's internally private
+          :ref:`lazy indexing classes <internal design.lazy indexing>`,
+          but data is eagerly loaded into memory as numpy arrays when accessed.
+          This can be more efficient for smaller arrays, though results may vary.
         - ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
         - ``chunks={}`` loads the data with dask using the engine's preferred chunk
           size, generally identical to the format's chunk size. If not available, a
@@ -1288,8 +1294,10 @@ def open_groups(
 
         - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the
           engine preferred chunks.
-        - ``chunks=None`` skips using dask, which is generally faster for
-          small arrays.
+        - ``chunks=None`` skips using dask. This uses xarray's internally private
+          :ref:`lazy indexing classes <internal design.lazy indexing>`,
+          but data is eagerly loaded into memory as numpy arrays when accessed.
+          This can be more efficient for smaller arrays, though results may vary.
         - ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
         - ``chunks={}`` loads the data with dask using the engine's preferred chunk
           size, generally identical to the format's chunk size. If not available, a
diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -256,6 +256,20 @@ def find_root_and_group(ds):
     return ds, group
 
 
+def collect_ancestor_dimensions(group) -> dict[str, int]:
+    """Returns dimensions defined in parent groups.
+
+    If dimensions are defined in multiple ancestors, use the size of the closest
+    ancestor.
+    """
+    dims = {}
+    while (group := group.parent) is not None:
+        for k, v in group.dimensions.items():
+            if k not in dims:
+                dims[k] = len(v)
+    return dims
+
+
 def datatree_from_dict_with_io_cleanup(groups_dict: Mapping[str, Dataset]) -> DataTree:
     """DataTree.from_dict with file clean-up."""
     try:
@@ -315,6 +329,9 @@ class AbstractDataStore:
     def get_dimensions(self):  # pragma: no cover
         raise NotImplementedError()
 
+    def get_parent_dimensions(self):  # pragma: no cover
+        return {}
+
     def get_attrs(self):  # pragma: no cover
         raise NotImplementedError()
 
@@ -570,21 +587,22 @@ def set_dimensions(self, variables, unlimited_dims=None):
         if unlimited_dims is None:
             unlimited_dims = set()
 
+        parent_dims = self.get_parent_dimensions()
         existing_dims = self.get_dimensions()
 
         dims = {}
         for v in unlimited_dims:  # put unlimited_dims first
             dims[v] = None
         for v in variables.values():
-            dims.update(dict(zip(v.dims, v.shape, strict=True)))
+            dims |= v.sizes
 
         for dim, length in dims.items():
             if dim in existing_dims and length != existing_dims[dim]:
                 raise ValueError(
                     "Unable to update size for existing dimension"
                     f"{dim!r} ({length} != {existing_dims[dim]})"
                 )
-            elif dim not in existing_dims:
+            elif dim not in existing_dims and length != parent_dims.get(dim):
                 is_unlimited = dim in unlimited_dims
                 self.set_dimension(dim, length, is_unlimited)
 
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -16,6 +16,7 @@
     WritableCFDataStore,
     _normalize_path,
     _open_remote_file,
+    collect_ancestor_dimensions,
     datatree_from_dict_with_io_cleanup,
     find_root_and_group,
 )
@@ -287,6 +288,9 @@ def get_attrs(self):
     def get_dimensions(self):
         return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items())
 
+    def get_parent_dimensions(self):
+        return FrozenDict(collect_ancestor_dimensions(self.ds))
+
     def get_encoding(self):
         return {
             "unlimited_dims": {
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -16,6 +16,7 @@
     T_PathFileOrDataStore,
     WritableCFDataStore,
     _normalize_path,
+    collect_ancestor_dimensions,
     datatree_from_dict_with_io_cleanup,
     find_root_and_group,
     robust_getitem,
@@ -518,6 +519,9 @@ def get_attrs(self):
     def get_dimensions(self):
         return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items())
 
+    def get_parent_dimensions(self):
+        return FrozenDict(collect_ancestor_dimensions(self.ds))
+
     def get_encoding(self):
         return {
             "unlimited_dims": {
diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -1420,8 +1420,10 @@ def open_zarr(
 
         - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the
           engine preferred chunks.
-        - ``chunks=None`` skips using dask, which is generally faster for
-          small arrays.
+        - ``chunks=None`` skips using dask. This uses xarray's internally private
+          :ref:`lazy indexing classes <internal design.lazy indexing>`,
+          but data is eagerly loaded into memory as numpy arrays when accessed.
+          This can be more efficient for smaller arrays, though results may vary.
         - ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
         - ``chunks={}`` loads the data with dask using engine preferred chunks if
           exposed by the backend, otherwise with a single chunk for all arrays.
diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py
@@ -1450,6 +1450,73 @@ def filter_like(self, other: DataTree) -> DataTree:
         other_keys = {key for key, _ in other.subtree_with_keys}
         return self.filter(lambda node: node.relative_to(self) in other_keys)
 
+    def prune(self, drop_size_zero_vars: bool = False) -> DataTree:
+        """
+        Remove empty nodes from the tree.
+
+        Returns a new tree containing only nodes that contain data variables with actual data.
+        Intermediate nodes are kept if they are required to support non-empty children.
+
+        Parameters
+        ----------
+        drop_size_zero_vars : bool, default False
+            If True, also considers variables with zero size as empty.
+            If False, keeps nodes with data variables even if they have zero size.
+
+        Returns
+        -------
+        DataTree
+            A new tree with empty nodes removed.
+
+        See Also
+        --------
+        filter
+
+        Examples
+        --------
+        >>> dt = xr.DataTree.from_dict(
+        ...     {
+        ...         "/a": xr.Dataset({"foo": ("x", [1, 2])}),
+        ...         "/b": xr.Dataset({"bar": ("x", [])}),
+        ...         "/c": xr.Dataset(),
+        ...     }
+        ... )
+        >>> dt.prune()  # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
+        <xarray.DataTree>
+        Group: /
+        ├── Group: /a
+        │       Dimensions:  (x: 2)
+        │       Dimensions without coordinates: x
+        │       Data variables:
+        │           foo      (x) int64 16B 1 2
+        └── Group: /b
+                Dimensions:  (x: 0)
+                Dimensions without coordinates: x
+                Data variables:
+                    bar      (x) float64 0B...
+
+        The ``drop_size_zero_vars`` parameter controls whether variables
+        with zero size are considered empty:
+
+        >>> dt.prune(drop_size_zero_vars=True)
+        <xarray.DataTree>
+        Group: /
+        └── Group: /a
+                Dimensions:  (x: 2)
+                Dimensions without coordinates: x
+                Data variables:
+                    foo      (x) int64 16B 1 2
+        """
+        non_empty_cond: Callable[[DataTree], bool]
+        if drop_size_zero_vars:
+            non_empty_cond = lambda node: len(node.data_vars) > 0 and any(
+                var.size > 0 for var in node.data_vars.values()
+            )
+        else:
+            non_empty_cond = lambda node: len(node.data_vars) > 0
+
+        return self.filter(non_empty_cond)
+
     def match(self, pattern: str) -> DataTree:
         """
         Return nodes with paths matching pattern.
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -1704,6 +1704,17 @@ def test_write_groups(self) -> None:
             with self.open(tmp_file, group="data/2") as actual2:
                 assert_identical(data2, actual2)
 
+    def test_child_group_with_inconsistent_dimensions(self) -> None:
+        base = Dataset(coords={"x": [1, 2]})
+        child = Dataset(coords={"x": [1, 2, 3]})
+        with create_tmp_file() as tmp_file:
+            self.save(base, tmp_file)
+            self.save(child, tmp_file, group="child", mode="a")
+            with self.open(tmp_file) as actual_base:
+                assert_identical(base, actual_base)
+            with self.open(tmp_file, group="child") as actual_child:
+                assert_identical(child, actual_child)
+
     @pytest.mark.parametrize(
         "input_strings, is_bytes",
         [
diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py
@@ -265,6 +265,23 @@ def test_write_subgroup(self, tmpdir):
             assert_equal(original_dt, roundtrip_dt)
             assert_identical(expected_dt, roundtrip_dt)
 
+    @requires_netCDF4
+    def test_no_redundant_dimensions(self, tmpdir):
+        # regression test for https://github.com/pydata/xarray/issues/10241
+        original_dt = DataTree.from_dict(
+            {
+                "/": xr.Dataset(coords={"x": [1, 2, 3]}),
+                "/child": xr.Dataset({"foo": ("x", [4, 5, 6])}),
+            }
+        )
+        filepath = tmpdir / "test.zarr"
+        original_dt.to_netcdf(filepath, engine=self.engine)
+
+        root = nc4.Dataset(str(filepath))
+        child = root.groups["child"]
+        assert list(root.dimensions) == ["x"]
+        assert list(child.dimensions) == []
+
 
 @requires_netCDF4
 class TestNetCDF4DatatreeIO(DatatreeIOBase):
diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py