Skip to content

Commit 3132f6a

Browse files
committed
2 parents d377780 + 1bb867d commit 3132f6a

File tree

9 files changed

+210
-161
lines changed

9 files changed

+210
-161
lines changed

DATATREE_MIGRATION_GUIDE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ A number of other API changes have been made, which should only require minor mo
4545
- The `DataTree.parent` property is now read-only. To assign a ancestral relationships directly you must instead use the `.children` property on the parent node, which remains settable.
4646
- Similarly the `parent` kwarg has been removed from the `DataTree.__init__` constructor.
4747
- DataTree objects passed to the `children` kwarg in `DataTree.__init__` are now shallow-copied.
48+
- `DataTree.map_over_subtree` has been renamed to `DataTree.map_over_datasets`, and changed to no longer work like a decorator. Instead you use it to apply the function and arguments directly, more like how `xarray.apply_ufunc` works.
4849
- `DataTree.as_array` has been replaced by `DataTree.to_dataarray`.
4950
- A number of methods which were not well tested have been (temporarily) disabled. In general we have tried to only keep things that are known to work, with the plan to increase API surface incrementally after release.
5051

doc/whats-new.rst

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,13 @@ What's New
1616
1717
.. _whats-new.2024.10.1:
1818

19-
v.2024.10.1 (unreleased)
20-
------------------------
21-
22-
23-
Breaking Changes
24-
~~~~~~~~~~~~~~~~
25-
- The minimum versions of some dependencies were changed
19+
v.2024.11.0 (Nov 22, 2024)
20+
--------------------------
2621

27-
===================== ========= =======
28-
Package Old New
29-
===================== ========= =======
30-
boto3 1.28 1.29
31-
dask-core 2023.9 2023.11
32-
distributed 2023.9 2023.11
33-
h5netcdf 1.2 1.3
34-
numbagg 0.2.1 0.6
35-
typing_extensions 4.7 4.8
36-
===================== ========= =======
22+
This release brings better support for wrapping JAX arrays and Astropy Quantity objects, :py:meth:`DataTree.persist`, algorithmic improvements
23+
to many methods with dask (:py:meth:`Dataset.polyfit`, :py:meth:`Dataset.ffill`, :py:meth:`Dataset.bfill`, rolling reductions), and bug fixes.
24+
Thanks to the 22 contributors to this release:
25+
Benoit Bovy, Deepak Cherian, Dimitri Papadopoulos Orfanos, Holly Mandel, James Bourbeau, Joe Hamman, Justus Magin, Kai Mühlbauer, Lukas Trippe, Mathias Hauser, Maximilian Roos, Michael Niklas, Pascal Bourgault, Patrick Hoefler, Sam Levang, Sarah Charlotte Johnson, Scott Huberty, Stephan Hoyer, Tom Nicholas, Virgile Andreani, joseph nowak and tvo
3726

3827
New Features
3928
~~~~~~~~~~~~
@@ -64,10 +53,23 @@ New Features
6453
underlying array's backend. Provides better support for certain wrapped array types
6554
like ``jax.numpy.ndarray``. (:issue:`7848`, :pull:`9776`).
6655
By `Sam Levang <https://github.com/slevang>`_.
56+
- Speed up loading of large zarr stores using dask arrays. (:issue:`8902`)
57+
By `Deepak Cherian <https://github.com/dcherian>`_.
6758

68-
Breaking changes
59+
Breaking Changes
6960
~~~~~~~~~~~~~~~~
61+
- The minimum versions of some dependencies were changed
7062

63+
===================== ========= =======
64+
Package Old New
65+
===================== ========= =======
66+
boto3 1.28 1.29
67+
dask-core 2023.9 2023.11
68+
distributed 2023.9 2023.11
69+
h5netcdf 1.2 1.3
70+
numbagg 0.2.1 0.6
71+
typing_extensions 4.7 4.8
72+
===================== ========= =======
7173

7274
Deprecations
7375
~~~~~~~~~~~~

xarray/backends/plugins.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def get_backend(engine: str | type[BackendEntrypoint]) -> BackendEntrypoint:
200200
engines = list_engines()
201201
if engine not in engines:
202202
raise ValueError(
203-
f"unrecognized engine {engine} must be one of your download engines: {list(engines)}"
203+
f"unrecognized engine '{engine}' must be one of your download engines: {list(engines)}. "
204204
"To install additional dependencies, see:\n"
205205
"https://docs.xarray.dev/en/stable/user-guide/io.html \n"
206206
"https://docs.xarray.dev/en/stable/getting-started-guide/installing.html"

xarray/backends/zarr.py

Lines changed: 102 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from xarray.namedarray.utils import module_available
3838

3939
if TYPE_CHECKING:
40+
from zarr import Array as ZarrArray
4041
from zarr import Group as ZarrGroup
4142

4243
from xarray.backends.common import AbstractDataStore
@@ -443,7 +444,7 @@ def extract_zarr_variable_encoding(
443444
shape = shape if shape else variable.shape
444445
encoding = variable.encoding.copy()
445446

446-
safe_to_drop = {"source", "original_shape"}
447+
safe_to_drop = {"source", "original_shape", "preferred_chunks"}
447448
valid_encodings = {
448449
"codecs",
449450
"chunks",
@@ -871,16 +872,27 @@ def store(
871872
else:
872873
zarr = attempt_import("zarr")
873874

874-
existing_keys = tuple(self.zarr_group.array_keys())
875+
if self._mode == "w":
876+
# always overwrite, so we don't care about existing names,
877+
# and consistency of encoding
878+
new_variable_names = set(variables)
879+
existing_keys = {}
880+
existing_variable_names = {}
881+
else:
882+
existing_keys = tuple(self.zarr_group.array_keys())
883+
existing_variable_names = {
884+
vn for vn in variables if _encode_variable_name(vn) in existing_keys
885+
}
886+
new_variable_names = set(variables) - existing_variable_names
875887

876-
if self._mode == "r+":
877-
new_names = [k for k in variables if k not in existing_keys]
878-
if new_names:
879-
raise ValueError(
880-
f"dataset contains non-pre-existing variables {new_names}, "
881-
"which is not allowed in ``xarray.Dataset.to_zarr()`` with "
882-
"``mode='r+'``. To allow writing new variables, set ``mode='a'``."
883-
)
888+
if self._mode == "r+" and (
889+
new_names := [k for k in variables if k not in existing_keys]
890+
):
891+
raise ValueError(
892+
f"dataset contains non-pre-existing variables {new_names!r}, "
893+
"which is not allowed in ``xarray.Dataset.to_zarr()`` with "
894+
"``mode='r+'``. To allow writing new variables, set ``mode='a'``."
895+
)
884896

885897
if self._append_dim is not None and self._append_dim not in existing_keys:
886898
# For dimensions without coordinate values, we must parse
@@ -895,10 +907,6 @@ def store(
895907
f"dataset dimensions {existing_dims}"
896908
)
897909

898-
existing_variable_names = {
899-
vn for vn in variables if _encode_variable_name(vn) in existing_keys
900-
}
901-
new_variable_names = set(variables) - existing_variable_names
902910
variables_encoded, attributes = self.encode(
903911
{vn: variables[vn] for vn in new_variable_names}, attributes
904912
)
@@ -920,10 +928,9 @@ def store(
920928
# Modified variables must use the same encoding as the store.
921929
vars_with_encoding = {}
922930
for vn in existing_variable_names:
923-
if self._mode in ["a", "a-", "r+"]:
924-
_validate_datatypes_for_zarr_append(
925-
vn, existing_vars[vn], variables[vn]
926-
)
931+
_validate_datatypes_for_zarr_append(
932+
vn, existing_vars[vn], variables[vn]
933+
)
927934
vars_with_encoding[vn] = variables[vn].copy(deep=False)
928935
vars_with_encoding[vn].encoding = existing_vars[vn].encoding
929936
vars_with_encoding, _ = self.encode(vars_with_encoding, {})
@@ -968,6 +975,69 @@ def store(
968975
def sync(self):
969976
pass
970977

978+
def _open_existing_array(self, *, name) -> ZarrArray:
979+
import zarr
980+
981+
# TODO: if mode="a", consider overriding the existing variable
982+
# metadata. This would need some case work properly with region
983+
# and append_dim.
984+
if self._write_empty is not None:
985+
# Write to zarr_group.chunk_store instead of zarr_group.store
986+
# See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
987+
# The open_consolidated() enforces a mode of r or r+
988+
# (and to_zarr with region provided enforces a read mode of r+),
989+
# and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
990+
# and a 'normal Store subtype for chunk_store.
991+
# The exact type depends on if a local path was used, or a URL of some sort,
992+
# but the point is that it's not a read-only ConsolidatedMetadataStore.
993+
# It is safe to write chunk data to the chunk_store because no metadata would be changed by
994+
# to_zarr with the region parameter:
995+
# - Because the write mode is enforced to be r+, no new variables can be added to the store
996+
# (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
997+
# - Existing variables already have their attrs included in the consolidated metadata file.
998+
# - The size of dimensions can not be expanded, that would require a call using `append_dim`
999+
# which is mutually exclusive with `region`
1000+
zarr_array = zarr.open(
1001+
store=(
1002+
self.zarr_group.store if _zarr_v3() else self.zarr_group.chunk_store
1003+
),
1004+
# TODO: see if zarr should normalize these strings.
1005+
path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip("/"),
1006+
write_empty_chunks=self._write_empty,
1007+
)
1008+
else:
1009+
zarr_array = self.zarr_group[name]
1010+
1011+
return zarr_array
1012+
1013+
def _create_new_array(
1014+
self, *, name, shape, dtype, fill_value, encoding, attrs
1015+
) -> ZarrArray:
1016+
if coding.strings.check_vlen_dtype(dtype) is str:
1017+
dtype = str
1018+
1019+
if self._write_empty is not None:
1020+
if (
1021+
"write_empty_chunks" in encoding
1022+
and encoding["write_empty_chunks"] != self._write_empty
1023+
):
1024+
raise ValueError(
1025+
'Differing "write_empty_chunks" values in encoding and parameters'
1026+
f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }'
1027+
)
1028+
else:
1029+
encoding["write_empty_chunks"] = self._write_empty
1030+
1031+
zarr_array = self.zarr_group.create(
1032+
name,
1033+
shape=shape,
1034+
dtype=dtype,
1035+
fill_value=fill_value,
1036+
**encoding,
1037+
)
1038+
zarr_array = _put_attrs(zarr_array, attrs)
1039+
return zarr_array
1040+
9711041
def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=None):
9721042
"""
9731043
This provides a centralized method to set the variables on the data
@@ -986,8 +1056,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
9861056
dimensions.
9871057
"""
9881058

989-
import zarr
990-
9911059
existing_keys = tuple(self.zarr_group.array_keys())
9921060
is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3
9931061

@@ -1016,47 +1084,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
10161084
else:
10171085
del v.encoding["_FillValue"]
10181086

1019-
zarr_array = None
10201087
zarr_shape = None
10211088
write_region = self._write_region if self._write_region is not None else {}
10221089
write_region = {dim: write_region.get(dim, slice(None)) for dim in dims}
10231090

1024-
if name in existing_keys:
1091+
if self._mode != "w" and name in existing_keys:
10251092
# existing variable
1026-
# TODO: if mode="a", consider overriding the existing variable
1027-
# metadata. This would need some case work properly with region
1028-
# and append_dim.
1029-
if self._write_empty is not None:
1030-
# Write to zarr_group.chunk_store instead of zarr_group.store
1031-
# See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
1032-
# The open_consolidated() enforces a mode of r or r+
1033-
# (and to_zarr with region provided enforces a read mode of r+),
1034-
# and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
1035-
# and a 'normal Store subtype for chunk_store.
1036-
# The exact type depends on if a local path was used, or a URL of some sort,
1037-
# but the point is that it's not a read-only ConsolidatedMetadataStore.
1038-
# It is safe to write chunk data to the chunk_store because no metadata would be changed by
1039-
# to_zarr with the region parameter:
1040-
# - Because the write mode is enforced to be r+, no new variables can be added to the store
1041-
# (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
1042-
# - Existing variables already have their attrs included in the consolidated metadata file.
1043-
# - The size of dimensions can not be expanded, that would require a call using `append_dim`
1044-
# which is mutually exclusive with `region`
1045-
zarr_array = zarr.open(
1046-
store=(
1047-
self.zarr_group.store
1048-
if _zarr_v3()
1049-
else self.zarr_group.chunk_store
1050-
),
1051-
# TODO: see if zarr should normalize these strings.
1052-
path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip(
1053-
"/"
1054-
),
1055-
write_empty_chunks=self._write_empty,
1056-
)
1057-
else:
1058-
zarr_array = self.zarr_group[name]
1059-
1093+
zarr_array = self._open_existing_array(name=name)
10601094
if self._append_dim is not None and self._append_dim in dims:
10611095
# resize existing variable
10621096
append_axis = dims.index(self._append_dim)
@@ -1089,40 +1123,27 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
10891123
shape=zarr_shape,
10901124
)
10911125

1092-
if name not in existing_keys:
1126+
if self._mode == "w" or name not in existing_keys:
10931127
# new variable
1094-
encoded_attrs = {}
1128+
encoded_attrs = {k: self.encode_attribute(v) for k, v in attrs.items()}
10951129
# the magic for storing the hidden dimension data
10961130
if is_zarr_v3_format:
10971131
encoding["dimension_names"] = dims
10981132
else:
10991133
encoded_attrs[DIMENSION_KEY] = dims
1100-
for k2, v2 in attrs.items():
1101-
encoded_attrs[k2] = self.encode_attribute(v2)
1102-
1103-
if coding.strings.check_vlen_dtype(dtype) is str:
1104-
dtype = str
1105-
1106-
if self._write_empty is not None:
1107-
if (
1108-
"write_empty_chunks" in encoding
1109-
and encoding["write_empty_chunks"] != self._write_empty
1110-
):
1111-
raise ValueError(
1112-
'Differing "write_empty_chunks" values in encoding and parameters'
1113-
f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }'
1114-
)
1115-
else:
1116-
encoding["write_empty_chunks"] = self._write_empty
1117-
1118-
zarr_array = self.zarr_group.create(
1119-
name,
1120-
shape=shape,
1134+
1135+
encoding["exists_ok" if _zarr_v3() else "overwrite"] = (
1136+
True if self._mode == "w" else False
1137+
)
1138+
1139+
zarr_array = self._create_new_array(
1140+
name=name,
11211141
dtype=dtype,
1142+
shape=shape,
11221143
fill_value=fill_value,
1123-
**encoding,
1144+
encoding=encoding,
1145+
attrs=encoded_attrs,
11241146
)
1125-
zarr_array = _put_attrs(zarr_array, encoded_attrs)
11261147

11271148
writer.add(v.data, zarr_array, region)
11281149

0 commit comments

Comments
 (0)