Skip to content

Commit ffb30a8

Browse files
max-sixtydcherianpre-commit-ci[bot]
authored
Check for aligned chunks when writing to existing variables (#8459)
* Check for aligned chunks when writing to existing variables * Update doc/whats-new.rst Co-authored-by: Deepak Cherian <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add regression test for #8459 * Update whats-new * Address Ryan's comment * Update region typing * Update test --------- Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian <[email protected]>
1 parent 852b7e6 commit ffb30a8

File tree

5 files changed

+111
-13
lines changed

5 files changed

+111
-13
lines changed

doc/whats-new.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ v2024.03.0 (unreleased)
2222

2323
New Features
2424
~~~~~~~~~~~~
25-
25+
- Partial writes to existing chunks with ``region`` or ``append_dim`` will now raise an error
26+
(unless ``safe_chunks=False``); previously an error would only be raised on
27+
new variables. (:pull:`8459`, :issue:`8371`, :issue:`8882`)
28+
By `Maximilian Roos <https://github.com/max-sixty>`_.
2629
- Grouped and resampling quantile calculations now use the vectorized algorithm in ``flox>=0.9.4`` if present.
2730
By `Deepak Cherian <https://github.com/dcherian>`_.
2831
- Do not broadcast in arithmetic operations when global option ``arithmetic_broadcast=False``

xarray/backends/zarr.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):
195195
f"Writing this array in parallel with dask could lead to corrupted data."
196196
)
197197
if safe_chunks:
198-
raise NotImplementedError(
198+
raise ValueError(
199199
base_error
200200
+ " Consider either rechunking using `chunk()`, deleting "
201201
"or modifying `encoding['chunks']`, or specify `safe_chunks=False`."
@@ -707,6 +707,17 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
707707
if v.encoding == {"_FillValue": None} and fill_value is None:
708708
v.encoding = {}
709709

710+
# We need to do this for both new and existing variables to ensure we're not
711+
# writing to a partial chunk, even though we don't use the `encoding` value
712+
# when writing to an existing variable. See
713+
# https://github.com/pydata/xarray/issues/8371 for details.
714+
encoding = extract_zarr_variable_encoding(
715+
v,
716+
raise_on_invalid=check,
717+
name=vn,
718+
safe_chunks=self._safe_chunks,
719+
)
720+
710721
if name in existing_keys:
711722
# existing variable
712723
# TODO: if mode="a", consider overriding the existing variable
@@ -737,9 +748,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
737748
zarr_array = self.zarr_group[name]
738749
else:
739750
# new variable
740-
encoding = extract_zarr_variable_encoding(
741-
v, raise_on_invalid=check, name=vn, safe_chunks=self._safe_chunks
742-
)
743751
encoded_attrs = {}
744752
# the magic for storing the hidden dimension data
745753
encoded_attrs[DIMENSION_KEY] = dims

xarray/core/dataarray.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4120,7 +4120,7 @@ def to_zarr(
41204120
compute: Literal[True] = True,
41214121
consolidated: bool | None = None,
41224122
append_dim: Hashable | None = None,
4123-
region: Mapping[str, slice] | None = None,
4123+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
41244124
safe_chunks: bool = True,
41254125
storage_options: dict[str, str] | None = None,
41264126
zarr_version: int | None = None,
@@ -4140,7 +4140,7 @@ def to_zarr(
41404140
compute: Literal[False],
41414141
consolidated: bool | None = None,
41424142
append_dim: Hashable | None = None,
4143-
region: Mapping[str, slice] | None = None,
4143+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
41444144
safe_chunks: bool = True,
41454145
storage_options: dict[str, str] | None = None,
41464146
zarr_version: int | None = None,
@@ -4158,7 +4158,7 @@ def to_zarr(
41584158
compute: bool = True,
41594159
consolidated: bool | None = None,
41604160
append_dim: Hashable | None = None,
4161-
region: Mapping[str, slice] | None = None,
4161+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
41624162
safe_chunks: bool = True,
41634163
storage_options: dict[str, str] | None = None,
41644164
zarr_version: int | None = None,
@@ -4237,6 +4237,12 @@ def to_zarr(
42374237
in with ``region``, use a separate call to ``to_zarr()`` with
42384238
``compute=False``. See "Appending to existing Zarr stores" in
42394239
the reference documentation for full details.
4240+
4241+
Users are expected to ensure that the specified region aligns with
4242+
Zarr chunk boundaries, and that dask chunks are also aligned.
4243+
Xarray makes limited checks that these multiple chunk boundaries line up.
4244+
It is possible to write incomplete chunks and corrupt the data with this
4245+
option if you are not careful.
42404246
safe_chunks : bool, default: True
42414247
If True, only allow writes to when there is a many-to-one relationship
42424248
between Zarr chunks (specified in encoding) and Dask chunks.

xarray/core/dataset.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2452,6 +2452,12 @@ def to_zarr(
24522452
in with ``region``, use a separate call to ``to_zarr()`` with
24532453
``compute=False``. See "Appending to existing Zarr stores" in
24542454
the reference documentation for full details.
2455+
2456+
Users are expected to ensure that the specified region aligns with
2457+
Zarr chunk boundaries, and that dask chunks are also aligned.
2458+
Xarray makes limited checks that these multiple chunk boundaries line up.
2459+
It is possible to write incomplete chunks and corrupt the data with this
2460+
option if you are not careful.
24552461
safe_chunks : bool, default: True
24562462
If True, only allow writes to when there is a many-to-one relationship
24572463
between Zarr chunks (specified in encoding) and Dask chunks.

xarray/tests/test_backends.py

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2304,7 +2304,7 @@ def test_chunk_encoding_with_dask(self) -> None:
23042304
# should fail if encoding["chunks"] clashes with dask_chunks
23052305
badenc = ds.chunk({"x": 4})
23062306
badenc.var1.encoding["chunks"] = (6,)
2307-
with pytest.raises(NotImplementedError, match=r"named 'var1' would overlap"):
2307+
with pytest.raises(ValueError, match=r"named 'var1' would overlap"):
23082308
with self.roundtrip(badenc) as actual:
23092309
pass
23102310

@@ -2342,9 +2342,7 @@ def test_chunk_encoding_with_dask(self) -> None:
23422342
# but itermediate unaligned chunks are bad
23432343
badenc = ds.chunk({"x": (3, 5, 3, 1)})
23442344
badenc.var1.encoding["chunks"] = (3,)
2345-
with pytest.raises(
2346-
NotImplementedError, match=r"would overlap multiple dask chunks"
2347-
):
2345+
with pytest.raises(ValueError, match=r"would overlap multiple dask chunks"):
23482346
with self.roundtrip(badenc) as actual:
23492347
pass
23502348

@@ -2358,7 +2356,7 @@ def test_chunk_encoding_with_dask(self) -> None:
23582356
# TODO: remove this failure once synchronized overlapping writes are
23592357
# supported by xarray
23602358
ds_chunk4["var1"].encoding.update({"chunks": 5})
2361-
with pytest.raises(NotImplementedError, match=r"named 'var1' would overlap"):
2359+
with pytest.raises(ValueError, match=r"named 'var1' would overlap"):
23622360
with self.roundtrip(ds_chunk4) as actual:
23632361
pass
23642362
# override option
@@ -5753,3 +5751,80 @@ def test_zarr_region(tmp_path):
57535751

57545752
# Write without region
57555753
ds_transposed.to_zarr(tmp_path / "test.zarr", mode="r+")
5754+
5755+
5756+
@requires_zarr
5757+
@requires_dask
5758+
def test_zarr_region_chunk_partial(tmp_path):
5759+
"""
5760+
Check that writing to partial chunks with `region` fails, assuming `safe_chunks=False`.
5761+
"""
5762+
ds = (
5763+
xr.DataArray(np.arange(120).reshape(4, 3, -1), dims=list("abc"))
5764+
.rename("var1")
5765+
.to_dataset()
5766+
)
5767+
5768+
ds.chunk(5).to_zarr(tmp_path / "foo.zarr", compute=False, mode="w")
5769+
with pytest.raises(ValueError):
5770+
for r in range(ds.sizes["a"]):
5771+
ds.chunk(3).isel(a=[r]).to_zarr(
5772+
tmp_path / "foo.zarr", region=dict(a=slice(r, r + 1))
5773+
)
5774+
5775+
5776+
@requires_zarr
5777+
@requires_dask
5778+
def test_zarr_append_chunk_partial(tmp_path):
5779+
t_coords = np.array([np.datetime64("2020-01-01").astype("datetime64[ns]")])
5780+
data = np.ones((10, 10))
5781+
5782+
da = xr.DataArray(
5783+
data.reshape((-1, 10, 10)),
5784+
dims=["time", "x", "y"],
5785+
coords={"time": t_coords},
5786+
name="foo",
5787+
)
5788+
da.to_zarr(tmp_path / "foo.zarr", mode="w", encoding={"foo": {"chunks": (5, 5, 1)}})
5789+
5790+
new_time = np.array([np.datetime64("2021-01-01").astype("datetime64[ns]")])
5791+
5792+
da2 = xr.DataArray(
5793+
data.reshape((-1, 10, 10)),
5794+
dims=["time", "x", "y"],
5795+
coords={"time": new_time},
5796+
name="foo",
5797+
)
5798+
with pytest.raises(ValueError, match="encoding was provided"):
5799+
da2.to_zarr(
5800+
tmp_path / "foo.zarr",
5801+
append_dim="time",
5802+
mode="a",
5803+
encoding={"foo": {"chunks": (1, 1, 1)}},
5804+
)
5805+
5806+
# chunking with dask sidesteps the encoding check, so we need a different check
5807+
with pytest.raises(ValueError, match="Specified zarr chunks"):
5808+
da2.chunk({"x": 1, "y": 1, "time": 1}).to_zarr(
5809+
tmp_path / "foo.zarr", append_dim="time", mode="a"
5810+
)
5811+
5812+
5813+
@requires_zarr
5814+
@requires_dask
5815+
def test_zarr_region_chunk_partial_offset(tmp_path):
5816+
# https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545
5817+
store = tmp_path / "foo.zarr"
5818+
data = np.ones((30,))
5819+
da = xr.DataArray(data, dims=["x"], coords={"x": range(30)}, name="foo").chunk(x=10)
5820+
da.to_zarr(store, compute=False)
5821+
5822+
da.isel(x=slice(10)).chunk(x=(10,)).to_zarr(store, region="auto")
5823+
5824+
da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(
5825+
store, safe_chunks=False, region="auto"
5826+
)
5827+
5828+
# This write is unsafe, and should raise an error, but does not.
5829+
# with pytest.raises(ValueError):
5830+
# da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto")

0 commit comments

Comments
 (0)