From c1b8d4a0c5b486c41d0ec52c823cc1e83de075f7 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 12 Nov 2019 22:20:11 +0000 Subject: [PATCH 1/7] recursive tokenize --- xarray/core/dataarray.py | 4 +++- xarray/core/dataset.py | 6 +++++- xarray/core/variable.py | 8 ++++++-- xarray/tests/test_dask.py | 14 ++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 5e164f420c8..a192fe08cee 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -755,7 +755,9 @@ def reset_coords( return dataset def __dask_tokenize__(self): - return (type(self), self._variable, self._coords, self._name) + from dask.base import normalize_token + + return normalize_token((type(self), self._variable, self._coords, self._name)) def __dask_graph__(self): return self._to_temp_dataset().__dask_graph__() diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index dc5a315e72a..fd4bcfb1ebb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -652,7 +652,11 @@ def load(self, **kwargs) -> "Dataset": return self def __dask_tokenize__(self): - return (type(self), self._variables, self._coord_names, self._attrs) + from dask.base import normalize_token + + return normalize_token(( + type(self), self._variables, self._coord_names, self._attrs + )) def __dask_graph__(self): graphs = {k: v.__dask_graph__() for k, v in self.variables.items()} diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 916df75b3e0..f842a4a9428 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -393,7 +393,9 @@ def compute(self, **kwargs): def __dask_tokenize__(self): # Use v.data, instead of v._data, in order to cope with the wrappers # around NetCDF and the like - return type(self), self._dims, self.data, self._attrs + from dask.base import normalize_token + + return normalize_token((type(self), self._dims, self.data, self._attrs)) def __dask_graph__(self): if isinstance(self._data, dask_array_type): @@ -1973,8 +1975,10 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): self._data = PandasIndexAdapter(self._data) def __dask_tokenize__(self): + from dask.base import normalize_token + # Don't waste time converting pd.Index to np.ndarray - return (type(self), self._dims, self._data.array, self._attrs) + return normalize_token((type(self), self._dims, self._data.array, self._attrs)) def load(self): # data is already loaded into memory for IndexVariable diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index fa8ae9991d7..aa6c2b80477 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1283,6 +1283,20 @@ def test_token_identical(obj, transform): ) +def test_recursive_token(): + """Test that tokenization is invoked recursively, and doesn't just rely on the + output of str() + """ + a = np.ones(10000) + b = np.ones(10000) + b[5000] = 2 + assert str(a) == str(b) + assert dask.base.tokenize(a) != dask.base.tokenize(b) + xa = DataArray(a) + xb = DataArray(b) + assert dask.base.tokenize(xa) != dask.base.tokenize(xb) + + @requires_scipy_or_netCDF4 def test_normalize_token_with_backend(map_ds): with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file: From 25b655d0bccc30954a0158093e46f791beb903c7 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 12 Nov 2019 22:20:48 +0000 Subject: [PATCH 2/7] black --- xarray/core/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fd4bcfb1ebb..fe8abdc4b95 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -654,9 +654,9 @@ def load(self, **kwargs) -> "Dataset": def __dask_tokenize__(self): from dask.base import normalize_token - return normalize_token(( - type(self), self._variables, self._coord_names, self._attrs - )) + return normalize_token( + (type(self), self._variables, self._coord_names, self._attrs) + ) def __dask_graph__(self): graphs = {k: v.__dask_graph__() for k, v in self.variables.items()} From 4f395b6ad13621b2c21599ef4ce96995eae14e44 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 12 Nov 2019 22:35:35 +0000 Subject: [PATCH 3/7] What's New --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 96f0ba9a4a6..620617c127a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -73,7 +73,7 @@ New Features for xarray objects. Note that xarray objects with a dask.array backend already used deterministic hashing in previous releases; this change implements it when whole xarray objects are embedded in a dask graph, e.g. when :meth:`DataArray.map` is - invoked. (:issue:`3378`, :pull:`3446`) + invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`) By `Deepak Cherian `_ and `Guido Imperiale `_. From d2f8b5d02b039e2b3cebbd890e934dd4c02155b5 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 12 Nov 2019 22:41:14 +0000 Subject: [PATCH 4/7] Also test Dataset --- xarray/tests/test_dask.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index aa6c2b80477..62f0af1b1ba 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1292,9 +1292,12 @@ def test_recursive_token(): b[5000] = 2 assert str(a) == str(b) assert dask.base.tokenize(a) != dask.base.tokenize(b) - xa = DataArray(a) - xb = DataArray(b) - assert dask.base.tokenize(xa) != dask.base.tokenize(xb) + da_a = DataArray(a) + da_b = DataArray(b) + assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) + ds_a = da_a.to_dataset(name="x") + ds_b = da_b.to_dataset(name="x") + assert dask.base.tokenize(ds_a) != dask.base.tokenize(ds_b) @requires_scipy_or_netCDF4 From 7f47b459b994a8c607dcaad6a9c853420415e3f3 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 12 Nov 2019 22:43:56 +0000 Subject: [PATCH 5/7] Also test IndexVariable --- xarray/tests/test_dask.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 62f0af1b1ba..1d40ba970ff 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1292,13 +1292,24 @@ def test_recursive_token(): b[5000] = 2 assert str(a) == str(b) assert dask.base.tokenize(a) != dask.base.tokenize(b) + + # Test DataArray and Variable da_a = DataArray(a) da_b = DataArray(b) + assert str(da_a) == str(da_b) assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) + + # Test Dataset ds_a = da_a.to_dataset(name="x") ds_b = da_b.to_dataset(name="x") assert dask.base.tokenize(ds_a) != dask.base.tokenize(ds_b) + # Test IndexVariable + da_a = DataArray(a, dims=["x"], coords={"x": a}) + da_b = DataArray(a, dims=["x"], coords={"x": b}) + assert str(da_a) == str(da_b) + assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) + @requires_scipy_or_netCDF4 def test_normalize_token_with_backend(map_ds): From c31172576a655c5bc94b2cfad3b5fb4b43d86e89 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 12 Nov 2019 22:49:33 +0000 Subject: [PATCH 6/7] Cleanup --- xarray/tests/test_dask.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 1d40ba970ff..43b788153bc 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1296,7 +1296,6 @@ def test_recursive_token(): # Test DataArray and Variable da_a = DataArray(a) da_b = DataArray(b) - assert str(da_a) == str(da_b) assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) # Test Dataset @@ -1307,7 +1306,6 @@ def test_recursive_token(): # Test IndexVariable da_a = DataArray(a, dims=["x"], coords={"x": a}) da_b = DataArray(a, dims=["x"], coords={"x": b}) - assert str(da_a) == str(da_b) assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) From 36ad4f7d4d2f238cccb20d48c83d604ad431c49d Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 12 Nov 2019 23:23:31 +0000 Subject: [PATCH 7/7] tokenize sparse objects --- xarray/tests/test_sparse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index a31da162487..a02fef2faeb 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -856,6 +856,10 @@ def test_dask_token(): import dask s = sparse.COO.from_numpy(np.array([0, 0, 1, 2])) + + # https://github.com/pydata/sparse/issues/300 + s.__dask_tokenize__ = lambda: dask.base.normalize_token(s.__dict__) + a = DataArray(s) t1 = dask.base.tokenize(a) t2 = dask.base.tokenize(a)