From c1b5c1c6406e7cc76cfed38d221c458c654f29a7 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 14 Sep 2024 19:33:21 -0600 Subject: [PATCH 1/9] Avoid rechunking when preferred_method="blockwise" --- flox/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/flox/core.py b/flox/core.py index c419f7465..7cab9b394 100644 --- a/flox/core.py +++ b/flox/core.py @@ -642,6 +642,7 @@ def rechunk_for_blockwise(array: DaskArray, axis: T_Axis, labels: np.ndarray) -> DaskArray Rechunked array """ + # TODO: this should be unnecessary? labels = factorize_((labels,), axes=())[0] chunks = array.chunks[axis] newchunks = _get_optimal_chunks_for_groups(chunks, labels) @@ -2623,7 +2624,8 @@ def groupby_reduce( partial_agg = partial(dask_groupby_agg, **kwargs) - if method == "blockwise" and by_.ndim == 1: + # if preferred method is already blockwise, no need to rechunk + if preferred_method != "blockwise" and method == "blockwise" and by_.ndim == 1: array = rechunk_for_blockwise(array, axis=-1, labels=by_) result, groups = partial_agg( From e7c603b8b7645333f33ae73f0c79e8b7e477ac2a Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 14 Sep 2024 21:17:20 -0600 Subject: [PATCH 2/9] Add new numpy1 environment --- .github/workflows/ci.yaml | 3 +++ ci/env-numpy1.yml | 30 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 ci/env-numpy1.yml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bbee15060..2932271b0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -37,6 +37,9 @@ jobs: - os: "ubuntu-latest" env: "minimal-requirements" python-version: "3.10" + - os: "windows-latest" + env: "env-numpy1" + python-version: "3.10" steps: - uses: actions/checkout@v4 with: diff --git a/ci/env-numpy1.yml b/ci/env-numpy1.yml new file mode 100644 index 000000000..30dccdc0e --- /dev/null +++ b/ci/env-numpy1.yml @@ -0,0 +1,30 @@ +name: flox-tests +channels: + - conda-forge +dependencies: + - asv + - cachey + - cftime + - codecov + - cubed>=0.14.3 + - dask-core + - pandas + - numpy<2 + - scipy + - lxml # for mypy coverage report + - matplotlib + - pip + - pytest + - pytest-cov + - pytest-pretty + - pytest-xdist + - syrupy + - pre-commit + - numpy_groupies>=0.9.19 + - pooch + - toolz + - numba + - numbagg>=0.3 + - hypothesis + - pip: + - git+https://github.com/dcherian/xarray.git@flox-preserve-dtype From 722dfc76c1d9067986845d3c05939ad5b7c2ee23 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Sep 2024 11:57:13 -0600 Subject: [PATCH 3/9] try int_ instead of intp --- flox/core.py | 2 +- flox/xrdtypes.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flox/core.py b/flox/core.py index c419f7465..2c5fa0ddd 100644 --- a/flox/core.py +++ b/flox/core.py @@ -2418,7 +2418,7 @@ def groupby_reduce( ) is_bool_array = np.issubdtype(array.dtype, bool) - array = array.astype(np.intp) if is_bool_array else array + array = array.astype(np.int_) if is_bool_array else array isbins = _atleast_1d(isbin, nby) diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py index 34d0d2a52..e781caf32 100644 --- a/flox/xrdtypes.py +++ b/flox/xrdtypes.py @@ -179,9 +179,9 @@ def _maybe_promote_int(dtype) -> np.dtype: if not isinstance(dtype, np.dtype): dtype = np.dtype(dtype) if dtype.kind == "i": - dtype = np.result_type(dtype, np.intp) + dtype = np.result_type(dtype, np.int_) elif dtype.kind == "u": - dtype = np.result_type(dtype, np.uintp) + dtype = np.result_type(dtype, np.uint_) return dtype From 6800b8ba2e5528f0e36d65fffd587010f68a46c0 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Sep 2024 12:02:51 -0600 Subject: [PATCH 4/9] Use uintp instead --- flox/xrdtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py index e781caf32..b5fc8108e 100644 --- a/flox/xrdtypes.py +++ b/flox/xrdtypes.py @@ -181,7 +181,7 @@ def _maybe_promote_int(dtype) -> np.dtype: if dtype.kind == "i": dtype = np.result_type(dtype, np.int_) elif dtype.kind == "u": - dtype = np.result_type(dtype, np.uint_) + dtype = np.result_type(dtype, np.uintp) return dtype From 7db06a2a387a318fe217f8242d0efd02d36c6816 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Sep 2024 12:44:27 -0600 Subject: [PATCH 5/9] Use np.uint instead --- flox/xrdtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py index b5fc8108e..e1b9bccec 100644 --- a/flox/xrdtypes.py +++ b/flox/xrdtypes.py @@ -181,7 +181,7 @@ def _maybe_promote_int(dtype) -> np.dtype: if dtype.kind == "i": dtype = np.result_type(dtype, np.int_) elif dtype.kind == "u": - dtype = np.result_type(dtype, np.uintp) + dtype = np.result_type(dtype, np.uint) return dtype From 38a556fe2bad788977f837c6b11d96edb3973421 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Sep 2024 12:50:47 -0600 Subject: [PATCH 6/9] more fixes --- flox/core.py | 6 +++--- tests/conftest.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/flox/core.py b/flox/core.py index 2c5fa0ddd..5515f254c 100644 --- a/flox/core.py +++ b/flox/core.py @@ -2776,7 +2776,7 @@ def groupby_scan( return array is_bool_array = np.issubdtype(array.dtype, bool) - array = array.astype(np.intp) if is_bool_array else array + array = array.astype(np.int_) if is_bool_array else array if expected_groups is not None: raise NotImplementedError("Setting `expected_groups` and binning is not supported yet.") @@ -2810,9 +2810,9 @@ def groupby_scan( # it defaults to the dtype of a, unless a # has an integer dtype with a precision less than that of the default platform integer. if array.dtype.kind == "i": - agg.dtype = np.result_type(array.dtype, np.intp) + agg.dtype = np.result_type(array.dtype, np.int_) elif array.dtype.kind == "u": - agg.dtype = np.result_type(array.dtype, np.uintp) + agg.dtype = np.result_type(array.dtype, np.uint) else: agg.dtype = array.dtype if dtype is None else dtype diff --git a/tests/conftest.py b/tests/conftest.py index 4413ea1e8..3158936da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,7 @@ settings.register_profile( "default", max_examples=300, + deadline=500, suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.too_slow], verbosity=Verbosity.verbose, ) From 40efff208f2c0b435438f1e31ec9dc6dcc8b4fc9 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Sep 2024 14:32:40 -0600 Subject: [PATCH 7/9] Add test --- tests/test_core.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index cef9ad8a1..4ead7bfb5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1997,3 +1997,11 @@ def test_agg_dtypes(func, engine): ) expected = _get_array_func(func)(counts, dtype="uint8") assert actual.dtype == np.uint8 == expected.dtype + + +def test_blockwise_avoid_rechunk(): + array = dask.array.zeros((6,), chunks=(2, 4), dtype=np.int64) + by = np.array(["1", "1", "0", "", "0", ""], dtype=" Date: Mon, 16 Sep 2024 14:50:42 -0600 Subject: [PATCH 8/9] fix --- tests/test_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_core.py b/tests/test_core.py index 4ead7bfb5..94e32a6dc 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1999,6 +1999,7 @@ def test_agg_dtypes(func, engine): assert actual.dtype == np.uint8 == expected.dtype +@requires_dask def test_blockwise_avoid_rechunk(): array = dask.array.zeros((6,), chunks=(2, 4), dtype=np.int64) by = np.array(["1", "1", "0", "", "0", ""], dtype=" Date: Mon, 16 Sep 2024 16:55:55 -0600 Subject: [PATCH 9/9] fix again --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 94e32a6dc..164f87b3c 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2005,4 +2005,4 @@ def test_blockwise_avoid_rechunk(): by = np.array(["1", "1", "0", "", "0", ""], dtype="