From 52270a46e38a06c218a4e06f05b09f26a03fdee8 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Thu, 29 Sep 2022 17:04:25 +0200 Subject: [PATCH 01/15] Add a dtype check for numpy arrays --- tests/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/__init__.py b/tests/__init__.py index fef0a778e..fa95ea49d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -98,6 +98,9 @@ def assert_equal(a, b): # does some validation of the dask graph da.utils.assert_eq(a, b, equal_nan=True) else: + if a.dtype != b.dtype: + raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})") + np.testing.assert_allclose(a, b, equal_nan=True) From e8f62ce814c0fb7766e1e44752322812709710f5 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 7 Oct 2022 20:58:11 -0600 Subject: [PATCH 02/15] speciify group dtype --- tests/test_core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index c70ae83c6..25e93f1dd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -126,7 +126,9 @@ def test_groupby_reduce( split_out=split_out, engine=engine, ) - assert_equal(groups, [0, 1, 2]) + g_dtype = by.dtype if expected_groups is None else np.asarray(expected_groups).dtype + + assert_equal(groups, np.array([0, 1, 2], g_dtype)) assert_equal(expected, result) From e42690c2af4b9ff5dd189b94b87fcd92d0fd2d41 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 7 Oct 2022 20:58:30 -0600 Subject: [PATCH 03/15] Fix minmax reduction dtype --- flox/core.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/flox/core.py b/flox/core.py index fe6bbe475..66a9c868e 100644 --- a/flox/core.py +++ b/flox/core.py @@ -55,6 +55,12 @@ def _is_arg_reduction(func: str | Aggregation) -> bool: return False +def _is_minmax_reduction(func: str | Aggregation) -> bool: + return not _is_arg_reduction(func) and ( + isinstance(func, str) and ("max" in func or "min" in func) + ) + + def _get_expected_groups(by, sort: bool) -> pd.Index: if is_duck_dask_array(by): raise ValueError("Please provide expected_groups if not grouping by a numpy array.") @@ -1657,4 +1663,7 @@ def groupby_reduce( result, from_=groups[0], to=expected_groups, fill_value=fill_value ).reshape(result.shape[:-1] + grp_shape) groups = final_groups + + if _is_minmax_reduction(func): + result = result.astype(bool) return (result, *groups) From 4b22512cb5fadbfec067f7ef01cd66e099f60144 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 7 Oct 2022 21:05:48 -0600 Subject: [PATCH 04/15] small fix --- flox/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flox/core.py b/flox/core.py index 66a9c868e..5e23f96fd 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1664,6 +1664,6 @@ def groupby_reduce( ).reshape(result.shape[:-1] + grp_shape) groups = final_groups - if _is_minmax_reduction(func): + if _is_minmax_reduction(func) and np.issubdtype(array.dtype, bool): result = result.astype(bool) return (result, *groups) From e412a66263d8bcc688e569ec6666b018fd486bea Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 7 Oct 2022 21:10:54 -0600 Subject: [PATCH 05/15] specify eye dtype --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 25e93f1dd..be98ba71e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -923,7 +923,7 @@ def test_multiple_groupers(): reindex=True, func="count", ) - expected = np.eye(5, 5) + expected = np.eye(5, 5, dtype=int) assert_equal(expected, actual) From c04ce3be5d4cefc895fb6cc13f8c9ba1a8880351 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 7 Oct 2022 21:13:13 -0600 Subject: [PATCH 06/15] moar fix --- flox/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flox/core.py b/flox/core.py index 5e23f96fd..37c3ddd57 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1484,7 +1484,8 @@ def groupby_reduce( if not is_duck_array(array): array = np.asarray(array) - array = array.astype(int) if np.issubdtype(array.dtype, bool) else array + is_bool_array = np.issubdtype(array.dtype, bool) + array = array.astype(int) if is_bool_array else array if isinstance(isbin, bool): isbin = (isbin,) * len(by) @@ -1664,6 +1665,6 @@ def groupby_reduce( ).reshape(result.shape[:-1] + grp_shape) groups = final_groups - if _is_minmax_reduction(func) and np.issubdtype(array.dtype, bool): + if _is_minmax_reduction(func) and is_bool_array: result = result.astype(bool) return (result, *groups) From 104393511ad6d390efe92e8d83d669759e1e4bfc Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 7 Oct 2022 21:18:55 -0600 Subject: [PATCH 07/15] Fixed map_reduce_blockwise mixed --- tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index be98ba71e..ad79a9ea0 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -848,13 +848,13 @@ def test_bool_reductions(func, engine): def test_map_reduce_blockwise_mixed(): t = pd.date_range("2000-01-01", "2000-12-31", freq="D").to_series() data = t.dt.dayofyear - actual = groupby_reduce( + actual, _ = groupby_reduce( dask.array.from_array(data.values, chunks=365), t.dt.month, func="mean", method="split-reduce", ) - expected = groupby_reduce(data, t.dt.month, func="mean") + expected, _ = groupby_reduce(data, t.dt.month, func="mean") assert_equal(expected, actual) From ad8ca0d600c222f142bd7442ba2d5740341523fd Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 8 Oct 2022 09:18:25 +0200 Subject: [PATCH 08/15] Add ignores to failed imports --- tests/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index fa95ea49d..9917b41fc 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -14,7 +14,7 @@ dask_array_type = da.Array except ImportError: - dask_array_type = () + dask_array_type = () # type: ignore try: @@ -22,7 +22,7 @@ xr_types = (xr.DataArray, xr.Dataset) except ImportError: - xr_types = () + xr_types = () # type: ignore def _importorskip(modname, minversion=None): From fba418b95a1b0120bde59f931f6c8b0c1ea40bb6 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 8 Oct 2022 09:20:16 +0200 Subject: [PATCH 09/15] align more types --- flox/core.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/flox/core.py b/flox/core.py index a39883417..8c2ef1bfb 100644 --- a/flox/core.py +++ b/flox/core.py @@ -37,8 +37,12 @@ if TYPE_CHECKING: import dask.array.Array as DaskArray + T_ExpectedGroups = Union[Sequence, np.ndarray] + T_ExpectedGroupsOpt = Union[T_ExpectedGroups, None] + T_ExpectedGroupsIndex = tuple[pd.Index | None, ...] T_Func = Union[str, Callable] T_Funcs = Union[T_Func, Sequence[T_Func]] + T_Func2 = Union[str, Aggregation] T_Axis = int T_Axes = tuple[T_Axis, ...] T_AxesOpt = Union[T_Axis, T_Axes, None] @@ -60,7 +64,7 @@ DUMMY_AXIS = -2 -def _is_arg_reduction(func: str | Aggregation) -> bool: +def _is_arg_reduction(func: T_Func2) -> bool: if isinstance(func, str) and func in ["argmin", "argmax", "nanargmax", "nanargmin"]: return True if isinstance(func, Aggregation) and func.reduction_type == "argreduce": @@ -68,7 +72,7 @@ def _is_arg_reduction(func: str | Aggregation) -> bool: return False -def _is_minmax_reduction(func: str | Aggregation) -> bool: +def _is_minmax_reduction(func: T_Func2) -> bool: return not _is_arg_reduction(func) and ( isinstance(func, str) and ("max" in func or "min" in func) ) @@ -1033,7 +1037,16 @@ def split_blocks(applied, split_out, expected_groups, split_name): def _reduce_blockwise( - array, by, agg, *, axis: T_Axes, expected_groups, fill_value, engine: T_Engine, sort, reindex + array, + by, + agg: Aggregation, + *, + axis: T_Axes, + expected_groups, + fill_value, + engine: T_Engine, + sort, + reindex, ) -> FinalResultsDict: """ Blockwise groupby reduction that produces the final result. This code path is @@ -1341,8 +1354,8 @@ def _assert_by_is_aligned(shape, by): def _convert_expected_groups_to_index( - expected_groups: Iterable, isbin: Sequence[bool], sort: bool -) -> tuple[pd.Index | None, ...]: + expected_groups: T_ExpectedGroups, isbin: Sequence[bool], sort: bool +) -> T_ExpectedGroupsIndex: out: list[pd.Index | None] = [] for ex, isbin_ in zip(expected_groups, isbin): if isinstance(ex, pd.IntervalIndex) or (isinstance(ex, pd.Index) and not isbin): @@ -1403,8 +1416,8 @@ def _factorize_multiple(by, expected_groups, by_is_dask, reindex): def groupby_reduce( array: np.ndarray | DaskArray, *by: np.ndarray | DaskArray, - func: str | Aggregation, - expected_groups: Sequence | np.ndarray | None = None, + func: T_Func2, + expected_groups: T_ExpectedGroupsOpt = None, sort: bool = True, isbin: T_IsBins = False, axis: T_AxesOpt = None, From 18722baaa2b10ddb1a5d703abadba4dd66799eeb Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 8 Oct 2022 09:20:36 +0200 Subject: [PATCH 10/15] Add typing to some tests --- tests/test_core.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index ad79a9ea0..eab8df0f2 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,4 +1,5 @@ from functools import reduce +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -63,6 +64,9 @@ def dask_array_ones(*args): pytest.param("nanmedian", marks=(pytest.mark.skip,)), ) +if TYPE_CHECKING: + from flox.core import T_Engine, T_Func2, T_ExpectedGroupsOpt + def test_alignment_error(): da = np.ones((12,)) @@ -101,8 +105,16 @@ def test_alignment_error(): ], ) def test_groupby_reduce( - array, by, expected, func, expected_groups, chunk, split_out, dtype, engine -): + engine: T_Engine, + func: T_Func2, + array: np.ndarray, + by: np.ndarray, + expected: list[float], + expected_groups: T_ExpectedGroupsOpt, + chunk: bool, + split_out: int, + dtype: np.typing.DTypeLike, +) -> None: array = array.astype(dtype) if chunk: if not has_dask or expected_groups is None: @@ -110,12 +122,12 @@ def test_groupby_reduce( array = da.from_array(array, chunks=(3,) if array.ndim == 1 else (1, 3)) by = da.from_array(by, chunks=(3,) if by.ndim == 1 else (1, 3)) - if "mean" in func: - expected = np.array(expected, dtype=float) + if func == "mean" or func == "nanmean": + expected_result = np.array(expected, dtype=float) elif func == "sum": - expected = np.array(expected, dtype=dtype) + expected_result = np.array(expected, dtype=dtype) elif func == "count": - expected = np.array(expected, dtype=int) + expected_result = np.array(expected, dtype=int) result, groups, = groupby_reduce( array, @@ -129,7 +141,7 @@ def test_groupby_reduce( g_dtype = by.dtype if expected_groups is None else np.asarray(expected_groups).dtype assert_equal(groups, np.array([0, 1, 2], g_dtype)) - assert_equal(expected, result) + assert_equal(expected_result, result) def gen_array_by(size, func): @@ -845,7 +857,7 @@ def test_bool_reductions(func, engine): @requires_dask -def test_map_reduce_blockwise_mixed(): +def test_map_reduce_blockwise_mixed() -> None: t = pd.date_range("2000-01-01", "2000-12-31", freq="D").to_series() data = t.dt.dayofyear actual, _ = groupby_reduce( @@ -910,7 +922,7 @@ def test_factorize_values_outside_bins(): assert_equal(expected, actual) -def test_multiple_groupers(): +def test_multiple_groupers() -> None: actual, *_ = groupby_reduce( np.ones((5, 2)), np.arange(10).reshape(5, 2), From 27abdda48d70e3e4a4763cf7c2fadbb326ba84c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 8 Oct 2022 07:21:01 +0000 Subject: [PATCH 11/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index eab8df0f2..ea2388490 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -65,7 +65,7 @@ def dask_array_ones(*args): ) if TYPE_CHECKING: - from flox.core import T_Engine, T_Func2, T_ExpectedGroupsOpt + from flox.core import T_Engine, T_ExpectedGroupsOpt, T_Func2 def test_alignment_error(): From 6d57fcf5f9bc4eb24f887a2a63dfd9d02ab8bc59 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 8 Oct 2022 09:22:07 +0200 Subject: [PATCH 12/15] Update core.py --- flox/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flox/core.py b/flox/core.py index 8c2ef1bfb..4fbab63d6 100644 --- a/flox/core.py +++ b/flox/core.py @@ -11,7 +11,6 @@ Any, Callable, Dict, - Iterable, Literal, Mapping, Sequence, From 7a2bd669464a43593b787cc17c5b3f170b669d25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 8 Oct 2022 07:22:46 +0000 Subject: [PATCH 13/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flox/core.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/flox/core.py b/flox/core.py index 4fbab63d6..3ab4f428a 100644 --- a/flox/core.py +++ b/flox/core.py @@ -6,16 +6,7 @@ import operator from collections import namedtuple from functools import partial, reduce -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Literal, - Mapping, - Sequence, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Dict, Literal, Mapping, Sequence, Union import numpy as np import numpy_groupies as npg From 8feb56fd41bfcbf35210075611e877778ad34f1a Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 8 Oct 2022 09:24:18 +0200 Subject: [PATCH 14/15] Update test_core.py --- tests/test_core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index ea2388490..25660e734 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from functools import reduce from typing import TYPE_CHECKING From bdb8f78167ee3f2ea64fc45050acd69504f685d9 Mon Sep 17 00:00:00 2001 From: dcherian Date: Sun, 9 Oct 2022 10:53:32 -0600 Subject: [PATCH 15/15] Small typing improvements --- flox/core.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/flox/core.py b/flox/core.py index 3ab4f428a..15577d35c 100644 --- a/flox/core.py +++ b/flox/core.py @@ -27,12 +27,11 @@ if TYPE_CHECKING: import dask.array.Array as DaskArray - T_ExpectedGroups = Union[Sequence, np.ndarray] + T_ExpectedGroups = Union[Sequence, np.ndarray, pd.Index] T_ExpectedGroupsOpt = Union[T_ExpectedGroups, None] - T_ExpectedGroupsIndex = tuple[pd.Index | None, ...] T_Func = Union[str, Callable] T_Funcs = Union[T_Func, Sequence[T_Func]] - T_Func2 = Union[str, Aggregation] + T_Agg = Union[str, Aggregation] T_Axis = int T_Axes = tuple[T_Axis, ...] T_AxesOpt = Union[T_Axis, T_Axes, None] @@ -54,7 +53,7 @@ DUMMY_AXIS = -2 -def _is_arg_reduction(func: T_Func2) -> bool: +def _is_arg_reduction(func: T_Agg) -> bool: if isinstance(func, str) and func in ["argmin", "argmax", "nanargmax", "nanargmin"]: return True if isinstance(func, Aggregation) and func.reduction_type == "argreduce": @@ -62,7 +61,7 @@ def _is_arg_reduction(func: T_Func2) -> bool: return False -def _is_minmax_reduction(func: T_Func2) -> bool: +def _is_minmax_reduction(func: T_Agg) -> bool: return not _is_arg_reduction(func) and ( isinstance(func, str) and ("max" in func or "min" in func) ) @@ -1345,7 +1344,7 @@ def _assert_by_is_aligned(shape, by): def _convert_expected_groups_to_index( expected_groups: T_ExpectedGroups, isbin: Sequence[bool], sort: bool -) -> T_ExpectedGroupsIndex: +) -> tuple[pd.Index | None, ...]: out: list[pd.Index | None] = [] for ex, isbin_ in zip(expected_groups, isbin): if isinstance(ex, pd.IntervalIndex) or (isinstance(ex, pd.Index) and not isbin): @@ -1406,7 +1405,7 @@ def _factorize_multiple(by, expected_groups, by_is_dask, reindex): def groupby_reduce( array: np.ndarray | DaskArray, *by: np.ndarray | DaskArray, - func: T_Func2, + func: T_Agg, expected_groups: T_ExpectedGroupsOpt = None, sort: bool = True, isbin: T_IsBins = False,