Preserve dtype better when specified.

dcherian · dcherian · commit 24be2d191b80 · 2024-09-04T10:38:49.000-06:00
diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -292,8 +292,8 @@ def __repr__(self) -> str:
     combine="sum",
     fill_value=0,
     final_fill_value=0,
-    dtypes=np.intp,
-    final_dtype=np.intp,
+    dtypes=np.integer,
+    final_dtype=np.integer,
 )
 
 # note that the fill values are the result of np.func([np.nan, np.nan])
@@ -521,20 +521,23 @@ def quantile_new_dims_func(q) -> tuple[Dim]:
     return (Dim(name="quantile", values=q),)
 
 
+# if the input contains integers or floats smaller than float64,
+# the output data-type is float64. Otherwise, the output data-type is the same as that
+# of the input.
 quantile = Aggregation(
     name="quantile",
     fill_value=dtypes.NA,
     chunk=None,
     combine=None,
-    final_dtype=np.floating,
+    final_dtype=np.float64,
     new_dims_func=quantile_new_dims_func,
 )
 nanquantile = Aggregation(
     name="nanquantile",
     fill_value=dtypes.NA,
     chunk=None,
     combine=None,
-    final_dtype=np.floating,
+    final_dtype=np.float64,
     new_dims_func=quantile_new_dims_func,
 )
 mode = Aggregation(
@@ -780,10 +783,8 @@ def _initialize_aggregation(
         np.dtype(dtype) if dtype is not None and not isinstance(dtype, np.dtype) else dtype
     )
     final_dtype = dtypes._normalize_dtype(
-        dtype_ or agg.dtype_init["final"], array_dtype, fill_value
+        dtype_ or agg.dtype_init["final"], array_dtype, agg.preserves_dtype, fill_value
     )
-    if not agg.preserves_dtype:
-        final_dtype = dtypes._maybe_promote_int(final_dtype)
     agg.dtype = {
         "user": dtype,  # Save to automatically choose an engine
         "final": final_dtype,
diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py
@@ -150,9 +150,14 @@ def is_datetime_like(dtype):
     return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64)
 
 
-def _normalize_dtype(dtype: DTypeLike, array_dtype: np.dtype, fill_value=None) -> np.dtype:
+def _normalize_dtype(
+    dtype: DTypeLike, array_dtype: np.dtype, preserves_dtype: bool, fill_value=None
+) -> np.dtype:
     if dtype is None:
-        dtype = array_dtype
+        if not preserves_dtype:
+            dtype = _maybe_promote_int(array_dtype)
+        else:
+            dtype = array_dtype
     if dtype is np.floating:
         # mean, std, var always result in floating
         # but we preserve the array's dtype if it is floating
diff --git a/tests/strategies.py b/tests/strategies.py
@@ -26,7 +26,7 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:
 
 
 # TODO: stop excluding everything but U
-array_dtype_st = supported_dtypes().filter(lambda x: x.kind not in "cmMU")
+array_dtypes = supported_dtypes().filter(lambda x: x.kind not in "cmMU")
 by_dtype_st = supported_dtypes()
 
 NON_NUMPY_FUNCS = ["first", "last", "nanfirst", "nanlast", "count", "any", "all"] + list(
@@ -38,7 +38,7 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:
     [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS]
 )
 numeric_arrays = npst.arrays(
-    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtypes
 )
 all_arrays = npst.arrays(
     elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1929,3 +1929,13 @@ def test_ffill_bfill(chunks, size, add_nan_by, func):
     expected = flox.groupby_scan(array.compute(), by, func=func)
     actual = flox.groupby_scan(array, by, func=func)
     assert_equal(expected, actual)
+
+
+def test_agg_dtypes():
+    # regression test for GH388
+    counts = np.array([0, 2, 1, 0, 1])
+    group = np.array([1, 1, 1, 2, 2])
+    actual, _ = groupby_reduce(
+        counts, group, expected_groups=(np.array([1, 2]),), func="sum", dtype="uint8"
+    )
+    assert actual.dtype == np.uint8
diff --git a/tests/test_properties.py b/tests/test_properties.py
@@ -19,7 +19,7 @@
 from flox.xrutils import notnull
 
 from . import assert_equal
-from .strategies import by_arrays, chunked_arrays, func_st, numeric_arrays
+from .strategies import array_dtypes, by_arrays, chunked_arrays, func_st, numeric_arrays
 from .strategies import chunks as chunks_strategy
 
 dask.config.set(scheduler="sync")
@@ -223,3 +223,25 @@ def test_first_last_useless(data, func):
     actual, groups = groupby_reduce(array, by, axis=-1, func=func, engine="numpy")
     expected = np.zeros(shape[:-1] + (len(groups),), dtype=array.dtype)
     assert_equal(actual, expected)
+
+
+@given(
+    func=st.sampled_from(["sum", "prod", "nansum", "nanprod"]),
+    engine=st.sampled_from(["numpy", "flox"]),
+    array_dtype=st.none() | array_dtypes,
+    dtype=st.none() | array_dtypes,
+)
+def test_agg_dtype_specified(func, array_dtype, dtype, engine):
+    # regression test for GH388
+    counts = np.array([0, 2, 1, 0, 1], dtype=array_dtype)
+    group = np.array([1, 1, 1, 2, 2])
+    actual, _ = groupby_reduce(
+        counts,
+        group,
+        expected_groups=(np.array([1, 2]),),
+        func=func,
+        dtype=dtype,
+        engine=engine,
+    )
+    expected = getattr(np, func)(counts, keepdims=True, dtype=dtype)
+    assert actual.dtype == expected.dtype

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:`
`26`	`26`
`27`	`27`
`28`	`28`	`# TODO: stop excluding everything but U`
`29`		`-array_dtype_st = supported_dtypes().filter(lambda x: x.kind not in "cmMU")`
	`29`	`+array_dtypes = supported_dtypes().filter(lambda x: x.kind not in "cmMU")`
`30`	`30`	`by_dtype_st = supported_dtypes()`
`31`	`31`
`32`	`32`	`NON_NUMPY_FUNCS = ["first", "last", "nanfirst", "nanlast", "count", "any", "all"] + list(`
`@@ -38,7 +38,7 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:`
`38`	`38`	`[f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS]`
`39`	`39`	`)`
`40`	`40`	`numeric_arrays = npst.arrays(`
`41`		`- elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st`
	`41`	`+ elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtypes`
`42`	`42`	`)`
`43`	`43`	`all_arrays = npst.arrays(`
`44`	`44`	`elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()`