From 8a3a93fc9245077e6907ded658eabaee904308f2 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 May 2023 09:07:18 -0700 Subject: [PATCH 1/4] REF: split out dtype-finding in concat_compat --- pandas/core/dtypes/concat.py | 126 ++++++++++++----------------------- 1 file changed, 44 insertions(+), 82 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 1a02f9a8f0211..eafc0633f1d93 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -18,14 +18,9 @@ common_dtype_categorical_compat, find_common_type, ) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - DatetimeTZDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCCategoricalIndex, - ABCExtensionArray, ABCSeries, ) @@ -33,6 +28,7 @@ from pandas._typing import ( ArrayLike, AxisInt, + DtypeObj, ) from pandas.core.arrays import ( @@ -100,45 +96,54 @@ def concat_compat( # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. + orig = to_concat non_empties = [x for x in to_concat if _is_nonempty(x, axis)] if non_empties and axis == 0 and not ea_compat_axis: # ea_compat_axis see GH#39574 to_concat = non_empties - dtypes = {obj.dtype for obj in to_concat} - kinds = {obj.dtype.kind for obj in to_concat} - contains_datetime = any( - isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in "mM" - for dtype in dtypes - ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) + any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties) + + if len(to_concat) < len(orig): + _, _, alt_dtype = _get_result_dtype(orig, non_empties) + + if target_dtype is not None: + to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat] + + if not isinstance(to_concat[0], np.ndarray): + # i.e. isinstance(to_concat[0], ExtensionArray) + to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat_eas) + else: + to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) + result = np.concatenate(to_concat_arrs, axis=axis) + + if not any_ea and "b" in kinds and result.dtype.kind in "iuf": + # GH#39817 cast to object instead of casting bools to numeric + result = result.astype(object, copy=False) + return result - all_empty = not len(non_empties) - single_dtype = len(dtypes) == 1 - any_ea = any(isinstance(x, ExtensionDtype) for x in dtypes) - if contains_datetime: - return _concat_datetime(to_concat, axis=axis) +def _get_result_dtype( + to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike] +) -> tuple[bool, set[str], DtypeObj]: + target_dtype = None + dtypes = {obj.dtype for obj in to_concat} + kinds = {obj.dtype.kind for obj in to_concat} + + any_ea = any(not isinstance(x, np.ndarray) for x in to_concat) if any_ea: + # i.e. any ExtensionArrays + # we ignore axis here, as internally concatting with EAs is always # for axis=0 - if not single_dtype: + if len(dtypes) != 1: target_dtype = find_common_type([x.dtype for x in to_concat]) target_dtype = common_dtype_categorical_compat(to_concat, target_dtype) - to_concat = [ - astype_array(arr, target_dtype, copy=False) for arr in to_concat - ] - - if isinstance(to_concat[0], ABCExtensionArray): - # TODO: what about EA-backed Index? - to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat_eas) - else: - to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) - return np.concatenate(to_concat_arrs) - elif all_empty: + elif not len(non_empties): # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) @@ -148,17 +153,16 @@ def concat_compat( pass else: # coerce to object - to_concat = [x.astype("object") for x in to_concat] + target_dtype = np.dtype(object) kinds = {"o"} + else: + # Argument 1 to "list" has incompatible type "Set[Union[ExtensionDtype, + # Any]]"; expected "Iterable[Union[dtype[Any], None, Type[Any], + # _SupportsDType[dtype[Any]], str, Tuple[Any, Union[SupportsIndex, + # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + target_dtype = np.find_common_type(list(dtypes), []) # type: ignore[arg-type] - # error: Argument 1 to "concatenate" has incompatible type - # "Sequence[Union[ExtensionArray, ndarray[Any, Any]]]"; expected - # "Union[_SupportsArray[dtype[Any]], _NestedSequence[_SupportsArray[dtype[Any]]]]" - result: np.ndarray = np.concatenate(to_concat, axis=axis) # type: ignore[arg-type] - if "b" in kinds and result.dtype.kind in "iuf": - # GH#39817 cast to object instead of casting bools to numeric - result = result.astype(object, copy=False) - return result + return any_ea, kinds, target_dtype def union_categoricals( @@ -320,45 +324,3 @@ def _maybe_unwrap(x): dtype = CategoricalDtype(categories=categories, ordered=ordered) return Categorical._simple_new(new_codes, dtype=dtype) - - -def _concatenate_2d(to_concat: Sequence[np.ndarray], axis: AxisInt) -> np.ndarray: - # coerce to 2d if needed & concatenate - if axis == 1: - to_concat = [np.atleast_2d(x) for x in to_concat] - return np.concatenate(to_concat, axis=axis) - - -def _concat_datetime(to_concat: Sequence[ArrayLike], axis: AxisInt = 0) -> ArrayLike: - """ - provide concatenation of an datetimelike array of arrays each of which is a - single M8[ns], datetime64[ns, tz] or m8[ns] dtype - - Parameters - ---------- - to_concat : sequence of arrays - axis : axis to provide concatenation - - Returns - ------- - a single array, preserving the combined dtypes - """ - from pandas.core.construction import ensure_wrapped_if_datetimelike - - to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] - - single_dtype = lib.dtypes_all_equal([x.dtype for x in to_concat]) - - # multiple types, need to coerce to object - if not single_dtype: - # ensure_wrapped_if_datetimelike ensures that astype(object) wraps - # in Timestamp/Timedelta - return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) - - # error: Unexpected keyword argument "axis" for "_concat_same_type" of - # "ExtensionArray" - to_concat_eas = cast("list[ExtensionArray]", to_concat) - result = type(to_concat_eas[0])._concat_same_type( # type: ignore[call-arg] - to_concat_eas, axis=axis - ) - return result From 1f69ef280c3af9615c0a28197d4afb159a9dc191 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 May 2023 10:39:42 -0700 Subject: [PATCH 2/4] mypy fixup --- pandas/core/dtypes/concat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index eafc0633f1d93..e5b80a8b371fa 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -162,7 +162,10 @@ def _get_result_dtype( # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" target_dtype = np.find_common_type(list(dtypes), []) # type: ignore[arg-type] - return any_ea, kinds, target_dtype + # error: Incompatible return value type (got "Tuple[bool, Set[Union[str, Any]], + # Union[dtype[Any], ExtensionDtype, None]]", expected "Tuple[bool, + # Set[str], Union[dtype[Any], ExtensionDtype]]") + return any_ea, kinds, target_dtype # type: ignore[return-value] def union_categoricals( From 783b7642b2b56f01fd498bda05828aa0bfd0bc88 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 May 2023 12:40:59 -0700 Subject: [PATCH 3/4] fix annotation --- pandas/core/dtypes/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index e5b80a8b371fa..d5a99d812f3b5 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -127,7 +127,7 @@ def concat_compat( def _get_result_dtype( to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike] -) -> tuple[bool, set[str], DtypeObj]: +) -> tuple[bool, set[str], DtypeObj | None]: target_dtype = None dtypes = {obj.dtype for obj in to_concat} From fd891de2db109c87c8a3531dafd4e09a09abce90 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 May 2023 12:43:40 -0700 Subject: [PATCH 4/4] remove unused ignore --- pandas/core/dtypes/concat.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d5a99d812f3b5..4a25c3541a398 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -162,10 +162,7 @@ def _get_result_dtype( # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" target_dtype = np.find_common_type(list(dtypes), []) # type: ignore[arg-type] - # error: Incompatible return value type (got "Tuple[bool, Set[Union[str, Any]], - # Union[dtype[Any], ExtensionDtype, None]]", expected "Tuple[bool, - # Set[str], Union[dtype[Any], ExtensionDtype]]") - return any_ea, kinds, target_dtype # type: ignore[return-value] + return any_ea, kinds, target_dtype def union_categoricals(