Skip to content
Closed
79 changes: 76 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ensure_platform_int,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
Expand Down Expand Up @@ -2348,10 +2349,82 @@ def _can_hold_na(self):
return True

@classmethod
def _concat_same_type(self, to_concat):
from pandas.core.dtypes.concat import concat_categorical
def _concat_same_type(cls, to_concat):
return cls._concat_arrays(to_concat)
# TODO: lock down stricter behavior?

return concat_categorical(to_concat)
@classmethod
def _concat_same_dtype(
cls,
to_concat,
axis: int = 0,
sort_categories: bool = False,
ignore_order: bool = False,
):
"""
Like _concat_same_type, but with the added restriction of matching dtypes.
"""
ordered = False

first = to_concat[0]

# identical categories - fastpath
categories = first.categories
ordered = first.ordered

if all(first.categories.equals(other.categories) for other in to_concat[1:]):
new_codes = np.concatenate([c.codes for c in to_concat])
else:
codes = [first.codes] + [
recode_for_categories(other.codes, other.categories, first.categories)
for other in to_concat[1:]
]
new_codes = np.concatenate(codes)

if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)

new_codes = take_1d(indexer, new_codes, fill_value=-1)

if ignore_order:
ordered = False

return cls(new_codes, categories=categories, ordered=ordered, fastpath=True)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
from pandas.core.dtypes.concat import concat_compat, union_categoricals

categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]

# validate the categories
if len(categoricals) != len(to_concat):
pass
else:
# when all categories are identical
first = to_concat[0]
if all(first.is_dtype_equal(other) for other in to_concat[1:]):
return union_categoricals(categoricals)

# extract the categoricals & coerce to object if needed
to_concat = [
x._internal_get_values()
if is_categorical_dtype(x.dtype)
else np.asarray(x).ravel()
if not is_datetime64tz_dtype(x)
else np.asarray(x.astype(object))
for x in to_concat
]

result = concat_compat(to_concat)
if axis == 1:
# TODO(EA2D): this is a kludge for 1D EAs
result = result.reshape(1, len(result))
return result

def isin(self, values):
"""
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float_dtype,
is_integer_dtype,
is_list_like,
Expand Down Expand Up @@ -751,6 +752,30 @@ def _concat_same_type(cls, to_concat, axis: int = 0):

return cls._simple_new(values, dtype=dtype, freq=new_freq)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array

to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]

if len({x.dtype for x in to_concat}) == 1:
if axis == 1 and is_extension_array_dtype(to_concat[0].dtype):
# TODO(EA2D): not necessary with 2D EAs
axis = 0

result = cls._concat_same_type(to_concat, axis=axis)

if axis == 1 and result.ndim == 1:
# TODO(EA2D): not necessary with 2D EAs
result = result.reshape(1, -1)
return result

to_concat = [x.astype(object) for x in to_concat]
if axis == 1:
# TODO(EA2D): not necessary with 2D EAs
to_concat = [np.atleast_2d(x) for x in to_concat]
return np.concatenate(to_concat, axis=axis)

def copy(self):
values = self.asi8.copy()
return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq)
Expand Down
13 changes: 13 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,19 @@ def _concat_same_type(cls, to_concat):

return cls(data, sparse_index=sp_index, fill_value=fill_value)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
fill_values = [x.fill_value for x in to_concat if isinstance(x, cls)]
fill_value = fill_values[0]

# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x if isinstance(x, cls) else cls(x.squeeze(), fill_value=fill_value)
for x in to_concat
]

return cls._concat_same_type(to_concat)

def astype(self, dtype=None, copy=True):
"""
Change the dtype of a SparseArray.
Expand Down
173 changes: 12 additions & 161 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ def is_nonempty(x) -> bool:
_contains_datetime = any(typ.startswith("datetime") for typ in typs)
_contains_period = any(typ.startswith("period") for typ in typs)

from pandas.core.arrays import Categorical, SparseArray, datetimelike as dtl
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And to make my suggestion more concrete: instead of import Categorical here, it would be from pandas.core.arrays.categorical import _concat_arrays as concat_categorical (or whathever name we give it)

from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array

to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]

all_empty = not len(non_empties)
single_dtype = len({x.dtype for x in to_concat}) == 1
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)
Expand All @@ -106,14 +111,15 @@ def is_nonempty(x) -> bool:
elif "category" in typs:
# this must be prior to concat_datetime,
# to support Categorical + datetime-like
return concat_categorical(to_concat, axis=axis)
return Categorical._concat_arrays(to_concat, axis=axis)

elif _contains_datetime or "timedelta" in typs or _contains_period:
return concat_datetime(to_concat, axis=axis, typs=typs)
obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0]
return type(obj)._concat_arrays(to_concat, axis=axis)

# these are mandated to handle empties as well
elif "sparse" in typs:
return _concat_sparse(to_concat, axis=axis, typs=typs)
return SparseArray._concat_arrays(to_concat, axis=axis)

elif any_ea and axis == 1:
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
Expand All @@ -136,52 +142,6 @@ def is_nonempty(x) -> bool:
return np.concatenate(to_concat, axis=axis)


def concat_categorical(to_concat, axis: int = 0):
"""
Concatenate an object/categorical array of arrays, each of which is a
single dtype

Parameters
----------
to_concat : array of arrays
axis : int
Axis to provide concatenation in the current implementation this is
always 0, e.g. we only have 1D categoricals

Returns
-------
Categorical
A single array, preserving the combined dtypes
"""
# we could have object blocks and categoricals here
# if we only have a single categoricals then combine everything
# else its a non-compat categorical
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]

# validate the categories
if len(categoricals) != len(to_concat):
pass
else:
# when all categories are identical
first = to_concat[0]
if all(first.is_dtype_equal(other) for other in to_concat[1:]):
return union_categoricals(categoricals)

# extract the categoricals & coerce to object if needed
to_concat = [
x._internal_get_values()
if is_categorical_dtype(x.dtype)
else np.asarray(x).ravel()
if not is_datetime64tz_dtype(x)
else np.asarray(x.astype(object))
for x in to_concat
]
result = concat_compat(to_concat)
if axis == 1:
result = result.reshape(1, len(result))
return result


def union_categoricals(
to_union, sort_categories: bool = False, ignore_order: bool = False
):
Expand Down Expand Up @@ -309,28 +269,10 @@ def _maybe_unwrap(x):
ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered

if all(first.categories.equals(other.categories) for other in to_union[1:]):
new_codes = np.concatenate([c.codes for c in to_union])
else:
codes = [first.codes] + [
recode_for_categories(other.codes, other.categories, first.categories)
for other in to_union[1:]
]
new_codes = np.concatenate(codes)
return Categorical._concat_same_dtype(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also move the full of union_categoricals do the categorical array module?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would be my preference too, but trying to keep the already-broad scope/diff limited

to_union, sort_categories=sort_categories, ignore_order=ignore_order,
)

if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)

from pandas.core.algorithms import take_1d

new_codes = take_1d(indexer, new_codes, fill_value=-1)
elif ignore_order or all(not c.ordered for c in to_union):
# different categories - union and recode
cats = first.categories.append([c.categories for c in to_union[1:]])
Expand All @@ -354,94 +296,3 @@ def _maybe_unwrap(x):
ordered = False

return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)


def _concatenate_2d(to_concat, axis: int):
# coerce to 2d if needed & concatenate
if axis == 1:
to_concat = [np.atleast_2d(x) for x in to_concat]
return np.concatenate(to_concat, axis=axis)


def concat_datetime(to_concat, axis=0, typs=None):
"""
provide concatenation of an datetimelike array of arrays each of which is a
single M8[ns], datetimet64[ns, tz] or m8[ns] dtype

Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes

Returns
-------
a single array, preserving the combined dtypes
"""
if typs is None:
typs = get_dtype_kinds(to_concat)

to_concat = [_wrap_datetimelike(x) for x in to_concat]
single_dtype = len({x.dtype for x in to_concat}) == 1

# multiple types, need to coerce to object
if not single_dtype:
# wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta
return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)

if axis == 1:
# TODO(EA2D): kludge not necessary with 2D EAs
to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat]

result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)

if result.ndim == 2 and is_extension_array_dtype(result.dtype):
# TODO(EA2D): kludge not necessary with 2D EAs
assert result.shape[0] == 1
result = result[0]
return result


def _wrap_datetimelike(arr):
"""
Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.

DTA/TDA handle .astype(object) correctly.
"""
from pandas.core.construction import array as pd_array, extract_array

arr = extract_array(arr, extract_numpy=True)
if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]:
arr = pd_array(arr)
return arr


def _concat_sparse(to_concat, axis=0, typs=None):
"""
provide concatenation of an sparse/dense array of arrays each of which is a
single dtype

Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes

Returns
-------
a single array, preserving the combined dtypes
"""
from pandas.core.arrays import SparseArray

fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
fill_value = fill_values[0]

# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x
if isinstance(x, SparseArray)
else SparseArray(x.squeeze(), fill_value=fill_value)
for x in to_concat
]

return SparseArray._concat_same_type(to_concat)