From ae51cffe16ceceb38f870c0e744412af0faeef83 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Jan 2021 18:53:45 -0800 Subject: [PATCH 01/18] ENH: 2D support for MaskedArray --- pandas/core/array_algos/masked_reductions.py | 63 ++++++--- pandas/core/arrays/_mixins.py | 21 --- pandas/core/arrays/base.py | 19 +++ pandas/core/arrays/boolean.py | 23 ++-- pandas/core/arrays/floating.py | 16 +-- pandas/core/arrays/integer.py | 18 +-- pandas/core/arrays/masked.py | 89 +++++++++---- pandas/core/arrays/string_.py | 14 +- pandas/core/ops/mask_ops.py | 2 +- .../tests/arrays/boolean/test_construction.py | 8 +- pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/dim2.py | 123 ++++++++++++++++++ pandas/tests/extension/test_boolean.py | 4 + pandas/tests/extension/test_datetime.py | 4 + pandas/tests/extension/test_floating.py | 4 + pandas/tests/extension/test_integer.py | 4 + pandas/tests/extension/test_numpy.py | 4 + pandas/tests/extension/test_period.py | 4 + pandas/tests/extension/test_string.py | 4 + 19 files changed, 327 insertions(+), 98 deletions(-) create mode 100644 pandas/tests/extension/base/dim2.py diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index ec0f2c61e0a29..d5163cc8f93c2 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -3,7 +3,7 @@ for missing values. """ -from typing import Callable +from typing import Callable, Optional import numpy as np @@ -20,6 +20,7 @@ def _sumprod( *, skipna: bool = True, min_count: int = 0, + axis: Optional[int] = None, ): """ Sum or product for 1D masked array. @@ -37,40 +38,58 @@ def _sumprod( min_count : int, default 0 The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + axis : int, optional, default None """ if not skipna: - if mask.any() or check_below_min_count(values.shape, None, min_count): + if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: - return func(values) + return func(values, axis=axis) else: - if check_below_min_count(values.shape, mask, min_count): + if check_below_min_count(values.shape, mask, min_count) and ( + axis is None or values.ndim == 1 + ): return libmissing.NA if np_version_under1p17: - return func(values[~mask]) + return func(values[~mask], axis=axis) else: - return func(values, where=~mask) + return func(values, where=~mask, axis=axis) def sum( - values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, + min_count: int = 0, + axis: Optional[int] = None, ): return _sumprod( - np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) def prod( - values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, + min_count: int = 0, + axis: Optional[int] = None, ): return _sumprod( - np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count + np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) def _minmax( - func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True + func: Callable, + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, + axis: Optional[int] = None, ): """ Reduction for 1D masked array. @@ -85,6 +104,7 @@ def _minmax( Boolean numpy array (True values indicate missing values). skipna : bool, default True Whether to skip NA. + axis : int, optional, default None """ if not skipna: if mask.any() or not values.size: @@ -101,14 +121,27 @@ def _minmax( return libmissing.NA -def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): - return _minmax(np.min, values=values, mask=mask, skipna=skipna) +def min( + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, + axis: Optional[int] = None, +): + return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis) -def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): - return _minmax(np.max, values=values, mask=mask, skipna=skipna) +def max( + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, + axis: Optional[int] = None, +): + return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis) +# TODO: axis kwarg def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True): if not values.size or mask.all(): return libmissing.NA diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 62c594d73a227..f6d462fc9e67d 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -301,27 +301,6 @@ def _wrap_reduction_result(self, axis: Optional[int], result): return self._box_func(result) return self._from_backing_data(result) - # ------------------------------------------------------------------------ - - def __repr__(self) -> str: - if self.ndim == 1: - return super().__repr__() - - from pandas.io.formats.printing import format_object_summary - - # the short repr has no trailing newline, while the truncated - # repr does. So we include a newline in our template, and strip - # any trailing newlines from format_object_summary - lines = [ - format_object_summary(x, self._formatter(), indent_for_name=False).rstrip( - ", \n" - ) - for x in self - ] - data = ",\n".join(lines) - class_name = f"<{type(self).__name__}>" - return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" - # ------------------------------------------------------------------------ # __array_function__ methods diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9a8b37e0785e0..693898b5b6d6b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1097,6 +1097,9 @@ def view(self, dtype: Optional[Dtype] = None) -> ArrayLike: # ------------------------------------------------------------------------ def __repr__(self) -> str: + if self.ndim > 1: + return self._repr_2d() + from pandas.io.formats.printing import format_object_summary # the short repr has no trailing newline, while the truncated @@ -1108,6 +1111,22 @@ def __repr__(self) -> str: class_name = f"<{type(self).__name__}>\n" return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" + def _repr_2d(self) -> str: + from pandas.io.formats.printing import format_object_summary + + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + lines = [ + format_object_summary(x, self._formatter(), indent_for_name=False).rstrip( + ", \n" + ) + for x in self + ] + data = ",\n".join(lines) + class_name = f"<{type(self).__name__}>" + return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" + def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: """ Formatting function for scalar values. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index bbbc0911b4846..b4915d0db89f5 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -193,10 +193,8 @@ def coerce_to_array( if mask_values is not None: mask = mask | mask_values - if values.ndim != 1: - raise ValueError("values must be a 1D list-like") - if mask.ndim != 1: - raise ValueError("mask must be a 1D list-like") + if values.shape != mask.shape: + raise ValueError("values.shape and mask.shape must match") return values, mask @@ -411,9 +409,9 @@ def _values_for_argsort(self) -> np.ndarray: """ data = self._data.copy() data[self._mask] = -1 - return data + return data.ravel("K") - def any(self, *, skipna: bool = True, **kwargs): + def any(self, *, skipna: bool = True, axis: Optional[int] = 0, **kwargs): """ Return whether any element is True. @@ -430,6 +428,7 @@ def any(self, *, skipna: bool = True, **kwargs): If `skipna` is False, the result will still be True if there is at least one element that is True, otherwise NA will be returned if there are NA's present. + axis : int or None, default 0 **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. @@ -472,16 +471,17 @@ def any(self, *, skipna: bool = True, **kwargs): values = self._data.copy() np.putmask(values, self._mask, False) - result = values.any() + result = values.any(axis=axis) + if skipna: return result else: - if result or len(self) == 0 or not self._mask.any(): + if result or self.size == 0 or not self._mask.any(): return result else: return self.dtype.na_value - def all(self, *, skipna: bool = True, **kwargs): + def all(self, *, skipna: bool = True, axis: Optional[int] = 0, **kwargs): """ Return whether all elements are True. @@ -498,6 +498,7 @@ def all(self, *, skipna: bool = True, **kwargs): If `skipna` is False, the result will still be False if there is at least one element that is False, otherwise NA will be returned if there are NA's present. + axis : int or None, default 0 **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. @@ -538,12 +539,12 @@ def all(self, *, skipna: bool = True, **kwargs): values = self._data.copy() np.putmask(values, self._mask, True) - result = values.all() + result = values.all(axis=axis) if skipna: return result else: - if not result or len(self) == 0 or not self._mask.any(): + if not result or self.size == 0 or not self._mask.any(): return result else: return self.dtype.na_value diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1ac23d7893fbf..12d751a155e8e 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -394,21 +394,21 @@ def _cmp_method(self, other, op): return BooleanArray(result, mask) - def sum(self, *, skipna=True, min_count=0, **kwargs): + def sum(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs): nv.validate_sum((), kwargs) - return super()._reduce("sum", skipna=skipna, min_count=min_count) + return super()._reduce("sum", skipna=skipna, min_count=min_count, axis=axis) - def prod(self, *, skipna=True, min_count=0, **kwargs): + def prod(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs): nv.validate_prod((), kwargs) - return super()._reduce("prod", skipna=skipna, min_count=min_count) + return super()._reduce("prod", skipna=skipna, min_count=min_count, axis=axis) - def min(self, *, skipna=True, **kwargs): + def min(self, *, skipna=True, axis: Optional[int] = 0, **kwargs): nv.validate_min((), kwargs) - return super()._reduce("min", skipna=skipna) + return super()._reduce("min", skipna=skipna, axis=axis) - def max(self, *, skipna=True, **kwargs): + def max(self, *, skipna=True, axis: Optional[int] = 0, **kwargs): nv.validate_max((), kwargs) - return super()._reduce("max", skipna=skipna) + return super()._reduce("max", skipna=skipna, axis=axis) def _maybe_mask_result(self, result, mask, other, op_name: str): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f8378fb7d1500..d8bbe2a0ccc3e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -423,7 +423,7 @@ def _values_for_argsort(self) -> np.ndarray: data = self._data.copy() if self._mask.any(): data[self._mask] = data.min() - 1 - return data + return data.ravel("K") def _cmp_method(self, other, op): from pandas.core.arrays import BooleanArray @@ -470,21 +470,21 @@ def _cmp_method(self, other, op): return BooleanArray(result, mask) - def sum(self, *, skipna=True, min_count=0, **kwargs): + def sum(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs): nv.validate_sum((), kwargs) - return super()._reduce("sum", skipna=skipna, min_count=min_count) + return super()._reduce("sum", skipna=skipna, min_count=min_count, axis=axis) - def prod(self, *, skipna=True, min_count=0, **kwargs): + def prod(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs): nv.validate_prod((), kwargs) - return super()._reduce("prod", skipna=skipna, min_count=min_count) + return super()._reduce("prod", skipna=skipna, min_count=min_count, axis=axis) - def min(self, *, skipna=True, **kwargs): + def min(self, *, skipna=True, axis: Optional[int] = 0, **kwargs): nv.validate_min((), kwargs) - return super()._reduce("min", skipna=skipna) + return super()._reduce("min", skipna=skipna, axis=axis) - def max(self, *, skipna=True, **kwargs): + def max(self, *, skipna=True, axis: Optional[int] = 0, **kwargs): nv.validate_max((), kwargs) - return super()._reduce("max", skipna=skipna) + return super()._reduce("max", skipna=skipna, axis=axis) def _maybe_mask_result(self, result, mask, other, op_name: str): """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e4a98a54ee94c..7d252c465afec 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -5,14 +5,14 @@ import numpy as np from pandas._libs import lib, missing as libmissing -from pandas._typing import ArrayLike, Dtype, NpDtype, Scalar +from pandas._typing import ArrayLike, Dtype, NpDtype, Scalar, Shape from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( + is_bool, is_dtype_equal, - is_integer, is_object_dtype, is_scalar, is_string_dtype, @@ -80,6 +80,8 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): # The value used to fill '_data' to avoid upcasting _internal_fill_value: Scalar + _data: np.ndarray + _mask: np.ndarray[Any, bool] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): # values is supposed to already be validated in the subclass @@ -88,10 +90,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): "mask should be boolean numpy array. Use " "the 'pd.array' function instead" ) - if values.ndim != 1: - raise ValueError("values must be a 1D array") - if mask.ndim != 1: - raise ValueError("mask must be a 1D array") + if values.shape != mask.shape: + raise ValueError("values.shape must match mask.shape") if copy: values = values.copy() @@ -107,14 +107,16 @@ def dtype(self) -> BaseMaskedDtype: def __getitem__( self, item: Union[int, slice, np.ndarray] ) -> Union[BaseMaskedArray, Any]: - if is_integer(item): - if self._mask[item]: + item = check_array_indexer(self, item) + + newmask = self._mask[item] + if is_bool(newmask): + # This is a scalar indexing + if newmask: return self.dtype.na_value return self._data[item] - item = check_array_indexer(self, item) - - return type(self)(self._data[item], self._mask[item]) + return type(self)(self._data[item], newmask) def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]: raise AbstractMethodError(self) @@ -134,15 +136,42 @@ def __setitem__(self, key, value) -> None: self._mask[key] = mask def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] + if self.ndim == 1: + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + else: + for i in range(len(self)): + yield self[i] def __len__(self) -> int: return len(self._data) + @property + def shape(self) -> Shape: + return self._data.shape + + @property + def ndim(self) -> int: + return self._data.ndim + + def reshape(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT: + data = self._data.reshape(*args, **kwargs) + mask = self._mask.reshape(*args, **kwargs) + return type(self)(data, mask) + + def ravel(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT: + # TODO: need to make sure we have the same order for data/mask + data = self._data.ravel(*args, **kwargs) + mask = self._mask.ravel(*args, **kwargs) + return type(self)(data, mask) + + @property + def T(self: BaseMaskedArrayT) -> BaseMaskedArrayT: + return type(self)(self._data.T, self._mask.T) + def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: return type(self)(~self._data, self._mask) @@ -295,10 +324,12 @@ def nbytes(self) -> int: @classmethod def _concat_same_type( - cls: Type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT] + cls: Type[BaseMaskedArrayT], + to_concat: Sequence[BaseMaskedArrayT], + axis: int = 0, ) -> BaseMaskedArrayT: - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) + data = np.concatenate([x._data for x in to_concat], axis=axis) + mask = np.concatenate([x._mask for x in to_concat], axis=axis) return cls(data, mask) def take( @@ -307,15 +338,22 @@ def take( *, allow_fill: bool = False, fill_value: Optional[Scalar] = None, + axis: int = 0, ) -> BaseMaskedArrayT: # we always fill with 1 internally # to avoid upcasting data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + self._data, + indexer, + fill_value=data_fill_value, + allow_fill=allow_fill, + axis=axis, ) - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + mask = take( + self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis + ) # if we are filling # we only fill where the indexer is null @@ -339,7 +377,9 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: arr = self._data mask = self._mask - codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + codes, uniques = factorize_array( + arr.ravel(), na_sentinel=na_sentinel, mask=mask.ravel() + ) # the hashtables don't handle all different types of bits uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) @@ -368,7 +408,7 @@ def value_counts(self, dropna: bool = True) -> "Series": # compute counts on the data with no nans data = self._data[~self._mask] - value_counts = Index(data).value_counts() + value_counts = Index(data.ravel("K")).value_counts() # TODO(extension) # if we have allow Index to hold an ExtensionArray @@ -399,7 +439,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if name in {"sum", "prod", "min", "max", "mean"}: op = getattr(masked_reductions, name) - return op(data, mask, skipna=skipna, **kwargs) + result = op(data, mask, skipna=skipna, **kwargs) + return result # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3d0ac3380ec39..55b741e1ce919 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -197,7 +197,9 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + if len(self._ndarray) and not lib.is_string_array( + self._ndarray.ravel("K"), skipna=True + ): raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( @@ -256,7 +258,7 @@ def _values_for_factorize(self): arr = self._ndarray.copy() mask = self.isna() arr[mask] = -1 - return arr, -1 + return arr.ravel("K"), -1 def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) @@ -316,9 +318,11 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, axis: Optional[int] = 0, **kwargs + ): if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna) + return getattr(self, name)(skipna=skipna, axis=axis) raise TypeError(f"Cannot perform reduction '{name}' with string dtype") @@ -339,7 +343,7 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: def value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna).astype("Int64") + return value_counts(self._ndarray.ravel("K"), dropna=dropna).astype("Int64") def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 8fb81faf313d7..0e7fe755d8916 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -173,6 +173,6 @@ def kleene_and( return result, mask -def raise_for_nan(value, method): +def raise_for_nan(value, method: str): if lib.is_float(value) and np.isnan(value): raise ValueError(f"Cannot perform logical '{method}' with floating NaN") diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index c9e96c437964f..f080bf7e03412 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -27,10 +27,10 @@ def test_boolean_array_constructor(): with pytest.raises(TypeError, match="mask should be boolean numpy array"): BooleanArray(values, None) - with pytest.raises(ValueError, match="values must be a 1D array"): + with pytest.raises(ValueError, match="values.shape must match mask.shape"): BooleanArray(values.reshape(1, -1), mask) - with pytest.raises(ValueError, match="mask must be a 1D array"): + with pytest.raises(ValueError, match="values.shape must match mask.shape"): BooleanArray(values, mask.reshape(1, -1)) @@ -183,10 +183,10 @@ def test_coerce_to_array(): values = np.array([True, False, True, False], dtype="bool") mask = np.array([False, False, False, True], dtype="bool") - with pytest.raises(ValueError, match="values must be a 1D list-like"): + with pytest.raises(ValueError, match="values.shape and mask.shape must match"): coerce_to_array(values.reshape(1, -1)) - with pytest.raises(ValueError, match="mask must be a 1D list-like"): + with pytest.raises(ValueError, match="values.shape and mask.shape must match"): coerce_to_array(values, mask=mask.reshape(1, -1)) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 323cb843b2d74..51f9d05f697d1 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -43,6 +43,7 @@ class TestMyDtype(BaseDtypeTests): """ from .casting import BaseCastingTests # noqa from .constructors import BaseConstructorsTests # noqa +from .dim2 import Dim2CompatTests # noqa from .dtype import BaseDtypeTests # noqa from .getitem import BaseGetitemTests # noqa from .groupby import BaseGroupbyTests # noqa diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py new file mode 100644 index 0000000000000..647ae23a0d07b --- /dev/null +++ b/pandas/tests/extension/base/dim2.py @@ -0,0 +1,123 @@ +""" +Tests for 2D compatibility. +""" +import numpy as np +import pytest + +from .base import BaseExtensionTests + + +class Dim2CompatTests(BaseExtensionTests): + def test_take_2d(self, data): + arr2d = data.reshape(-1, 1) + + result = arr2d.take([0, 0, -1], axis=0) + + expected = data.take([0, 0, -1]).reshape(-1, 1) + self.assert_extension_array_equal(result, expected) + + def test_repr_2d(self, data): + # this could fail in a corner case where an element contained the name + res = repr(data.reshape(1, -1)) + assert res.count(f"<{type(data).__name__}") == 1 + + res = repr(data.reshape(-1, 1)) + assert res.count(f"<{type(data).__name__}") == 1 + + def test_reshape(self, data): + arr2d = data.reshape(-1, 1) + assert arr2d.shape == (data.size, 1) + assert len(arr2d) == len(data) + + arr2d = data.reshape((-1, 1)) + assert arr2d.shape == (data.size, 1) + assert len(arr2d) == len(data) + + with pytest.raises(ValueError): + data.reshape((data.size, 2)) + with pytest.raises(ValueError): + data.reshape(data.size, 2) + + def test_getitem_2d(self, data): + arr2d = data.reshape(1, -1) + + result = arr2d[0] + self.assert_extension_array_equal(result, data) + + with pytest.raises(IndexError): + arr2d[1] + + with pytest.raises(IndexError): + arr2d[-2] + + result = arr2d[:] + self.assert_extension_array_equal(result, arr2d) + + result = arr2d[:, :] + self.assert_extension_array_equal(result, arr2d) + + result = arr2d[:, 0] + expected = data[[0]] + self.assert_extension_array_equal(result, expected) + + # dimension-expanding getitem on 1D + result = data[:, np.newaxis] + self.assert_extension_array_equal(result, arr2d.T) + + def test_iter_2d(self, data): + arr2d = data.reshape(1, -1) + + objs = list(iter(arr2d)) + assert len(objs) == arr2d.shape[0] + + for obj in objs: + assert isinstance(obj, type(data)) + assert obj.dtype == data.dtype + assert obj.ndim == 1 + assert len(obj) == arr2d.shape[1] + + def test_concat_2d(self, data): + left = data.reshape(-1, 1) + right = left.copy() + + # axis=0 + result = left._concat_same_type([left, right], axis=0) + expected = data._concat_same_type([data, data]).reshape(-1, 1) + self.assert_extension_array_equal(result, expected) + + # axis=1 + result = left._concat_same_type([left, right], axis=1) + expected = data.repeat(2).reshape(-1, 2) + self.assert_extension_array_equal(result, expected) + + # axis > 1 -> invalid + with pytest.raises(ValueError): + left._concat_same_type([left, right], axis=2) + + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) + def test_reductions_2d_axis_none(self, data, method): + if not hasattr(data, method): + pytest.skip("test is not applicable for this type/dtype") + + arr2d = data.reshape(1, -1) + + err_expected = None + err_result = None + try: + expected = getattr(data, method)() + except Exception as err: + # if the 1D reduction is invalid, the 2D reduction should be as well + err_expected = err + try: + result = getattr(arr2d, method)(axis=None) + except Exception as err2: + err_result = err2 + + else: + result = getattr(arr2d, method)(axis=None) + + if err_result is not None or err_expected is not None: + assert type(err_result) == type(err_expected) + return + + assert result == expected # TODO: or matching NA diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index ced7ea9261310..33b9471d651ff 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -391,3 +391,7 @@ class TestUnaryOps(base.BaseUnaryOpsTests): class TestParsing(base.BaseParsingTests): pass + + +class Test2DCompat(base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 0fde1e8a2fdb8..2d4f1a807b681 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -223,3 +223,7 @@ class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests): class TestPrinting(BaseDatetimeTests, base.BasePrintingTests): pass + + +class Test2DCompat(BaseDatetimeTests, base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index c08c31e90fecc..7e08ad22bfe31 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -221,3 +221,7 @@ class TestPrinting(base.BasePrintingTests): class TestParsing(base.BaseParsingTests): pass + + +class Test2DCompat(base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 99a32203053c6..c249d066c0be6 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -255,3 +255,7 @@ class TestPrinting(base.BasePrintingTests): class TestParsing(base.BaseParsingTests): pass + + +class Test2DCompat(base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 29790d14f93cc..46691ff803bf6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -490,3 +490,7 @@ def test_setitem_loc_iloc_slice(self, data): @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): pass + + +class Test2DCompat(BaseNumPyTests, base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 817881e00fa99..86b53b08f7dcb 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -172,3 +172,7 @@ class TestParsing(BasePeriodTests, base.BaseParsingTests): @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): super().test_EA_types(engine, data) + + +class Test2DCompat(BasePeriodTests, base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index d49c4c5cf4889..53926fb00f472 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -149,3 +149,7 @@ class TestPrinting(base.BasePrintingTests): class TestGroupBy(base.BaseGroupbyTests): pass + + +class Test2DCompat(base.Dim2CompatTests): + pass From 125606bc6b61e27e74a0f8265857fb9485230eed Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 6 Jan 2021 08:15:38 -0800 Subject: [PATCH 02/18] remove Any part of _mask annotation --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7d252c465afec..dd14599452125 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -81,7 +81,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): # The value used to fill '_data' to avoid upcasting _internal_fill_value: Scalar _data: np.ndarray - _mask: np.ndarray[Any, bool] + _mask: np.ndarray def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): # values is supposed to already be validated in the subclass From dd5dbbec37194891dba1e66ddf7b970a2050ada9 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 6 Jan 2021 10:00:32 -0800 Subject: [PATCH 03/18] xfail for ArrowStringArray --- pandas/tests/extension/test_string.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 53926fb00f472..490a048532236 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -152,4 +152,7 @@ class TestGroupBy(base.BaseGroupbyTests): class Test2DCompat(base.Dim2CompatTests): - pass + @pytest.fixture(autouse=True) + def arrow_not_supported(self, data): + if isinstance(data.dtype, ArrowStringDtype): + pytest.xfail(reason="2D support not implemented for ArrowStringArray") From 17f63d46a7c62b681a59146c98a286a85e55f486 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Feb 2021 12:48:38 -0800 Subject: [PATCH 04/18] absolute import --- pandas/tests/extension/base/dim2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 647ae23a0d07b..48836a79fc038 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class Dim2CompatTests(BaseExtensionTests): From 3f14fa33e47477f891b753b20a3acbffa544a895 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Feb 2021 09:24:42 -0800 Subject: [PATCH 05/18] TST: reductions with axis --- pandas/core/arrays/boolean.py | 2 +- pandas/core/arrays/categorical.py | 31 +++++----- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/masked.py | 6 +- pandas/core/arrays/numeric.py | 12 ++++ pandas/core/arrays/string_.py | 4 +- pandas/core/indexes/extension.py | 4 +- pandas/tests/extension/base/dim2.py | 72 ++++++++++++++++++++++ pandas/tests/extension/test_categorical.py | 10 +++ 9 files changed, 119 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 9b3bc6ff2dca8..1563e9c85dbee 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -421,7 +421,7 @@ def _values_for_argsort(self) -> np.ndarray: """ data = self._data.copy() data[self._mask] = -1 - return data.ravel("K") + return data def any(self, *, skipna: bool = True, axis: Optional[int] = 0, **kwargs): """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index af78b84923a9c..24b083910961d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -60,11 +60,10 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import factorize, get_data_algo, take_1d, unique1d -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray, ravel_compat from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array -from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort @@ -1299,6 +1298,7 @@ def _validate_fill_value(self, fill_value): # ------------------------------------------------------------- + @ravel_compat def __array__(self, dtype: Optional[NpDtype] = None) -> np.ndarray: """ The numpy array interface. @@ -1772,7 +1772,10 @@ def __iter__(self): """ Returns an Iterator over the values of this Categorical. """ - return iter(self._internal_get_values().tolist()) + if self.ndim == 1: + return iter(self._internal_get_values().tolist()) + else: + return (self[n] for n in range(len(self))) def __contains__(self, key) -> bool: """ @@ -1891,16 +1894,6 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ - def __getitem__(self, key): - """ - Return an item. - """ - result = super().__getitem__(key) - if getattr(result, "ndim", 0) > 1: - result = result._ndarray - deprecate_ndim_indexing(result) - return result - def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) @@ -2158,7 +2151,17 @@ def _concat_same_type( ) -> CategoricalT: from pandas.core.dtypes.concat import union_categoricals - return union_categoricals(to_concat) + result = union_categoricals(to_concat) + first = to_concat[0] + if axis >= first.ndim: + raise ValueError + if axis == 1: + first = to_concat[0] + if not all(len(x) == len(first) for x in to_concat): + raise ValueError + # TODO: Will this get contiguity wrong? + result = result.reshape(-1, len(to_concat), order="F") + return result # ------------------------------------------------------------------ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e89db0e83767a..6587f89d4e021 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -375,7 +375,7 @@ def _values_for_argsort(self) -> np.ndarray: data = self._data.copy() if self._mask.any(): data[self._mask] = data.min() - 1 - return data.ravel("K") + return data def _cmp_method(self, other, op): from pandas.core.arrays import BooleanArray diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6238f1a55038c..c084b4c08e596 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -391,9 +391,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: arr = self._data mask = self._mask - codes, uniques = factorize_array( - arr.ravel(), na_sentinel=na_sentinel, mask=mask.ravel() - ) + codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) # the hashtables don't handle all different types of bits uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) @@ -422,7 +420,7 @@ def value_counts(self, dropna: bool = True) -> Series: # compute counts on the data with no nans data = self._data[~self._mask] - value_counts = Index(data.ravel("K")).value_counts() + value_counts = Index(data).value_counts() # TODO(extension) # if we have allow Index to hold an ExtensionArray diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 69499bc7e4a77..d6c378de013ec 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -188,3 +188,15 @@ def reconstruct(x): return tuple(reconstruct(x) for x in result) else: return reconstruct(result) + + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + result = super()._reduce(name, skipna=skipna, **kwargs) + if isinstance(result, np.ndarray): + axis = kwargs["axis"] + if skipna: + # we only retain mask for all-NA rows/columns + mask = self._mask.all(axis=axis) + else: + mask = self._mask.any(axis=axis) + return type(self)(result, mask=mask) + return result diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5506de98058cc..0a53d707ee9cf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -260,7 +260,7 @@ def _values_for_factorize(self): arr = self._ndarray.copy() mask = self.isna() arr[mask] = -1 - return arr.ravel("K"), -1 + return arr, -1 def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) @@ -345,7 +345,7 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: def value_counts(self, dropna: bool = True): from pandas import value_counts - return value_counts(self._ndarray.ravel("K"), dropna=dropna).astype("Int64") + return value_counts(self._ndarray, dropna=dropna).astype("Int64") def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index d17ac52e7cdd8..9c045adf29e3f 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -241,8 +241,8 @@ def __getitem__(self, key): return type(self)(result, name=self.name) # Unpack to ndarray for MPL compat # pandas\core\indexes\extension.py:220: error: "ExtensionArray" has - # no attribute "_data" [attr-defined] - result = result._data # type: ignore[attr-defined] + # no attribute "_ndarray" [attr-defined] + result = result._ndarray # type: ignore[attr-defined] # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 48836a79fc038..69c92b3f318bf 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import pandas as pd from pandas.tests.extension.base.base import BaseExtensionTests @@ -121,3 +122,74 @@ def test_reductions_2d_axis_none(self, data, method): return assert result == expected # TODO: or matching NA + + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) + def test_reductions_2d_axis0(self, data, method): + if not hasattr(data, method): + pytest.skip("test is not applicable for this type/dtype") + + arr2d = data.reshape(1, -1) + + kwargs = {} + if method == "std": + # pass ddof=0 so we get all-zero std instead of all-NA std + kwargs["ddof"] = 0 + + try: + result = getattr(arr2d, method)(axis=0, **kwargs) + except Exception as err: + try: + getattr(data, method)() + except Exception as err2: + assert type(err) == type(err2) + return + else: + raise AssertionError("Both reductions should raise or neither") + + if method in ["mean", "median", "sum", "prod"]: + # std and var are not dtype-preserving + expected = data + if method in ["sum", "prod"] and data.dtype.kind in ["i", "u"]: + # FIXME: kludge + if data.dtype.kind == "i": + dtype = pd.Int64Dtype + else: + dtype = pd.UInt64Dtype + + expected = data.astype(dtype) + if type(expected) != type(data): + pytest.xfail(reason="IntegerArray.astype is broken GH#38983") + assert type(expected) == type(data), type(expected) + assert dtype == expected.dtype + + self.assert_extension_array_equal(result, expected) + elif method == "std": + self.assert_extension_array_equal(result, data - data) + # punt on method == "var" + + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) + def test_reductions_2d_axis1(self, data, method): + if not hasattr(data, method): + pytest.skip("test is not applicable for this type/dtype") + + arr2d = data.reshape(1, -1) + + try: + result = getattr(arr2d, method)(axis=1) + except Exception as err: + try: + getattr(data, method)() + except Exception as err2: + assert type(err) == type(err2) + return + else: + raise AssertionError("Both reductions should raise or neither") + + # not necesarrily type/dtype-preserving, so weaker assertions + assert result.shape == (1,) + expected_scalar = getattr(data, method)() + if pd.isna(result[0]): + # TODO: require matching NA + assert pd.isna(expected_scalar), expected_scalar + else: + assert result[0] == expected_scalar diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 10e82a8c9bff1..83e8f958c88cd 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -292,3 +292,13 @@ def test_not_equal_with_na(self, categories): class TestParsing(base.BaseParsingTests): pass + + +class Test2DCompat(base.Dim2CompatTests): + def test_repr_2d(self, data): + # Categorical __repr__ doesnt include "Caegorical", so we need to special-case + res = repr(data.reshape(1, -1)) + assert res.count("\nCategories") == 1 + + res = repr(data.reshape(-1, 1)) + assert res.count("\nCategories") == 1 From 553038c58dc84543ece66cf2191e8c280a1d45b7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Feb 2021 12:36:01 -0800 Subject: [PATCH 06/18] np_version_under1p17 compat --- pandas/core/array_algos/masked_reductions.py | 5 +++- pandas/tests/extension/base/dim2.py | 27 +++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 9d6047b290280..5bd950bf52c46 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -52,7 +52,10 @@ def _sumprod( return libmissing.NA if np_version_under1p17: - return func(values[~mask], axis=axis) + if values.ndim == 1: + return func(values[~mask], axis=axis) + else: + raise NotImplementedError else: return func(values, where=~mask, axis=axis) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 69c92b3f318bf..9851df7ba29f5 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -4,10 +4,23 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p17 + import pandas as pd +from pandas.core.arrays import FloatingArray, IntegerArray from pandas.tests.extension.base.base import BaseExtensionTests +def maybe_xfail_masked_reductions(arr, request): + if ( + isinstance(arr, (FloatingArray, IntegerArray)) + and np_version_under1p17 + and arr.ndim == 2 + ): + mark = pytest.mark.xfail(reason="masked_reductions does not implement") + request.node.add_marker(mark) + + class Dim2CompatTests(BaseExtensionTests): def test_take_2d(self, data): arr2d = data.reshape(-1, 1) @@ -96,11 +109,12 @@ def test_concat_2d(self, data): left._concat_same_type([left, right], axis=2) @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis_none(self, data, method): + def test_reductions_2d_axis_none(self, data, method, request): if not hasattr(data, method): pytest.skip("test is not applicable for this type/dtype") arr2d = data.reshape(1, -1) + maybe_xfail_masked_reductions(arr2d, request) err_expected = None err_result = None @@ -124,11 +138,12 @@ def test_reductions_2d_axis_none(self, data, method): assert result == expected # TODO: or matching NA @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis0(self, data, method): + def test_reductions_2d_axis0(self, data, method, request): if not hasattr(data, method): pytest.skip("test is not applicable for this type/dtype") arr2d = data.reshape(1, -1) + maybe_xfail_masked_reductions(arr2d, request) kwargs = {} if method == "std": @@ -158,7 +173,10 @@ def test_reductions_2d_axis0(self, data, method): expected = data.astype(dtype) if type(expected) != type(data): - pytest.xfail(reason="IntegerArray.astype is broken GH#38983") + mark = pytest.mark.xfail( + reason="IntegerArray.astype is broken GH#38983" + ) + request.node.add_marker(mark) assert type(expected) == type(data), type(expected) assert dtype == expected.dtype @@ -168,11 +186,12 @@ def test_reductions_2d_axis0(self, data, method): # punt on method == "var" @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis1(self, data, method): + def test_reductions_2d_axis1(self, data, method, request): if not hasattr(data, method): pytest.skip("test is not applicable for this type/dtype") arr2d = data.reshape(1, -1) + maybe_xfail_masked_reductions(arr2d, request) try: result = getattr(arr2d, method)(axis=1) From 44999d1c09904412a191a62d6f872fe79a3ab3d5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Feb 2021 10:09:33 -0800 Subject: [PATCH 07/18] xfail syntax --- pandas/tests/extension/test_string.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 17595c78f742a..e57c0017c8ecf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -168,6 +168,9 @@ class TestGroupBy(base.BaseGroupbyTests): class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) - def arrow_not_supported(self, data): + def arrow_not_supported(self, data, request): if isinstance(data.dtype, ArrowStringDtype): - pytest.xfail(reason="2D support not implemented for ArrowStringArray") + mark = pytest.mark.xfail( + reason="2D support not implemented for ArrowStringArray" + ) + request.node.add_param(mark) From 7a6c22693d2283d059ed68f51f75a0d72bd94b00 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Feb 2021 13:06:08 -0800 Subject: [PATCH 08/18] typo fixup --- pandas/tests/extension/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e57c0017c8ecf..b8b11f24023fc 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -173,4 +173,4 @@ def arrow_not_supported(self, data, request): mark = pytest.mark.xfail( reason="2D support not implemented for ArrowStringArray" ) - request.node.add_param(mark) + request.node.add_marker(mark) From 6664d0d0d0fbd1cd055a50b7ef67690f07780c0a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Feb 2021 16:28:08 -0800 Subject: [PATCH 09/18] isort fixup --- pandas/core/array_algos/masked_reductions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 5bd950bf52c46..04f1672778ea4 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -3,7 +3,10 @@ for missing values. """ -from typing import Callable, Optional +from typing import ( + Callable, + Optional, +) import numpy as np From 6f26c4b78acaaab1ace094346edcc7e612d883cf Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 09:40:52 -0800 Subject: [PATCH 10/18] Fix pad/backfill 2d --- pandas/_libs/algos.pyx | 8 +++++--- pandas/core/arrays/categorical.py | 16 ++++++++-------- pandas/core/arrays/masked.py | 8 ++++---- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5783d3c2353aa..0b91a84e1d50a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -629,7 +629,7 @@ def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): +def pad_2d_inplace(algos_t[:, :] values, uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K algos_t val @@ -648,10 +648,11 @@ def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): val = values[j, 0] for i in range(N): if mask[j, i]: - if fill_count >= lim: + if fill_count >= lim or i == 0: continue fill_count += 1 values[j, i] = val + mask[j, i] = False else: fill_count = 0 val = values[j, i] @@ -776,7 +777,7 @@ def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): @cython.boundscheck(False) @cython.wraparound(False) def backfill_2d_inplace(algos_t[:, :] values, - const uint8_t[:, :] mask, + uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K @@ -800,6 +801,7 @@ def backfill_2d_inplace(algos_t[:, :] values, continue fill_count += 1 values[j, i] = val + mask[j, i] = False else: fill_count = 0 val = values[j, i] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 10e38958c69bc..103dd25a8609f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -104,7 +104,6 @@ extract_array, sanitize_array, ) -from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -1776,13 +1775,9 @@ def fillna(self, value=None, method=None, limit=None): if method is not None: # pad / bfill - - # TODO: dispatch when self.categories is EA-dtype - values = np.asarray(self).reshape(-1, len(self)) - values = interpolate_2d(values, method, 0, None).astype( - self.categories.dtype - )[0] - codes = _get_codes_for_values(values, self.categories) + return NDArrayBackedExtensionArray.fillna( + self, None, method=method, limit=limit + ) else: # We copy even if there is nothing to fill @@ -2604,6 +2599,11 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray: """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) + if values.ndim > 1: + flat = values.ravel() + codes = _get_codes_for_values(flat, categories) + return codes.reshape(values.shape) + if is_extension_array_dtype(categories.dtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index ff0bdf352eb09..1476b5acf58ec 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -170,13 +170,13 @@ def fillna( if mask.any(): if method is not None: - func = missing.get_fill_func(method) + func = missing.get_fill_func(method, ndim=self.ndim) new_values, new_mask = func( - self._data.copy(), + self._data.copy().T, limit=limit, - mask=mask.copy(), + mask=mask.copy().T, ) - return type(self)(new_values, new_mask.view(np.bool_)) + return type(self)(new_values.T, new_mask.view(np.bool_).T) else: # fill with value new_values = self.copy() From 34fda9782ad00ff2af7c1daf823c855a71b6f281 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 14:58:02 -0800 Subject: [PATCH 11/18] typo fixup --- pandas/tests/extension/test_categorical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index e77f7553f96f3..e70c3c7fa630e 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -300,7 +300,8 @@ class TestParsing(base.BaseParsingTests): class Test2DCompat(base.Dim2CompatTests): def test_repr_2d(self, data): - # Categorical __repr__ doesnt include "Caegorical", so we need to special-case + # Categorical __repr__ doesn't include "Categorical", so we need + # to special-case res = repr(data.reshape(1, -1)) assert res.count("\nCategories") == 1 From 4b751010653c355d1c3cec1e243d997d40e5476f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Apr 2021 09:26:18 -0700 Subject: [PATCH 12/18] comment --- pandas/core/arrays/categorical.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 807176cf33832..c0428e129d78c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2209,11 +2209,13 @@ def _concat_same_type( from pandas.core.dtypes.concat import union_categoricals result = union_categoricals(to_concat) + + # in case we are concatenating along axis != 0, we need to reshape + # the result from union_categoricals first = to_concat[0] if axis >= first.ndim: raise ValueError if axis == 1: - first = to_concat[0] if not all(len(x) == len(first) for x in to_concat): raise ValueError # TODO: Will this get contiguity wrong? From 21cf57845f80613352c35d20afa917a9c8612250 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 3 Aug 2021 21:56:25 -0700 Subject: [PATCH 13/18] fix broken tests --- pandas/core/arrays/categorical.py | 11 ----------- pandas/tests/extension/test_string.py | 4 ++-- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4002e36578f3c..144ea76073cc1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -110,7 +110,6 @@ extract_array, sanitize_array, ) -from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -2010,16 +2009,6 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ - def __getitem__(self, key): - """ - Return an item. - """ - result = super().__getitem__(key) - if getattr(result, "ndim", 0) > 1: - result = result._ndarray - deprecate_ndim_indexing(result) - return result - def _validate_listlike(self, value): # NB: here we assume scalar-like tuples have already been excluded value = extract_array(value, extract_numpy=True) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 909c91a5897a9..af86c359c4c00 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -19,9 +19,9 @@ import pytest import pandas as pd +from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base -from pandas.tests.extension.arrow.arrays import ArrowStringDtype def split_array(arr): @@ -192,7 +192,7 @@ class TestGroupBy(base.BaseGroupbyTests): class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) def arrow_not_supported(self, data, request): - if isinstance(data.dtype, ArrowStringDtype): + if isinstance(data, ArrowStringArray): mark = pytest.mark.xfail( reason="2D support not implemented for ArrowStringArray" ) From 3bfe60cca250a9c542ad67d258e4472a30d37583 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 6 Oct 2021 09:11:07 -0700 Subject: [PATCH 14/18] comment --- pandas/core/arrays/masked.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f5aca1c52fe44..0247cd717edec 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -121,6 +121,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): # The value used to fill '_data' to avoid upcasting _internal_fill_value: Scalar + # our underlying data and mask are each ndarrays _data: np.ndarray _mask: np.ndarray From 5b014c19ea76474a24e940688d920d2c25d803ae Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Oct 2021 11:55:08 -0700 Subject: [PATCH 15/18] troubleshoot windows build --- pandas/tests/extension/base/dim2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index b80d2a3586b3b..2307fc3aeeab2 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -194,7 +194,10 @@ def test_reductions_2d_axis0(self, data, method, request): if method in ["sum", "prod"] and data.dtype.kind in ["i", "u"]: # FIXME: kludge if data.dtype.kind == "i": - dtype = pd.Int64Dtype() + if np.dtype(int).itemsize == 4: + dtype = pd.Int32Dtype() + else: + dtype = pd.Int64Dtype() else: dtype = pd.UInt64Dtype() From 15a533fd758f260ebd8113052f515092dbe2566b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 13 Oct 2021 15:18:17 -0700 Subject: [PATCH 16/18] troubleshoot 32bit builds --- pandas/tests/extension/base/dim2.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 2307fc3aeeab2..ba0be3b3e4904 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -4,6 +4,11 @@ import numpy as np import pytest +from pandas.compat import ( + IS64, + is_platform_windows, +) + import pandas as pd from pandas.tests.extension.base.base import BaseExtensionTests @@ -194,12 +199,23 @@ def test_reductions_2d_axis0(self, data, method, request): if method in ["sum", "prod"] and data.dtype.kind in ["i", "u"]: # FIXME: kludge if data.dtype.kind == "i": - if np.dtype(int).itemsize == 4: - dtype = pd.Int32Dtype() + if is_platform_windows() or not IS64: + # FIXME: kludge for 32bit builds + if data.dtype == "i4": + dtype = pd.Int32Dtype() + else: + dtype = pd.Int64Dtype() else: dtype = pd.Int64Dtype() else: - dtype = pd.UInt64Dtype() + if is_platform_windows() or not IS64: + # FIXME: kludge for 32bit builds + if data.dtype == "u4": + dtype = pd.UInt32Dtype() + else: + dtype = pd.UInt64Dtype() + else: + dtype = pd.UInt64Dtype() expected = data.astype(dtype) assert type(expected) == type(data), type(expected) From 7a7601e418649ea402c306d798b3244a6b653fde Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 13 Oct 2021 18:34:29 -0700 Subject: [PATCH 17/18] troubleshoot 32bit builds --- pandas/tests/extension/base/dim2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index ba0be3b3e4904..b7d1096aa2bf6 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -201,7 +201,7 @@ def test_reductions_2d_axis0(self, data, method, request): if data.dtype.kind == "i": if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds - if data.dtype == "i4": + if result.dtype == "i4": dtype = pd.Int32Dtype() else: dtype = pd.Int64Dtype() @@ -210,7 +210,7 @@ def test_reductions_2d_axis0(self, data, method, request): else: if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds - if data.dtype == "u4": + if result.dtype == "u4": dtype = pd.UInt32Dtype() else: dtype = pd.UInt64Dtype() From 7c6baaf0981faf3857d1fd2f458c0f6d3b855745 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 14 Oct 2021 07:25:00 -0700 Subject: [PATCH 18/18] troubleshoot 32 bit builds --- pandas/tests/extension/base/dim2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index b7d1096aa2bf6..b4a817cbc37ec 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -201,7 +201,7 @@ def test_reductions_2d_axis0(self, data, method, request): if data.dtype.kind == "i": if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds - if result.dtype == "i4": + if result.dtype.itemsize == 4: dtype = pd.Int32Dtype() else: dtype = pd.Int64Dtype() @@ -210,7 +210,7 @@ def test_reductions_2d_axis0(self, data, method, request): else: if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds - if result.dtype == "u4": + if result.dtype.itemsize == 4: dtype = pd.UInt32Dtype() else: dtype = pd.UInt64Dtype()