diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 16f0b9ee99909..351dc363c9550 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -788,6 +788,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. +- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 79070bbbfd11a..8735284617f31 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -45,6 +45,8 @@ import pandas.core.algorithms as algorithms +from pandas.core.sorting import nargsort + from pandas.io.formats import console from pandas.io.formats.terminal import get_terminal_size from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs @@ -1605,32 +1607,15 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): msg = 'invalid na_position: {na_position!r}' raise ValueError(msg.format(na_position=na_position)) - codes = np.sort(self._codes) - if not ascending: - codes = codes[::-1] - - # NaN handling - na_mask = (codes == -1) - if na_mask.any(): - n_nans = len(codes[na_mask]) - if na_position == "first": - # in this case sort to the front - new_codes = codes.copy() - new_codes[0:n_nans] = -1 - new_codes[n_nans:] = codes[~na_mask] - codes = new_codes - elif na_position == "last": - # ... and to the end - new_codes = codes.copy() - pos = len(codes) - n_nans - new_codes[0:pos] = codes[~na_mask] - new_codes[pos:] = -1 - codes = new_codes + sorted_idx = nargsort(self, + ascending=ascending, + na_position=na_position) + if inplace: - self._codes = codes - return + self._codes = self._codes[sorted_idx] else: - return self._constructor(values=codes, dtype=self.dtype, + return self._constructor(values=self._codes[sorted_idx], + dtype=self.dtype, fastpath=True) def _values_for_rank(self): diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5aa9ea658482b..ee1c62f3decf9 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -241,7 +241,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): # specially handle Categorical if is_categorical_dtype(items): - return items.argsort(ascending=ascending, kind=kind) + if na_position not in {'first', 'last'}: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + + mask = isna(items) + cnt_null = mask.sum() + sorted_idx = items.argsort(ascending=ascending, kind=kind) + if ascending and na_position == 'last': + # NaN is coded as -1 and is listed in front after sorting + sorted_idx = np.roll(sorted_idx, -cnt_null) + elif not ascending and na_position == 'first': + # NaN is coded as -1 and is listed in the end after sorting + sorted_idx = np.roll(sorted_idx, cnt_null) + return sorted_idx items = np.asanyarray(items) idx = np.arange(len(items)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 599ae683f914b..41b11d9c15f35 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -10,7 +10,7 @@ from pandas.compat import lrange from pandas.api.types import CategoricalDtype from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range, NaT, IntervalIndex) + date_range, NaT, IntervalIndex, Categorical) from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -161,7 +161,7 @@ def test_sort_nan(self): 'B': [5, 9, 2, nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5]) sorted_df = df.sort_values(['A', 'B'], ascending=[ - 1, 0], na_position='first') + 1, 0], na_position='first') assert_frame_equal(sorted_df, expected) # na_position='last', not order @@ -170,7 +170,7 @@ def test_sort_nan(self): 'B': [4, 5, 5, nan, 2, 9, 5]}, index=[5, 4, 6, 1, 3, 0, 2]) sorted_df = df.sort_values(['A', 'B'], ascending=[ - 0, 1], na_position='last') + 0, 1], na_position='last') assert_frame_equal(sorted_df, expected) # Test DataFrame with nan label @@ -514,7 +514,7 @@ def test_sort_index_categorical_index(self): df = (DataFrame({'A': np.arange(6, dtype='int64'), 'B': Series(list('aabbca')) - .astype(CategoricalDtype(list('cab')))}) + .astype(CategoricalDtype(list('cab')))}) .set_index('B')) result = df.sort_index() @@ -598,3 +598,81 @@ def test_sort_index_intervalindex(self): closed='right') result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) + + def test_sort_index_na_position_with_categories(self): + # GH 22556 + # Positioning missing value properly when column is Categorical. + categories = ['A', 'B', 'C'] + category_indices = [0, 2, 4] + list_of_nans = [np.nan, np.nan] + na_indices = [1, 3] + na_position_first = 'first' + na_position_last = 'last' + column_name = 'c' + + reversed_categories = sorted(categories, reverse=True) + reversed_category_indices = sorted(category_indices, reverse=True) + reversed_na_indices = sorted(na_indices, reverse=True) + + df = pd.DataFrame({ + column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], + categories=categories, + ordered=True)}) + # sort ascending with na first + result = df.sort_values(by=column_name, + ascending=True, + na_position=na_position_first) + expected = DataFrame({ + column_name: Categorical(list_of_nans + categories, + categories=categories, + ordered=True) + }, index=na_indices + category_indices) + + assert_frame_equal(result, expected) + + # sort ascending with na last + result = df.sort_values(by=column_name, + ascending=True, + na_position=na_position_last) + expected = DataFrame({ + column_name: Categorical(categories + list_of_nans, + categories=categories, + ordered=True) + }, index=category_indices + na_indices) + + assert_frame_equal(result, expected) + + # sort descending with na first + result = df.sort_values(by=column_name, + ascending=False, + na_position=na_position_first) + expected = DataFrame({ + column_name: Categorical(list_of_nans + reversed_categories, + categories=categories, + ordered=True) + }, index=reversed_na_indices + reversed_category_indices) + + assert_frame_equal(result, expected) + + # sort descending with na last + result = df.sort_values(by=column_name, + ascending=False, + na_position=na_position_last) + expected = DataFrame({ + column_name: Categorical(reversed_categories + list_of_nans, + categories=categories, + ordered=True) + }, index=reversed_category_indices + reversed_na_indices) + + assert_frame_equal(result, expected) + + def test_sort_index_na_position_with_categories_raises(self): + df = pd.DataFrame({ + 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], + categories=['A', 'B', 'C'], + ordered=True)}) + + with pytest.raises(ValueError): + df.sort_values(by='c', + ascending=False, + na_position='bad_position') diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 303d3a3d8dbe9..7b9e23fca59aa 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -995,7 +995,7 @@ def test_categorical_sorting(self, file): parsed = read_stata(getattr(self, file)) # Sort based on codes, not strings - parsed = parsed.sort_values("srh") + parsed = parsed.sort_values("srh", na_position='first') # Don't sort index parsed.index = np.arange(parsed.shape[0])