From d119a001d1ad986af60f32e06e9a0f4d30588bd3 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sat, 8 Sep 2018 12:28:58 -0500 Subject: [PATCH 01/14] fix na_position when sort_values by categorical values --- pandas/core/sorting.py | 14 ++++++++- .../frame/test_sort_values_level_as_str.py | 29 ++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5aa9ea658482b..07d54f387e4d1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -241,7 +241,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): # specially handle Categorical if is_categorical_dtype(items): - return items.argsort(ascending=ascending, kind=kind) + mask = items._codes == -1 + idx = np.arange(len(items)) + null_idx = np.nonzero(mask)[0] + sorted_idx = items[~mask].argsort(ascending=ascending, kind=kind) + if na_position == 'first': + idx[~mask] = sorted_idx+len(null_idx) + idx[mask] = np.arange(len(null_idx)) + elif na_position == 'last': + idx[~mask] = sorted_idx + idx[mask] = np.arange(len(null_idx)) + len(sorted_idx) + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + return idx items = np.asanyarray(items) idx = np.arange(len(items)) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index 2653cc77b27a4..f857c18520932 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame +from pandas import DataFrame, Categorical from pandas.errors import PerformanceWarning from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -16,6 +16,14 @@ def df_none(): ('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']}) +@pytest.fixture +def df_category_with_nan(): + return DataFrame({ + 'c': Categorical(['A', np.nan, 'B'], + categories=['A', 'B'], + ordered=True)}) + + @pytest.fixture(params=[ ['outer'], ['outer', 'inner'] @@ -93,3 +101,22 @@ def test_sort_column_level_and_index_label( assert_frame_equal(result, expected) else: assert_frame_equal(result, expected) + + +def test_sort_index_na_position_with_categories( + df_category_with_nan, sort_names): + result = df_category_with_nan.sort_values(by='c', na_position='first') + expected = DataFrame({ + 'c': Categorical([np.nan, 'A', 'B'], + categories=['A', 'B'], + ordered=True)}) + + assert_frame_equal(result, expected) + + result = df_category_with_nan.sort_values(by='c', na_position='last') + expected = DataFrame({ + 'c': Categorical(['A', 'B', np.nan], + categories=['A', 'B'], + ordered=True)}) + + assert_frame_equal(result, expected) \ No newline at end of file From f36346b517d0704edfc2d21cdfcb2112f4e0c06c Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sat, 8 Sep 2018 12:40:02 -0500 Subject: [PATCH 02/14] move test to test_sort.py and fix index --- .../frame/test_sort_values_level_as_str.py | 29 +------------------ pandas/tests/frame/test_sorting.py | 23 ++++++++++++++- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index f857c18520932..2653cc77b27a4 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Categorical +from pandas import DataFrame from pandas.errors import PerformanceWarning from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -16,14 +16,6 @@ def df_none(): ('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']}) -@pytest.fixture -def df_category_with_nan(): - return DataFrame({ - 'c': Categorical(['A', np.nan, 'B'], - categories=['A', 'B'], - ordered=True)}) - - @pytest.fixture(params=[ ['outer'], ['outer', 'inner'] @@ -101,22 +93,3 @@ def test_sort_column_level_and_index_label( assert_frame_equal(result, expected) else: assert_frame_equal(result, expected) - - -def test_sort_index_na_position_with_categories( - df_category_with_nan, sort_names): - result = df_category_with_nan.sort_values(by='c', na_position='first') - expected = DataFrame({ - 'c': Categorical([np.nan, 'A', 'B'], - categories=['A', 'B'], - ordered=True)}) - - assert_frame_equal(result, expected) - - result = df_category_with_nan.sort_values(by='c', na_position='last') - expected = DataFrame({ - 'c': Categorical(['A', 'B', np.nan], - categories=['A', 'B'], - ordered=True)}) - - assert_frame_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 599ae683f914b..3277c19b63117 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -10,7 +10,7 @@ from pandas.compat import lrange from pandas.api.types import CategoricalDtype from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range, NaT, IntervalIndex) + date_range, NaT, IntervalIndex, Categorical) from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -598,3 +598,24 @@ def test_sort_index_intervalindex(self): closed='right') result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) + + + def test_sort_index_na_position_with_categories(self): + df_category_with_nan = pd.DataFrame({'c': pd.Categorical(['A', np.nan, 'B'], + categories=['A', 'B'], + ordered=True)}) + result = df_category_with_nan.sort_values(by='c', na_position='first') + expected = DataFrame({ + 'c': Categorical([np.nan, 'A', 'B'], + categories=['A', 'B'], + ordered=True)}, index=[1, 0, 2]) + + assert_frame_equal(result, expected) + + result = df_category_with_nan.sort_values(by='c', na_position='last') + expected = DataFrame({ + 'c': Categorical(['A', 'B', np.nan], + categories=['A', 'B'], + ordered=True)}, index=[0, 2, 1]) + + assert_frame_equal(result, expected) \ No newline at end of file From 1e269890f89c22f9ce9d81803bb3c7c3b96ec39a Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sat, 8 Sep 2018 15:34:46 -0500 Subject: [PATCH 03/14] using pandas.isna to allow CategoricalIndex calling nargsort --- pandas/core/sorting.py | 22 ++++++++++------- pandas/tests/frame/test_sorting.py | 38 ++++++++++++++++++++++-------- pandas/tests/io/test_stata.py | 2 +- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 07d54f387e4d1..b1ba8ae21f3aa 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -241,19 +241,23 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): # specially handle Categorical if is_categorical_dtype(items): - mask = items._codes == -1 - idx = np.arange(len(items)) - null_idx = np.nonzero(mask)[0] - sorted_idx = items[~mask].argsort(ascending=ascending, kind=kind) + mask = isna(items) + null_idx = np.where(mask)[0] + sorted_idx = items.argsort(ascending=ascending, kind=kind) + if ascending: + # NaN is coded as -1 and is listed in front after sorting + non_null_sorted_idx = sorted_idx[len(null_idx):] + null_sorted_idx = sorted_idx[:len(null_idx)] + else: + # NaN is coded as -1 and is listed in the end after sorting + non_null_sorted_idx = sorted_idx[:-len(null_idx)] + null_sorted_idx = sorted_idx[-len(null_idx):] if na_position == 'first': - idx[~mask] = sorted_idx+len(null_idx) - idx[mask] = np.arange(len(null_idx)) + return np.concatenate([null_sorted_idx, non_null_sorted_idx]) elif na_position == 'last': - idx[~mask] = sorted_idx - idx[mask] = np.arange(len(null_idx)) + len(sorted_idx) + return np.concatenate([non_null_sorted_idx, null_sorted_idx]) else: raise ValueError('invalid na_position: {!r}'.format(na_position)) - return idx items = np.asanyarray(items) idx = np.arange(len(items)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 3277c19b63117..6d2ab861c12fa 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -601,21 +601,39 @@ def test_sort_index_intervalindex(self): def test_sort_index_na_position_with_categories(self): - df_category_with_nan = pd.DataFrame({'c': pd.Categorical(['A', np.nan, 'B'], - categories=['A', 'B'], + # GH 22556 + # Positioning missing value properly when column is Categorical. + df_category_with_nan = pd.DataFrame({'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], + categories=['A', 'B', 'C'], ordered=True)}) - result = df_category_with_nan.sort_values(by='c', na_position='first') + result = df_category_with_nan.sort_values(by='c', ascending=True, na_position='first') expected = DataFrame({ - 'c': Categorical([np.nan, 'A', 'B'], - categories=['A', 'B'], - ordered=True)}, index=[1, 0, 2]) + 'c': Categorical([np.nan, np.nan, 'A', 'B', 'C'], + categories=['A', 'B', 'C'], + ordered=True)}, index=[1, 3, 0, 2, 4]) assert_frame_equal(result, expected) - result = df_category_with_nan.sort_values(by='c', na_position='last') + result = df_category_with_nan.sort_values(by='c', ascending=True, na_position='last') expected = DataFrame({ - 'c': Categorical(['A', 'B', np.nan], - categories=['A', 'B'], - ordered=True)}, index=[0, 2, 1]) + 'c': Categorical(['A', 'B', 'C', np.nan, np.nan], + categories=['A', 'B', 'C'], + ordered=True)}, index=[0, 2, 4, 1, 3]) + + assert_frame_equal(result, expected) + + result = df_category_with_nan.sort_values(by='c', ascending=False, na_position='first') + expected = DataFrame({ + 'c': Categorical([np.nan, np.nan, 'C', 'B', 'A'], + categories=['A', 'B', 'C'], + ordered=True)}, index=[3, 1, 4, 2, 0]) + + assert_frame_equal(result, expected) + + result = df_category_with_nan.sort_values(by='c', ascending=False, na_position='last') + expected = DataFrame({ + 'c': Categorical(['C', 'B', 'A', np.nan, np.nan], + categories=['A', 'B', 'C'], + ordered=True)}, index=[4, 2, 0, 3, 1]) assert_frame_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index cfe47cae7e5e1..83b5290d82890 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -997,7 +997,7 @@ def test_categorical_sorting(self, file): parsed = read_stata(getattr(self, file)) # Sort based on codes, not strings - parsed = parsed.sort_values("srh") + parsed = parsed.sort_values("srh", na_position='first') # Don't sort index parsed.index = np.arange(parsed.shape[0]) From dc4b4c49f1cfce36ffeb288ac59167bf3164c630 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sat, 8 Sep 2018 15:40:42 -0500 Subject: [PATCH 04/14] linter fix in test_sorting.py --- pandas/tests/frame/test_sorting.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 6d2ab861c12fa..15e08985f9205 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -599,14 +599,16 @@ def test_sort_index_intervalindex(self): result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) - def test_sort_index_na_position_with_categories(self): # GH 22556 # Positioning missing value properly when column is Categorical. - df_category_with_nan = pd.DataFrame({'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], - categories=['A', 'B', 'C'], - ordered=True)}) - result = df_category_with_nan.sort_values(by='c', ascending=True, na_position='first') + df_category_with_nan = pd.DataFrame({ + 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], + categories=['A', 'B', 'C'], + ordered=True)}) + result = df_category_with_nan.sort_values(by='c', + ascending=True, + na_position='first') expected = DataFrame({ 'c': Categorical([np.nan, np.nan, 'A', 'B', 'C'], categories=['A', 'B', 'C'], @@ -614,7 +616,9 @@ def test_sort_index_na_position_with_categories(self): assert_frame_equal(result, expected) - result = df_category_with_nan.sort_values(by='c', ascending=True, na_position='last') + result = df_category_with_nan.sort_values(by='c', + ascending=True, + na_position='last') expected = DataFrame({ 'c': Categorical(['A', 'B', 'C', np.nan, np.nan], categories=['A', 'B', 'C'], @@ -622,7 +626,9 @@ def test_sort_index_na_position_with_categories(self): assert_frame_equal(result, expected) - result = df_category_with_nan.sort_values(by='c', ascending=False, na_position='first') + result = df_category_with_nan.sort_values(by='c', + ascending=False, + na_position='first') expected = DataFrame({ 'c': Categorical([np.nan, np.nan, 'C', 'B', 'A'], categories=['A', 'B', 'C'], @@ -630,10 +636,12 @@ def test_sort_index_na_position_with_categories(self): assert_frame_equal(result, expected) - result = df_category_with_nan.sort_values(by='c', ascending=False, na_position='last') + result = df_category_with_nan.sort_values(by='c', + ascending=False, + na_position='last') expected = DataFrame({ 'c': Categorical(['C', 'B', 'A', np.nan, np.nan], categories=['A', 'B', 'C'], ordered=True)}, index=[4, 2, 0, 3, 1]) - assert_frame_equal(result, expected) \ No newline at end of file + assert_frame_equal(result, expected) From a4ee75f8192ecd8d190ba890a2a73f0cbd1d0347 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sat, 8 Sep 2018 15:49:25 -0500 Subject: [PATCH 05/14] add to Whatsnew --- doc/source/whatsnew/v0.24.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fb7af00f61534..f02bd1557ef34 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -608,6 +608,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in `codes` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of `.from_codes([1.1, 2.0])`. +- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ```na_position`` value. (:issue:`22556`). `test_stata.py` was incorrectly passing using default ``na_position='last'``. Datetimelike ^^^^^^^^^^^^ From 2b6f0b080b9257e19b1773854a906dacde10080f Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sun, 9 Sep 2018 10:04:52 -0500 Subject: [PATCH 06/14] test ValueError exception and use np.roll to do the rotate --- pandas/core/sorting.py | 16 +++++----------- pandas/tests/frame/test_sorting.py | 5 +++++ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index b1ba8ae21f3aa..ba49b8b462e65 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -241,23 +241,17 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): # specially handle Categorical if is_categorical_dtype(items): + if na_position not in ['first', 'last']: + raise ValueError('invalid na_position: {!r}'.format(na_position)) mask = isna(items) - null_idx = np.where(mask)[0] + cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) if ascending: # NaN is coded as -1 and is listed in front after sorting - non_null_sorted_idx = sorted_idx[len(null_idx):] - null_sorted_idx = sorted_idx[:len(null_idx)] + return sorted_idx if na_position == 'first' else np.roll(sorted_idx, -cnt_null) else: # NaN is coded as -1 and is listed in the end after sorting - non_null_sorted_idx = sorted_idx[:-len(null_idx)] - null_sorted_idx = sorted_idx[-len(null_idx):] - if na_position == 'first': - return np.concatenate([null_sorted_idx, non_null_sorted_idx]) - elif na_position == 'last': - return np.concatenate([non_null_sorted_idx, null_sorted_idx]) - else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + return sorted_idx if na_position == 'last' else np.roll(sorted_idx, cnt_null) items = np.asanyarray(items) idx = np.arange(len(items)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 15e08985f9205..c02a8c6dc5819 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -645,3 +645,8 @@ def test_sort_index_na_position_with_categories(self): ordered=True)}, index=[4, 2, 0, 3, 1]) assert_frame_equal(result, expected) + + with pytest.raises(ValueError) as err: + df_category_with_nan.sort_values(by='c', + ascending=False, + na_position='bad_position') From 229fff57137324d5723f9c92497ee30944431ad5 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sun, 9 Sep 2018 10:09:58 -0500 Subject: [PATCH 07/14] flake8 linter fix --- pandas/core/sorting.py | 6 ++++-- pandas/tests/frame/test_sorting.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ba49b8b462e65..49552788ed930 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -248,10 +248,12 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): sorted_idx = items.argsort(ascending=ascending, kind=kind) if ascending: # NaN is coded as -1 and is listed in front after sorting - return sorted_idx if na_position == 'first' else np.roll(sorted_idx, -cnt_null) + return sorted_idx if na_position == 'first' \ + else np.roll(sorted_idx, -cnt_null) else: # NaN is coded as -1 and is listed in the end after sorting - return sorted_idx if na_position == 'last' else np.roll(sorted_idx, cnt_null) + return sorted_idx if na_position == 'last' else \ + np.roll(sorted_idx, cnt_null) items = np.asanyarray(items) idx = np.arange(len(items)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index c02a8c6dc5819..434a74b6109bd 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -646,7 +646,7 @@ def test_sort_index_na_position_with_categories(self): assert_frame_equal(result, expected) - with pytest.raises(ValueError) as err: + with pytest.raises(ValueError): df_category_with_nan.sort_values(by='c', ascending=False, na_position='bad_position') From fbc8a4e3c6467140a173f2f0f5690cd1bbd667d1 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Mon, 10 Sep 2018 22:04:59 -0500 Subject: [PATCH 08/14] address comments --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/sorting.py | 11 +++--- pandas/tests/frame/test_sorting.py | 54 ++++++++++++++++++------------ 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index f02bd1557ef34..ff0aa63912bfb 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -608,7 +608,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in `codes` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of `.from_codes([1.1, 2.0])`. -- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ```na_position`` value. (:issue:`22556`). `test_stata.py` was incorrectly passing using default ``na_position='last'``. +- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). `test_stata.py` was incorrectly passing using default ``na_position='last'``. Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 49552788ed930..2eb5557d1f74c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -246,14 +246,13 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): mask = isna(items) cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) - if ascending: + if ascending and na_position == 'last': # NaN is coded as -1 and is listed in front after sorting - return sorted_idx if na_position == 'first' \ - else np.roll(sorted_idx, -cnt_null) - else: + sorted_idx = np.roll(sorted_idx, -cnt_null) + elif not ascending and na_position == 'first': # NaN is coded as -1 and is listed in the end after sorting - return sorted_idx if na_position == 'last' else \ - np.roll(sorted_idx, cnt_null) + sorted_idx = np.roll(sorted_idx, cnt_null) + return sorted_idx items = np.asanyarray(items) idx = np.arange(len(items)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 434a74b6109bd..6571435fbab94 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -602,51 +602,61 @@ def test_sort_index_intervalindex(self): def test_sort_index_na_position_with_categories(self): # GH 22556 # Positioning missing value properly when column is Categorical. - df_category_with_nan = pd.DataFrame({ + df = pd.DataFrame({ 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], categories=['A', 'B', 'C'], ordered=True)}) - result = df_category_with_nan.sort_values(by='c', - ascending=True, - na_position='first') - expected = DataFrame({ + result_ascending_na_first = df.sort_values(by='c', + ascending=True, + na_position='first') + expected_ascending_na_first = DataFrame({ 'c': Categorical([np.nan, np.nan, 'A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=True)}, index=[1, 3, 0, 2, 4]) - assert_frame_equal(result, expected) + assert_frame_equal(result_ascending_na_first, + expected_ascending_na_first) - result = df_category_with_nan.sort_values(by='c', + result_ascending_na_last = df.sort_values(by='c', ascending=True, na_position='last') - expected = DataFrame({ + expected_ascending_na_last = DataFrame({ 'c': Categorical(['A', 'B', 'C', np.nan, np.nan], categories=['A', 'B', 'C'], ordered=True)}, index=[0, 2, 4, 1, 3]) - assert_frame_equal(result, expected) + assert_frame_equal(result_ascending_na_last, + expected_ascending_na_last) - result = df_category_with_nan.sort_values(by='c', - ascending=False, - na_position='first') - expected = DataFrame({ + result_descending_na_first = df.sort_values(by='c', + ascending=False, + na_position='first') + expected_descending_na_first = DataFrame({ 'c': Categorical([np.nan, np.nan, 'C', 'B', 'A'], categories=['A', 'B', 'C'], ordered=True)}, index=[3, 1, 4, 2, 0]) - assert_frame_equal(result, expected) + assert_frame_equal(result_descending_na_first, + expected_descending_na_first) - result = df_category_with_nan.sort_values(by='c', - ascending=False, - na_position='last') - expected = DataFrame({ + result_descending_na_last = df.sort_values(by='c', + ascending=False, + na_position='last') + expected_descending_na_last = DataFrame({ 'c': Categorical(['C', 'B', 'A', np.nan, np.nan], categories=['A', 'B', 'C'], ordered=True)}, index=[4, 2, 0, 3, 1]) - assert_frame_equal(result, expected) + assert_frame_equal(result_descending_na_last, + expected_descending_na_last) + + def test_sort_index_na_position_with_categories_raises(self): + df = pd.DataFrame({ + 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], + categories=['A', 'B', 'C'], + ordered=True)}) with pytest.raises(ValueError): - df_category_with_nan.sort_values(by='c', - ascending=False, - na_position='bad_position') + df.sort_values(by='c', + ascending=False, + na_position='bad_position') From 1999d58cda2d0617f536344f14c4b585a52b3e5b Mon Sep 17 00:00:00 2001 From: Weiwen Date: Wed, 26 Sep 2018 18:36:51 -0500 Subject: [PATCH 09/14] call narg in Categorical.sort_values and address linter --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/arrays/categorical.py | 33 ++++++++---------------------- pandas/core/sorting.py | 1 + pandas/tests/frame/test_sorting.py | 20 ++++++++++-------- 4 files changed, 22 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5a5fed88681a2..7b6a23fc4121a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -613,7 +613,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. -- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). `test_stata.py` was incorrectly passing using default ``na_position='last'``. +- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 63a1dacb47abb..375fd9f4267d2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -45,6 +45,8 @@ import pandas.core.algorithms as algorithms +from pandas.core.sorting import nargsort + from pandas.io.formats import console from pandas.io.formats.terminal import get_terminal_size from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs @@ -1605,32 +1607,15 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): msg = 'invalid na_position: {na_position!r}' raise ValueError(msg.format(na_position=na_position)) - codes = np.sort(self._codes) - if not ascending: - codes = codes[::-1] - - # NaN handling - na_mask = (codes == -1) - if na_mask.any(): - n_nans = len(codes[na_mask]) - if na_position == "first": - # in this case sort to the front - new_codes = codes.copy() - new_codes[0:n_nans] = -1 - new_codes[n_nans:] = codes[~na_mask] - codes = new_codes - elif na_position == "last": - # ... and to the end - new_codes = codes.copy() - pos = len(codes) - n_nans - new_codes[0:pos] = codes[~na_mask] - new_codes[pos:] = -1 - codes = new_codes + sorted_idx = nargsort(self, + ascending=ascending, + na_position=na_position) + if inplace: - self._codes = codes - return + self._codes = self._codes[sorted_idx] else: - return self._constructor(values=codes, dtype=self.dtype, + return self._constructor(values=self._codes[sorted_idx], + dtype=self.dtype, fastpath=True) def _values_for_rank(self): diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2eb5557d1f74c..932ad888bb3ae 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -243,6 +243,7 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): if is_categorical_dtype(items): if na_position not in ['first', 'last']: raise ValueError('invalid na_position: {!r}'.format(na_position)) + mask = isna(items) cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 6571435fbab94..4208e6106cf68 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -602,16 +602,18 @@ def test_sort_index_intervalindex(self): def test_sort_index_na_position_with_categories(self): # GH 22556 # Positioning missing value properly when column is Categorical. + categories = ['A', 'B', 'C'] + list_of_nans = [np.nan, np.nan] df = pd.DataFrame({ 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], - categories=['A', 'B', 'C'], + categories=categories, ordered=True)}) result_ascending_na_first = df.sort_values(by='c', ascending=True, na_position='first') expected_ascending_na_first = DataFrame({ - 'c': Categorical([np.nan, np.nan, 'A', 'B', 'C'], - categories=['A', 'B', 'C'], + 'c': Categorical(list_of_nans + categories, + categories=categories, ordered=True)}, index=[1, 3, 0, 2, 4]) assert_frame_equal(result_ascending_na_first, @@ -621,8 +623,8 @@ def test_sort_index_na_position_with_categories(self): ascending=True, na_position='last') expected_ascending_na_last = DataFrame({ - 'c': Categorical(['A', 'B', 'C', np.nan, np.nan], - categories=['A', 'B', 'C'], + 'c': Categorical(categories + list_of_nans, + categories=categories, ordered=True)}, index=[0, 2, 4, 1, 3]) assert_frame_equal(result_ascending_na_last, @@ -632,8 +634,8 @@ def test_sort_index_na_position_with_categories(self): ascending=False, na_position='first') expected_descending_na_first = DataFrame({ - 'c': Categorical([np.nan, np.nan, 'C', 'B', 'A'], - categories=['A', 'B', 'C'], + 'c': Categorical(list_of_nans + sorted(categories, reverse=True), + categories=categories, ordered=True)}, index=[3, 1, 4, 2, 0]) assert_frame_equal(result_descending_na_first, @@ -643,8 +645,8 @@ def test_sort_index_na_position_with_categories(self): ascending=False, na_position='last') expected_descending_na_last = DataFrame({ - 'c': Categorical(['C', 'B', 'A', np.nan, np.nan], - categories=['A', 'B', 'C'], + 'c': Categorical(sorted(categories, reverse=True) + list_of_nans, + categories=categories, ordered=True)}, index=[4, 2, 0, 3, 1]) assert_frame_equal(result_descending_na_last, From 8943280372204fcd52258e961e28338d6653db51 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sun, 7 Oct 2018 21:27:08 -0500 Subject: [PATCH 10/14] comment rest --- pandas/tests/frame/test_sorting.py | 52 ++++++++++++++++-------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 4208e6106cf68..069b6ef3ef205 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -608,49 +608,53 @@ def test_sort_index_na_position_with_categories(self): 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], categories=categories, ordered=True)}) - result_ascending_na_first = df.sort_values(by='c', - ascending=True, - na_position='first') - expected_ascending_na_first = DataFrame({ + # sort ascending with na first + result = df.sort_values(by='c', + ascending=True, + na_position='first') + expected = DataFrame({ 'c': Categorical(list_of_nans + categories, categories=categories, ordered=True)}, index=[1, 3, 0, 2, 4]) - assert_frame_equal(result_ascending_na_first, - expected_ascending_na_first) + assert_frame_equal(result, + expected) - result_ascending_na_last = df.sort_values(by='c', - ascending=True, - na_position='last') - expected_ascending_na_last = DataFrame({ + # sort ascending with na last + result = df.sort_values(by='c', + ascending=True, + na_position='last') + expected = DataFrame({ 'c': Categorical(categories + list_of_nans, categories=categories, ordered=True)}, index=[0, 2, 4, 1, 3]) - assert_frame_equal(result_ascending_na_last, - expected_ascending_na_last) + assert_frame_equal(result, + expected) - result_descending_na_first = df.sort_values(by='c', - ascending=False, - na_position='first') - expected_descending_na_first = DataFrame({ + # sort descending with na first + result = df.sort_values(by='c', + ascending=False, + na_position='first') + expected = DataFrame({ 'c': Categorical(list_of_nans + sorted(categories, reverse=True), categories=categories, ordered=True)}, index=[3, 1, 4, 2, 0]) - assert_frame_equal(result_descending_na_first, - expected_descending_na_first) + assert_frame_equal(result, + expected) - result_descending_na_last = df.sort_values(by='c', - ascending=False, - na_position='last') - expected_descending_na_last = DataFrame({ + # sort descending with na last + result = df.sort_values(by='c', + ascending=False, + na_position='last') + expected = DataFrame({ 'c': Categorical(sorted(categories, reverse=True) + list_of_nans, categories=categories, ordered=True)}, index=[4, 2, 0, 3, 1]) - assert_frame_equal(result_descending_na_last, - expected_descending_na_last) + assert_frame_equal(result, + expected) def test_sort_index_na_position_with_categories_raises(self): df = pd.DataFrame({ From 1709c91cc2e67812b972013cde530d4277d56a48 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Tue, 9 Oct 2018 18:52:36 -0500 Subject: [PATCH 11/14] parametrize test and use set --- pandas/core/sorting.py | 2 +- pandas/tests/frame/test_sorting.py | 67 +++++++++++++++++------------- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 932ad888bb3ae..ee1c62f3decf9 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -241,7 +241,7 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): # specially handle Categorical if is_categorical_dtype(items): - if na_position not in ['first', 'last']: + if na_position not in {'first', 'last'}: raise ValueError('invalid na_position: {!r}'.format(na_position)) mask = isna(items) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 069b6ef3ef205..b96bfa6b99034 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -603,58 +603,67 @@ def test_sort_index_na_position_with_categories(self): # GH 22556 # Positioning missing value properly when column is Categorical. categories = ['A', 'B', 'C'] + reversed_categories = sorted(categories, reverse=True) + category_indices = [0, 2, 4] + reversed_category_indices = sorted(category_indices, reverse=True) + na_indices = [1, 3] + reversed_na_indices = sorted(na_indices, reverse=True) list_of_nans = [np.nan, np.nan] + is_ascending = True + not_ascending = not is_ascending + na_position_first = 'first' + na_position_last = 'last' + column_name = 'c' + df = pd.DataFrame({ - 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], + column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], categories=categories, ordered=True)}) # sort ascending with na first - result = df.sort_values(by='c', - ascending=True, - na_position='first') + result = df.sort_values(by=column_name, + ascending=is_ascending, + na_position=na_position_first) expected = DataFrame({ - 'c': Categorical(list_of_nans + categories, + column_name: Categorical(list_of_nans + categories, categories=categories, - ordered=True)}, index=[1, 3, 0, 2, 4]) + ordered=True)}, index=na_indices+category_indices) - assert_frame_equal(result, - expected) + assert_frame_equal(result, expected) # sort ascending with na last - result = df.sort_values(by='c', - ascending=True, - na_position='last') + result = df.sort_values(by=column_name, + ascending=is_ascending, + na_position=na_position_last) expected = DataFrame({ - 'c': Categorical(categories + list_of_nans, + column_name: Categorical(categories + list_of_nans, categories=categories, - ordered=True)}, index=[0, 2, 4, 1, 3]) + ordered=True)}, index=category_indices+na_indices) - assert_frame_equal(result, - expected) + assert_frame_equal(result, expected) # sort descending with na first - result = df.sort_values(by='c', - ascending=False, - na_position='first') + result = df.sort_values(by=column_name, + ascending=not_ascending, + na_position=na_position_first) expected = DataFrame({ - 'c': Categorical(list_of_nans + sorted(categories, reverse=True), + column_name: Categorical(list_of_nans + reversed_categories, categories=categories, - ordered=True)}, index=[3, 1, 4, 2, 0]) + ordered=True)}, + index=reversed_na_indices + reversed_category_indices) - assert_frame_equal(result, - expected) + assert_frame_equal(result, expected) # sort descending with na last - result = df.sort_values(by='c', - ascending=False, - na_position='last') + result = df.sort_values(by=column_name, + ascending=not_ascending, + na_position=na_position_last) expected = DataFrame({ - 'c': Categorical(sorted(categories, reverse=True) + list_of_nans, + column_name: Categorical(reversed_categories + list_of_nans, categories=categories, - ordered=True)}, index=[4, 2, 0, 3, 1]) + ordered=True)}, + index=reversed_category_indices + reversed_na_indices) - assert_frame_equal(result, - expected) + assert_frame_equal(result, expected) def test_sort_index_na_position_with_categories_raises(self): df = pd.DataFrame({ From 7f6c68a721cd5bfe1e751f6a174d66dc0e61a346 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Tue, 9 Oct 2018 20:02:28 -0500 Subject: [PATCH 12/14] PEP8 indent --- pandas/tests/frame/test_sorting.py | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b96bfa6b99034..e2f925e54eb15 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -161,7 +161,7 @@ def test_sort_nan(self): 'B': [5, 9, 2, nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5]) sorted_df = df.sort_values(['A', 'B'], ascending=[ - 1, 0], na_position='first') + 1, 0], na_position='first') assert_frame_equal(sorted_df, expected) # na_position='last', not order @@ -170,7 +170,7 @@ def test_sort_nan(self): 'B': [4, 5, 5, nan, 2, 9, 5]}, index=[5, 4, 6, 1, 3, 0, 2]) sorted_df = df.sort_values(['A', 'B'], ascending=[ - 0, 1], na_position='last') + 0, 1], na_position='last') assert_frame_equal(sorted_df, expected) # Test DataFrame with nan label @@ -514,7 +514,7 @@ def test_sort_index_categorical_index(self): df = (DataFrame({'A': np.arange(6, dtype='int64'), 'B': Series(list('aabbca')) - .astype(CategoricalDtype(list('cab')))}) + .astype(CategoricalDtype(list('cab')))}) .set_index('B')) result = df.sort_index() @@ -617,16 +617,18 @@ def test_sort_index_na_position_with_categories(self): df = pd.DataFrame({ column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], - categories=categories, - ordered=True)}) + categories=categories, + ordered=True)}) # sort ascending with na first result = df.sort_values(by=column_name, ascending=is_ascending, na_position=na_position_first) expected = DataFrame({ column_name: Categorical(list_of_nans + categories, - categories=categories, - ordered=True)}, index=na_indices+category_indices) + categories=categories, + ordered=True) + }, + index=na_indices + category_indices) assert_frame_equal(result, expected) @@ -636,8 +638,10 @@ def test_sort_index_na_position_with_categories(self): na_position=na_position_last) expected = DataFrame({ column_name: Categorical(categories + list_of_nans, - categories=categories, - ordered=True)}, index=category_indices+na_indices) + categories=categories, + ordered=True) + }, + index=category_indices + na_indices) assert_frame_equal(result, expected) @@ -647,8 +651,9 @@ def test_sort_index_na_position_with_categories(self): na_position=na_position_first) expected = DataFrame({ column_name: Categorical(list_of_nans + reversed_categories, - categories=categories, - ordered=True)}, + categories=categories, + ordered=True) + }, index=reversed_na_indices + reversed_category_indices) assert_frame_equal(result, expected) @@ -659,8 +664,9 @@ def test_sort_index_na_position_with_categories(self): na_position=na_position_last) expected = DataFrame({ column_name: Categorical(reversed_categories + list_of_nans, - categories=categories, - ordered=True)}, + categories=categories, + ordered=True) + }, index=reversed_category_indices + reversed_na_indices) assert_frame_equal(result, expected) From 9f0343b61a57c8bf495d9dfbb4eb3a1be737a461 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Wed, 10 Oct 2018 22:59:00 -0500 Subject: [PATCH 13/14] pytest parametrization --- pandas/tests/frame/test_sorting.py | 44 +++++++++++++++--------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index e2f925e54eb15..eaef5191dffc7 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -599,21 +599,25 @@ def test_sort_index_intervalindex(self): result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) - def test_sort_index_na_position_with_categories(self): + @pytest.mark.parametrize("categories", [['A', 'B', 'C']]) + @pytest.mark.parametrize("category_indices", [[0, 2, 4]]) + @pytest.mark.parametrize("list_of_nans", [[np.nan, np.nan]]) + @pytest.mark.parametrize("na_indices", [[1, 3]]) + @pytest.mark.parametrize("na_position_first", ['first']) + @pytest.mark.parametrize("na_position_last", ['last']) + @pytest.mark.parametrize("column_name", ['c']) + def test_sort_index_na_position_with_categories(self, categories, + category_indices, + na_indices, + list_of_nans, + na_position_first, + na_position_last, + column_name): # GH 22556 # Positioning missing value properly when column is Categorical. - categories = ['A', 'B', 'C'] reversed_categories = sorted(categories, reverse=True) - category_indices = [0, 2, 4] reversed_category_indices = sorted(category_indices, reverse=True) - na_indices = [1, 3] reversed_na_indices = sorted(na_indices, reverse=True) - list_of_nans = [np.nan, np.nan] - is_ascending = True - not_ascending = not is_ascending - na_position_first = 'first' - na_position_last = 'last' - column_name = 'c' df = pd.DataFrame({ column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], @@ -621,53 +625,49 @@ def test_sort_index_na_position_with_categories(self): ordered=True)}) # sort ascending with na first result = df.sort_values(by=column_name, - ascending=is_ascending, + ascending=True, na_position=na_position_first) expected = DataFrame({ column_name: Categorical(list_of_nans + categories, categories=categories, ordered=True) - }, - index=na_indices + category_indices) + }, index=na_indices + category_indices) assert_frame_equal(result, expected) # sort ascending with na last result = df.sort_values(by=column_name, - ascending=is_ascending, + ascending=True, na_position=na_position_last) expected = DataFrame({ column_name: Categorical(categories + list_of_nans, categories=categories, ordered=True) - }, - index=category_indices + na_indices) + }, index=category_indices + na_indices) assert_frame_equal(result, expected) # sort descending with na first result = df.sort_values(by=column_name, - ascending=not_ascending, + ascending=False, na_position=na_position_first) expected = DataFrame({ column_name: Categorical(list_of_nans + reversed_categories, categories=categories, ordered=True) - }, - index=reversed_na_indices + reversed_category_indices) + }, index=reversed_na_indices + reversed_category_indices) assert_frame_equal(result, expected) # sort descending with na last result = df.sort_values(by=column_name, - ascending=not_ascending, + ascending=False, na_position=na_position_last) expected = DataFrame({ column_name: Categorical(reversed_categories + list_of_nans, categories=categories, ordered=True) - }, - index=reversed_category_indices + reversed_na_indices) + }, index=reversed_category_indices + reversed_na_indices) assert_frame_equal(result, expected) From 9e7afb1bc71f370be60e38ebb7f08b2992391918 Mon Sep 17 00:00:00 2001 From: Weiwen Date: Sat, 13 Oct 2018 14:07:16 -0500 Subject: [PATCH 14/14] reverse back from pytest parametrization --- pandas/tests/frame/test_sorting.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index eaef5191dffc7..41b11d9c15f35 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -599,22 +599,17 @@ def test_sort_index_intervalindex(self): result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("categories", [['A', 'B', 'C']]) - @pytest.mark.parametrize("category_indices", [[0, 2, 4]]) - @pytest.mark.parametrize("list_of_nans", [[np.nan, np.nan]]) - @pytest.mark.parametrize("na_indices", [[1, 3]]) - @pytest.mark.parametrize("na_position_first", ['first']) - @pytest.mark.parametrize("na_position_last", ['last']) - @pytest.mark.parametrize("column_name", ['c']) - def test_sort_index_na_position_with_categories(self, categories, - category_indices, - na_indices, - list_of_nans, - na_position_first, - na_position_last, - column_name): + def test_sort_index_na_position_with_categories(self): # GH 22556 # Positioning missing value properly when column is Categorical. + categories = ['A', 'B', 'C'] + category_indices = [0, 2, 4] + list_of_nans = [np.nan, np.nan] + na_indices = [1, 3] + na_position_first = 'first' + na_position_last = 'last' + column_name = 'c' + reversed_categories = sorted(categories, reverse=True) reversed_category_indices = sorted(category_indices, reverse=True) reversed_na_indices = sorted(na_indices, reverse=True)