From 31f1799644cc2e1975651207bc97234c03be19f2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Mar 2018 12:43:01 -0800 Subject: [PATCH 1/7] Added GroupBy mad tests --- pandas/tests/groupby/test_groupby.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0561b3a1d8592..f41d4671e4a43 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2141,6 +2141,30 @@ def test_groupby_bool_aggs(self, agg_func, skipna, vals): result = getattr(df.groupby('key'), agg_func)(skipna=skipna) assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("dtype", ['int', 'float']) + def test_groupby_mad(self, dtype): + vals = np.array(range(10)).astype(dtype) + df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) + exp_df = DataFrame({'val': [1.2, 1.2]}, index=pd.Index(['a', 'b'], + name='key')) + + result = df.groupby('key').mad() + tm.assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("vals", [ + ['foo'] * 10, [True] * 10]) + def test_groupby_mad_raises(self, vals): + df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) + + with tm.assert_raises_regex(TypeError, "Invalid type for mad"): + df.groupby('key').mad() + + def test_groupby_mad_skipna(self): + df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) + with tm.assert_raises_regex( + NotImplementedError, "'skipna=False' not yet implemented"): + df.groupby('key').mad(skipna=False) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From 0c103697a00451853eba086fd15263f7add5c622 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Mar 2018 14:38:14 -0800 Subject: [PATCH 2/7] Implemented Cythonized GroupBy mad; fixed tests --- pandas/core/groupby.py | 20 ++++++++++++++++ pandas/tests/groupby/test_groupby.py | 33 ++++++++++++++------------ pandas/tests/groupby/test_whitelist.py | 2 +- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4a09d636ee320..587e8691314c0 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1353,6 +1353,26 @@ def var(self, ddof=1, *args, **kwargs): f = lambda x: x.var(ddof=ddof, **kwargs) return self._python_agg_general(f) + @Substitution(name='groupby') + @Appender(_doc_template) + def mad(self, skipna=True): + if not skipna: + raise NotImplementedError("'skipna=False' not yet implemented") + + if self.axis != 0: + return self.apply(lambda x: x.mad(axis=self.axis)) + + # Wrap in a try..except. Ideally this wouldn't be required here but + # rather be moved to the underlying function calls + try: + demeaned = np.abs(self.shift(0) - self.transform('mean')) + demeaned = demeaned.set_index(self.grouper.labels) + agg = demeaned.groupby(demeaned.index) + except TypeError: + raise DataError('No numeric types to aggregate') + + return agg.mean().set_index(self.grouper.result_index) + @Substitution(name='groupby') @Appender(_doc_template) def sem(self, ddof=1): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f41d4671e4a43..cedf4093c4066 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,6 +9,7 @@ from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, concat, Panel, DatetimeIndex, read_csv) +from pandas.core.base import DataError from pandas.core.dtypes.missing import isna from pandas.errors import UnsupportedFunctionCall, PerformanceWarning from pandas.util.testing import (assert_frame_equal, assert_index_equal, @@ -1300,17 +1301,6 @@ def test_non_cython_api(self): g = df.groupby('A') gni = df.groupby('A', as_index=False) - # mad - expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' - result = g.mad() - assert_frame_equal(result, expected) - - expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], - index=[0, 1]) - result = gni.mad() - assert_frame_equal(result, expected) - # describe expected_index = pd.Index([1, 3], name='A') expected_col = pd.MultiIndex(levels=[['B'], @@ -2141,14 +2131,26 @@ def test_groupby_bool_aggs(self, agg_func, skipna, vals): result = getattr(df.groupby('key'), agg_func)(skipna=skipna) assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("test_mi", [True, False]) @pytest.mark.parametrize("dtype", ['int', 'float']) - def test_groupby_mad(self, dtype): + def test_groupby_mad(self, test_mi, dtype): vals = np.array(range(10)).astype(dtype) df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) exp_df = DataFrame({'val': [1.2, 1.2]}, index=pd.Index(['a', 'b'], name='key')) - result = df.groupby('key').mad() + grping = ['key'] + if test_mi: + df = df.append(df) # Double the size of the frame + df['newcol'] = ['foo'] * 10 + ['bar'] * 10 + grping.append('newcol') + + mi = pd.MultiIndex.from_product((exp_df.index.values, + ['bar', 'foo']), + names=['key', 'newcol']) + exp_df = exp_df.append(exp_df).set_index(mi) + + result = df.groupby(grping).mad() tm.assert_frame_equal(result, exp_df) @pytest.mark.parametrize("vals", [ @@ -2156,11 +2158,12 @@ def test_groupby_mad(self, dtype): def test_groupby_mad_raises(self, vals): df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) - with tm.assert_raises_regex(TypeError, "Invalid type for mad"): + with tm.assert_raises_regex(DataError, + "No numeric types to aggregate"): df.groupby('key').mad() def test_groupby_mad_skipna(self): - df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) + df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': range(10)}) with tm.assert_raises_regex( NotImplementedError, "'skipna=False' not yet implemented"): df.groupby('key').mad(skipna=False) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 8d6e074881cbb..4bea747963dc8 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -12,7 +12,7 @@ AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] -AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] +AGG_FUNCTIONS_WITH_SKIPNA = ['skew'] df_whitelist = frozenset([ 'last', From 192253fbd08d5eaa9158d2f9f0bb177220553a33 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Mar 2018 15:33:15 -0800 Subject: [PATCH 3/7] Fixed support issue for series; added tests --- pandas/core/groupby.py | 7 +++++-- pandas/tests/groupby/test_groupby.py | 23 ++++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 587e8691314c0..a6916557c8b77 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1366,12 +1366,15 @@ def mad(self, skipna=True): # rather be moved to the underlying function calls try: demeaned = np.abs(self.shift(0) - self.transform('mean')) - demeaned = demeaned.set_index(self.grouper.labels) + demeaned.index = self.grouper.labels agg = demeaned.groupby(demeaned.index) except TypeError: raise DataError('No numeric types to aggregate') - return agg.mean().set_index(self.grouper.result_index) + result = agg.mean() + result.index = self.grouper.result_index + + return result @Substitution(name='groupby') @Appender(_doc_template) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cedf4093c4066..acb3c2f98929c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2131,27 +2131,36 @@ def test_groupby_bool_aggs(self, agg_func, skipna, vals): result = getattr(df.groupby('key'), agg_func)(skipna=skipna) assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("obj", [Series, DataFrame]) @pytest.mark.parametrize("test_mi", [True, False]) @pytest.mark.parametrize("dtype", ['int', 'float']) - def test_groupby_mad(self, test_mi, dtype): + def test_groupby_mad(self, obj, test_mi, dtype): vals = np.array(range(10)).astype(dtype) df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) - exp_df = DataFrame({'val': [1.2, 1.2]}, index=pd.Index(['a', 'b'], - name='key')) + idx = pd.Index(['a', 'b'], name='key') + exp = obj([1.2, 1.2], index=idx) grping = ['key'] + if test_mi: df = df.append(df) # Double the size of the frame df['newcol'] = ['foo'] * 10 + ['bar'] * 10 grping.append('newcol') - mi = pd.MultiIndex.from_product((exp_df.index.values, + mi = pd.MultiIndex.from_product((exp.index.values, ['bar', 'foo']), names=['key', 'newcol']) - exp_df = exp_df.append(exp_df).set_index(mi) + exp = exp.append(exp) + exp.index = mi - result = df.groupby(grping).mad() - tm.assert_frame_equal(result, exp_df) + if obj is Series: + exp.name = 'val' + result = df.groupby(grping)['val'].mad() + tm.assert_series_equal(result, exp) + else: + exp = exp.rename(columns={0: 'val'}) + result = df.groupby(grping).mad() + tm.assert_frame_equal(result, exp) @pytest.mark.parametrize("vals", [ ['foo'] * 10, [True] * 10]) From 9d7f0ac90a0d0210d78b92b1cc4648e14c301e7b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Mar 2018 15:51:02 -0800 Subject: [PATCH 4/7] Code refactor / cleanup --- pandas/core/groupby.py | 11 ++++------- pandas/tests/groupby/test_groupby.py | 8 ++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a6916557c8b77..bf05ed0785e52 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1362,18 +1362,15 @@ def mad(self, skipna=True): if self.axis != 0: return self.apply(lambda x: x.mad(axis=self.axis)) - # Wrap in a try..except. Ideally this wouldn't be required here but - # rather be moved to the underlying function calls + # Wrap in a try..except to catch a TypeError with bool data + # Ideally this would be implemented in `mean` instead of here try: demeaned = np.abs(self.shift(0) - self.transform('mean')) - demeaned.index = self.grouper.labels - agg = demeaned.groupby(demeaned.index) + result = demeaned.groupby(self.grouper.labels).mean() + result.index = self.grouper.result_index except TypeError: raise DataError('No numeric types to aggregate') - result = agg.mean() - result.index = self.grouper.result_index - return result @Substitution(name='groupby') diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index acb3c2f98929c..7e08a7e96d23c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2131,15 +2131,15 @@ def test_groupby_bool_aggs(self, agg_func, skipna, vals): result = getattr(df.groupby('key'), agg_func)(skipna=skipna) assert_frame_equal(result, exp_df) - @pytest.mark.parametrize("obj", [Series, DataFrame]) + @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize("test_mi", [True, False]) @pytest.mark.parametrize("dtype", ['int', 'float']) - def test_groupby_mad(self, obj, test_mi, dtype): + def test_groupby_mad(self, klass, test_mi, dtype): vals = np.array(range(10)).astype(dtype) df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) idx = pd.Index(['a', 'b'], name='key') - exp = obj([1.2, 1.2], index=idx) + exp = klass([1.2, 1.2], index=idx) grping = ['key'] if test_mi: @@ -2153,7 +2153,7 @@ def test_groupby_mad(self, obj, test_mi, dtype): exp = exp.append(exp) exp.index = mi - if obj is Series: + if klass is Series: exp.name = 'val' result = df.groupby(grping)['val'].mad() tm.assert_series_equal(result, exp) From 57152e6480f8f9622aaf44875e4148f109918085 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Mar 2018 16:19:59 -0800 Subject: [PATCH 5/7] Updated whatsnew --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 79d85513efa26..c332b977bd362 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -791,6 +791,7 @@ Performance Improvements - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) - Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) - Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.mad` (:issue:`19165`) .. _whatsnew_0230.docs: From 5307ac3d9e7c1398e53f06a57d8e4dc40f23930f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 6 Aug 2018 20:57:00 -0700 Subject: [PATCH 6/7] Test fixup --- pandas/tests/groupby/test_function.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 8cd95975fdb96..b337fa8e5f5fa 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -4,6 +4,7 @@ import pandas as pd from pandas import (DataFrame, Index, compat, isna, Series, MultiIndex, Timestamp, date_range) +from pandas.core.base import DataError from pandas.errors import UnsupportedFunctionCall from pandas.util import testing as tm import pandas.core.nanops as nanops @@ -472,7 +473,7 @@ def test_max_nan_bug(): @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize("test_mi", [True, False]) @pytest.mark.parametrize("dtype", ['int', 'float']) -def test_groupby_mad(self, klass, test_mi, dtype): +def test_groupby_mad(klass, test_mi, dtype): vals = np.array(range(10)).astype(dtype) df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) @@ -503,7 +504,7 @@ def test_groupby_mad(self, klass, test_mi, dtype): @pytest.mark.parametrize("vals", [ ['foo'] * 10, [True] * 10]) -def test_groupby_mad_raises(self, vals): +def test_groupby_mad_raises(vals): df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) with tm.assert_raises_regex(DataError, @@ -511,7 +512,7 @@ def test_groupby_mad_raises(self, vals): df.groupby('key').mad() -def test_groupby_mad_skipna(self): +def test_groupby_mad_skipna(): df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': range(10)}) with tm.assert_raises_regex( NotImplementedError, "'skipna=False' not yet implemented"): From 5cab1eb6c10ab071d92e02946048f048cfe7158f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 6 Aug 2018 20:57:35 -0700 Subject: [PATCH 7/7] LINT fixup --- pandas/tests/groupby/test_function.py | 1 + pandas/tests/groupby/test_groupby.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b337fa8e5f5fa..69b87b0689f23 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -470,6 +470,7 @@ def test_max_nan_bug(): tm.assert_frame_equal(r, e) assert not r['File'].isna().any() + @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize("test_mi", [True, False]) @pytest.mark.parametrize("dtype", ['int', 'float']) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ddf5ba3460d83..ca3ed354809a1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,6 @@ from pandas import (date_range, Timestamp, Index, MultiIndex, DataFrame, Series, Panel, DatetimeIndex, read_csv) -from pandas.core.base import DataError from pandas.errors import PerformanceWarning from pandas.util.testing import (assert_frame_equal, assert_series_equal, assert_almost_equal)