REF: re-use machinery for DataFrameGroupBy.nunique (#41390)

jbrockmendel · web-flow · commit e3a9618b2b71 · 2021-05-10T19:26:13.000-04:00
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -22,7 +22,6 @@
     Mapping,
     TypeVar,
     Union,
-    cast,
 )
 import warnings
 
@@ -1626,6 +1625,10 @@ def _wrap_aggregated_output(
 
         if self.axis == 1:
             result = result.T
+            if result.index.equals(self.obj.index):
+                # Retain e.g. DatetimeIndex/TimedeltaIndex freq
+                result.index = self.obj.index.copy()
+                # TODO: Do this more systematically
 
         return self._reindex_output(result)
 
@@ -1677,21 +1680,21 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
 
         return self._reindex_output(result)._convert(datetime=True)
 
-    def _iterate_column_groupbys(self):
-        for i, colname in enumerate(self._selected_obj.columns):
+    def _iterate_column_groupbys(self, obj: FrameOrSeries):
+        for i, colname in enumerate(obj.columns):
             yield colname, SeriesGroupBy(
-                self._selected_obj.iloc[:, i],
+                obj.iloc[:, i],
                 selection=colname,
                 grouper=self.grouper,
                 exclusions=self.exclusions,
             )
 
-    def _apply_to_column_groupbys(self, func) -> DataFrame:
+    def _apply_to_column_groupbys(self, func, obj: FrameOrSeries) -> DataFrame:
         from pandas.core.reshape.concat import concat
 
-        columns = self._selected_obj.columns
+        columns = obj.columns
         results = [
-            func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()
+            func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
         ]
 
         if not len(results):
@@ -1778,41 +1781,21 @@ def nunique(self, dropna: bool = True) -> DataFrame:
         4   ham       5      x
         5   ham       5      y
         """
-        from pandas.core.reshape.concat import concat
 
-        # TODO: this is duplicative of how GroupBy naturally works
-        # Try to consolidate with normal wrapping functions
+        if self.axis != 0:
+            # see test_groupby_crash_on_nunique
+            return self._python_agg_general(lambda sgb: sgb.nunique(dropna))
 
         obj = self._obj_with_exclusions
-        if self.axis == 0:
-            iter_func = obj.items
-        else:
-            iter_func = obj.iterrows
-
-        res_list = [
-            SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique(
-                dropna
-            )
-            for label, content in iter_func()
-        ]
-        if res_list:
-            results = concat(res_list, axis=1)
-            results = cast(DataFrame, results)
-        else:
-            # concat would raise
-            results = DataFrame(
-                [], index=self.grouper.result_index, columns=obj.columns[:0]
-            )
-
-        if self.axis == 1:
-            results = results.T
-
-        other_axis = 1 - self.axis
-        results._get_axis(other_axis).names = obj._get_axis(other_axis).names
+        results = self._apply_to_column_groupbys(
+            lambda sgb: sgb.nunique(dropna), obj=obj
+        )
+        results.columns.names = obj.columns.names  # TODO: do at higher level?
 
         if not self.as_index:
             results.index = ibase.default_index(len(results))
             self._insert_inaxis_grouper_inplace(results)
+
         return results
 
     @Appender(DataFrame.idxmax.__doc__)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1882,7 +1882,9 @@ def ohlc(self) -> DataFrame:
             )
             return self._reindex_output(result)
 
-        return self._apply_to_column_groupbys(lambda x: x.ohlc())
+        return self._apply_to_column_groupbys(
+            lambda x: x.ohlc(), self._obj_with_exclusions
+        )
 
     @final
     @doc(DataFrame.describe)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2058,24 +2058,36 @@ def test_dup_labels_output_shape(groupby_func, idx):
 
 def test_groupby_crash_on_nunique(axis):
     # Fix following 30253
+    dti = date_range("2016-01-01", periods=2, name="foo")
     df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
+    df.columns.names = ("bar", "baz")
+    df.index = dti
 
     axis_number = df._get_axis_number(axis)
     if not axis_number:
         df = df.T
 
-    result = df.groupby(axis=axis_number, level=0).nunique()
+    gb = df.groupby(axis=axis_number, level=0)
+    result = gb.nunique()
 
-    expected = DataFrame({"A": [1, 2], "D": [1, 1]})
+    expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti)
+    expected.columns.name = "bar"
     if not axis_number:
         expected = expected.T
 
     tm.assert_frame_equal(result, expected)
 
-    # same thing, but empty columns
-    gb = df[[]].groupby(axis=axis_number, level=0)
-    res = gb.nunique()
-    exp = expected[[]]
+    if axis_number == 0:
+        # same thing, but empty columns
+        gb2 = df[[]].groupby(axis=axis_number, level=0)
+        exp = expected[[]]
+    else:
+        # same thing, but empty rows
+        gb2 = df.loc[[]].groupby(axis=axis_number, level=0)
+        # default for empty when we can't infer a dtype is float64
+        exp = expected.loc[[]].astype(np.float64)
+
+    res = gb2.nunique()
     tm.assert_frame_equal(res, exp)
 
 
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
@@ -121,12 +121,8 @@ def test_aaa_group_order():
     tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5])
 
 
-def test_aggregate_normal(request, resample_method):
+def test_aggregate_normal(resample_method):
     """Check TimeGrouper's aggregation is identical as normal groupby."""
-    if resample_method == "ohlc":
-        request.node.add_marker(
-            pytest.mark.xfail(reason="DataError: No numeric types to aggregate")
-        )
 
     data = np.random.randn(20, 4)
     normal_df = DataFrame(data, columns=["A", "B", "C", "D"])

Original file line number	Diff line number	Diff line change
`@@ -1882,7 +1882,9 @@ def ohlc(self) -> DataFrame:`
`1882`	`1882`	`)`
`1883`	`1883`	`return self._reindex_output(result)`
`1884`	`1884`
`1885`		`- return self._apply_to_column_groupbys(lambda x: x.ohlc())`
	`1885`	`+ return self._apply_to_column_groupbys(`
	`1886`	`+ lambda x: x.ohlc(), self._obj_with_exclusions`
	`1887`	`+ )`
`1886`	`1888`
`1887`	`1889`	`@final`
`1888`	`1890`	`@doc(DataFrame.describe)`