Improve the speed of from_dataframe with a MultiIndex (by 40x!) (#4184)

shoyer · web-flow · commit 03d409ec3503 · 2020-07-02T13:39:00.000-07:00
* Add MultiIndexSeries.time_to_xarray() benchmark * Improve the speed of from_dataframe with a MultiIndex Fixes GH-2459 Before: pandas.MultiIndexSeries.time_to_xarray ======= ========= ========== -- subset ------- -------------------- dtype True False ======= ========= ========== int 505±0ms 37.1±0ms float 485±0ms 38.3±0ms ======= ========= ========== After: pandas.MultiIndexSeries.time_to_xarray ======= ========= ========== -- subset ------- -------------------- dtype True False ======= ========= ========== int 11.5±0ms 39.2±0ms float 12.5±0ms 26.6±0ms ======= ========= ========== There are still some cases where we have to fall back to the existing slow implementation, but hopefully they should now be relatively rare. * remove unused import * Simplify converting MultiIndex dataframes * remove comments * remove types with NA * more multiindex dataframe tests * add whats new note * Preserve order of MultiIndex levels in from_dataframe * Add todo note * Rewrite from_dataframe to avoid passing around a dataframe * Require that MultiIndexes are unique even with sparse=True * clarify comment
diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py
@@ -0,0 +1,24 @@
+import numpy as np
+import pandas as pd
+
+from . import parameterized
+
+
+class MultiIndexSeries:
+    def setup(self, dtype, subset):
+        data = np.random.rand(100000).astype(dtype)
+        index = pd.MultiIndex.from_product(
+            [
+                list("abcdefhijk"),
+                list("abcdefhijk"),
+                pd.date_range(start="2000-01-01", periods=1000, freq="B"),
+            ]
+        )
+        series = pd.Series(data, index)
+        if subset:
+            series = series[::3]
+        self.series = series
+
+    @parameterized(["dtype", "subset"], ([int, float], [True, False]))
+    def time_to_xarray(self, dtype, subset):
+        self.series.to_xarray()
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -49,7 +49,10 @@ Enhancements
   For orthogonal linear- and nearest-neighbor interpolation, we do 1d-interpolation sequentially
   rather than interpolating in multidimensional space. (:issue:`2223`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
-- :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
+- Major performance improvement for :py:meth:`Dataset.from_dataframe` when the
+  dataframe has a MultiIndex (:pull:`4184`).
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
+  - :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
   coordinate attributes (:pull:`4103`). By `Oriol Abril <https://github.com/OriolAbril>`_.
 
 New Features
@@ -133,8 +136,9 @@ Bug fixes
   By `Deepak Cherian <https://github.com/dcherian>`_.
 - ``ValueError`` is raised when ``fill_value`` is not a scalar in :py:meth:`full_like`. (:issue:`3977`)
   By `Huite Bootsma <https://github.com/huite>`_.
-- Fix wrong order in converting a ``pd.Series`` with a MultiIndex to ``DataArray``. (:issue:`3951`)
-  By `Keisuke Fujii <https://github.com/fujiisoup>`_.
+- Fix wrong order in converting a ``pd.Series`` with a MultiIndex to ``DataArray``.
+  (:issue:`3951`, :issue:`4186`)
+  By `Keisuke Fujii <https://github.com/fujiisoup>`_ and `Stephan Hoyer <https://github.com/shoyer>`_.
 - Fix renaming of coords when one or more stacked coords is not in
   sorted order during stack+groupby+apply operations. (:issue:`3287`,
   :pull:`3906`) By `Spencer Hill <https://github.com/spencerahill>`_
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -4543,11 +4543,10 @@ def to_dataframe(self):
         return self._to_dataframe(self.dims)
 
     def _set_sparse_data_from_dataframe(
-        self, dataframe: pd.DataFrame, dims: tuple
+        self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple
     ) -> None:
         from sparse import COO
 
-        idx = dataframe.index
         if isinstance(idx, pd.MultiIndex):
             coords = np.stack([np.asarray(code) for code in idx.codes], axis=0)
             is_sorted = idx.is_lexsorted()
@@ -4557,11 +4556,7 @@ def _set_sparse_data_from_dataframe(
             is_sorted = True
             shape = (idx.size,)
 
-        for name, series in dataframe.items():
-            # Cast to a NumPy array first, in case the Series is a pandas
-            # Extension array (which doesn't have a valid NumPy dtype)
-            values = np.asarray(series)
-
+        for name, values in arrays:
             # In virtually all real use cases, the sparse array will now have
             # missing values and needs a fill_value. For consistency, don't
             # special case the rare exceptions (e.g., dtype=int without a
@@ -4580,18 +4575,36 @@ def _set_sparse_data_from_dataframe(
             self[name] = (dims, data)
 
     def _set_numpy_data_from_dataframe(
-        self, dataframe: pd.DataFrame, dims: tuple
+        self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple
     ) -> None:
-        idx = dataframe.index
-        if isinstance(idx, pd.MultiIndex):
-            # expand the DataFrame to include the product of all levels
-            full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
-            dataframe = dataframe.reindex(full_idx)
-            shape = tuple(lev.size for lev in idx.levels)
-        else:
-            shape = (idx.size,)
-        for name, series in dataframe.items():
-            data = np.asarray(series).reshape(shape)
+        if not isinstance(idx, pd.MultiIndex):
+            for name, values in arrays:
+                self[name] = (dims, values)
+            return
+
+        shape = tuple(lev.size for lev in idx.levels)
+        indexer = tuple(idx.codes)
+
+        # We already verified that the MultiIndex has all unique values, so
+        # there are missing values if and only if the size of output arrays is
+        # larger that the index.
+        missing_values = np.prod(shape) > idx.shape[0]
+
+        for name, values in arrays:
+            # NumPy indexing is much faster than using DataFrame.reindex() to
+            # fill in missing values:
+            # https://stackoverflow.com/a/35049899/809705
+            if missing_values:
+                dtype, fill_value = dtypes.maybe_promote(values.dtype)
+                data = np.full(shape, fill_value, dtype)
+            else:
+                # If there are no missing values, keep the existing dtype
+                # instead of promoting to support NA, e.g., keep integer
+                # columns as integers.
+                # TODO: consider removing this special case, which doesn't
+                # exist for sparse=True.
+                data = np.zeros(shape, values.dtype)
+            data[indexer] = values
             self[name] = (dims, data)
 
     @classmethod
@@ -4631,7 +4644,19 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Datas
         if not dataframe.columns.is_unique:
             raise ValueError("cannot convert DataFrame with non-unique columns")
 
-        idx, dataframe = remove_unused_levels_categories(dataframe.index, dataframe)
+        idx = remove_unused_levels_categories(dataframe.index)
+
+        if isinstance(idx, pd.MultiIndex) and not idx.is_unique:
+            raise ValueError(
+                "cannot convert a DataFrame with a non-unique MultiIndex into xarray"
+            )
+
+        # Cast to a NumPy array first, in case the Series is a pandas Extension
+        # array (which doesn't have a valid NumPy dtype)
+        # TODO: allow users to control how this casting happens, e.g., by
+        # forwarding arguments to pandas.Series.to_numpy?
+        arrays = [(k, np.asarray(v)) for k, v in dataframe.items()]
+
         obj = cls()
 
         if isinstance(idx, pd.MultiIndex):
@@ -4647,9 +4672,9 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Datas
             obj[index_name] = (dims, idx)
 
         if sparse:
-            obj._set_sparse_data_from_dataframe(dataframe, dims)
+            obj._set_sparse_data_from_dataframe(idx, arrays, dims)
         else:
-            obj._set_numpy_data_from_dataframe(dataframe, dims)
+            obj._set_numpy_data_from_dataframe(idx, arrays, dims)
         return obj
 
     def to_dask_dataframe(self, dim_order=None, set_index=False):
diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
@@ -9,7 +9,7 @@
 from .variable import Variable
 
 
-def remove_unused_levels_categories(index, dataframe=None):
+def remove_unused_levels_categories(index: pd.Index) -> pd.Index:
     """
     Remove unused levels from MultiIndex and unused categories from CategoricalIndex
     """
@@ -25,14 +25,15 @@ def remove_unused_levels_categories(index, dataframe=None):
                 else:
                     level = level[index.codes[i]]
                 levels.append(level)
+            # TODO: calling from_array() reorders MultiIndex levels. It would
+            # be best to avoid this, if possible, e.g., by using
+            # MultiIndex.remove_unused_levels() (which does not reorder) on the
+            # part of the MultiIndex that is not categorical, or by fixing this
+            # upstream in pandas.
             index = pd.MultiIndex.from_arrays(levels, names=index.names)
     elif isinstance(index, pd.CategoricalIndex):
         index = index.remove_unused_categories()
-
-    if dataframe is None:
-        return index
-    dataframe = dataframe.set_index(index)
-    return dataframe.index, dataframe
+    return index
 
 
 class Indexes(collections.abc.Mapping):
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -4013,6 +4013,49 @@ def test_to_and_from_empty_dataframe(self):
         assert len(actual) == 0
         assert expected.equals(actual)
 
+    def test_from_dataframe_multiindex(self):
+        index = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]], names=["x", "y"])
+        df = pd.DataFrame({"z": np.arange(6)}, index=index)
+
+        expected = Dataset(
+            {"z": (("x", "y"), [[0, 1, 2], [3, 4, 5]])},
+            coords={"x": ["a", "b"], "y": [1, 2, 3]},
+        )
+        actual = Dataset.from_dataframe(df)
+        assert_identical(actual, expected)
+
+        df2 = df.iloc[[3, 2, 1, 0, 4, 5], :]
+        actual = Dataset.from_dataframe(df2)
+        assert_identical(actual, expected)
+
+        df3 = df.iloc[:4, :]
+        expected3 = Dataset(
+            {"z": (("x", "y"), [[0, 1, 2], [3, np.nan, np.nan]])},
+            coords={"x": ["a", "b"], "y": [1, 2, 3]},
+        )
+        actual = Dataset.from_dataframe(df3)
+        assert_identical(actual, expected3)
+
+        df_nonunique = df.iloc[[0, 0], :]
+        with raises_regex(ValueError, "non-unique MultiIndex"):
+            Dataset.from_dataframe(df_nonunique)
+
+    def test_from_dataframe_unsorted_levels(self):
+        # regression test for GH-4186
+        index = pd.MultiIndex(
+            levels=[["b", "a"], ["foo"]], codes=[[0, 1], [0, 0]], names=["lev1", "lev2"]
+        )
+        df = pd.DataFrame({"c1": [0, 2], "c2": [1, 3]}, index=index)
+        expected = Dataset(
+            {
+                "c1": (("lev1", "lev2"), [[0], [2]]),
+                "c2": (("lev1", "lev2"), [[1], [3]]),
+            },
+            coords={"lev1": ["b", "a"], "lev2": ["foo"]},
+        )
+        actual = Dataset.from_dataframe(df)
+        assert_identical(actual, expected)
+
     def test_from_dataframe_non_unique_columns(self):
         # regression test for GH449
         df = pd.DataFrame(np.zeros((2, 2)))