mypy fixes

DHRUVA KUMAR KAUSHAL · DHRUVA KUMAR KAUSHAL · commit 7deee5707e43 · 2025-12-01T21:07:44.000+05:30
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -3954,7 +3954,10 @@ def to_pandas(self) -> Self | pd.Series | pd.DataFrame:
         return pandas_object
 
     def to_dataframe(
-        self, name: Hashable | None = None, dim_order: Sequence[Hashable] | None = None
+        self,
+        name: Hashable | None = None,
+        dim_order: Sequence[Hashable] | None = None,
+        create_index: bool = True,
     ) -> pd.DataFrame:
         """Convert this array and its coordinates into a tidy pandas.DataFrame.
 
@@ -3979,6 +3982,11 @@ def to_dataframe(
 
             If provided, must include all dimensions of this DataArray. By default,
             dimensions are sorted according to the DataArray dimensions order.
+        create_index : bool, default: True
+            If True (default), create a MultiIndex from the Cartesian product
+            of this DataArray's indices. If False, use a RangeIndex instead.
+            This can be useful to avoid the potentially expensive MultiIndex
+            creation.
 
         Returns
         -------
@@ -4013,7 +4021,7 @@ def to_dataframe(
         else:
             ordered_dims = ds._normalize_dim_order(dim_order=dim_order)
 
-        df = ds._to_dataframe(ordered_dims)
+        df = ds._to_dataframe(ordered_dims, create_index=create_index)
         df.columns = [name if c == unique_name else c for c in df.columns]
         return df
 
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -7200,7 +7200,7 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
             "Please use Dataset.to_dataframe() instead."
         )
 
-    def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
+    def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = True):
         from xarray.core.extension_array import PandasExtensionArray
 
         # All and only non-index arrays (whether data or coordinates) should
@@ -7231,7 +7231,13 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
             self._variables[k].set_dims(ordered_dims).values.reshape(-1)
             for k in non_extension_array_columns
         ]
-        index = self.coords.to_index([*ordered_dims])
+        if create_index:
+            index = self.coords.to_index([*ordered_dims])
+        else:
+            # Use a simple RangeIndex when create_index=False
+            # Calculate the total size from ordered_dims
+            total_size = np.prod(list(ordered_dims.values())) if ordered_dims else 0
+            index = pd.RangeIndex(total_size)
         broadcasted_df = pd.DataFrame(
             {
                 **dict(zip(non_extension_array_columns, data, strict=True)),
@@ -7259,7 +7265,11 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
             broadcasted_df = broadcasted_df.join(extension_array_df)
         return broadcasted_df[columns_in_order]
 
-    def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame:
+    def to_dataframe(
+        self,
+        dim_order: Sequence[Hashable] | None = None,
+        create_index: bool = True,
+    ) -> pd.DataFrame:
         """Convert this dataset into a pandas.DataFrame.
 
         Non-index variables in this dataset form the columns of the
@@ -7278,6 +7288,11 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr
 
             If provided, must include all dimensions of this dataset. By
             default, dimensions are in the same order as in `Dataset.sizes`.
+        create_index : bool, default: True
+            If True (default), create a MultiIndex from the Cartesian product
+            of this dataset's indices. If False, use a RangeIndex instead.
+            This can be useful to avoid the potentially expensive MultiIndex
+            creation.
 
         Returns
         -------
@@ -7288,7 +7303,7 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr
 
         ordered_dims = self._normalize_dim_order(dim_order=dim_order)
 
-        return self._to_dataframe(ordered_dims=ordered_dims)
+        return self._to_dataframe(ordered_dims=ordered_dims, create_index=create_index)
 
     def _set_sparse_data_from_dataframe(
         self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -3553,6 +3553,32 @@ def test_to_dataframe_0length(self) -> None:
         assert len(actual) == 0
         assert_array_equal(actual.index.names, list("ABC"))
 
+    def test_to_dataframe_create_index(self) -> None:
+        # Test create_index parameter
+        arr_np = np.arange(12).reshape(3, 4)
+        arr = DataArray(arr_np, [("x", [1, 2, 3]), ("y", list("abcd"))], name="foo")
+
+        # Default behavior: create MultiIndex
+        df_with_index = arr.to_dataframe()
+        assert isinstance(df_with_index.index, pd.MultiIndex)
+        assert df_with_index.index.names == ["x", "y"]
+        assert len(df_with_index) == 12
+
+        # With create_index=False: use RangeIndex
+        df_without_index = arr.to_dataframe(create_index=False)
+        assert isinstance(df_without_index.index, pd.RangeIndex)
+        assert len(df_without_index) == 12
+
+        # Data should be the same regardless
+        assert_array_equal(df_with_index["foo"].values, df_without_index["foo"].values)
+
+        # Test with coords that have different dimensions
+        arr.coords["z"] = ("x", [-1, -2, -3])
+        df_with_coords = arr.to_dataframe(create_index=False)
+        assert isinstance(df_with_coords.index, pd.RangeIndex)
+        assert "z" in df_with_coords.columns
+        assert len(df_with_coords) == 12
+
     @pytest.mark.parametrize(
         "x_dtype,y_dtype,v_dtype",
         [
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -2259,6 +2259,38 @@ def test_to_pandas(self) -> None:
         with pytest.raises(ValueError, match=r"cannot convert Datasets"):
             Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas()
 
+    def test_to_dataframe_create_index(self) -> None:
+        # Test create_index parameter for Dataset
+        x = np.random.randn(3, 4)
+        y = np.random.randn(3, 4)
+        ds = Dataset(
+            {"a": (("x", "y"), x), "b": (("x", "y"), y)},
+            coords={"x": [1, 2, 3], "y": list("abcd")},
+        )
+
+        # Default behavior: create MultiIndex
+        df_with_index = ds.to_dataframe()
+        assert isinstance(df_with_index.index, pd.MultiIndex)
+        assert df_with_index.index.names == ["x", "y"]
+        assert len(df_with_index) == 12
+
+        # With create_index=False: use RangeIndex
+        df_without_index = ds.to_dataframe(create_index=False)
+        assert isinstance(df_without_index.index, pd.RangeIndex)
+        assert len(df_without_index) == 12
+
+        # Data should be the same regardless
+        assert_array_equal(df_with_index["a"].values, df_without_index["a"].values)
+        assert_array_equal(df_with_index["b"].values, df_without_index["b"].values)
+
+        # Test with dim_order and create_index=False
+        df_reordered = ds.to_dataframe(dim_order=["y", "x"], create_index=False)
+        assert isinstance(df_reordered.index, pd.RangeIndex)
+        assert len(df_reordered) == 12
+        # Check that dim_order affects the data ordering
+        df_reordered_with_idx = ds.to_dataframe(dim_order=["y", "x"])
+        assert_array_equal(df_reordered["a"].values, df_reordered_with_idx["a"].values)
+
     def test_reindex_like(self) -> None:
         data = create_test_data()
         data["letters"] = ("dim3", 10 * ["a"])