Skip to content

Commit 7deee57

Browse files
DHRUVA KUMAR KAUSHALDHRUVA KUMAR KAUSHAL
authored andcommitted
mypy fixes
1 parent 52283c6 commit 7deee57

File tree

4 files changed

+87
-6
lines changed

4 files changed

+87
-6
lines changed

xarray/core/dataarray.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3954,7 +3954,10 @@ def to_pandas(self) -> Self | pd.Series | pd.DataFrame:
39543954
return pandas_object
39553955

39563956
def to_dataframe(
3957-
self, name: Hashable | None = None, dim_order: Sequence[Hashable] | None = None
3957+
self,
3958+
name: Hashable | None = None,
3959+
dim_order: Sequence[Hashable] | None = None,
3960+
create_index: bool = True,
39583961
) -> pd.DataFrame:
39593962
"""Convert this array and its coordinates into a tidy pandas.DataFrame.
39603963
@@ -3979,6 +3982,11 @@ def to_dataframe(
39793982
39803983
If provided, must include all dimensions of this DataArray. By default,
39813984
dimensions are sorted according to the DataArray dimensions order.
3985+
create_index : bool, default: True
3986+
If True (default), create a MultiIndex from the Cartesian product
3987+
of this DataArray's indices. If False, use a RangeIndex instead.
3988+
This can be useful to avoid the potentially expensive MultiIndex
3989+
creation.
39823990
39833991
Returns
39843992
-------
@@ -4013,7 +4021,7 @@ def to_dataframe(
40134021
else:
40144022
ordered_dims = ds._normalize_dim_order(dim_order=dim_order)
40154023

4016-
df = ds._to_dataframe(ordered_dims)
4024+
df = ds._to_dataframe(ordered_dims, create_index=create_index)
40174025
df.columns = [name if c == unique_name else c for c in df.columns]
40184026
return df
40194027

xarray/core/dataset.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7200,7 +7200,7 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
72007200
"Please use Dataset.to_dataframe() instead."
72017201
)
72027202

7203-
def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
7203+
def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = True):
72047204
from xarray.core.extension_array import PandasExtensionArray
72057205

72067206
# All and only non-index arrays (whether data or coordinates) should
@@ -7231,7 +7231,13 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
72317231
self._variables[k].set_dims(ordered_dims).values.reshape(-1)
72327232
for k in non_extension_array_columns
72337233
]
7234-
index = self.coords.to_index([*ordered_dims])
7234+
if create_index:
7235+
index = self.coords.to_index([*ordered_dims])
7236+
else:
7237+
# Use a simple RangeIndex when create_index=False
7238+
# Calculate the total size from ordered_dims
7239+
total_size = np.prod(list(ordered_dims.values())) if ordered_dims else 0
7240+
index = pd.RangeIndex(total_size)
72357241
broadcasted_df = pd.DataFrame(
72367242
{
72377243
**dict(zip(non_extension_array_columns, data, strict=True)),
@@ -7259,7 +7265,11 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
72597265
broadcasted_df = broadcasted_df.join(extension_array_df)
72607266
return broadcasted_df[columns_in_order]
72617267

7262-
def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame:
7268+
def to_dataframe(
7269+
self,
7270+
dim_order: Sequence[Hashable] | None = None,
7271+
create_index: bool = True,
7272+
) -> pd.DataFrame:
72637273
"""Convert this dataset into a pandas.DataFrame.
72647274
72657275
Non-index variables in this dataset form the columns of the
@@ -7278,6 +7288,11 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr
72787288
72797289
If provided, must include all dimensions of this dataset. By
72807290
default, dimensions are in the same order as in `Dataset.sizes`.
7291+
create_index : bool, default: True
7292+
If True (default), create a MultiIndex from the Cartesian product
7293+
of this dataset's indices. If False, use a RangeIndex instead.
7294+
This can be useful to avoid the potentially expensive MultiIndex
7295+
creation.
72817296
72827297
Returns
72837298
-------
@@ -7288,7 +7303,7 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr
72887303

72897304
ordered_dims = self._normalize_dim_order(dim_order=dim_order)
72907305

7291-
return self._to_dataframe(ordered_dims=ordered_dims)
7306+
return self._to_dataframe(ordered_dims=ordered_dims, create_index=create_index)
72927307

72937308
def _set_sparse_data_from_dataframe(
72947309
self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple

xarray/tests/test_dataarray.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3553,6 +3553,32 @@ def test_to_dataframe_0length(self) -> None:
35533553
assert len(actual) == 0
35543554
assert_array_equal(actual.index.names, list("ABC"))
35553555

3556+
def test_to_dataframe_create_index(self) -> None:
3557+
# Test create_index parameter
3558+
arr_np = np.arange(12).reshape(3, 4)
3559+
arr = DataArray(arr_np, [("x", [1, 2, 3]), ("y", list("abcd"))], name="foo")
3560+
3561+
# Default behavior: create MultiIndex
3562+
df_with_index = arr.to_dataframe()
3563+
assert isinstance(df_with_index.index, pd.MultiIndex)
3564+
assert df_with_index.index.names == ["x", "y"]
3565+
assert len(df_with_index) == 12
3566+
3567+
# With create_index=False: use RangeIndex
3568+
df_without_index = arr.to_dataframe(create_index=False)
3569+
assert isinstance(df_without_index.index, pd.RangeIndex)
3570+
assert len(df_without_index) == 12
3571+
3572+
# Data should be the same regardless
3573+
assert_array_equal(df_with_index["foo"].values, df_without_index["foo"].values)
3574+
3575+
# Test with coords that have different dimensions
3576+
arr.coords["z"] = ("x", [-1, -2, -3])
3577+
df_with_coords = arr.to_dataframe(create_index=False)
3578+
assert isinstance(df_with_coords.index, pd.RangeIndex)
3579+
assert "z" in df_with_coords.columns
3580+
assert len(df_with_coords) == 12
3581+
35563582
@pytest.mark.parametrize(
35573583
"x_dtype,y_dtype,v_dtype",
35583584
[

xarray/tests/test_dataset.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2259,6 +2259,38 @@ def test_to_pandas(self) -> None:
22592259
with pytest.raises(ValueError, match=r"cannot convert Datasets"):
22602260
Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas()
22612261

2262+
def test_to_dataframe_create_index(self) -> None:
2263+
# Test create_index parameter for Dataset
2264+
x = np.random.randn(3, 4)
2265+
y = np.random.randn(3, 4)
2266+
ds = Dataset(
2267+
{"a": (("x", "y"), x), "b": (("x", "y"), y)},
2268+
coords={"x": [1, 2, 3], "y": list("abcd")},
2269+
)
2270+
2271+
# Default behavior: create MultiIndex
2272+
df_with_index = ds.to_dataframe()
2273+
assert isinstance(df_with_index.index, pd.MultiIndex)
2274+
assert df_with_index.index.names == ["x", "y"]
2275+
assert len(df_with_index) == 12
2276+
2277+
# With create_index=False: use RangeIndex
2278+
df_without_index = ds.to_dataframe(create_index=False)
2279+
assert isinstance(df_without_index.index, pd.RangeIndex)
2280+
assert len(df_without_index) == 12
2281+
2282+
# Data should be the same regardless
2283+
assert_array_equal(df_with_index["a"].values, df_without_index["a"].values)
2284+
assert_array_equal(df_with_index["b"].values, df_without_index["b"].values)
2285+
2286+
# Test with dim_order and create_index=False
2287+
df_reordered = ds.to_dataframe(dim_order=["y", "x"], create_index=False)
2288+
assert isinstance(df_reordered.index, pd.RangeIndex)
2289+
assert len(df_reordered) == 12
2290+
# Check that dim_order affects the data ordering
2291+
df_reordered_with_idx = ds.to_dataframe(dim_order=["y", "x"])
2292+
assert_array_equal(df_reordered["a"].values, df_reordered_with_idx["a"].values)
2293+
22622294
def test_reindex_like(self) -> None:
22632295
data = create_test_data()
22642296
data["letters"] = ("dim3", 10 * ["a"])

0 commit comments

Comments
 (0)