Skip to content

Commit 1f45bca

Browse files
Thomas-Ztzilio
andauthored
Support explicitly setting a dimension order with to_dataframe() (#4333)
* #4331: Adding dim_order parameter to Dataset.to_dataframe * #4331: Typo * #4331: Adding dim_order parameter to DataArray.to_dataframe. Refactoring some code, fixing some docstring. * #4331: Updating whats-new.rst * #4331: Updating whats-new.rst (bis) Co-authored-by: tzilio <[email protected]>
1 parent 8fab5a2 commit 1f45bca

File tree

5 files changed

+133
-22
lines changed

5 files changed

+133
-22
lines changed

doc/whats-new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ New Features
4545
of :py:class:`DataArray` and :py:class:`Dataset` objects and
4646
document the new method in :doc:`internals`. (:pull:`4248`).
4747
By `Justus Magin <https://github.com/keewis>`_.
48+
- :py:meth:`~xarray.DataArray.to_dataframe` and :py:meth:`~xarray.Dataset.to_dataframe`
49+
now accept a ``dim_order`` parameter allowing to specify the resulting dataframe's
50+
dimensions order (:issue:`4331`, :pull:`4333`).
51+
By `Thomas Zilio <https://github.com/thomas-z>`_.
4852

4953

5054
Bug fixes

xarray/core/dataarray.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2365,13 +2365,36 @@ def to_pandas(self) -> Union["DataArray", pd.Series, pd.DataFrame]:
23652365
indexes = [self.get_index(dim) for dim in self.dims]
23662366
return constructor(self.values, *indexes)
23672367

2368-
def to_dataframe(self, name: Hashable = None) -> pd.DataFrame:
2368+
def to_dataframe(
2369+
self, name: Hashable = None, dim_order: List[Hashable] = None
2370+
) -> pd.DataFrame:
23692371
"""Convert this array and its coordinates into a tidy pandas.DataFrame.
23702372
23712373
The DataFrame is indexed by the Cartesian product of index coordinates
23722374
(in the form of a :py:class:`pandas.MultiIndex`).
23732375
23742376
Other coordinates are included as columns in the DataFrame.
2377+
2378+
Parameters
2379+
----------
2380+
name
2381+
Name to give to this array (required if unnamed).
2382+
dim_order
2383+
Hierarchical dimension order for the resulting dataframe.
2384+
Array content is transposed to this order and then written out as flat
2385+
vectors in contiguous order, so the last dimension in this list
2386+
will be contiguous in the resulting DataFrame. This has a major
2387+
influence on which operations are efficient on the resulting
2388+
dataframe.
2389+
2390+
If provided, must include all dimensions of this DataArray. By default,
2391+
dimensions are sorted according to the DataArray dimensions order.
2392+
2393+
Returns
2394+
-------
2395+
result
2396+
DataArray as a pandas DataFrame.
2397+
23752398
"""
23762399
if name is None:
23772400
name = self.name
@@ -2381,15 +2404,20 @@ def to_dataframe(self, name: Hashable = None) -> pd.DataFrame:
23812404
"DataFrame: use the ``name`` parameter"
23822405
)
23832406

2384-
dims = dict(zip(self.dims, self.shape))
23852407
# By using a unique name, we can convert a DataArray into a DataFrame
23862408
# even if it shares a name with one of its coordinates.
23872409
# I would normally use unique_name = object() but that results in a
23882410
# dataframe with columns in the wrong order, for reasons I have not
23892411
# been able to debug (possibly a pandas bug?).
23902412
unique_name = "__unique_name_identifier_z98xfz98xugfg73ho__"
23912413
ds = self._to_dataset_whole(name=unique_name)
2392-
df = ds._to_dataframe(dims)
2414+
2415+
if dim_order is None:
2416+
ordered_dims = dict(zip(self.dims, self.shape))
2417+
else:
2418+
ordered_dims = ds._normalize_dim_order(dim_order=dim_order)
2419+
2420+
df = ds._to_dataframe(ordered_dims)
23932421
df.columns = [name if c == unique_name else c for c in df.columns]
23942422
return df
23952423

xarray/core/dataset.py

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4524,23 +4524,75 @@ def to_array(self, dim="variable", name=None):
45244524
data, coords, dims, attrs=self.attrs, name=name, indexes=indexes
45254525
)
45264526

4527-
def _to_dataframe(self, ordered_dims):
4527+
def _normalize_dim_order(
4528+
self, dim_order: List[Hashable] = None
4529+
) -> Dict[Hashable, int]:
4530+
"""
4531+
Check the validity of the provided dimensions if any and return the mapping
4532+
between dimension name and their size.
4533+
4534+
Parameters
4535+
----------
4536+
dim_order
4537+
Dimension order to validate (default to the alphabetical order if None).
4538+
4539+
Returns
4540+
-------
4541+
result
4542+
Validated dimensions mapping.
4543+
4544+
"""
4545+
if dim_order is None:
4546+
dim_order = list(self.dims)
4547+
elif set(dim_order) != set(self.dims):
4548+
raise ValueError(
4549+
"dim_order {} does not match the set of dimensions of this "
4550+
"Dataset: {}".format(dim_order, list(self.dims))
4551+
)
4552+
4553+
ordered_dims = {k: self.dims[k] for k in dim_order}
4554+
4555+
return ordered_dims
4556+
4557+
def _to_dataframe(self, ordered_dims: Mapping[Hashable, int]):
45284558
columns = [k for k in self.variables if k not in self.dims]
45294559
data = [
45304560
self._variables[k].set_dims(ordered_dims).values.reshape(-1)
45314561
for k in columns
45324562
]
4533-
index = self.coords.to_index(ordered_dims)
4563+
index = self.coords.to_index([*ordered_dims])
45344564
return pd.DataFrame(dict(zip(columns, data)), index=index)
45354565

4536-
def to_dataframe(self):
4566+
def to_dataframe(self, dim_order: List[Hashable] = None) -> pd.DataFrame:
45374567
"""Convert this dataset into a pandas.DataFrame.
45384568
45394569
Non-index variables in this dataset form the columns of the
4540-
DataFrame. The DataFrame is be indexed by the Cartesian product of
4570+
DataFrame. The DataFrame is indexed by the Cartesian product of
45414571
this dataset's indices.
4572+
4573+
Parameters
4574+
----------
4575+
dim_order
4576+
Hierarchical dimension order for the resulting dataframe. All
4577+
arrays are transposed to this order and then written out as flat
4578+
vectors in contiguous order, so the last dimension in this list
4579+
will be contiguous in the resulting DataFrame. This has a major
4580+
influence on which operations are efficient on the resulting
4581+
dataframe.
4582+
4583+
If provided, must include all dimensions of this dataset. By
4584+
default, dimensions are sorted alphabetically.
4585+
4586+
Returns
4587+
-------
4588+
result
4589+
Dataset as a pandas DataFrame.
4590+
45424591
"""
4543-
return self._to_dataframe(self.dims)
4592+
4593+
ordered_dims = self._normalize_dim_order(dim_order=dim_order)
4594+
4595+
return self._to_dataframe(ordered_dims=ordered_dims)
45444596

45454597
def _set_sparse_data_from_dataframe(
45464598
self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple
@@ -4694,11 +4746,11 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
46944746
influence on which operations are efficient on the resulting dask
46954747
dataframe.
46964748
4697-
If provided, must include all dimensions on this dataset. By
4749+
If provided, must include all dimensions of this dataset. By
46984750
default, dimensions are sorted alphabetically.
46994751
set_index : bool, optional
47004752
If set_index=True, the dask DataFrame is indexed by this dataset's
4701-
coordinate. Since dask DataFrames to not support multi-indexes,
4753+
coordinate. Since dask DataFrames do not support multi-indexes,
47024754
set_index only works if the dataset only contains one dimension.
47034755
47044756
Returns
@@ -4709,15 +4761,7 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
47094761
import dask.array as da
47104762
import dask.dataframe as dd
47114763

4712-
if dim_order is None:
4713-
dim_order = list(self.dims)
4714-
elif set(dim_order) != set(self.dims):
4715-
raise ValueError(
4716-
"dim_order {} does not match the set of dimensions on this "
4717-
"Dataset: {}".format(dim_order, list(self.dims))
4718-
)
4719-
4720-
ordered_dims = {k: self.dims[k] for k in dim_order}
4764+
ordered_dims = self._normalize_dim_order(dim_order=dim_order)
47214765

47224766
columns = list(ordered_dims)
47234767
columns.extend(k for k in self.coords if k not in self.dims)
@@ -4744,6 +4788,8 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
47444788
df = dd.concat(series_list, axis=1)
47454789

47464790
if set_index:
4791+
dim_order = [*ordered_dims]
4792+
47474793
if len(dim_order) == 1:
47484794
(dim,) = dim_order
47494795
df = df.set_index(dim)

xarray/tests/test_dataarray.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3463,15 +3463,18 @@ def test_to_pandas(self):
34633463

34643464
def test_to_dataframe(self):
34653465
# regression test for #260
3466-
arr = DataArray(
3467-
np.random.randn(3, 4), [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo"
3468-
)
3466+
arr_np = np.random.randn(3, 4)
3467+
3468+
arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo")
34693469
expected = arr.to_series()
34703470
actual = arr.to_dataframe()["foo"]
34713471
assert_array_equal(expected.values, actual.values)
34723472
assert_array_equal(expected.name, actual.name)
34733473
assert_array_equal(expected.index.values, actual.index.values)
34743474

3475+
actual = arr.to_dataframe(dim_order=["A", "B"])["foo"]
3476+
assert_array_equal(arr_np.transpose().reshape(-1), actual.values)
3477+
34753478
# regression test for coords with different dimensions
34763479
arr.coords["C"] = ("B", [-1, -2, -3])
34773480
expected = arr.to_series().to_frame()
@@ -3482,6 +3485,9 @@ def test_to_dataframe(self):
34823485
assert_array_equal(expected.columns.values, actual.columns.values)
34833486
assert_array_equal(expected.index.values, actual.index.values)
34843487

3488+
with pytest.raises(ValueError, match="does not match the set of dimensions"):
3489+
arr.to_dataframe(dim_order=["B", "A", "C"])
3490+
34853491
arr.name = None # unnamed
34863492
with raises_regex(ValueError, "unnamed"):
34873493
arr.to_dataframe()

xarray/tests/test_dataset.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3939,6 +3939,33 @@ def test_to_and_from_dataframe(self):
39393939
# check roundtrip
39403940
assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual))
39413941

3942+
# Check multiindex reordering
3943+
new_order = ["x", "y"]
3944+
actual = ds.to_dataframe(dim_order=new_order)
3945+
assert expected.equals(actual)
3946+
3947+
new_order = ["y", "x"]
3948+
exp_index = pd.MultiIndex.from_arrays(
3949+
[["a", "a", "b", "b", "c", "c"], [0, 1, 0, 1, 0, 1]], names=["y", "x"]
3950+
)
3951+
expected = pd.DataFrame(
3952+
w.transpose().reshape(-1), columns=["w"], index=exp_index
3953+
)
3954+
actual = ds.to_dataframe(dim_order=new_order)
3955+
assert expected.equals(actual)
3956+
3957+
invalid_order = ["x"]
3958+
with pytest.raises(
3959+
ValueError, match="does not match the set of dimensions of this"
3960+
):
3961+
ds.to_dataframe(dim_order=invalid_order)
3962+
3963+
invalid_order = ["x", "z"]
3964+
with pytest.raises(
3965+
ValueError, match="does not match the set of dimensions of this"
3966+
):
3967+
ds.to_dataframe(dim_order=invalid_order)
3968+
39423969
# check pathological cases
39433970
df = pd.DataFrame([1])
39443971
actual = Dataset.from_dataframe(df)

0 commit comments

Comments
 (0)