diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index aec31f40f8570..3d8346aeb97a2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -24,7 +24,9 @@ from pandas import ( DataFrame, + Index, MultiIndex, + RangeIndex, arrays, get_option, ) @@ -250,14 +252,37 @@ def read( if dtype_backend == "pandas": result = pa_table.to_pandas(**to_pandas_kwargs) elif dtype_backend == "pyarrow": - result = DataFrame( - { - col_name: arrays.ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + index_columns = pa_table.schema.pandas_metadata.get("index_columns", []) + result_dc = { + col_name: arrays.ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } + idx: Index | None + if len(index_columns) == 0: + idx = None + elif len(index_columns) == 1 and isinstance(index_columns[0], dict): + params = index_columns[0] + idx = RangeIndex( + params.get("start"), + params.get("stop"), + params.get("step"), + name=params.get("name"), + ) + + else: + index_data = [ + result_dc.pop(index_col) for index_col in index_columns + ] + if len(index_data) == 1: + name = index_columns[0] + if isinstance(name, str) and name.startswith("__index_level_"): + name = None + idx = Index(index_data[0], name=name) + else: + idx = MultiIndex.from_arrays(index_data, names=index_columns) + result = DataFrame(result_dc, index=idx) if manager == "array": result = result._as_manager("array", copy=False) return result diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 353dc4f1cbd8a..b8d02fc04f90e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -18,6 +18,7 @@ import pandas.util._test_decorators as td import pandas as pd +from pandas import RangeIndex import pandas._testing as tm from pandas.util.version import Version @@ -1225,3 +1226,38 @@ def test_bytes_file_name(self, engine): result = read_parquet(path, engine=engine) tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("index", ["A", ["A", "B"]]) + def test_pyarrow_backed_df_index(self, index, pa): + # GH#48944 + obj = pd.DataFrame(data={"A": [0, 1], "B": [1, 0], "C": 1}) + df = obj.set_index(index) + with tm.ensure_clean("test.parquet") as path: + with open(path.encode(), "wb") as f: + df.to_parquet(f) + + with pd.option_context("mode.dtype_backend", "pyarrow"): + result = read_parquet(path, engine="pyarrow") + expected = obj.astype("int64[pyarrow]").set_index(index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "test"]) + @pytest.mark.parametrize("index", [True, False, None]) + def test_pyarrow_backed_df_range_index(self, pa, index, name): + # GH#48944 + df = pd.DataFrame( + data={"A": [0, 1], "B": [1, 0]}, + index=RangeIndex(start=100, stop=102, name=name), + ) + with tm.ensure_clean("test.parquet") as path: + with open(path.encode(), "wb") as f: + df.to_parquet(f, index=index) + + with pd.option_context("mode.dtype_backend", "pyarrow"): + result = read_parquet(path, engine="pyarrow") + expected = df.astype("int64[pyarrow]") + if index is False: + expected = expected.reset_index(drop=True) + elif index: + expected.index = pd.Index([100, 101], dtype="int64[pyarrow]", name=name) + tm.assert_frame_equal(result, expected)