Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ Other enhancements
- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
- Added support for SQLAlchemy 2.0 (:issue:`40686`)
- :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`)
- Added methods :meth:`DataFrame.from_pyarrow` and :meth:`DataFrame.to_pyarrow` to convert data from and to PyArrow tables (:issue:`51760`)

.. ---------------------------------------------------------------------------
.. _whatsnew_200.notable_bug_fixes:
Expand Down
77 changes: 77 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.arrow import (
ArrowDtype,
ArrowExtensionArray,
)
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
Expand Down Expand Up @@ -1763,6 +1767,79 @@ def create_index(indexlist, namelist):
columns = create_index(data["columns"], data["column_names"])
return cls(realdata, index=index, columns=columns, dtype=dtype)

@classmethod
def from_pyarrow(
cls,
table,
) -> DataFrame:
"""
Convert a pyarrow table to DataFrame.

Parameters
----------
table: pyarrow.Table

Returns
-------
DataFrame

See Also
--------
DataFrame.to_pyarrow
pyarrow.Table.to_pandas

Notes
-----
The conversion is zero-copy, and the resulting DataFrame uses
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure that the term "zero-copy" is well understood, so maybe elaborate?

Arrow-backend data types.

For customization of the conversion use the
`to_pandas <https://arrow.apache.org/docs/python/generated/\
pyarrow.Table.html#pyarrow.Table.to_pandas>`__
method of the Table object.

Examples
--------
>>> import pyarrow as pa
>>> table = pa.table([pa.array([1, 2, 3])], names=["my_column"])
>>> pd.DataFrame.from_pyarrow(table)
my_column
0 1
1 2
2 3
"""
return table.to_pandas(types_mapper=ArrowDtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should do an instance check here ; eg if this is called with a non arrow table you would get a hard to understand error


def to_pyarrow(
self,
):
"""
Convert the DataFrame to a pyarrow.Table object.

Returns
-------
pyarrow.Table

See Also
--------
DataFrame.from_pyarrow
pyarrow.Table.from_pandas

Examples
--------
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.to_pyarrow()
pyarrow.Table
col1: int64
col2: int64
----
col1: [[1,2]]
col2: [[3,4]]
"""
import pyarrow as pa

return pa.Table.from_pandas(self)

def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
Expand Down
92 changes: 92 additions & 0 deletions pandas/tests/frame/constructors/test_pyarrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import pytest

from pandas import (
NA,
DataFrame,
Series,
)
from pandas.core.arrays.arrow import ArrowDtype

pa = pytest.importorskip("pyarrow", minversion="7.0.0")
import pandas._testing as tm


@pytest.fixture(scope="module")
def sample_dataframe_numpy_backend():
return DataFrame(
{
"u8": Series([1, 2, 3, NA], dtype="UInt8"),
"f64": Series([float("NaN"), 1.0, 2.0, 3.0], dtype="float64"),
"s": Series(["foo", "bar", None, "foobar"], dtype="object"),
}
)


@pytest.fixture(scope="module")
def sample_dataframe_pyarrow_backend():
return DataFrame(
{
"u8": Series([1, 2, 3, NA], dtype="uint8[pyarrow]"),
"f64": Series([NA, 1.0, 2.0, 3.0], dtype="float64[pyarrow]"),
"s": Series(["foo", "bar", NA, "foobar"], dtype="string[pyarrow]"),
}
)


@pytest.fixture(scope="module")
def sample_pyarrow_table():
return pa.table(
[
pa.array([1, 2, 3, None], type=pa.uint8()),
pa.array([None, 1.0, 2.0, 3.0], type=pa.float64()),
pa.array(["foo", "bar", None, "foobar"], type=pa.string()),
],
names=["u8", "f64", "s"],
)


class TestPyArrow:
@pytest.mark.parametrize(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need a skip if pyarrow is not installed

"column,dtype", [("u8", pa.uint8()), ("f64", pa.float64()), ("s", pa.string())]
)
def test_from_pyarrow_uses_right_pandas_types(
self, sample_pyarrow_table, column, dtype
):
result = DataFrame.from_pyarrow(sample_pyarrow_table)
assert result[column].dtype == ArrowDtype(dtype)

@pytest.mark.parametrize("column", ["u8", "f64", "s"])
def test_from_pyarrow_keeps_correct_data(self, sample_pyarrow_table, column):
result = DataFrame.from_pyarrow(sample_pyarrow_table)
assert result[column]._data.array._data == sample_pyarrow_table[column]

@pytest.mark.parametrize("column", ["u8", "f64", "s"])
def test_from_pyarrow_does_not_copy_memory(self, sample_pyarrow_table, column):
result = DataFrame.from_pyarrow(sample_pyarrow_table)

result_buffers = result[column]._data.array._data.chunks[0].buffers()
expected_buffers = sample_pyarrow_table[column].chunks[0].buffers()

for result_buffer, expected_buffer in zip(result_buffers, expected_buffers):
if result_buffer is None and expected_buffer is None:
continue
assert result_buffer.address == expected_buffer.address
assert result_buffer.size == expected_buffer.size

def test_to_pyarrow_numpy_backend(
self, sample_dataframe_numpy_backend, sample_pyarrow_table
):
result = sample_dataframe_numpy_backend.to_pyarrow()
assert result == sample_pyarrow_table

def test_to_pyarrow_pyarrow_backend(
self, sample_dataframe_pyarrow_backend, sample_pyarrow_table
):
result = sample_dataframe_pyarrow_backend.to_pyarrow()
assert result == sample_pyarrow_table

def test_pyarrow_roundtrip(
self, sample_dataframe_numpy_backend, sample_dataframe_pyarrow_backend
):
result = DataFrame.from_pyarrow(sample_dataframe_numpy_backend.to_pyarrow())
tm.assert_frame_equal(result, sample_dataframe_pyarrow_backend)
1 change: 1 addition & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
SparseArray,
TimedeltaArray,
)
from pandas.core.arrays.arrow import ArrowDtype

MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
MIXED_INT_DTYPES = [
Expand Down