Skip to content

Commit aae43b2

Browse files
mkviatkovskiiclaude
andcommitted
BUG: to_parquet (pyarrow) opens local path twice
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent b5c64e2 commit aae43b2

3 files changed

Lines changed: 81 additions & 18 deletions

File tree

doc/source/whatsnew/v3.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ I/O
360360
- Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`)
361361
- Storing a :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` level named ``'index'`` via :meth:`HDFStore.put` or :meth:`HDFStore.append` with ``format='table'`` now raises a clear ``ValueError`` instead of an opaque reshape error (:issue:`6208`)
362362
- Writing a :class:`DataFrame` with ``format='table'`` and a column named ``'index'`` as a ``data_columns`` entry (including ``data_columns=True``) now raises a clear ``ValueError`` instead of an opaque reshape error (:issue:`41437`)
363+
- Fixed bug in :meth:`DataFrame.to_parquet` (``pyarrow`` engine) where a local file path was opened twice, once by pandas and again by pyarrow, wasting a syscall and silently truncating output to 0 bytes on filesystems that finalize a file's contents on close (:issue:`65810`)
363364

364365
Period
365366
^^^^^^

pandas/io/parquet.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from pandas.io._util import arrow_table_to_pandas
3434
from pandas.io.common import (
3535
IOHandles,
36+
check_parent_directory,
3637
get_handle,
3738
is_fsspec_url,
3839
is_url,
@@ -142,24 +143,26 @@ def _get_path_or_handle(
142143
and isinstance(path_or_handle, str)
143144
and not os.path.isdir(path_or_handle)
144145
):
145-
# use get_handle only when we are very certain that it is not a directory
146-
# fsspec resources can also point to directories
147-
# this branch is used for example when reading from non-fsspec URLs
148-
handles = get_handle(
149-
path_or_handle, mode, is_text=False, storage_options=storage_options
150-
)
151-
fs = None
152-
path_or_handle = handles.handle
153-
if hasattr(path_or_handle, "name") and isinstance(
154-
path_or_handle.name, (str, bytes)
155-
):
156-
# Unwrap the Python file handle back to a string path so that
157-
# PyArrow can use memory-mapped and multithreaded C++ I/O
158-
# instead of going through the Python I/O layer. GH#47702
159-
if isinstance(path_or_handle.name, bytes):
160-
path_or_handle = path_or_handle.name.decode()
161-
else:
162-
path_or_handle = path_or_handle.name
146+
if is_url(path_or_handle):
147+
# pyarrow cannot read non-fsspec URLs (e.g. http/https), so let
148+
# get_handle download them into a buffer for pyarrow to consume.
149+
handles = get_handle(
150+
path_or_handle, mode, is_text=False, storage_options=storage_options
151+
)
152+
path_or_handle = handles.handle
153+
else:
154+
# Local path: hand the string to pyarrow so it can use memory-mapped,
155+
# multithreaded C++ I/O rather than the Python I/O layer (GH#47702).
156+
# Do not open it via get_handle as well: pyarrow opens the path
157+
# itself, so going through get_handle too would open the file twice.
158+
# That wastes a syscall on POSIX and, on filesystems that finalize a
159+
# file's contents on close, lets the empty pandas-side descriptor
160+
# close last and clobber pyarrow's data to 0 bytes. get_handle would
161+
# also expand "~" and check the parent directory on write, so
162+
# reproduce both below to keep behavior unchanged.
163+
path_or_handle = os.path.expanduser(path_or_handle)
164+
if "w" in mode or "a" in mode or "x" in mode:
165+
check_parent_directory(path_or_handle)
163166
return path_or_handle, handles, fs
164167

165168

pandas/tests/io/test_parquet.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,65 @@ def test_maps_as_pydicts(self, pa, temp_file):
12551255
read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}},
12561256
)
12571257

1258+
def test_to_parquet_local_path_does_not_call_get_handle(
1259+
self, pa, temp_file, monkeypatch
1260+
):
1261+
# GH#65810 local paths are handed to pyarrow directly; get_handle used
1262+
# to open them only to unwrap the name back to a string, opening the
1263+
# path a second time and truncating output to 0 bytes on filesystems
1264+
# that finalize contents on close
1265+
def fail(*args, **kwargs):
1266+
pytest.fail("get_handle should not be called for a local path")
1267+
1268+
monkeypatch.setattr("pandas.io.parquet.get_handle", fail)
1269+
1270+
df = pd.DataFrame({"a": [1, 2, 3]})
1271+
df.to_parquet(temp_file, engine=pa)
1272+
1273+
def test_to_parquet_local_path_opens_destination_once(
1274+
self, pa, temp_file, monkeypatch
1275+
):
1276+
# GH#65810 pandas must not open the destination itself; pyarrow opens it
1277+
# via C++ (bypassing builtins.open), so no Python-level open is expected
1278+
opens = []
1279+
real_open = open
1280+
target = os.fspath(temp_file)
1281+
1282+
def spy_open(file, *args, **kwargs):
1283+
if os.fspath(file) == target:
1284+
opens.append(file)
1285+
return real_open(file, *args, **kwargs)
1286+
1287+
monkeypatch.setattr("builtins.open", spy_open)
1288+
1289+
df = pd.DataFrame({"a": [1, 2, 3]})
1290+
df.to_parquet(temp_file, engine=pa)
1291+
assert opens == []
1292+
1293+
def test_to_parquet_missing_parent_directory_raises(self, pa, tmp_path):
1294+
# GH#65810 skipping get_handle must keep its parent-directory check
1295+
df = pd.DataFrame({"a": [1, 2, 3]})
1296+
path = tmp_path / "missing" / "out.parquet"
1297+
msg = "Cannot save file into a non-existent directory"
1298+
with pytest.raises(OSError, match=msg):
1299+
df.to_parquet(path, engine=pa)
1300+
1301+
def test_read_parquet_url_still_uses_get_handle(self, pa, monkeypatch):
1302+
# GH#65810 only local paths skip get_handle; non-fsspec URLs must still
1303+
# be routed through it because pyarrow cannot fetch them
1304+
url = "http://example.com/nonexistent.parquet"
1305+
calls = []
1306+
1307+
def stub(path, mode, **kwargs):
1308+
calls.append((path, mode))
1309+
raise ValueError("reached get_handle")
1310+
1311+
monkeypatch.setattr("pandas.io.parquet.get_handle", stub)
1312+
1313+
with pytest.raises(ValueError, match="reached get_handle"):
1314+
read_parquet(url, engine=pa)
1315+
assert calls == [(url, "rb")]
1316+
12581317

12591318
class TestParquetFastParquet(Base):
12601319
def test_basic(self, fp, df_full, request, temp_file):

0 commit comments

Comments
 (0)