diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a3b5ba616b258..e8680d7aae042 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -253,6 +253,8 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- :func:`read_pickle` (and other ``read_*`` functions that handle compressed inputs) can now load from ``.zip`` files created by OS X/macOS that contain ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). + .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 8ec0a869c7042..9a252f1599012 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -616,7 +616,14 @@ def get_handle( handle = _BytesZipFile(handle, ioargs.mode, **compression_args) if handle.mode == "r": handles.append(handle) - zip_names = handle.namelist() + + # Ignore hidden folders added by OS X/macOS on .zip creation + zip_names = [ + _ + for _ in handle.namelist() + if not (_.startswith("__MACOSX/") or _.startswith(".DS_STORE")) + ] + if len(zip_names) == 1: handle = handle.open(zip_names.pop()) elif len(zip_names) == 0: diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 690d3133dae5e..27315880f7a87 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -61,6 +61,20 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) +@pytest.mark.parametrize("compression", ["zip", "infer"]) +def test_zip_no_error_hidden_files(parser_and_data, compression, python_parser_only): + _, data, expected = parser_and_data + + with tm.ensure_clean("combined_zip.zip") as path: + inner_file_names = ["test_file", "__MACOSX/dummy", ".DS_STORE"] + + with zipfile.ZipFile(path, mode="w") as tmp: + for file_name in inner_file_names: + tmp.writestr(file_name, data) + + python_parser_only.read_csv(path, compression=compression) + + def test_zip_error_no_files(parser_and_data): parser, _, _ = parser_and_data diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 34b36e2549b62..7a881cd46a75a 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -393,6 +393,33 @@ def test_read_infer(self, ext, get_random_path): tm.assert_frame_equal(df, df2) + @pytest.mark.parametrize("cruft", ["__MACOSX/", ".DS_STORE"]) + def test_load_zip_with_hidden_folders(self, cruft, get_random_path): + # Test loading .zip files with platform-specific hidden folders (issue #37098) + base = get_random_path + path1 = f"{base}.raw" + path2 = f"{base}.zip" + dummy = f"{base}.dummy" + compression = "zip" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean( + path2 + ) as p2, tm.ensure_clean(dummy) as dummy_path: + + df = tm.makeDataFrame() + df.to_pickle(p1, compression=None) + self.compress_file(p1, p2, compression=compression) + + # add dummy file `{cruft}{dummy}` to the archive + with zipfile.ZipFile(p2, "a", compression=zipfile.ZIP_DEFLATED) as f: + f.write(dummy_path, f"{cruft}{dummy}") + with zipfile.ZipFile(p2, "r") as f: + assert f"{cruft}{dummy}" in f.namelist() + + # dummy file should be ignored on reading, otherwise read_pickle will fail + df2 = pd.read_pickle(p2) + tm.assert_frame_equal(df, df2) + # --------------------- # test pickle compression