Fix OnDiskCacheHolder to list all files for decompressing operations (#203)

ejguan · facebook-github-bot · commit 37bd84f6adcc · 2022-02-10T06:22:29.000-08:00
Summary: Pull Request resolved: #203 Add `FileLister` to make sure `OnDiskCacheHolder` can list all of files after any 1-to-N operations like decompression. Test Plan: Imported from OSS Reviewed By: VitalyFedyunin, NivekT Differential Revision: D34085743 Pulled By: ejguan fbshipit-source-id: 3f2461b0e77eb015ec4e8b5b5a936505380f5a76
diff --git a/test/test_remote_io.py b/test/test_remote_io.py
@@ -137,6 +137,30 @@ def _read_and_decode(x):
             self.assertTrue(os.path.exists(expected_csv_path))
             self.assertEqual(expected_csv_path, csv_path)
 
+        # Cache decompressed archive but only check root directory
+        root_dir = "temp"
+
+        file_cache_dp = OnDiskCacheHolder(
+            tar_cache_dp, filepath_fn=lambda tar_path: os.path.join(os.path.dirname(tar_path), root_dir)
+        )
+        file_cache_dp = FileOpener(file_cache_dp, mode="rb").read_from_tar()
+        file_cache_dp = file_cache_dp.end_caching(
+            mode="wb",
+            filepath_fn=lambda file_path: os.path.join(self.temp_dir.name, root_dir, os.path.basename(file_path)),
+        )
+
+        cached_it = iter(file_cache_dp)
+        for i in range(3):
+            expected_csv_path = os.path.join(self.temp_dir.name, root_dir, f"{i}.csv")
+            # File doesn't exist on disk
+            self.assertFalse(os.path.exists(expected_csv_path))
+
+            csv_path = next(cached_it)
+
+            # File is cached to disk
+            self.assertTrue(os.path.exists(expected_csv_path))
+            self.assertEqual(expected_csv_path, csv_path)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchdata/datapipes/iter/util/cacheholder.py b/torchdata/datapipes/iter/util/cacheholder.py
@@ -12,7 +12,7 @@
 
 from torch.utils.data.graph import traverse
 from torchdata.datapipes import functional_datapipe
-from torchdata.datapipes.iter import IterDataPipe
+from torchdata.datapipes.iter import FileLister, IterDataPipe
 
 if DILL_AVAILABLE:
     import dill
@@ -135,7 +135,7 @@ def _filepath_fn(url):
         hash_dict = {"expected_filepaht": expected_MD5_hash}
 
         cache_dp = url.on_disk_cache(filepath_fn=_filepath_fn, hash_dict=_hash_dict, hash_type="md5")
-        cache_dp = HttpReader(cache_dp).end_caching(mode="wb". filepath_fn=_filepath_fn)
+        cache_dp = HttpReader(cache_dp).end_caching(filepath_fn=_filepath_fn)
     """
 
     _temp_dict: Dict = {}
@@ -234,14 +234,15 @@ class EndOnDiskCacheHolderIterDataPipe(IterDataPipe):
         datapipe: IterDataPipe with at least one `OnDiskCacheHolder` in the graph.
         mode: Mode in which cached files are opened for write the data. This is needed
             to be aligned with the type of data or file handle from `datapipe`.
+            ``"wb"`` is used by default.
         filepath_fn: Optional function to extract filepath from the metadata from `datapipe`.
             As default, it would directly use the metadata as file path.
         same_filepath_fn: Set to `True` to use same `filepath_fn` from the `OnDiskCacheHolder`.
         skip_read: Boolean value to skip reading the file handle from `datapipe`.
             As default, reading is enabled and reading function is created based on the `mode`.
     """
 
-    def __new__(cls, datapipe, mode="w", filepath_fn=None, *, same_filepath_fn=False, skip_read=False):
+    def __new__(cls, datapipe, mode="wb", filepath_fn=None, *, same_filepath_fn=False, skip_read=False):
         if filepath_fn is not None and same_filepath_fn:
             raise ValueError("`filepath_fn` is mutually exclusive with `same_filepath_fn`")
 
@@ -255,16 +256,17 @@ def __new__(cls, datapipe, mode="w", filepath_fn=None, *, same_filepath_fn=False
 
         _filepath_fn, _hash_dict, _hash_type, _ = OnDiskCacheHolderIterDataPipe._temp_dict[cache_holder]
         cached_dp = cache_holder._end_caching()
+        cached_dp = FileLister(cached_dp, recursive=True)
 
         if same_filepath_fn:
             filepath_fn = _filepath_fn
 
         todo_dp = datapipe
         if not skip_read:
-            if "b" in mode:
-                todo_dp = todo_dp.map(fn=_read_bytes, input_col=1)
-            else:
+            if "t" in mode:
                 todo_dp = todo_dp.map(fn=_read_str, input_col=1)
+            else:
+                todo_dp = todo_dp.map(fn=_read_bytes, input_col=1)
 
         if filepath_fn is not None:
             todo_dp = todo_dp.map(fn=filepath_fn, input_col=0)
diff --git a/torchdata/datapipes/iter/util/saver.py b/torchdata/datapipes/iter/util/saver.py
@@ -28,7 +28,7 @@ def __init__(
         filepath_fn: Optional[Callable] = None,
     ):
         self.source_datapipe: IterDataPipe[Tuple[Any, U]] = source_datapipe
-        self.mode: str = mode
+        self.mode: str = mode if "w" in mode else "w" + mode
         self.fn: Optional[Callable] = filepath_fn
 
     def __iter__(self) -> Iterator[str]: