change saving logic (meta-pytorch#2182)

felipemello1 · Felipe Mello · rahul-sarvam · commit 608fbee27bf4 · 2024-12-23T14:04:09.000+01:00
Co-authored-by: Felipe Mello &lt;felipemello@fb.com&gt;
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
@@ -30,7 +30,6 @@
 from torchtune.training.checkpointing._utils import (
     ADAPTER_CONFIG_FNAME,
     ADAPTER_MODEL_FNAME,
-    BASE_MODEL_DIRNAME,
     copy_files,
     get_adapter_checkpoint_path,
     get_model_checkpoint_path,
@@ -180,14 +179,6 @@ def __init__(
         self._output_dir = Path(output_dir)
         self._output_dir.mkdir(parents=True, exist_ok=True)
 
-        # save all files in input_dir, except model weights and mapping, to output_dir
-        # this is useful to preserve the tokenizer, configs, license, etc.
-        copy_files(
-            self._checkpoint_dir,
-            Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME),
-            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
-        )
-
         #  resume from adapter_model ckpt
         self._adapter_checkpoint = get_adapter_checkpoint_path(
             output_dir=self._output_dir,
@@ -331,6 +322,14 @@ def save_checkpoint(
                 "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights."
             )
 
+        # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch}
+        # So its easy to run inference with the model using this epoch's checkpoint
+        copy_files(
+            self._checkpoint_dir,
+            Path.joinpath(self._output_dir, f"epoch_{epoch}"),
+            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        )
+
         # If the recipe state needs to be output, first remove the model state dict
         if intermediate_checkpoint:
             _ = state_dict.pop(training.MODEL_KEY, None)
@@ -435,14 +434,6 @@ def __init__(
             Path.joinpath(self._checkpoint_dir, "config.json").read_text()
         )
 
-        # save all files in input_dir, except model weights and mapping, to output_dir
-        # this is useful to preserve the tokenizer, configs, license, etc.
-        copy_files(
-            self._checkpoint_dir,
-            Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME),
-            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
-        )
-
         # repo_id is necessary for when saving an adapter config, so its compatible with HF.
         # This json file is produced and saved in the download step.
         # contents are {"repo_id": "some_model/some_model_version"}
@@ -873,6 +864,14 @@ def save_checkpoint(
                     f"saved to {output_path}"
                 )
 
+        # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch}
+        # So its easy to run inference with the model using this epoch's checkpoint
+        copy_files(
+            self._checkpoint_dir,
+            Path.joinpath(self._output_dir, f"epoch_{epoch}"),
+            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        )
+
         # If the recipe state needs to be output, first remove the model state dict
         # and if it exists, remove the adapter state dict as well
         if intermediate_checkpoint:
@@ -966,14 +965,6 @@ def __init__(
         self._output_dir = Path(output_dir)
         self._output_dir.mkdir(parents=True, exist_ok=True)
 
-        # save all files in input_dir, except model weights and mapping, to output_dir
-        # this is useful to preserve the tokenizer, configs, license, etc.
-        copy_files(
-            self._checkpoint_dir,
-            Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME),
-            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
-        )
-
         #  resume from adapter_model ckpt
         self._adapter_checkpoint = get_adapter_checkpoint_path(
             output_dir=self._output_dir,
@@ -1126,6 +1117,14 @@ def save_checkpoint(
                 "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights."
             )
 
+        # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch}
+        # So its easy to run inference with the model using this epoch's checkpoint
+        copy_files(
+            self._checkpoint_dir,
+            Path.joinpath(self._output_dir, f"epoch_{epoch}"),
+            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        )
+
         # If the recipe state needs to be output, first remove the model state dict
         # and if it exists, remove the adapter state dict as well
         if intermediate_checkpoint:
diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py
@@ -38,7 +38,6 @@
 # standardize checkpointing
 SHARD_FNAME = "ft-model-{cpt_idx}-of-{num_shards}"
 RECIPE_STATE_DIRNAME = "recipe_state"
-BASE_MODEL_DIRNAME = "base_model"
 
 # Needed when setting up output dir in checkpointing
 REPO_ID_FNAME = "original_repo_id"
@@ -334,6 +333,7 @@ def copy_files(
     output_dir: Union[str, Path],
     *,
     ignore_suffixes: Optional[List[str]] = None,
+    max_file_size_mb: int = 100,
 ) -> None:
     """
     Copies files from the input directory to the output directory, preserving the directory structure.
@@ -346,6 +346,7 @@ def copy_files(
         output_dir (Union[str, Path]): The path to the output directory where files should be copied.
         ignore_suffixes (Optional[List[str]]): A list of file suffixes to exclude from copying.
           Defaults to ['.pt', '.bin', '.safetensors'] if not provided.
+        max_file_size_mb (int): The maximum file size in megabytes to copy. Defaults to 100 MB.
     Returns:
         None
     Example:
@@ -355,6 +356,7 @@ def copy_files(
     already exist in the destination or have the specified suffixes.
     """
 
+    max_file_size = max_file_size_mb * 1024 * 1024
     for root, dirs, files in os.walk(input_dir):
 
         # Filter out directories that start with '.'. E.g. ".cache/"
@@ -381,6 +383,13 @@ def copy_files(
             src_file = os.path.join(root, file)
             dest_file = os.path.join(dest_dir, file)
 
+            # Check the file size
+            if os.path.getsize(src_file) > max_file_size:
+                print(
+                    f"Skipping copying {src_file} to {output_dir} as it exceeds the size limit of {max_file_size_mb} MiB."
+                )
+                continue
+
             # Copy the file if it doesn't already exist in the destination
             if not os.path.exists(dest_file):
                 shutil.copy2(src_file, dest_file)