Add support for StatefulDataLoader (#2410)

joecummings · web-flow · commit 7b654ea00980 · 2025-02-24T18:01:11.000-05:00
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -7,15 +7,15 @@
 import sys
 import time
 from functools import partial
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union
 from warnings import warn
 
 import torch
 from omegaconf import DictConfig, ListConfig
 
 from torch import nn
 from torch.optim import Optimizer
-from torch.utils.data import DataLoader, DistributedSampler
+from torchdata.stateful_dataloader import StatefulDataLoader
 
 from torchtune import config, modules, training, utils
 from torchtune.config._utils import _get_component_from_path
@@ -302,11 +302,16 @@ def setup(self, cfg: DictConfig) -> None:
         # sampler and dataloader depend on the tokenizer and loss_fn and should be
         # setup after both of these are initialized
         collate_name = cfg.get("collate_fn", "torchtune.data.padded_collate_sft")
-        self._sampler, self._dataloader = self._setup_data(
+        self._dataloader = self._setup_data(
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            dataloader_state_dict=(
+                ckpt_dict[training.DATALOADER_KEY]
+                if self._resume_from_checkpoint
+                else None
+            ),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -548,11 +553,12 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
-    ) -> Tuple[DistributedSampler, DataLoader]:
+        dataloader_state_dict: Optional[Dict[str, Any]] = None,
+    ) -> StatefulDataLoader:
         """
-        All data related setup happens here. Currently this recipe only supports the
-        DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
-        iterable datasets and streaming datasets are not supported.
+        All data related setup happens here. This recipe currently supports only
+        map-style datasets. If a state_dict is provided (meaning we are resuming a training run),
+        it is loaded into the dataloader.
         """
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -570,19 +576,10 @@ def _setup_data(
             raise RuntimeError("left_pad_sequence collator is only for inference.")
         collate_fn = _get_component_from_path(collate_fn)
 
-        sampler = DistributedSampler(
-            ds,
-            num_replicas=1,
-            rank=0,
-            shuffle=shuffle,
-            seed=0,
-        )
-        dataloader = DataLoader(
+        dataloader = StatefulDataLoader(
             dataset=ds,
             batch_size=batch_size,
-            sampler=sampler,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=True,
+            shuffle=shuffle,
             collate_fn=(
                 partial(
                     collate_fn,
@@ -592,11 +589,12 @@ def _setup_data(
                 if not packed
                 else padded_collate_packed
             ),
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
         )
-
-        log.info("Dataset and Sampler are initialized.")
-
-        return sampler, dataloader
+        if dataloader_state_dict is not None:
+            dataloader.load_state_dict(dataloader_state_dict)
+        return dataloader
 
     def save_checkpoint(self, epoch: int) -> None:
         """
@@ -606,12 +604,16 @@ def save_checkpoint(self, epoch: int) -> None:
         ckpt_dict = {training.MODEL_KEY: self._model.state_dict()}
         # if training is in-progress, checkpoint the optimizer state as well
         if epoch + 1 < self.total_epochs:
+            dataloader_sd = self._dataloader.state_dict()
+            # Hardcode _iterator_finished to True to avoid issues with resuming from a checkpoint
+            dataloader_sd["_iterator_finished"] = True
             ckpt_dict.update(
                 {
                     training.SEED_KEY: self.seed,
                     training.EPOCHS_KEY: self.epochs_run,
                     training.TOTAL_EPOCHS_KEY: self.total_epochs,
                     training.MAX_STEPS_KEY: self.max_steps_per_epoch,
+                    training.DATALOADER_KEY: dataloader_sd,
                 }
             )
             if not self._optimizer_in_bwd:
@@ -669,19 +671,8 @@ def train(self) -> None:
         self._profiler.start()
         # self.epochs_run should be non-zero when we're resuming from a checkpoint
         for curr_epoch in range(self.epochs_run, self.total_epochs):
-            # Update the sampler to ensure data is correctly shuffled across epochs
-            # in case shuffle is True
-            self._sampler.set_epoch(curr_epoch)
-
             pbar = tqdm(total=self._steps_per_epoch)
             for idx, batch in enumerate(self._dataloader):
-                if (
-                    self.max_steps_per_epoch is not None
-                    and (idx // self._gradient_accumulation_steps)
-                    == self.max_steps_per_epoch
-                ):
-                    break
-
                 # Start tracking CUDA memory for active steps for just the first epoch
                 if (
                     curr_epoch == 0
@@ -777,6 +768,12 @@ def train(self) -> None:
                 # if the schedule cycle doesn't align with gradient accumulation.
                 self._profiler.step()
 
+                # Check if we should stop training for this epoch
+                if (
+                    (idx + 1) // self._gradient_accumulation_steps
+                ) == self.max_steps_per_epoch:
+                    break
+
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
 
diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
@@ -56,8 +56,8 @@ def _get_test_config_overrides(self):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama2": [10.5201, 10.5217, 10.4945, 10.5136],
-            "llama3": [11.9839, 11.9684, 11.9596, 11.9366],
+            "llama2": [10.5219, 10.5292, 10.5475, 10.5195],
+            "llama3": [11.9611, 11.9432, 11.9326, 11.9807],
         }
 
         return loss_values_map[model_type]
@@ -153,7 +153,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch, optimizer_in_bwd):
         ckpt = "llama2_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
-        log_file = gen_log_file_name(tmpdir)
+        first_log_file = gen_log_file_name(tmpdir, suffix="first")
 
         # Config file needed for model conversion.
         # Create a second copy for training resume
@@ -173,6 +173,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch, optimizer_in_bwd):
             checkpointer.model_type=LLAMA2 \
             tokenizer.path=/tmp/test-artifacts/tokenizer.model \
             tokenizer.prompt_template=null \
+            metric_logger.filename={first_log_file} \
             optimizer_in_bwd={optimizer_in_bwd} \
         """.split()
 
@@ -183,7 +184,15 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch, optimizer_in_bwd):
         with pytest.raises(SystemExit, match=""):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
+        # Sanity check that the loss values are expected for the initial run
+        expected_loss_values = self._fetch_expected_loss_values("llama2")
+        loss_values = get_loss_values_from_metric_logger(first_log_file)
+        torch.testing.assert_close(
+            loss_values, expected_loss_values, rtol=1e-4, atol=1e-4
+        )
+
         # Resume training
+        log_file = gen_log_file_name(tmpdir, suffix="resume")
         epoch_folder = get_largest_iter_folder(tmpdir)
         epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}"
         suffix = ".safetensors"
diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py
@@ -40,6 +40,7 @@
     ADAPTER_CONFIG,
     ADAPTER_KEY,
     Checkpointer,
+    DATALOADER_KEY,
     DistributedCheckpointer,
     EPOCHS_KEY,
     FormattedCheckpointFiles,
@@ -138,4 +139,5 @@
     "scale_grads",
     "get_distributed_backend",
     "disable_dropout",
+    "DATALOADER_KEY",
 ]
diff --git a/torchtune/training/checkpointing/__init__.py b/torchtune/training/checkpointing/__init__.py
@@ -14,6 +14,7 @@
 from torchtune.training.checkpointing._utils import (
     ADAPTER_CONFIG,
     ADAPTER_KEY,
+    DATALOADER_KEY,
     EPOCHS_KEY,
     FormattedCheckpointFiles,
     get_largest_iter_folder,
@@ -55,4 +56,5 @@
     "STEPS_KEY",
     "TOTAL_EPOCHS_KEY",
     "FormattedCheckpointFiles",
+    "DATALOADER_KEY",
 ]
diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py
@@ -71,6 +71,9 @@
 # rng state for ensuring correct training resuming in PPO
 RNG_KEY = "rng_state"
 
+# key used for dataloader state
+DATALOADER_KEY = "dataloader"
+
 
 class ModelType(Enum):
     """ModelType is used by the checkpointer to distinguish between different model architectures.