Improve Wandb experience (#660)

tcapelle · Thomas Capelle · kartikayk · web-flow · commit 5402b290024d · 2024-04-14T20:11:50.000-07:00
Co-authored-by: Thomas Capelle &lt;thomas.capelle@steady-sun.com&gt;
Co-authored-by: Kartikay Khandelwal &lt;47255723+kartikayk@users.noreply.github.com&gt;
Co-authored-by: yechenzhi &lt;136920488@qq.com&gt;
Co-authored-by: Rafi Ayub &lt;33648637+RdoubleA@users.noreply.github.com&gt;
Co-authored-by: Rohan Varma &lt;rvarm1@fb.com&gt;
Co-authored-by: Joe Cummings &lt;jrcummings27@gmail.com&gt;
Co-authored-by: Botao Chen &lt;markchen1015@meta.com&gt;
Co-authored-by: Nicolas Hug &lt;contact@nicolas-hug.com&gt;
Co-authored-by: ebsmothers &lt;ebs@meta.com&gt;
Co-authored-by: Mengtao Yuan &lt;mengtaoyuan1@gmail.com&gt;
Co-authored-by: solitude-alive &lt;44771751+solitude-alive@users.noreply.github.com&gt;
Co-authored-by: Jerry Zhang &lt;jerryzh168@gmail.com&gt;
Co-authored-by: RdoubleA &lt;rafiayub@fb.com&gt;
diff --git a/docs/source/_static/img/torchtune_workspace.png b/docs/source/_static/img/torchtune_workspace.png
diff --git a/docs/source/examples/wandb_logging.rst b/docs/source/examples/wandb_logging.rst
@@ -0,0 +1,85 @@
+.. _wandb_logging:
+
+===========================
+Logging to Weights & Biases
+===========================
+
+.. customcarditem::
+   :header: Logging to Weights & Biases
+   :card_description: Log metrics and model checkpoints to W&B
+   :image: _static/img/torchtune_workspace.png
+   :link: examples/wandb_logging.html
+   :tags: logging,wandb
+
+
+Torchtune supports logging your training runs to [Weights & Biases](https://wandb.ai).
+
+.. note::
+
+  You will need to install the `wandb` package to use this feature.
+  You can install it via pip:
+
+  .. code-block:: bash
+
+    pip install wandb
+
+  Then you need to login with your API key using the W&B CLI:
+
+  .. code-block:: bash
+
+    wandb login
+
+
+Metric Logger
+-------------
+
+The only change you need to make is to add the metric logger to your config. Weights & Biases will log the metrics and model checkpoints for you.
+
+.. code-block:: yaml
+
+    # enable logging to the built-in WandBLogger
+    metric_logger:
+      _component_: torchtune.utils.metric_logging.WandBLogger
+      # the W&B project to log to
+      project: torchtune
+
+
+We automatically grab the config from the recipe you are running and log it to W&B. You can find it in the W&B overview tab and the actual file in the `Files` tab.
+
+.. note::
+
+  Click on this sample [project to see the W&B workspace](https://wandb.ai/capecape/torchtune)
+  The config used to train the models can be found [here](https://wandb.ai/capecape/torchtune/runs/6053ofw0/files/torchtune_config_j67sb73v.yaml)
+
+Logging Model Checkpoints to W&B
+--------------------------------
+
+You can also log the model checkpoints to W&B by modifying the desired script `save_checkpoint` method.
+
+A suggested approach would be something like this:
+
+.. code-block:: python
+
+    def save_checkpoint(self, epoch: int) -> None:
+        ...
+        ## Let's save the checkpoint to W&B
+        ## depending on the Checkpointer Class the file will be named differently
+        ## Here is an example for the full_finetune case
+        checkpoint_file = Path.joinpath(
+            self._checkpointer._output_dir, f"torchtune_model_{epoch}"
+        ).with_suffix(".pt")
+        wandb_at = wandb.Artifact(
+            name=f"torchtune_model_{epoch}",
+            type="model",
+            # description of the model checkpoint
+            description="Model checkpoint",
+            # you can add whatever metadata you want as a dict
+            metadata={
+                utils.SEED_KEY: self.seed,
+                utils.EPOCHS_KEY: self.epochs_run,
+                utils.TOTAL_EPOCHS_KEY: self.total_epochs,
+                utils.MAX_STEPS_KEY: self.max_steps_per_epoch,
+            }
+        )
+        wandb_at.add_file(checkpoint_file)
+        wandb.log_artifact(wandb_at)
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -102,6 +102,7 @@ torchtune tutorials.
    examples/checkpointer
    examples/configs
    examples/recipe_deepdive
+   examples/wandb_logging
 
 .. toctree::
    :glob:
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -171,7 +171,11 @@ def setup(self, cfg: DictConfig) -> None:
         Sets up the recipe state correctly. This includes setting recipe attributes based
         on the ``resume_from_checkpoint`` flag.
         """
-        self._metric_logger = config.instantiate(cfg.metric_logger)
+        if self._is_rank_zero:
+            self._metric_logger = config.instantiate(cfg.metric_logger)
+
+            # log config with parameter override
+            self._metric_logger.log_config(cfg)
 
         ckpt_dict = self.load_checkpoint(cfg.checkpointer)
 
@@ -291,11 +295,8 @@ def _setup_model(
                 model, auto_wrap_policy={modules.TransformerDecoderLayer}
             )
         if self._is_rank_zero:
-            log.info(
-                utils.memory_stats_log(
-                    "Memory Stats after model init", device=self._device
-                )
-            )
+            memory_stats = utils.memory_stats_log(device=self._device)
+            log.info(f"Memory Stats after model init:\n{memory_stats}")
 
         # synchronize before training begins
         torch.distributed.barrier()
@@ -475,15 +476,18 @@ def train(self) -> None:
                     self.total_training_steps % self._log_peak_memory_every_n_steps == 0
                     and self._is_rank_zero
                 ):
-                    log.info(
-                        utils.memory_stats_log("Memory Stats", device=self._device)
+                    # Log peak memory for iteration
+                    memory_stats = utils.memory_stats_log(device=self._device)
+                    self._metric_logger.log_dict(
+                        memory_stats, step=self.total_training_steps
                     )
 
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
 
     def cleanup(self) -> None:
-        self._metric_logger.close()
+        if self._is_rank_zero:
+            self._metric_logger.close()
         torch.distributed.destroy_process_group()
 
 
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -176,6 +176,9 @@ def setup(self, cfg: DictConfig) -> None:
         """
         self._metric_logger = config.instantiate(cfg.metric_logger)
 
+        # log config with parameter override
+        self._metric_logger.log_config(cfg)
+
         ckpt_dict = self.load_checkpoint(cfg.checkpointer)
 
         # ``_setup_model`` handles initialization and loading the state dict. This method
@@ -257,11 +260,9 @@ def _setup_model(
         if compile_model:
             log.info("Compiling model with torch.compile...")
             model = utils.wrap_compile(model)
-        log.info(
-            utils.memory_stats_log(
-                "Memory Stats after model init:", device=self._device
-            )
-        )
+        if self._device == torch.device("cuda"):
+            memory_stats = utils.memory_stats_log(device=self._device)
+            log.info(f"Memory Stats after model init:\n{memory_stats}")
         return model
 
     def _setup_optimizer(
@@ -440,9 +441,13 @@ def train(self) -> None:
                     self.total_training_steps += 1
 
                 # Log peak memory for iteration
-                if self.total_training_steps % self._log_peak_memory_every_n_steps == 0:
-                    log.info(
-                        utils.memory_stats_log("Memory Stats:", device=self._device)
+                if (
+                    self.total_training_steps % self._log_peak_memory_every_n_steps == 0
+                    and self._device == torch.device("cuda")
+                ):
+                    memory_stats = utils.memory_stats_log(device=self._device)
+                    self._metric_logger.log_dict(
+                        memory_stats, step=self.total_training_steps
                     )
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
diff --git a/recipes/gemma_full_finetune_distributed.py b/recipes/gemma_full_finetune_distributed.py
@@ -146,7 +146,11 @@ def setup(self, cfg: DictConfig) -> None:
         Sets up the recipe state correctly. This includes setting recipe attributes based
         on the ``resume_from_checkpoint`` flag.
         """
-        self._metric_logger = config.instantiate(cfg.metric_logger)
+        if self._is_rank_zero:
+            self._metric_logger = config.instantiate(cfg.metric_logger)
+
+            # log config with parameter override
+            self._metric_logger.log_config(cfg)
 
         ckpt_dict = self.load_checkpoint(cfg.checkpointer)
 
@@ -263,12 +267,8 @@ def _setup_model(
                 model, auto_wrap_policy={modules.TransformerDecoderLayer}
             )
         if self._is_rank_zero:
-            log.info(
-                utils.memory_stats_log(
-                    "Memory Stats after model init", device=self._device
-                )
-            )
-
+            memory_stats = utils.memory_stats_log(device=self._device)
+            log.info(f"Memory Stats after model init:\n{memory_stats}")
         # synchronize before training begins
         torch.distributed.barrier()
 
@@ -458,15 +458,18 @@ def train(self) -> None:
                     self.total_training_steps % self._log_peak_memory_every_n_steps == 0
                     and self._is_rank_zero
                 ):
-                    log.info(
-                        utils.memory_stats_log("Memory Stats", device=self._device)
+                    # Log peak memory for iteration
+                    memory_stats = utils.memory_stats_log(device=self._device)
+                    self._metric_logger.log_dict(
+                        memory_stats, step=self.total_training_steps
                     )
 
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
 
     def cleanup(self) -> None:
-        self._metric_logger.close()
+        if self._is_rank_zero:
+            self._metric_logger.close()
         torch.distributed.destroy_process_group()
 
 
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
@@ -148,6 +148,9 @@ def setup(self, cfg: DictConfig) -> None:
         """
         self._metric_logger = config.instantiate(cfg.metric_logger)
 
+        # log config with parameter override
+        self._metric_logger.log_config(cfg)
+
         checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
 
         self._model = self._setup_model(
@@ -252,11 +255,9 @@ def _setup_model(
         )
 
         log.info(f"Model is initialized with precision {self._dtype}.")
-        log.info(
-            utils.memory_stats_log(
-                "Memory Stats after model init:", device=self._device
-            )
-        )
+        if self._device == torch.device("cuda"):
+            memory_stats = utils.memory_stats_log(device=self._device)
+            log.info(f"Memory Stats after model init:\n{memory_stats}")
         return model
 
     def _setup_optimizer(
@@ -490,9 +491,14 @@ def train(self) -> None:
                     # Update the number of steps when the weights are updated
                     self.total_training_steps += 1
                 # Log peak memory for iteration
-                if self.total_training_steps % self._log_peak_memory_every_n_steps == 0:
-                    log.info(
-                        utils.memory_stats_log("Memory Stats:", device=self._device)
+                if (
+                    self.total_training_steps % self._log_peak_memory_every_n_steps == 0
+                    and self._device == torch.device("cuda")
+                ):
+                    # Log peak memory for iteration
+                    memory_stats = utils.memory_stats_log(device=self._device)
+                    self._metric_logger.log_dict(
+                        memory_stats, step=self.total_training_steps
                     )
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -197,6 +197,9 @@ def setup(self, cfg: DictConfig) -> None:
         if self._is_rank_zero:
             self._metric_logger = config.instantiate(cfg.metric_logger)
 
+            # log config with parameter override
+            self._metric_logger.log_config(cfg)
+
         checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
 
         self._model = self._setup_model(
@@ -353,11 +356,8 @@ def _setup_model(
                 model, auto_wrap_policy={modules.TransformerDecoderLayer}
             )
         if self._is_rank_zero:
-            log.info(
-                utils.memory_stats_log(
-                    "Memory Stats after model init:", device=self._device
-                )
-            )
+            memory_stats = utils.memory_stats_log(device=self._device)
+            log.info(f"Memory Stats after model init:\n{memory_stats}")
 
         # synchronize before training begins
         torch.distributed.barrier()
@@ -571,8 +571,10 @@ def train(self) -> None:
                     self.total_training_steps % self._log_peak_memory_every_n_steps == 0
                     and self._is_rank_zero
                 ):
-                    log.info(
-                        utils.memory_stats_log("Memory Stats:", device=self._device)
+                    # Log peak memory for iteration
+                    memory_stats = utils.memory_stats_log(device=self._device)
+                    self._metric_logger.log_dict(
+                        memory_stats, step=self.total_training_steps
                     )
 
             self.epochs_run += 1
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -174,6 +174,10 @@ def setup(self, cfg: DictConfig) -> None:
         model, tokenizer, loss, optimizer, learning rate scheduler, sampler, and dataloader.
         """
         self._metric_logger = config.instantiate(cfg.metric_logger)
+
+        # log config with parameter override
+        self._metric_logger.log_config(cfg)
+
         self._model_compile = cfg.compile
         checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
 
@@ -291,11 +295,9 @@ def _setup_model(
         if compile_model:
             log.info("Compiling model with torch.compile...")
             model = utils.wrap_compile(model)
-        log.info(
-            utils.memory_stats_log(
-                "Memory Stats after model init:", device=self._device
-            )
-        )
+        if self._device == torch.device("cuda"):
+            memory_stats = utils.memory_stats_log(device=self._device)
+            log.info(f"Memory Stats after model init:\n{memory_stats}")
         return model
 
     def _setup_optimizer(
@@ -474,9 +476,12 @@ def train(self) -> None:
                     if (
                         self.total_training_steps % self._log_peak_memory_every_n_steps
                         == 0
+                        and self._device == torch.device("cuda")
                     ):
-                        log.info(
-                            utils.memory_stats_log("Memory Stats:", device=self._device)
+                        # Log peak memory for iteration
+                        memory_stats = utils.memory_stats_log(device=self._device)
+                        self._metric_logger.log_dict(
+                            memory_stats, step=self.total_training_steps
                         )
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
diff --git a/torchtune/utils/memory.py b/torchtune/utils/memory.py
diff --git a/torchtune/utils/metric_logging.py b/torchtune/utils/metric_logging.py