starting fp8 logging

jomitchellnv · jomitchellnv · commit 96a63f37988c · 2025-12-18T09:46:37.000-08:00
Signed-off-by: Jonathan Mitchell &lt;jomitchell@nvidia.com&gt;
diff --git a/.devcontainer/recipes/requirements.txt b/.devcontainer/recipes/requirements.txt
@@ -14,3 +14,6 @@ transformers
 typer
 wandb
 zstandard
+nvdlfw_inspect @ git+https://github.com/NVIDIA/nvidia-dlfw-inspect
+
+
diff --git a/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml b/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml
@@ -1,28 +1,23 @@
 example_fp8_tensor_stat_collection:
     enabled: True
     layers:
-        layer_types: [layernorm_linear]
+        # Match the actual linear layers within attention that support FP8 stats
+        layer_types: [layernorm_qkv, proj]
     transformer_engine:
         LogFp8TensorStats:
             enabled: True
             tensors_struct:
             - tensor: activation
-              stats: [fp8_block_scaling_underflows%]
+              stats: [underflows%, overflows%]
               freq: 1
             - tensor: activation
-              stats: [fp8_block_scaling_overflows%]
+              stats: [scale_inv_min, scale_inv_max]
               freq: 1
             - tensor: activation
-              stats: [fp8_block_scaling_scale_inv_min]
-              freq: 1
-            - tensor: activation
-              stats: [fp8_block_scaling_scale_inv_max]
-              freq: 1
-            - tensor: activation
-              stats: [fp8_block_scaling_mse]
+              stats: [mse]
               freq: 1
             - tensor: gradient
               stats: [underflows%]
               freq: 5
               start_step: 0
-              end_step: 80
+              end_step: 80
diff --git a/bionemo-recipes/recipes/esm2_native_te/train_ddp.py b/bionemo-recipes/recipes/esm2_native_te/train_ddp.py
@@ -16,6 +16,7 @@
 import logging
 from pathlib import Path
 
+import nvdlfw_inspect.api as debug_api
 import hydra
 import torch
 import transformer_engine.pytorch
@@ -43,6 +44,13 @@ def main(args: DictConfig) -> float | None:
     Returns:
         float: The loss value for the final batch.
     """
+    # TE Debug feature logging - MUST be done BEFORE FSDP wrapping
+    debug_api.initialize(
+        config_file="/workspaces/bionemo-framework/bionemo-recipes/recipes/esm2_native_te/fp8_stats_block_scaling_ddp.yaml",
+        feature_dirs=["/usr/local/lib/python3.12/dist-packages/transformer_engine/debug/features/"],
+        log_dir="./logddp",
+        default_logging_enabled=True,
+    )
     # Initialize the distributed configuration, including creating the distributed process group.
     dist_config = DistributedConfig()
     logger.info("Initializing distributed training: %s", dist_config)
@@ -65,6 +73,7 @@ def main(args: DictConfig) -> float | None:
     if args.use_sequence_packing:
         config.attn_input_format = "thd"
 
+    
     # Optionally use transformer engine to initialize only fp8 versions of weights by setting
     # `fp8_config.fp8_model_init_kwargs.enabled` to `True`, as opposed to using the default where both bfloat16 and fp8
     # versions of weights are kept.
@@ -80,10 +89,14 @@ def main(args: DictConfig) -> float | None:
     except AttributeError:
         pass
 
+    
+
     # Create optimizer.
     optimizer = AdamW(model.parameters(), **args.adamw_kwargs)
     scheduler = get_linear_schedule_with_warmup(optimizer, **args.lr_scheduler_kwargs)
 
+    debug_api.infer_and_assign_layer_names(model)
+
     model = model.to(device=device)
     model = torch.nn.parallel.DistributedDataParallel(
         model,
@@ -99,9 +112,9 @@ def main(args: DictConfig) -> float | None:
         else create_bshd_dataloader(dist_config, **args.dataset)
     )
 
-    if args.use_torch_compile:
-        # If we're using torch.compile, we need to do this before loading the checkpoint to ensure key consistency.
-        model = torch.compile(model)
+    # if args.use_torch_compile:
+    #     # If we're using torch.compile, we need to do this before loading the checkpoint to ensure key consistency.
+    #     model = torch.compile(model)
 
     # If we're resuming from a checkpoint, load it and set the start step. Otherwise, start from step 0.
     ckpt_path = Path(args.checkpoint.ckpt_dir) / "train_ddp" if args.checkpoint.ckpt_dir else None
@@ -134,6 +147,7 @@ def main(args: DictConfig) -> float | None:
             loss = outputs.loss
             loss.backward()
 
+            debug_api.step()
             # Compute and clip gradient norms.
             total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0).item()
 
diff --git a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 
 import hydra
+import nvdlfw_inspect.api as debug_api
 import torch
 import transformer_engine.pytorch
 from omegaconf import DictConfig, OmegaConf
@@ -36,8 +37,6 @@
 from perf_logger import PerfLogger
 from scheduler import get_linear_schedule_with_warmup
 
-import nvdlfw_inspect.api as debug_api
-
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -86,12 +85,31 @@ def main(args: DictConfig) -> float | None:
 
     logger.info("Initialized Model:\n%s", model)
 
+    # TE Debug feature logging - MUST be done BEFORE FSDP wrapping
+    debug_api.initialize(
+        config_file="/workspaces/bionemo-framework/bionemo-recipes/recipes/esm2_native_te/fp8_stats_block_scaling.yaml",
+        feature_dirs=["/usr/local/lib/python3.12/dist-packages/transformer_engine/debug/features/"],
+        log_dir="./log",
+        default_logging_enabled=True,
+    )
+    # Debug: Print module types to verify what we're working with
+    if dist_config.local_rank == 0:
+        logger.info("=== DEBUG: Module types in model ===")
+        for name, module in model.named_modules():
+            if 'layernorm_qkv' in name or 'proj' in name or 'self_attention' in name:
+                logger.info(f" -----> {name}: {type(module)} <----")
+        logger.info(f"=== DEBUG: FP8 config enabled={args.fp8_config.enabled}, recipe={args.fp8_config.fp8_recipe} ===")
+
     # We call the transformer stack "layers" in our TE models, but it's called "layer" in the original ESM-2 models.
     transformer_stack = model.esm.encoder.layers if hasattr(model.esm.encoder, "layers") else model.esm.encoder.layer
+    
     for layer in transformer_stack:
         fully_shard(layer, mesh=device_mesh["dp"])
     fully_shard(model, mesh=device_mesh["dp"])
 
+    # Assign names to layers so debug API can identify them - MUST be done BEFORE FSDP wrapping
+    debug_api.infer_and_assign_layer_names(model)
+
     # Create optimizer. Convert OmegaConf to regular dict to avoid serialization issues (BIONEMO-2873).
     optimizer = AdamW(model.parameters(), **OmegaConf.to_container(args.adamw_kwargs, resolve=True))  # type: ignore
     scheduler = get_linear_schedule_with_warmup(optimizer, **args.lr_scheduler_kwargs)
@@ -107,9 +125,9 @@ def main(args: DictConfig) -> float | None:
         else create_bshd_dataloader(dist_config, **args.dataset)
     )
 
-    if args.use_torch_compile:
-        # If we're using torch.compile, we need to do this before loading the checkpoint to ensure key consistency.
-        model = torch.compile(model)
+    # if args.use_torch_compile:
+    #     # If we're using torch.compile, we need to do this before loading the checkpoint to ensure key consistency.
+    #     model = torch.compile(model)
 
     # If we're resuming from a checkpoint, load it and set the start step. Otherwise, start from step 0.
     ckpt_path = Path(args.checkpoint.ckpt_dir) / "train_fsdp2" if args.checkpoint.ckpt_dir else None
@@ -128,15 +146,6 @@ def main(args: DictConfig) -> float | None:
 
     perf_logger = PerfLogger(dist_config, args)
 
-    # TE Debug feature logging
-    debug_api.initialize(
-        config_file="/workspaces/bionemo-framework/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml",
-        feature_dirs=["/usr/local/lib/python3.12/dist-packages/transformer_engine/debug/features/"],
-        log_dir="./log",
-        default_logging_enabled=True
-    )
-
-
     # Training loop
     step = start_step
     while step < args.num_train_steps:
@@ -159,7 +168,7 @@ def main(args: DictConfig) -> float | None:
             scheduler.step()
 
             debug_api.step()
-            
+
             optimizer.zero_grad()
 
             perf_logger.log_step(
@@ -183,7 +192,6 @@ def main(args: DictConfig) -> float | None:
                     max_checkpoints=args.checkpoint.max_checkpoints,
                 )
 
-            
             step += 1
             if step >= args.num_train_steps:
                 break