fix train_ddp_cp

pstjohn · pstjohn · commit d1fc22447b5f · 2025-12-22T16:09:43.000Z
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/esm2_native_te/train_ddp_cp.py b/bionemo-recipes/recipes/esm2_native_te/train_ddp_cp.py
@@ -109,24 +109,22 @@ def main(args: DictConfig) -> float | None:
         output_device=dist_config.local_rank,
         process_group=group_fsdp_cp,
     )
-    cp_group = device_mesh["cp"].get_group()
-    cp_rank = device_mesh.get_local_rank("cp")
 
     if args.cp_size > 1:
         for i, transformer_layer in enumerate(model.module.esm.encoder.layers):
             logger.debug(f"Rank {dist_config.rank}: Setting CP group for layer {i}")
             transformer_layer.set_context_parallel_group(
-                cp_group, torch.distributed.get_process_group_ranks(device_mesh["cp"].get_group()), torch.cuda.Stream()
+                device_mesh["cp"].get_group(),
+                torch.distributed.get_process_group_ranks(device_mesh["cp"].get_group()),
+                torch.cuda.Stream(),
             )
 
     # Context Parallelism requires THD Sequence Packing.
     assert args.use_sequence_packing, "Context Parallelism requires THD Sequence Packing."
 
     train_dataloader, dataset_or_sampler = create_cp_dataloader(
         dist_config,
-        cp_world_size=torch.distributed.get_world_size(group=cp_group),
-        cp_group=cp_group,
-        cp_rank=cp_rank,
+        cp_mesh=device_mesh["cp"],
         **args.dataset,
     )
 
diff --git a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2_cp.py b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2_cp.py
@@ -114,14 +114,15 @@ def main(args: DictConfig) -> float | None:
     transformer_stack = model.esm.encoder.layers if hasattr(model.esm.encoder, "layers") else model.esm.encoder.layer
     # Fully shard takes in a DeviceMesh object, which is a 2D mesh of dimensions (CP_dimension, DP_dimension).
     # FSDP2 will shard the model across the DP (dim=1) dimension and then duplicate across the CP (dim=0) dimension.
-    cp_group = device_mesh["cp"].get_group()
     for layer in transformer_stack:
         fully_shard(layer, mesh=cp_dp_mesh)
         # Set CP group for layer if CP is enabled.
         if args.cp_size > 1:
             logger.debug(f"Rank {dist_config.rank}: Setting CP group for layer {layer}")
             layer.set_context_parallel_group(
-                cp_group, torch.distributed.get_process_group_ranks(cp_group), torch.cuda.Stream()
+                device_mesh["cp"].get_group(),
+                torch.distributed.get_process_group_ranks(device_mesh["cp"].get_group()),
+                torch.cuda.Stream(),
             )
     fully_shard(model, mesh=cp_dp_mesh)