NVIDIA
diff --git a/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 82 additions & 207 deletions b/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 82 additions & 207 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 43 additions & 46 deletions b/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 43 additions & 46 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_collator_context_parallel.py‎
Lines changed: 35 additions & 19 deletions b/‎bionemo-recipes/models/esm2/tests/test_collator_context_parallel.py‎
Lines changed: 35 additions & 19 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_fp8.py‎
Lines changed: 8 additions & 5 deletions b/‎bionemo-recipes/models/esm2/tests/test_fp8.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_thd.py‎
Lines changed: 10 additions & 7 deletions b/‎bionemo-recipes/models/esm2/tests/test_thd.py‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎bionemo-recipes/recipes/esm2_native_te/README.md‎
Lines changed: 3 additions & 0 deletions b/‎bionemo-recipes/recipes/esm2_native_te/README.md‎
Lines changed: 3 additions & 0 deletions
@@ -21,15 +21,16 @@
 
 from esm.collator import (
     DataCollatorWithFlattening,
-    MLMDataCollatorWithFlattening,
     TokenPackingDataset,
     _split_sample_by_num_tokens,
 )
 
 
-def test_data_collator_with_flattening_basic():
+def test_data_collator_with_flattening_basic(tokenizer):
     """Test DataCollatorWithFlattening with input_ids and attention_mask."""
-    collator = DataCollatorWithFlattening(return_position_ids=True)
+    # Use DataCollatorForLanguageModeling with mlm_probability=0.0 to disable masking
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.0)
+    collator = DataCollatorWithFlattening(collator=mlm_collator, return_position_ids=True)
 
     # Create test sequences of different lengths
     features = [
@@ -70,19 +71,24 @@ def test_data_collator_with_flattening_basic():
     expected_input_ids = torch.tensor([[0, 5, 6, 7, 2, 0, 8, 9, 10, 11, 2, 0, 12, 13, 2]], dtype=torch.int64)
     torch.testing.assert_close(input_ids_tensor, expected_input_ids)
 
-    # Assert labels are not present when not provided in input
-    assert "labels" not in batch
+    # Assert labels are present (DataCollatorForLanguageModeling always creates them)
+    # With mlm_probability=0.0, all labels should be -100 (ignored)
+    assert "labels" in batch
+    assert (batch["labels"] == -100).all(), "With mlm_probability=0.0, all labels should be -100"
 
 
-def test_data_collator_with_flattening_with_labels():
+def test_data_collator_with_flattening_with_labels(tokenizer):
     """Test DataCollatorWithFlattening with input_ids, attention_mask, and labels."""
-    collator = DataCollatorWithFlattening()
+    # Use DataCollatorForLanguageModeling with mlm_probability=0.0 to disable masking
+    # Note: DataCollatorForLanguageModeling ignores input labels and creates its own
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.0)
+    collator = DataCollatorWithFlattening(collator=mlm_collator)
 
-    # Create test sequences with labels
+    # Create test sequences (labels will be created by DataCollatorForLanguageModeling)
     features = [
-        {"input_ids": [0, 5, 6, 7, 2], "labels": [0, 5, 6, 7, 2]},  # 5 tokens
-        {"input_ids": [0, 8, 9, 10, 11, 2], "labels": [0, 8, 9, 10, 11, 2]},  # 6 tokens
-        {"input_ids": [0, 12, 13, 2], "labels": [0, 12, 13, 2]},  # 4 tokens
+        {"input_ids": [0, 5, 6, 7, 2]},  # 5 tokens
+        {"input_ids": [0, 8, 9, 10, 11, 2]},  # 6 tokens
+        {"input_ids": [0, 12, 13, 2]},  # 4 tokens
     ]
 
     # Calculate expected total tokens
@@ -114,12 +120,12 @@ def test_data_collator_with_flattening_with_labels():
     assert batch["max_length_q"] == 6, f"Expected max_length_q=6, got {batch['max_length_q']}"
     assert batch["max_length_k"] == 6, f"Expected max_length_k=6, got {batch['max_length_k']}"
 
-    # Assert flattened input_ids and labels match concatenated original sequences
+    # Assert flattened input_ids match concatenated original sequences
     expected_input_ids = torch.tensor([[0, 5, 6, 7, 2, 0, 8, 9, 10, 11, 2, 0, 12, 13, 2]], dtype=torch.int64)
-    expected_labels = torch.tensor([[0, 5, 6, 7, 2, 0, 8, 9, 10, 11, 2, 0, 12, 13, 2]], dtype=torch.int64)
-
     torch.testing.assert_close(input_ids_tensor, expected_input_ids)
-    torch.testing.assert_close(labels_tensor, expected_labels)
+
+    # With mlm_probability=0.0, all labels should be -100 (ignored)
+    assert (labels_tensor == -100).all(), "With mlm_probability=0.0, all labels should be -100"
 
     # Assert that sequence boundaries are properly maintained
     # by checking that token positions match expected values
@@ -134,9 +140,11 @@ def test_data_collator_with_flattening_with_labels():
         start_idx = end_idx
 
 
-def test_data_collator_pads_to_multiple_of():
+def test_data_collator_pads_to_multiple_of(tokenizer):
     """Test DataCollatorWithFlattening with input_ids and attention_mask."""
-    collator = DataCollatorWithFlattening(pad_to_multiple_of=8, token_pad=1, label_pad=-100, return_position_ids=True)
+    # Use DataCollatorForLanguageModeling with mlm_probability=0.0 to disable masking
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.0)
+    collator = DataCollatorWithFlattening(collator=mlm_collator, pad_to_multiple_of=8, return_position_ids=True)
 
     # Create test sequences with labels
     features = [
@@ -168,11 +176,8 @@ def test_data_collator_pads_to_multiple_of():
 
 def test_mlm_data_collator_with_flattening_basic(tokenizer):
     """Test MLMDataCollatorWithFlattening with basic input_ids and verify labels are created."""
-    collator = MLMDataCollatorWithFlattening(
-        tokenizer=tokenizer,
-        mlm_probability=0.15,
-        return_position_ids=True,
-    )
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+    collator = DataCollatorWithFlattening(collator=mlm_collator, return_position_ids=True)
 
     # Create test sequences of different lengths
     features = [
@@ -232,11 +237,8 @@ def test_mlm_data_collator_with_flattening_basic(tokenizer):
 def test_mlm_data_collator_with_flattening_masking(tokenizer, test_proteins):
     """Test MLMDataCollatorWithFlattening with reproducible masking using a seed."""
     # Use a fixed seed for reproducibility
-    collator = MLMDataCollatorWithFlattening(
-        tokenizer=tokenizer,
-        mlm_probability=0.15,
-        seed=42,
-    )
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, seed=42)
+    collator = DataCollatorWithFlattening(collator=mlm_collator)
 
     features = [tokenizer(protein) for protein in test_proteins]
 
@@ -293,11 +295,8 @@ def test_mlm_data_collator_with_flattening_pad_to_multiple_of(tokenizer, test_pr
     remainder = -total_tokens % 8
     assert remainder != 0, "Test assumes we need to pad to reach a multiple of 8"
 
-    collator = MLMDataCollatorWithFlattening(
-        tokenizer=tokenizer,
-        mlm_probability=0.15,
-        pad_to_multiple_of=8,
-    )
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+    collator = DataCollatorWithFlattening(collator=mlm_collator, pad_to_multiple_of=8)
 
     features = [tokenizer(protein) for protein in test_proteins]
 
@@ -334,21 +333,22 @@ def test_mlm_data_collator_with_flattening_pad_to_multiple_of(tokenizer, test_pr
 
 def test_mlm_data_collator_with_flattening_bshd_equivalent(tokenizer, test_proteins):
     """Test MLMDataCollatorWithFlattening with bshd_equivalent=True."""
-    thd_collator = MLMDataCollatorWithFlattening(
-        tokenizer=tokenizer,
-        mlm_probability=0.15,
-        seed=42,
-        pad_to_multiple_of=16,
-        bshd_equivalent=True,
-        bshd_pad_to_multiple_of=256,
-    )
-
+    # Create separate collator instances with the same seed to ensure matching masking
+    # The BSHD collator pads to 256
     bshd_collator = DataCollatorForLanguageModeling(
         tokenizer=tokenizer,
         mlm_probability=0.15,
         seed=42,
         pad_to_multiple_of=256,
     )
+    thd_collator = DataCollatorWithFlattening(
+        collator=DataCollatorForLanguageModeling(
+            tokenizer=tokenizer,
+            mlm_probability=0.15,
+            seed=42,
+            pad_to_multiple_of=256,
+        )
+    )
 
     features = [tokenizer(protein) for protein in test_proteins]
 
@@ -375,11 +375,8 @@ def test_mlm_data_collator_with_flattening_bshd_equivalent(tokenizer, test_prote
 
 def test_mlm_data_collator_with_flattening_pad_sequences_to_be_divisible_by(tokenizer, test_proteins):
     """Test MLMDataCollatorWithFlattening with pad_sequences_to_be_divisible_by."""
-    collator = MLMDataCollatorWithFlattening(
-        tokenizer=tokenizer,
-        mlm_probability=0.15,
-        pad_sequences_to_be_divisible_by=16,
-    )
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+    collator = DataCollatorWithFlattening(collator=mlm_collator, pad_sequences_to_be_divisible_by=16)
     features = [tokenizer(protein) for protein in test_proteins]
     batch = collator(features)
     assert batch["input_ids"].numel() % 16 == 0, (
 
@@ -208,23 +208,35 @@ def __next__(self):
         return copy.deepcopy(self._batch)
 
 
-class _DummyCPGroup:
-    def __init__(self, size: int):
+class _DummyDeviceMesh:
+    """Dummy device mesh for testing ContextParallelDataLoaderWrapper."""
+
+    def __init__(self, size: int, rank: int = 0):
         self._size = size
+        self._rank = rank
+        self._group = mock.MagicMock()  # Mock process group
+
+    def get_local_rank(self) -> int:
+        """Return the local rank within this mesh."""
+        return self._rank
+
+    def get_group(self):
+        """Return the process group."""
+        return self._group
 
     def size(self) -> int:
+        """Return the size of the mesh."""
         return self._size
 
 
 def _fake_get_batch(
     cu_seqlens_padded,
     input_ids_padded,
     labels_padded,
-    cp_group,
+    cp_size,
     qvk_format,
     cp_rank,
 ):
-    cp_size = cp_group.size()
     total_slices = 2 * cp_size
     seq_tokens = input_ids_padded.view(-1)
     seq_labels = labels_padded.view(-1)
@@ -250,14 +262,14 @@ def _fake_get_batch(
     )
 
 
-def _make_cp_shards(base_batch: Dict[str, torch.Tensor], cp_group: _DummyCPGroup):
+def _make_cp_shards(base_batch: Dict[str, torch.Tensor], cp_size: int):
     combined_batch = []
-    for cp_rank in range(cp_group.size()):
+    for cp_rank in range(cp_size):
         input_ids_sharded, labels_sharded = _fake_get_batch(
             cu_seqlens_padded=base_batch["cu_seq_lens_q_padded"],
             input_ids_padded=base_batch["input_ids"],
             labels_padded=base_batch["labels"],
-            cp_group=cp_group,
+            cp_size=cp_size,
             qvk_format="thd",
             cp_rank=cp_rank,
         )
@@ -368,7 +380,7 @@ def test_dataloader_scatter_nopadding():
     CP0 | 1,2,7,8 | 9, 10, 15, 16 |
     CP1 | 3,4,5,6 | 11, 12, 13, 14|
     """
-    cp_group = _DummyCPGroup(size=2)
+    cp_size = 2
 
     def run_roundtrip(base_batch):
         combined_batch = [
@@ -381,22 +393,24 @@ def run_roundtrip(base_batch):
                         labels_padded=base_batch["labels"],
                         qvk_format="thd",
                         cp_rank=cp_rank,
-                        cp_world_size=cp_group.size(),
+                        cp_world_size=cp_size,
                     )[0],
                     "labels": _split_batch_by_cp_rank(
                         cu_seqlens_padded=base_batch["cu_seq_lens_q_padded"],
                         input_ids_padded=base_batch["input_ids"],
                         labels_padded=base_batch["labels"],
                         qvk_format="thd",
                         cp_rank=cp_rank,
-                        cp_world_size=cp_group.size(),
+                        cp_world_size=cp_size,
                     )[1],
                 },
             )
-            for cp_rank in range(cp_group.size())
+            for cp_rank in range(cp_size)
         ]
-        loader_rank0 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_group, cp_rank=0)
-        loader_rank1 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_group, cp_rank=1)
+        cp_mesh_rank0 = _DummyDeviceMesh(size=cp_size, rank=0)
+        cp_mesh_rank1 = _DummyDeviceMesh(size=cp_size, rank=1)
+        loader_rank0 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_mesh_rank0)
+        loader_rank1 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_mesh_rank1)
 
         scatter_payload: Dict[str, List[Dict[str, torch.Tensor]]] = {}
         current_rank = {"value": None}
@@ -455,7 +469,7 @@ def test_dataloader_scatter_with_pad_between_seqs():
     CP0 | 1,<p>,5,<p> | 9, <p>, 13, <p>|
     CP1 | 2,3,6, <p>  | 10, 11, 14, 15 |
     """
-    cp_group = _DummyCPGroup(size=2)
+    cp_size = 2
 
     def run_roundtrip(base_batch):
         combined_batch = [
@@ -468,22 +482,24 @@ def run_roundtrip(base_batch):
                         labels_padded=base_batch["labels"],
                         qvk_format="thd",
                         cp_rank=cp_rank,
-                        cp_world_size=cp_group.size(),
+                        cp_world_size=cp_size,
                     )[0],
                     "labels": _split_batch_by_cp_rank(
                         cu_seqlens_padded=base_batch["cu_seq_lens_q_padded"],
                         input_ids_padded=base_batch["input_ids"],
                         labels_padded=base_batch["labels"],
                         qvk_format="thd",
                         cp_rank=cp_rank,
-                        cp_world_size=cp_group.size(),
+                        cp_world_size=cp_size,
                     )[1],
                 },
             )
-            for cp_rank in range(cp_group.size())
+            for cp_rank in range(cp_size)
         ]
-        loader_rank0 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_group, cp_rank=0)
-        loader_rank1 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_group, cp_rank=1)
+        cp_mesh_rank0 = _DummyDeviceMesh(size=cp_size, rank=0)
+        cp_mesh_rank1 = _DummyDeviceMesh(size=cp_size, rank=1)
+        loader_rank0 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_mesh_rank0)
+        loader_rank1 = ContextParallelDataLoaderWrapper(_DummyLoader(combined_batch), cp_mesh_rank1)
 
         scatter_payload: Dict[str, List[Dict[str, torch.Tensor]]] = {}
         current_rank = {"value": None}
 
@@ -20,8 +20,9 @@
 from torch.distributed.checkpoint.state_dict import get_model_state_dict
 from transformer_engine.common import recipe as recipe_module
 from transformer_engine.pytorch import fp8
+from transformers import DataCollatorForLanguageModeling
 
-from esm.collator import MLMDataCollatorWithFlattening
+from esm.collator import DataCollatorWithFlattening
 from esm.modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM
 
 
@@ -87,11 +88,10 @@ def parametrize_recipes_with_support(recipes):
 
 @pytest.fixture
 def input_data_thd(tokenizer, tokenized_proteins):
-    data_collator = MLMDataCollatorWithFlattening(
-        tokenizer=tokenizer,
-        mlm_probability=0.15,
+    mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, seed=42)
+    data_collator = DataCollatorWithFlattening(
+        collator=mlm_collator,
         pad_to_multiple_of=32,  # MXFP8 requires the sequence length to be divisible by 32, regular FP8 requires 16.
-        seed=42,
     )
 
     return data_collator(tokenized_proteins)
@@ -139,6 +139,9 @@ def test_fp8_forward_and_backward_pass_thd(te_model_checkpoint, input_data_thd,
     if isinstance(fp8_recipe, recipe_module.NVFP4BlockScaling):
         atol = 0.2
         rtol = 0.05
+    elif isinstance(fp8_recipe, recipe_module.DelayedScaling):
+        atol = 0.1
+        rtol = 0.03
     else:
         atol = None
         rtol = None
 
@@ -20,8 +20,9 @@
 import torch
 from transformer_engine.pytorch.attention.dot_product_attention import _attention_backends
 from transformer_engine.pytorch.attention.dot_product_attention.context_parallel import pad_thd_sequences_for_cp
+from transformers import DataCollatorForLanguageModeling
 
-from esm.collator import MLMDataCollatorWithFlattening
+from esm.collator import DataCollatorWithFlattening
 from esm.modeling_esm_te import NVEsmConfig, NVEsmEmbeddings, NVEsmForMaskedLM
 
 
@@ -39,12 +40,14 @@
 
 @pytest.fixture
 def input_data_thd(tokenizer, tokenized_proteins):
-    data_collator = MLMDataCollatorWithFlattening(
-        tokenizer=tokenizer,
-        mlm_probability=0.15,
-        seed=42,
-        bshd_equivalent=True,
-        bshd_pad_to_multiple_of=32,
+    """The collator here needs to exactly match the one used in the `input_data` fixture for golden values to pass."""
+    data_collator = DataCollatorWithFlattening(
+        collator=DataCollatorForLanguageModeling(
+            tokenizer=tokenizer,
+            mlm_probability=0.15,
+            pad_to_multiple_of=32,
+            seed=42,
+        )
     )
     return data_collator(tokenized_proteins)
 
 
@@ -292,3 +292,6 @@ training configurations, allowing for easy modification of training hyper-parame
 
 Configuration parameters can be overridden from the command line, e.g.
 `python train_fsdp2.py --config-name L0_sanity fp8_config.enabled=true`.
+
+For verbose logging, use the hydra command line override `hydra.verbose=true`, see
+https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ for more details.