adding lm-eval test harness (#1371)

pstjohn · web-flow · commit 1071105ed567 · 2025-12-12T00:42:46.000Z
Adds explicit testing for lm-eval, although these don't trigger in CI
(currently). We should find a faster way to test this

---------

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/llama3/modeling_llama_te.py b/bionemo-recipes/models/llama3/modeling_llama_te.py
@@ -30,12 +30,12 @@
 
 
 AUTO_MAP = {
-    "AutoConfig": "llama3_nv.NVLlamaConfig",
-    "AutoModel": "llama3_nv.NVLlamaModel",
-    "AutoModelForCausalLM": "llama3_nv.NVLlamaForCausalLM",
-    "AutoModelForSequenceClassification": "llama3_nv.NVLlamaForSequenceClassification",
-    "AutoModelForQuestionAnswering": "llama3_nv.NVLlamaForQuestionAnswering",
-    "AutoModelForTokenClassification": "llama3_nv.NVLlamaForTokenClassification",
+    "AutoConfig": "modeling_llama_te.NVLlamaConfig",
+    "AutoModel": "modeling_llama_te.NVLlamaModel",
+    "AutoModelForCausalLM": "modeling_llama_te.NVLlamaForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_llama_te.NVLlamaForSequenceClassification",
+    "AutoModelForQuestionAnswering": "modeling_llama_te.NVLlamaForQuestionAnswering",
+    "AutoModelForTokenClassification": "modeling_llama_te.NVLlamaForTokenClassification",
 }
 
 
@@ -191,11 +191,12 @@ def forward(
 
         # This might be slower for BSHD + padding with fused attention backend. But it should be faster for the flash
         # attention backend.
+        self_attn_mask_type = "padding_causal"
         if should_pack_inputs:
             # Left-side padding is not supported in TE layers, so to make generation work with TE we dynamically convert
             # to THD-style inputs in our forward pass, and then convert back to BSHD for the output. This lets the
             # entire transformer stack run in THD mode.
-            assert attention_mask is not None, "Attention mask is required when using BSHD inputs."
+            assert attention_mask is not None, "Attention mask is required when packing BSHD inputs."
             batch_size = hidden_states.size(0)
             hidden_states, indices, cu_seqlens, max_seqlen, _ = _unpad_input(hidden_states, attention_mask)
             cu_seq_lens_q = cu_seq_lens_k = cu_seqlens
@@ -213,8 +214,10 @@ def forward(
             max_length_k = kwargs["max_length_k"]
 
         else:
-            assert attention_mask is not None, "Attention mask is required when using BSHD inputs."
-            attention_mask = attention_mask[:, None, None, :] < -1
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, None, None, :] < -1
+            else:
+                self_attn_mask_type = "causal"
             cu_seq_lens_q = cu_seq_lens_k = None
             max_length_q = max_length_k = hidden_states.size(1)
 
@@ -243,6 +246,7 @@ def forward(
                 hidden_states,
                 attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
                 rotary_pos_emb=te_rope_emb,
+                self_attn_mask_type=self_attn_mask_type,
                 inference_params=past_key_values,
                 cu_seqlens_q=cu_seq_lens_q,
                 cu_seqlens_kv=cu_seq_lens_k,
diff --git a/bionemo-recipes/models/llama3/requirements.txt b/bionemo-recipes/models/llama3/requirements.txt
@@ -1,3 +1,4 @@
+lm-eval  # For testing
 torch
 torchao!=0.14.0
 transformer_engine[pytorch]
diff --git a/bionemo-recipes/models/llama3/tests/test_lm_eval.py b/bionemo-recipes/models/llama3/tests/test_lm_eval.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+import pytest
+from transformers import AutoTokenizer
+
+from modeling_llama_te import AUTO_MAP, NVLlamaConfig, NVLlamaForCausalLM
+
+
+@pytest.fixture
+def model_checkpoint(tmp_path: Path):
+    tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-8B-Instruct-FP8")
+    config = NVLlamaConfig.from_pretrained(
+        "nvidia/Llama-3.1-8B-Instruct-FP8", num_hidden_layers=2, attn_input_format="bshd"
+    )
+    model = NVLlamaForCausalLM(config)
+    model.save_pretrained(tmp_path / "checkpoint")
+
+    tokenizer = AutoTokenizer.from_pretrained("nucleotide_fast_tokenizer")
+    tokenizer.save_pretrained(tmp_path / "checkpoint")
+
+    # Patch the config
+    with open(tmp_path / "checkpoint" / "config.json", "r") as f:
+        config = json.load(f)
+
+    config["auto_map"] = AUTO_MAP
+
+    with open(tmp_path / "checkpoint" / "config.json", "w") as f:
+        json.dump(config, f, indent=2, sort_keys=True)
+
+    shutil.copy("modeling_llama_te.py", tmp_path / "checkpoint" / "modeling_llama_te.py")
+    return tmp_path / "checkpoint"
+
+
+@pytest.mark.skipif(os.getenv("CI", "false") == "true", reason="Skipping slow lm-eval test in CI.")
+def test_lm_eval(model_checkpoint: Path):
+    # Create a mock model checkpoint
+
+    cmd = [
+        "lm_eval",
+        "--model",
+        "hf",
+        "--model_args",
+        f"pretrained={model_checkpoint},tokenizer={model_checkpoint}",
+        "--trust_remote_code",
+        "--tasks",
+        "arc_easy",  # TODO(BIONEMO-3410): support other tasks that use inference, e.g. coqa
+        "--device",
+        "cuda:0",
+        "--batch_size",
+        "8",
+    ]
+
+    result = subprocess.run(
+        cmd,
+        check=False,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        timeout=240,
+    )
+
+    if result.returncode != 0:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Command failed with exit code {result.returncode}")
diff --git a/bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py b/bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py
@@ -66,6 +66,25 @@ def test_llama_model_forward_pass(input_text, attn_input_format):
     assert len(outputs.hidden_states) == config.num_hidden_layers + 1
 
 
+def test_llama_model_forward_pass_no_attention_mask():
+    tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-8B-Instruct-FP8")
+    config = NVLlamaConfig.from_pretrained(
+        "nvidia/Llama-3.1-8B-Instruct-FP8", num_hidden_layers=2, attn_input_format="bshd"
+    )
+    model = NVLlamaForCausalLM(config)
+
+    input_text = ["Hello, world!"]
+    inputs = tokenizer(input_text, return_tensors="pt")
+    inputs = {k: v.to("cuda") for k, v in inputs.items() if k != "attention_mask"}
+    model.to("cuda")
+    with torch.no_grad():
+        outputs = model(**inputs, output_hidden_states=True)
+
+    assert outputs.logits is not None
+    assert outputs.hidden_states is not None
+    assert len(outputs.hidden_states) == config.num_hidden_layers + 1
+
+
 @pytest.mark.parametrize("attn_input_format", ["thd", "bshd"])
 def test_llama_model_backward_pass(input_text, attn_input_format):
     if attn_input_format == "thd" and torch.cuda.get_device_capability()[0] == 12:
diff --git a/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py
@@ -30,12 +30,12 @@
 
 
 AUTO_MAP = {
-    "AutoConfig": "llama3_nv.NVLlamaConfig",
-    "AutoModel": "llama3_nv.NVLlamaModel",
-    "AutoModelForCausalLM": "llama3_nv.NVLlamaForCausalLM",
-    "AutoModelForSequenceClassification": "llama3_nv.NVLlamaForSequenceClassification",
-    "AutoModelForQuestionAnswering": "llama3_nv.NVLlamaForQuestionAnswering",
-    "AutoModelForTokenClassification": "llama3_nv.NVLlamaForTokenClassification",
+    "AutoConfig": "modeling_llama_te.NVLlamaConfig",
+    "AutoModel": "modeling_llama_te.NVLlamaModel",
+    "AutoModelForCausalLM": "modeling_llama_te.NVLlamaForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_llama_te.NVLlamaForSequenceClassification",
+    "AutoModelForQuestionAnswering": "modeling_llama_te.NVLlamaForQuestionAnswering",
+    "AutoModelForTokenClassification": "modeling_llama_te.NVLlamaForTokenClassification",
 }
 
 
@@ -191,11 +191,12 @@ def forward(
 
         # This might be slower for BSHD + padding with fused attention backend. But it should be faster for the flash
         # attention backend.
+        self_attn_mask_type = "padding_causal"
         if should_pack_inputs:
             # Left-side padding is not supported in TE layers, so to make generation work with TE we dynamically convert
             # to THD-style inputs in our forward pass, and then convert back to BSHD for the output. This lets the
             # entire transformer stack run in THD mode.
-            assert attention_mask is not None, "Attention mask is required when using BSHD inputs."
+            assert attention_mask is not None, "Attention mask is required when packing BSHD inputs."
             batch_size = hidden_states.size(0)
             hidden_states, indices, cu_seqlens, max_seqlen, _ = _unpad_input(hidden_states, attention_mask)
             cu_seq_lens_q = cu_seq_lens_k = cu_seqlens
@@ -213,8 +214,10 @@ def forward(
             max_length_k = kwargs["max_length_k"]
 
         else:
-            assert attention_mask is not None, "Attention mask is required when using BSHD inputs."
-            attention_mask = attention_mask[:, None, None, :] < -1
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, None, None, :] < -1
+            else:
+                self_attn_mask_type = "causal"
             cu_seq_lens_q = cu_seq_lens_k = None
             max_length_q = max_length_k = hidden_states.size(1)
 
@@ -243,6 +246,7 @@ def forward(
                 hidden_states,
                 attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
                 rotary_pos_emb=te_rope_emb,
+                self_attn_mask_type=self_attn_mask_type,
                 inference_params=past_key_values,
                 cu_seqlens_q=cu_seq_lens_q,
                 cu_seqlens_kv=cu_seq_lens_k,

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+lm-eval # For testing`
`1`	`2`	`torch`
`2`	`3`	`torchao!=0.14.0`
`3`	`4`	`transformer_engine[pytorch]`