tests: add TestNemotronH cuda graph tests (NVIDIA#6390)

xinhe-nv · LarryXFly · Ria Jain · commit dec29edad08c · 2025-08-07T16:12:59.000-07:00
Signed-off-by: Xin He (SW-GPU) &lt;200704525+xinhe-nv@users.noreply.github.com&gt;
Co-authored-by: Larry &lt;197874197+LarryXFly@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -96,6 +96,16 @@ nvidia/Nemotron-H-8B-Base-8K:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 85.78
+nvidia/Nemotron-H-47B-Base-8K:
+  - accuracy: 88.82
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 88.55
+nvidia/Nemotron-H-56B-Base-8K:
+  - accuracy: 89.27
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 89.27
 nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
   - accuracy: 37.15
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -185,6 +185,16 @@ nvidia/Nemotron-H-8B-Base-8K:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 69.180
+nvidia/Nemotron-H-47B-Base-8K:
+  - accuracy: 83.26
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 82.68
+nvidia/Nemotron-H-56B-Base-8K:
+  - accuracy: 83.82
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 83.82
 microsoft/Phi-4-mini-instruct:
   - accuracy: 68.98
 # Created a dummy accuracy to track tp_size=2 for phi4-mini model.
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1602,31 +1602,114 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
     MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
     MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K"
 
-    def test_auto_dtype(self):
+    @parametrize_with_ids("cuda_graph", [False, True])
+    def test_auto_dtype(self, cuda_graph):
         # TODO: remove max_batch_size after mamba cache manager is supported
-        # ToDo: check 47b and 56b model
+        # Once removed max_batch_size, the test will OOM
         kv_cache_config = KvCacheConfig(enable_block_reuse=False)
         with LLM(self.MODEL_PATH,
                  kv_cache_config=kv_cache_config,
-                 max_batch_size=128) as llm:
+                 max_batch_size=128,
+                 cuda_graph_config=CudaGraphConfig()
+                 if cuda_graph else None) as llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
     @skip_pre_ada
-    def test_reasoning_fp8_prequantized(self):
+    @parametrize_with_ids("cuda_graph", [False, True])
+    def test_reasoning_fp8_prequantized(self, cuda_graph):
         kv_cache_config = KvCacheConfig(enable_block_reuse=False)
         with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8",
                  kv_cache_config=kv_cache_config,
-                 max_batch_size=256) as llm:
+                 max_batch_size=256,
+                 cuda_graph_config=CudaGraphConfig()
+                 if cuda_graph else None) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
 
+@pytest.mark.skip_less_device(8)
+@pytest.mark.skip_less_device_memory(80000)
+class TestNemotronH_47B_Base(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
+    MODEL_PATH = f"{llm_models_root()}/Nemotron-H-47B-Base-8K"
+
+    @parametrize_with_ids("cuda_graph", [False, True])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
+                                                         (8, 1, 8)],
+                             ids=["tp8", "tp8ep4", "tp8ep8"])
+    def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.6)
+        with LLM(self.MODEL_PATH,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=256,
+                 cuda_graph_config=CudaGraphConfig()
+                 if cuda_graph else None) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_ada
+    @parametrize_with_ids("cuda_graph", [False, True])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
+                                                         (8, 1, 8)],
+                             ids=["tp8", "tp8ep4", "tp8ep8"])
+    def test_reasoning_fp8_prequantized(self, cuda_graph, tp_size, pp_size,
+                                        ep_size):
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.6)
+        with LLM(f"{llm_models_root()}/Nemotron-H-47B-Reasoning-128K-FP8",
+                 kv_cache_config=kv_cache_config,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 max_batch_size=256,
+                 cuda_graph_config=CudaGraphConfig()
+                 if cuda_graph else None) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+@pytest.mark.skip_less_device(8)
+@pytest.mark.skip_less_device_memory(80000)
+class TestNemotronH_56B_Base(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
+    MODEL_PATH = f"{llm_models_root()}/Nemotron-H-56B-Base-8K"
+
+    @parametrize_with_ids("cuda_graph", [False, True])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
+                                                         (8, 1, 8)],
+                             ids=["tp8", "tp8ep4", "tp8ep8"])
+    def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.6)
+        with LLM(self.MODEL_PATH,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=256,
+                 cuda_graph_config=CudaGraphConfig()
+                 if cuda_graph else None) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
diff --git a/tests/integration/test_lists/qa/benchmark_test_list.txt b/tests/integration/test_lists/qa/benchmark_test_list.txt
@@ -12,8 +12,12 @@ accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_gra
 accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
-accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
+accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
+accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]