Skip to content

Commit e78535c

Browse files
committed
add TestNemotronH_47B and TestNemotronH_56B
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent ddc35fc commit e78535c

File tree

4 files changed

+108
-6
lines changed

4 files changed

+108
-6
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,16 @@ nvidia/Nemotron-H-8B-Base-8K:
9696
- quant_algo: FP8
9797
kv_cache_quant_algo: FP8
9898
accuracy: 85.78
99+
nvidia/Nemotron-H-47B-Base-8K:
100+
- accuracy: 46.20
101+
- quant_algo: FP8
102+
kv_cache_quant_algo: FP8
103+
accuracy: 85.78
104+
nvidia/Nemotron-H-56B-Base-8K:
105+
- accuracy: 46.20
106+
- quant_algo: FP8
107+
kv_cache_quant_algo: FP8
108+
accuracy: 85.78
99109
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
100110
- accuracy: 37.15
101111
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,16 @@ nvidia/Nemotron-H-8B-Base-8K:
185185
- quant_algo: FP8
186186
kv_cache_quant_algo: FP8
187187
accuracy: 69.180
188+
nvidia/Nemotron-H-47B-Base-8K:
189+
- accuracy: 69.590
190+
- quant_algo: FP8
191+
kv_cache_quant_algo: FP8
192+
accuracy: 69.180
193+
nvidia/Nemotron-H-56B-Base-8K:
194+
- accuracy: 69.590
195+
- quant_algo: FP8
196+
kv_cache_quant_algo: FP8
197+
accuracy: 69.180
188198
microsoft/Phi-4-mini-instruct:
189199
- accuracy: 68.98
190200
# Created a dummy accuracy to track tp_size=2 for phi4-mini model.

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 85 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,32 +1605,111 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
16051605
@parametrize_with_ids("cuda_graph", [False, True])
16061606
def test_auto_dtype(self, cuda_graph):
16071607
# TODO: remove max_batch_size after mamba cache manager is supported
1608-
# ToDo: check 47b and 56b model
1608+
# Once removed max_batch_size, the test will OOM
16091609
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
16101610
with LLM(self.MODEL_PATH,
16111611
kv_cache_config=kv_cache_config,
1612-
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
1613-
max_batch_size=128) as llm:
1612+
max_batch_size=128,
1613+
cuda_graph_config=CudaGraphConfig()
1614+
if cuda_graph else None) as llm:
16141615
task = MMLU(self.MODEL_NAME)
16151616
task.evaluate(llm)
16161617
task = GSM8K(self.MODEL_NAME)
16171618
task.evaluate(llm)
16181619

1619-
@skip_pre_hopper
1620+
@skip_pre_ada
16201621
@parametrize_with_ids("cuda_graph", [False, True])
16211622
def test_reasoning_fp8_prequantized(self, cuda_graph):
16221623
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
16231624
with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8",
16241625
kv_cache_config=kv_cache_config,
1625-
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
1626-
max_batch_size=256) as llm:
1626+
max_batch_size=256,
1627+
cuda_graph_config=CudaGraphConfig()
1628+
if cuda_graph else None) as llm:
1629+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
1630+
task = MMLU(self.MODEL_NAME)
1631+
task.evaluate(llm)
1632+
task = GSM8K(self.MODEL_NAME)
1633+
task.evaluate(llm)
1634+
1635+
1636+
@pytest.mark.skip_less_device(8)
1637+
@pytest.mark.skip_less_device_memory(80000)
1638+
class TestNemotronH_47B_Base(LlmapiAccuracyTestHarness):
1639+
MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
1640+
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-47B-Base-8K"
1641+
1642+
@parametrize_with_ids("cuda_graph", [False, True])
1643+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1644+
(8, 1, 8)],
1645+
ids=["tp8", "tp8ep4", "tp8ep8"])
1646+
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
1647+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
1648+
free_gpu_memory_fraction=0.6)
1649+
with LLM(self.MODEL_PATH,
1650+
tensor_parallel_size=tp_size,
1651+
pipeline_parallel_size=pp_size,
1652+
moe_expert_parallel_size=ep_size,
1653+
kv_cache_config=kv_cache_config,
1654+
max_batch_size=256,
1655+
cuda_graph_config=CudaGraphConfig()
1656+
if cuda_graph else None) as llm:
1657+
task = MMLU(self.MODEL_NAME)
1658+
task.evaluate(llm)
1659+
task = GSM8K(self.MODEL_NAME)
1660+
task.evaluate(llm)
1661+
1662+
@skip_pre_ada
1663+
@parametrize_with_ids("cuda_graph", [False, True])
1664+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1665+
(8, 1, 8)],
1666+
ids=["tp8", "tp8ep4", "tp8ep8"])
1667+
def test_reasoning_fp8_prequantized(self, cuda_graph, tp_size, pp_size,
1668+
ep_size):
1669+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
1670+
free_gpu_memory_fraction=0.6)
1671+
with LLM(f"{llm_models_root()}/Nemotron-H-47B-Reasoning-128K-FP8",
1672+
kv_cache_config=kv_cache_config,
1673+
tensor_parallel_size=tp_size,
1674+
pipeline_parallel_size=pp_size,
1675+
moe_expert_parallel_size=ep_size,
1676+
max_batch_size=256,
1677+
cuda_graph_config=CudaGraphConfig()
1678+
if cuda_graph else None) as llm:
16271679
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
16281680
task = MMLU(self.MODEL_NAME)
16291681
task.evaluate(llm)
16301682
task = GSM8K(self.MODEL_NAME)
16311683
task.evaluate(llm)
16321684

16331685

1686+
@pytest.mark.skip_less_device(8)
1687+
@pytest.mark.skip_less_device_memory(80000)
1688+
class TestNemotronH_56B_Base(LlmapiAccuracyTestHarness):
1689+
MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
1690+
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-56B-Base-8K"
1691+
1692+
@parametrize_with_ids("cuda_graph", [False, True])
1693+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1694+
(8, 1, 8)],
1695+
ids=["tp8", "tp8ep4", "tp8ep8"])
1696+
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
1697+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
1698+
free_gpu_memory_fraction=0.6)
1699+
with LLM(self.MODEL_PATH,
1700+
tensor_parallel_size=tp_size,
1701+
pipeline_parallel_size=pp_size,
1702+
moe_expert_parallel_size=ep_size,
1703+
kv_cache_config=kv_cache_config,
1704+
max_batch_size=256,
1705+
cuda_graph_config=CudaGraphConfig()
1706+
if cuda_graph else None) as llm:
1707+
task = MMLU(self.MODEL_NAME)
1708+
task.evaluate(llm)
1709+
task = GSM8K(self.MODEL_NAME)
1710+
task.evaluate(llm)
1711+
1712+
16341713
class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
16351714
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
16361715
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"

tests/integration/test_lists/qa/benchmark_test_list.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
1515
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
1616
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
1717
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
18+
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
19+
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
20+
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
1821
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
1922
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
2023
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]

0 commit comments

Comments
 (0)