Skip to content

Commit 162176b

Browse files
committed
add TestNemotronH_47B and TestNemotronH_56B
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent 2988bcc commit 162176b

File tree

2 files changed

+81
-6
lines changed

2 files changed

+81
-6
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,13 +1588,11 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
15881588

15891589
@parametrize_with_ids("cuda_graph", [False, True])
15901590
def test_auto_dtype(self, cuda_graph):
1591-
# TODO: remove max_batch_size after mamba cache manager is supported
1592-
# ToDo: check 47b and 56b model
15931591
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
15941592
with LLM(self.MODEL_PATH,
15951593
kv_cache_config=kv_cache_config,
1596-
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
1597-
max_batch_size=128) as llm:
1594+
cuda_graph_config=CudaGraphConfig()
1595+
if cuda_graph else None) as llm:
15981596
task = MMLU(self.MODEL_NAME)
15991597
task.evaluate(llm)
16001598
task = GSM8K(self.MODEL_NAME)
@@ -1606,15 +1604,89 @@ def test_reasoning_fp8_prequantized(self, cuda_graph):
16061604
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
16071605
with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8",
16081606
kv_cache_config=kv_cache_config,
1609-
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
1610-
max_batch_size=256) as llm:
1607+
cuda_graph_config=CudaGraphConfig()
1608+
if cuda_graph else None) as llm:
1609+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
1610+
task = MMLU(self.MODEL_NAME)
1611+
task.evaluate(llm)
1612+
task = GSM8K(self.MODEL_NAME)
1613+
task.evaluate(llm)
1614+
1615+
1616+
@pytest.mark.skip_less_device(8)
1617+
@pytest.mark.skip_less_device_memory(80000)
1618+
class TestNemotronH_47B_Base(LlmapiAccuracyTestHarness):
1619+
MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
1620+
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-47B-Base-8K"
1621+
1622+
@parametrize_with_ids("cuda_graph", [False, True])
1623+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1624+
(8, 1, 8)],
1625+
ids=["tp8", "tp8ep4", "tp8ep8"])
1626+
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
1627+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
1628+
free_gpu_memory_fraction=0.6)
1629+
with LLM(self.MODEL_PATH,
1630+
tensor_parallel_size=tp_size,
1631+
pipeline_parallel_size=pp_size,
1632+
moe_expert_parallel_size=ep_size,
1633+
kv_cache_config=kv_cache_config,
1634+
cuda_graph_config=CudaGraphConfig()
1635+
if cuda_graph else None) as llm:
1636+
task = MMLU(self.MODEL_NAME)
1637+
task.evaluate(llm)
1638+
task = GSM8K(self.MODEL_NAME)
1639+
task.evaluate(llm)
1640+
1641+
@skip_pre_hopper
1642+
@parametrize_with_ids("cuda_graph", [False, True])
1643+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1644+
(8, 1, 8)],
1645+
ids=["tp8", "tp8ep4", "tp8ep8"])
1646+
def test_reasoning_fp8_prequantized(self, cuda_graph, tp_size, pp_size,
1647+
ep_size):
1648+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
1649+
free_gpu_memory_fraction=0.6)
1650+
with LLM(f"{llm_models_root()}/Nemotron-H-47B-Reasoning-128K-FP8",
1651+
kv_cache_config=kv_cache_config,
1652+
tensor_parallel_size=tp_size,
1653+
pipeline_parallel_size=pp_size,
1654+
moe_expert_parallel_size=ep_size,
1655+
cuda_graph_config=CudaGraphConfig()
1656+
if cuda_graph else None) as llm:
16111657
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
16121658
task = MMLU(self.MODEL_NAME)
16131659
task.evaluate(llm)
16141660
task = GSM8K(self.MODEL_NAME)
16151661
task.evaluate(llm)
16161662

16171663

1664+
@pytest.mark.skip_less_device(8)
1665+
@pytest.mark.skip_less_device_memory(80000)
1666+
class TestNemotronH_56B_Base(LlmapiAccuracyTestHarness):
1667+
MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
1668+
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-56B-Base-8K"
1669+
1670+
@parametrize_with_ids("cuda_graph", [False, True])
1671+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1672+
(8, 1, 8)],
1673+
ids=["tp8", "tp8ep4", "tp8ep8"])
1674+
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
1675+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
1676+
free_gpu_memory_fraction=0.6)
1677+
with LLM(self.MODEL_PATH,
1678+
tensor_parallel_size=tp_size,
1679+
pipeline_parallel_size=pp_size,
1680+
moe_expert_parallel_size=ep_size,
1681+
kv_cache_config=kv_cache_config,
1682+
cuda_graph_config=CudaGraphConfig()
1683+
if cuda_graph else None) as llm:
1684+
task = MMLU(self.MODEL_NAME)
1685+
task.evaluate(llm)
1686+
task = GSM8K(self.MODEL_NAME)
1687+
task.evaluate(llm)
1688+
1689+
16181690
class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
16191691
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
16201692
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"

tests/integration/test_lists/qa/benchmark_test_list.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
1515
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
1616
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
1717
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
18+
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
19+
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
20+
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
1821
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
1922
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
2023
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]

0 commit comments

Comments
 (0)