@@ -1602,31 +1602,114 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
1602
1602
MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
1603
1603
MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-8B-Base-8K"
1604
1604
1605
- def test_auto_dtype (self ):
1605
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1606
+ def test_auto_dtype (self , cuda_graph ):
1606
1607
# TODO: remove max_batch_size after mamba cache manager is supported
1607
- # ToDo: check 47b and 56b model
1608
+ # Once removed max_batch_size, the test will OOM
1608
1609
kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1609
1610
with LLM (self .MODEL_PATH ,
1610
1611
kv_cache_config = kv_cache_config ,
1611
- max_batch_size = 128 ) as llm :
1612
+ max_batch_size = 128 ,
1613
+ cuda_graph_config = CudaGraphConfig ()
1614
+ if cuda_graph else None ) as llm :
1612
1615
task = MMLU (self .MODEL_NAME )
1613
1616
task .evaluate (llm )
1614
1617
task = GSM8K (self .MODEL_NAME )
1615
1618
task .evaluate (llm )
1616
1619
1617
1620
@skip_pre_ada
1618
- def test_reasoning_fp8_prequantized (self ):
1621
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1622
+ def test_reasoning_fp8_prequantized (self , cuda_graph ):
1619
1623
kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1620
1624
with LLM (f"{ llm_models_root ()} /Nemotron-H-8B-Reasoning-128K-FP8" ,
1621
1625
kv_cache_config = kv_cache_config ,
1622
- max_batch_size = 256 ) as llm :
1626
+ max_batch_size = 256 ,
1627
+ cuda_graph_config = CudaGraphConfig ()
1628
+ if cuda_graph else None ) as llm :
1623
1629
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
1624
1630
task = MMLU (self .MODEL_NAME )
1625
1631
task .evaluate (llm )
1626
1632
task = GSM8K (self .MODEL_NAME )
1627
1633
task .evaluate (llm )
1628
1634
1629
1635
1636
+ @pytest .mark .skip_less_device (8 )
1637
+ @pytest .mark .skip_less_device_memory (80000 )
1638
+ class TestNemotronH_47B_Base (LlmapiAccuracyTestHarness ):
1639
+ MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
1640
+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-47B-Base-8K"
1641
+
1642
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1643
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1644
+ (8 , 1 , 8 )],
1645
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1646
+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1647
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1648
+ free_gpu_memory_fraction = 0.6 )
1649
+ with LLM (self .MODEL_PATH ,
1650
+ tensor_parallel_size = tp_size ,
1651
+ pipeline_parallel_size = pp_size ,
1652
+ moe_expert_parallel_size = ep_size ,
1653
+ kv_cache_config = kv_cache_config ,
1654
+ max_batch_size = 256 ,
1655
+ cuda_graph_config = CudaGraphConfig ()
1656
+ if cuda_graph else None ) as llm :
1657
+ task = MMLU (self .MODEL_NAME )
1658
+ task .evaluate (llm )
1659
+ task = GSM8K (self .MODEL_NAME )
1660
+ task .evaluate (llm )
1661
+
1662
+ @skip_pre_ada
1663
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1664
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1665
+ (8 , 1 , 8 )],
1666
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1667
+ def test_reasoning_fp8_prequantized (self , cuda_graph , tp_size , pp_size ,
1668
+ ep_size ):
1669
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1670
+ free_gpu_memory_fraction = 0.6 )
1671
+ with LLM (f"{ llm_models_root ()} /Nemotron-H-47B-Reasoning-128K-FP8" ,
1672
+ kv_cache_config = kv_cache_config ,
1673
+ tensor_parallel_size = tp_size ,
1674
+ pipeline_parallel_size = pp_size ,
1675
+ moe_expert_parallel_size = ep_size ,
1676
+ max_batch_size = 256 ,
1677
+ cuda_graph_config = CudaGraphConfig ()
1678
+ if cuda_graph else None ) as llm :
1679
+ assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
1680
+ task = MMLU (self .MODEL_NAME )
1681
+ task .evaluate (llm )
1682
+ task = GSM8K (self .MODEL_NAME )
1683
+ task .evaluate (llm )
1684
+
1685
+
1686
+ @pytest .mark .skip_less_device (8 )
1687
+ @pytest .mark .skip_less_device_memory (80000 )
1688
+ class TestNemotronH_56B_Base (LlmapiAccuracyTestHarness ):
1689
+ MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
1690
+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-56B-Base-8K"
1691
+
1692
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1693
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1694
+ (8 , 1 , 8 )],
1695
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1696
+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1697
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1698
+ free_gpu_memory_fraction = 0.6 )
1699
+ with LLM (self .MODEL_PATH ,
1700
+ tensor_parallel_size = tp_size ,
1701
+ pipeline_parallel_size = pp_size ,
1702
+ moe_expert_parallel_size = ep_size ,
1703
+ kv_cache_config = kv_cache_config ,
1704
+ max_batch_size = 256 ,
1705
+ cuda_graph_config = CudaGraphConfig ()
1706
+ if cuda_graph else None ) as llm :
1707
+ task = MMLU (self .MODEL_NAME )
1708
+ task .evaluate (llm )
1709
+ task = GSM8K (self .MODEL_NAME )
1710
+ task .evaluate (llm )
1711
+
1712
+
1630
1713
class TestQwen2_7BInstruct (LlmapiAccuracyTestHarness ):
1631
1714
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
1632
1715
MODEL_PATH = f"{ llm_models_root ()} /Qwen2-7B-Instruct"
0 commit comments