@@ -1588,13 +1588,11 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
1588
1588
1589
1589
@parametrize_with_ids ("cuda_graph" , [False , True ])
1590
1590
def test_auto_dtype (self , cuda_graph ):
1591
- # TODO: remove max_batch_size after mamba cache manager is supported
1592
- # ToDo: check 47b and 56b model
1593
1591
kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1594
1592
with LLM (self .MODEL_PATH ,
1595
1593
kv_cache_config = kv_cache_config ,
1596
- cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
1597
- max_batch_size = 128 ) as llm :
1594
+ cuda_graph_config = CudaGraphConfig ()
1595
+ if cuda_graph else None ) as llm :
1598
1596
task = MMLU (self .MODEL_NAME )
1599
1597
task .evaluate (llm )
1600
1598
task = GSM8K (self .MODEL_NAME )
@@ -1606,15 +1604,89 @@ def test_reasoning_fp8_prequantized(self, cuda_graph):
1606
1604
kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1607
1605
with LLM (f"{ llm_models_root ()} /Nemotron-H-8B-Reasoning-128K-FP8" ,
1608
1606
kv_cache_config = kv_cache_config ,
1609
- cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
1610
- max_batch_size = 256 ) as llm :
1607
+ cuda_graph_config = CudaGraphConfig ()
1608
+ if cuda_graph else None ) as llm :
1609
+ assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
1610
+ task = MMLU (self .MODEL_NAME )
1611
+ task .evaluate (llm )
1612
+ task = GSM8K (self .MODEL_NAME )
1613
+ task .evaluate (llm )
1614
+
1615
+
1616
+ @pytest .mark .skip_less_device (8 )
1617
+ @pytest .mark .skip_less_device_memory (80000 )
1618
+ class TestNemotronH_47B_Base (LlmapiAccuracyTestHarness ):
1619
+ MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
1620
+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-47B-Base-8K"
1621
+
1622
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1623
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1624
+ (8 , 1 , 8 )],
1625
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1626
+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1627
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1628
+ free_gpu_memory_fraction = 0.6 )
1629
+ with LLM (self .MODEL_PATH ,
1630
+ tensor_parallel_size = tp_size ,
1631
+ pipeline_parallel_size = pp_size ,
1632
+ moe_expert_parallel_size = ep_size ,
1633
+ kv_cache_config = kv_cache_config ,
1634
+ cuda_graph_config = CudaGraphConfig ()
1635
+ if cuda_graph else None ) as llm :
1636
+ task = MMLU (self .MODEL_NAME )
1637
+ task .evaluate (llm )
1638
+ task = GSM8K (self .MODEL_NAME )
1639
+ task .evaluate (llm )
1640
+
1641
+ @skip_pre_hopper
1642
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1643
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1644
+ (8 , 1 , 8 )],
1645
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1646
+ def test_reasoning_fp8_prequantized (self , cuda_graph , tp_size , pp_size ,
1647
+ ep_size ):
1648
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1649
+ free_gpu_memory_fraction = 0.6 )
1650
+ with LLM (f"{ llm_models_root ()} /Nemotron-H-47B-Reasoning-128K-FP8" ,
1651
+ kv_cache_config = kv_cache_config ,
1652
+ tensor_parallel_size = tp_size ,
1653
+ pipeline_parallel_size = pp_size ,
1654
+ moe_expert_parallel_size = ep_size ,
1655
+ cuda_graph_config = CudaGraphConfig ()
1656
+ if cuda_graph else None ) as llm :
1611
1657
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
1612
1658
task = MMLU (self .MODEL_NAME )
1613
1659
task .evaluate (llm )
1614
1660
task = GSM8K (self .MODEL_NAME )
1615
1661
task .evaluate (llm )
1616
1662
1617
1663
1664
+ @pytest .mark .skip_less_device (8 )
1665
+ @pytest .mark .skip_less_device_memory (80000 )
1666
+ class TestNemotronH_56B_Base (LlmapiAccuracyTestHarness ):
1667
+ MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
1668
+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-56B-Base-8K"
1669
+
1670
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1671
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1672
+ (8 , 1 , 8 )],
1673
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1674
+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1675
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1676
+ free_gpu_memory_fraction = 0.6 )
1677
+ with LLM (self .MODEL_PATH ,
1678
+ tensor_parallel_size = tp_size ,
1679
+ pipeline_parallel_size = pp_size ,
1680
+ moe_expert_parallel_size = ep_size ,
1681
+ kv_cache_config = kv_cache_config ,
1682
+ cuda_graph_config = CudaGraphConfig ()
1683
+ if cuda_graph else None ) as llm :
1684
+ task = MMLU (self .MODEL_NAME )
1685
+ task .evaluate (llm )
1686
+ task = GSM8K (self .MODEL_NAME )
1687
+ task .evaluate (llm )
1688
+
1689
+
1618
1690
class TestQwen2_7BInstruct (LlmapiAccuracyTestHarness ):
1619
1691
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
1620
1692
MODEL_PATH = f"{ llm_models_root ()} /Qwen2-7B-Instruct"
0 commit comments