@@ -1605,32 +1605,111 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
1605
1605
@parametrize_with_ids ("cuda_graph" , [False , True ])
1606
1606
def test_auto_dtype (self , cuda_graph ):
1607
1607
# TODO: remove max_batch_size after mamba cache manager is supported
1608
- # ToDo: check 47b and 56b model
1608
+ # Once removed max_batch_size, the test will OOM
1609
1609
kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1610
1610
with LLM (self .MODEL_PATH ,
1611
1611
kv_cache_config = kv_cache_config ,
1612
- cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
1613
- max_batch_size = 128 ) as llm :
1612
+ max_batch_size = 128 ,
1613
+ cuda_graph_config = CudaGraphConfig ()
1614
+ if cuda_graph else None ) as llm :
1614
1615
task = MMLU (self .MODEL_NAME )
1615
1616
task .evaluate (llm )
1616
1617
task = GSM8K (self .MODEL_NAME )
1617
1618
task .evaluate (llm )
1618
1619
1619
- @skip_pre_hopper
1620
+ @skip_pre_ada
1620
1621
@parametrize_with_ids ("cuda_graph" , [False , True ])
1621
1622
def test_reasoning_fp8_prequantized (self , cuda_graph ):
1622
1623
kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1623
1624
with LLM (f"{ llm_models_root ()} /Nemotron-H-8B-Reasoning-128K-FP8" ,
1624
1625
kv_cache_config = kv_cache_config ,
1625
- cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
1626
- max_batch_size = 256 ) as llm :
1626
+ max_batch_size = 256 ,
1627
+ cuda_graph_config = CudaGraphConfig ()
1628
+ if cuda_graph else None ) as llm :
1629
+ assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
1630
+ task = MMLU (self .MODEL_NAME )
1631
+ task .evaluate (llm )
1632
+ task = GSM8K (self .MODEL_NAME )
1633
+ task .evaluate (llm )
1634
+
1635
+
1636
+ @pytest .mark .skip_less_device (8 )
1637
+ @pytest .mark .skip_less_device_memory (80000 )
1638
+ class TestNemotronH_47B_Base (LlmapiAccuracyTestHarness ):
1639
+ MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
1640
+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-47B-Base-8K"
1641
+
1642
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1643
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1644
+ (8 , 1 , 8 )],
1645
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1646
+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1647
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1648
+ free_gpu_memory_fraction = 0.6 )
1649
+ with LLM (self .MODEL_PATH ,
1650
+ tensor_parallel_size = tp_size ,
1651
+ pipeline_parallel_size = pp_size ,
1652
+ moe_expert_parallel_size = ep_size ,
1653
+ kv_cache_config = kv_cache_config ,
1654
+ max_batch_size = 256 ,
1655
+ cuda_graph_config = CudaGraphConfig ()
1656
+ if cuda_graph else None ) as llm :
1657
+ task = MMLU (self .MODEL_NAME )
1658
+ task .evaluate (llm )
1659
+ task = GSM8K (self .MODEL_NAME )
1660
+ task .evaluate (llm )
1661
+
1662
+ @skip_pre_ada
1663
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1664
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1665
+ (8 , 1 , 8 )],
1666
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1667
+ def test_reasoning_fp8_prequantized (self , cuda_graph , tp_size , pp_size ,
1668
+ ep_size ):
1669
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1670
+ free_gpu_memory_fraction = 0.6 )
1671
+ with LLM (f"{ llm_models_root ()} /Nemotron-H-47B-Reasoning-128K-FP8" ,
1672
+ kv_cache_config = kv_cache_config ,
1673
+ tensor_parallel_size = tp_size ,
1674
+ pipeline_parallel_size = pp_size ,
1675
+ moe_expert_parallel_size = ep_size ,
1676
+ max_batch_size = 256 ,
1677
+ cuda_graph_config = CudaGraphConfig ()
1678
+ if cuda_graph else None ) as llm :
1627
1679
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
1628
1680
task = MMLU (self .MODEL_NAME )
1629
1681
task .evaluate (llm )
1630
1682
task = GSM8K (self .MODEL_NAME )
1631
1683
task .evaluate (llm )
1632
1684
1633
1685
1686
+ @pytest .mark .skip_less_device (8 )
1687
+ @pytest .mark .skip_less_device_memory (80000 )
1688
+ class TestNemotronH_56B_Base (LlmapiAccuracyTestHarness ):
1689
+ MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
1690
+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-56B-Base-8K"
1691
+
1692
+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1693
+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1694
+ (8 , 1 , 8 )],
1695
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1696
+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1697
+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
1698
+ free_gpu_memory_fraction = 0.6 )
1699
+ with LLM (self .MODEL_PATH ,
1700
+ tensor_parallel_size = tp_size ,
1701
+ pipeline_parallel_size = pp_size ,
1702
+ moe_expert_parallel_size = ep_size ,
1703
+ kv_cache_config = kv_cache_config ,
1704
+ max_batch_size = 256 ,
1705
+ cuda_graph_config = CudaGraphConfig ()
1706
+ if cuda_graph else None ) as llm :
1707
+ task = MMLU (self .MODEL_NAME )
1708
+ task .evaluate (llm )
1709
+ task = GSM8K (self .MODEL_NAME )
1710
+ task .evaluate (llm )
1711
+
1712
+
1634
1713
class TestQwen2_7BInstruct (LlmapiAccuracyTestHarness ):
1635
1714
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
1636
1715
MODEL_PATH = f"{ llm_models_root ()} /Qwen2-7B-Instruct"
0 commit comments