diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index f93bae9593e..d5bee1e9a91 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -502,6 +502,7 @@ def test_chunked_prefill(self, attn_backend): (8, 1, 8)], ids=["tp8", "tp8ep4", "tp8ep8"]) def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) with LLM( f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", tensor_parallel_size=tp_size, @@ -509,6 +510,7 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size): max_seq_len=8192, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8