Skip to content

Commit cf00b40

Browse files
crazydemodominicshanshan
authored andcommitted
[https://nvbugs/5512734][fix] Update kv cache config for maverick (NVIDIA#7710)
Signed-off-by: Ivy Zhang <[email protected]> Signed-off-by: Wangshanshan <[email protected]>
1 parent 0272518 commit cf00b40

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -704,13 +704,15 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
704704
if get_device_count() != tp_size * pp_size:
705705
pytest.skip("Device count mismatch with world size")
706706

707+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
707708
with LLM(
708709
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
709710
tensor_parallel_size=tp_size,
710711
# Keep this low to avoid warmup OOM in CI
711712
max_seq_len=8192,
712713
pipeline_parallel_size=pp_size,
713714
moe_expert_parallel_size=ep_size,
715+
kv_cache_config=kv_cache_config,
714716
cuda_graph_config=CudaGraphConfig()
715717
if cuda_graph else None) as llm:
716718
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8

0 commit comments

Comments
 (0)