@@ -54,14 +54,12 @@ def run_benchmark(
54
54
backend : str = "_autodeploy" ,
55
55
report_json_path : str = None ,
56
56
max_batch_size : int = 32 ,
57
+ num_hidden_layers : int = 2 ,
57
58
):
58
59
"""Run benchmark and capture KV cache metrics from log output."""
59
60
60
61
# Read the test config to get free_mem_ratio
61
- config_path = f"{ temp_dir } /model_kwargs.yaml"
62
- with open (config_path , "r" ) as f :
63
- config = yaml .safe_load (f )
64
- free_mem_ratio = config .get ("free_mem_ratio" , 0.8 ) # Default to 0.8 if not specified
62
+ config_path = f"{ temp_dir } /extra_llm_api_options.yaml"
65
63
66
64
# Build the command to run the benchmark
67
65
cmd = [
@@ -79,19 +77,27 @@ def run_benchmark(
79
77
str (max_batch_size ),
80
78
]
81
79
82
- # Add extra_llm_api_options only for autodeploy backend
83
- if backend == "_autodeploy" :
84
- cmd .extend (["--extra_llm_api_options" , config_path ])
85
-
86
80
# Add report_json argument if path is provided
87
81
if report_json_path :
88
82
cmd .extend (["--report_json" , report_json_path ])
89
83
90
- print (f"🚀 Running benchmark command ({ backend } backend): { ' ' .join (cmd )} " )
91
- print (f"📋 Using free_mem_ratio from config: { free_mem_ratio } " )
84
+ if backend == "_autodeploy" :
85
+ # Add extra_llm_api_options only for autodeploy backend
86
+ cmd .extend (["--extra_llm_api_options" , config_path ])
87
+ with open (config_path , "r" ) as f :
88
+ config = yaml .safe_load (f )
89
+ free_mem_ratio = config .get ("free_mem_ratio" , 0.0 ) # Default to 0.0 if not specified
90
+ print (f"📋 Using free_mem_ratio from config: { free_mem_ratio } " )
92
91
93
92
# Run benchmark as subprocess to capture ALL output
94
- result = subprocess .run (cmd , capture_output = True , text = True )
93
+ import os
94
+
95
+ env = os .environ .copy ()
96
+ if backend == "pytorch" :
97
+ env ["TLLM_OVERRIDE_LAYER_NUM" ] = str (num_hidden_layers )
98
+ print (f"📋 Using TLLM_OVERRIDE_LAYER_NUM from env: { env ['TLLM_OVERRIDE_LAYER_NUM' ]} " )
99
+ print (f"🚀 Running benchmark command ({ backend } backend): { ' ' .join (cmd )} " )
100
+ result = subprocess .run (cmd , capture_output = True , text = True , env = env )
95
101
96
102
# Check if the command succeeded
97
103
assert result .returncode == 0 , (
@@ -272,14 +278,14 @@ def calculate_expected_kv_cache_metrics(free_mem_ratio: float):
272
278
# For TinyLlama-1.1B, model should be 2.2GB
273
279
estimated_model_size_mb = 2200 # Conservative estimate
274
280
# TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/6335 check why there is extra consumption
275
- extra_consumption_mb = 800
281
+ extra_consumption_mb = 2200
276
282
expected_free_mem_range = (
277
283
total_mem_mb - estimated_model_size_mb - extra_consumption_mb ,
278
284
total_mem_mb - estimated_model_size_mb ,
279
285
)
280
286
281
- # Current cache size is typically small initially (80MB range)
282
- expected_current_cache_size = 83886080 # This is more stable across GPUs
287
+ # Current cache size is typically small initially (16MB range)
288
+ expected_current_cache_size = 16777216
283
289
284
290
# Free memory values should be in reasonable range
285
291
expected_free_mem_pre_range = expected_free_mem_range
@@ -444,12 +450,12 @@ def print_kv_cache_metrics(kv_cache_metrics):
444
450
def trtllm_bench_unified_comparison (
445
451
llm_root , # noqa: F811
446
452
comparison_mode = "backend" ,
447
- free_mem_ratio = 0.9 ,
448
- num_hidden_layers = 10 ,
449
- max_batch_size = 32 ,
450
- golden_tokens_per_sec = 730 ,
451
- backend_relative_tolerance = 0.30 ,
452
- backend_absolute_tolerance = 15 .0 ,
453
+ free_mem_ratio = 0.1 ,
454
+ num_hidden_layers = 2 ,
455
+ max_batch_size = 32 , # below this value the kv cache resizing is skipped
456
+ golden_tokens_per_sec = 1400 ,
457
+ backend_relative_tolerance = 0.2 ,
458
+ backend_absolute_tolerance = 250 .0 ,
453
459
golden_relative_tolerance = 0.1 ,
454
460
golden_absolute_tolerance = 5.0 ,
455
461
):
@@ -461,29 +467,28 @@ def trtllm_bench_unified_comparison(
461
467
Args:
462
468
llm_root: Root directory for LLM models (pytest fixture)
463
469
comparison_mode: Either "backend" or "golden" to determine comparison type
464
- free_mem_ratio: Ratio of free memory to use for KV cache (default: 0.9)
465
- num_hidden_layers: Number of hidden layers for the model (default: 10)
466
- max_batch_size: Maximum batch size for benchmarking (default: 32)
467
- golden_tokens_per_sec: Golden performance value in tokens/sec/user (default: 730)
468
- backend_relative_tolerance: Relative tolerance for backend comparison (default: 0.30)
469
- backend_absolute_tolerance: Absolute tolerance for backend comparison (default: 15.0)
470
- golden_relative_tolerance: Relative tolerance for golden comparison (default: 0.1)
471
- golden_absolute_tolerance: Absolute tolerance for golden comparison (default: 5.0)
470
+ free_mem_ratio: Ratio of free memory to use for KV cache
471
+ num_hidden_layers: Number of hidden layers for the model
472
+ max_batch_size: Maximum batch size for benchmarking
473
+ golden_tokens_per_sec: Golden performance value in tokens/sec/user
474
+ backend_relative_tolerance: Relative tolerance for backend comparison
475
+ backend_absolute_tolerance: Absolute tolerance for backend comparison
476
+ golden_relative_tolerance: Relative tolerance for golden comparison
477
+ golden_absolute_tolerance: Absolute tolerance for golden comparison
472
478
"""
473
479
model_name = _hf_model_dir_or_hub_id (
474
480
f"{ llm_models_root ()} /TinyLlama-1.1B-Chat-v1.0" , "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
475
481
)
476
482
477
483
with tempfile .TemporaryDirectory () as temp_dir :
478
- with open (f"{ temp_dir } /model_kwargs .yaml" , "w" ) as f :
484
+ with open (f"{ temp_dir } /extra_llm_api_options .yaml" , "w" ) as f :
479
485
yaml .dump (
480
486
{
481
487
"model_kwargs" : {"num_hidden_layers" : num_hidden_layers },
482
488
# "cuda_graph_batch_sizes": [1, 2],
483
489
"compile_backend" : "torch-opt" ,
484
490
"free_mem_ratio" : free_mem_ratio ,
485
491
"runtime" : "trtllm" ,
486
- "skip_loading_weights" : True ,
487
492
},
488
493
f ,
489
494
)
@@ -515,7 +520,13 @@ def trtllm_bench_unified_comparison(
515
520
pytorch_report_path = f"{ temp_dir } /pytorch_report.json"
516
521
print ("=== RUNNING PYTORCH BACKEND ===" )
517
522
pytorch_report = run_benchmark (
518
- model_name , dataset_path , temp_dir , "pytorch" , pytorch_report_path , max_batch_size
523
+ model_name ,
524
+ dataset_path ,
525
+ temp_dir ,
526
+ "pytorch" ,
527
+ pytorch_report_path ,
528
+ max_batch_size ,
529
+ num_hidden_layers ,
519
530
)
520
531
521
532
# Extract pytorch performance metrics
0 commit comments