Skip to content

Commit f29a618

Browse files
committed
shrunk the model, and fixes
Signed-off-by: Eran Geva <[email protected]>
1 parent ba7a371 commit f29a618

File tree

1 file changed

+42
-31
lines changed

1 file changed

+42
-31
lines changed

tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,12 @@ def run_benchmark(
5454
backend: str = "_autodeploy",
5555
report_json_path: str = None,
5656
max_batch_size: int = 32,
57+
num_hidden_layers: int = 2,
5758
):
5859
"""Run benchmark and capture KV cache metrics from log output."""
5960

6061
# Read the test config to get free_mem_ratio
61-
config_path = f"{temp_dir}/model_kwargs.yaml"
62-
with open(config_path, "r") as f:
63-
config = yaml.safe_load(f)
64-
free_mem_ratio = config.get("free_mem_ratio", 0.8) # Default to 0.8 if not specified
62+
config_path = f"{temp_dir}/extra_llm_api_options.yaml"
6563

6664
# Build the command to run the benchmark
6765
cmd = [
@@ -79,19 +77,27 @@ def run_benchmark(
7977
str(max_batch_size),
8078
]
8179

82-
# Add extra_llm_api_options only for autodeploy backend
83-
if backend == "_autodeploy":
84-
cmd.extend(["--extra_llm_api_options", config_path])
85-
8680
# Add report_json argument if path is provided
8781
if report_json_path:
8882
cmd.extend(["--report_json", report_json_path])
8983

90-
print(f"🚀 Running benchmark command ({backend} backend): {' '.join(cmd)}")
91-
print(f"📋 Using free_mem_ratio from config: {free_mem_ratio}")
84+
if backend == "_autodeploy":
85+
# Add extra_llm_api_options only for autodeploy backend
86+
cmd.extend(["--extra_llm_api_options", config_path])
87+
with open(config_path, "r") as f:
88+
config = yaml.safe_load(f)
89+
free_mem_ratio = config.get("free_mem_ratio", 0.0) # Default to 0.0 if not specified
90+
print(f"📋 Using free_mem_ratio from config: {free_mem_ratio}")
9291

9392
# Run benchmark as subprocess to capture ALL output
94-
result = subprocess.run(cmd, capture_output=True, text=True)
93+
import os
94+
95+
env = os.environ.copy()
96+
if backend == "pytorch":
97+
env["TLLM_OVERRIDE_LAYER_NUM"] = str(num_hidden_layers)
98+
print(f"📋 Using TLLM_OVERRIDE_LAYER_NUM from env: {env['TLLM_OVERRIDE_LAYER_NUM']}")
99+
print(f"🚀 Running benchmark command ({backend} backend): {' '.join(cmd)}")
100+
result = subprocess.run(cmd, capture_output=True, text=True, env=env)
95101

96102
# Check if the command succeeded
97103
assert result.returncode == 0, (
@@ -272,14 +278,14 @@ def calculate_expected_kv_cache_metrics(free_mem_ratio: float):
272278
# For TinyLlama-1.1B, model should be 2.2GB
273279
estimated_model_size_mb = 2200 # Conservative estimate
274280
# TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/6335 check why there is extra consumption
275-
extra_consumption_mb = 800
281+
extra_consumption_mb = 2200
276282
expected_free_mem_range = (
277283
total_mem_mb - estimated_model_size_mb - extra_consumption_mb,
278284
total_mem_mb - estimated_model_size_mb,
279285
)
280286

281-
# Current cache size is typically small initially (80MB range)
282-
expected_current_cache_size = 83886080 # This is more stable across GPUs
287+
# Current cache size is typically small initially (16MB range)
288+
expected_current_cache_size = 16777216
283289

284290
# Free memory values should be in reasonable range
285291
expected_free_mem_pre_range = expected_free_mem_range
@@ -444,12 +450,12 @@ def print_kv_cache_metrics(kv_cache_metrics):
444450
def trtllm_bench_unified_comparison(
445451
llm_root, # noqa: F811
446452
comparison_mode="backend",
447-
free_mem_ratio=0.9,
448-
num_hidden_layers=10,
449-
max_batch_size=32,
450-
golden_tokens_per_sec=730,
451-
backend_relative_tolerance=0.30,
452-
backend_absolute_tolerance=15.0,
453+
free_mem_ratio=0.1,
454+
num_hidden_layers=2,
455+
max_batch_size=32, # below this value the kv cache resizing is skipped
456+
golden_tokens_per_sec=1400,
457+
backend_relative_tolerance=0.2,
458+
backend_absolute_tolerance=250.0,
453459
golden_relative_tolerance=0.1,
454460
golden_absolute_tolerance=5.0,
455461
):
@@ -461,29 +467,28 @@ def trtllm_bench_unified_comparison(
461467
Args:
462468
llm_root: Root directory for LLM models (pytest fixture)
463469
comparison_mode: Either "backend" or "golden" to determine comparison type
464-
free_mem_ratio: Ratio of free memory to use for KV cache (default: 0.9)
465-
num_hidden_layers: Number of hidden layers for the model (default: 10)
466-
max_batch_size: Maximum batch size for benchmarking (default: 32)
467-
golden_tokens_per_sec: Golden performance value in tokens/sec/user (default: 730)
468-
backend_relative_tolerance: Relative tolerance for backend comparison (default: 0.30)
469-
backend_absolute_tolerance: Absolute tolerance for backend comparison (default: 15.0)
470-
golden_relative_tolerance: Relative tolerance for golden comparison (default: 0.1)
471-
golden_absolute_tolerance: Absolute tolerance for golden comparison (default: 5.0)
470+
free_mem_ratio: Ratio of free memory to use for KV cache
471+
num_hidden_layers: Number of hidden layers for the model
472+
max_batch_size: Maximum batch size for benchmarking
473+
golden_tokens_per_sec: Golden performance value in tokens/sec/user
474+
backend_relative_tolerance: Relative tolerance for backend comparison
475+
backend_absolute_tolerance: Absolute tolerance for backend comparison
476+
golden_relative_tolerance: Relative tolerance for golden comparison
477+
golden_absolute_tolerance: Absolute tolerance for golden comparison
472478
"""
473479
model_name = _hf_model_dir_or_hub_id(
474480
f"{llm_models_root()}/TinyLlama-1.1B-Chat-v1.0", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
475481
)
476482

477483
with tempfile.TemporaryDirectory() as temp_dir:
478-
with open(f"{temp_dir}/model_kwargs.yaml", "w") as f:
484+
with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f:
479485
yaml.dump(
480486
{
481487
"model_kwargs": {"num_hidden_layers": num_hidden_layers},
482488
# "cuda_graph_batch_sizes": [1, 2],
483489
"compile_backend": "torch-opt",
484490
"free_mem_ratio": free_mem_ratio,
485491
"runtime": "trtllm",
486-
"skip_loading_weights": True,
487492
},
488493
f,
489494
)
@@ -515,7 +520,13 @@ def trtllm_bench_unified_comparison(
515520
pytorch_report_path = f"{temp_dir}/pytorch_report.json"
516521
print("=== RUNNING PYTORCH BACKEND ===")
517522
pytorch_report = run_benchmark(
518-
model_name, dataset_path, temp_dir, "pytorch", pytorch_report_path, max_batch_size
523+
model_name,
524+
dataset_path,
525+
temp_dir,
526+
"pytorch",
527+
pytorch_report_path,
528+
max_batch_size,
529+
num_hidden_layers,
519530
)
520531

521532
# Extract pytorch performance metrics

0 commit comments

Comments
 (0)