Skip to content

Commit 9d30ade

Browse files
xinhe-nvcoderabbitai[bot]
authored andcommitted
[TRTLLM-7048][feat] add benchmark TRT flow test for MIG (NVIDIA#6884)
Signed-off-by: Xin He (SW-GPU) <[email protected]> Signed-off-by: xinhe-nv <[email protected]> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Wangshanshan <[email protected]>
1 parent 6870f42 commit 9d30ade

File tree

1 file changed

+109
-4
lines changed

1 file changed

+109
-4
lines changed

tests/integration/defs/test_e2e.py

Lines changed: 109 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,9 @@ def __init__(self,
451451
skip_engine_build: bool = False,
452452
quant: Optional[str] = None,
453453
extra_llm_api_options: Optional[str] = None,
454-
use_mpirun: bool = False):
454+
use_mpirun: bool = False,
455+
concurrency: Optional[int] = None,
456+
num_requests: int = 10):
455457

456458
llm_models = llm_models_root()
457459
assert llm_models is not None
@@ -476,12 +478,14 @@ def __init__(self,
476478
else:
477479
self.mpirun_cmd = ""
478480
self.engine_path = None
481+
self.concurrency = concurrency
482+
self.num_requests = num_requests
479483

480484
def __call__(self):
481485
self.prepare_dataset()
482486
if not (self.skip_engine_build or self.use_pytorch_backend):
483487
self.build_engine()
484-
self.run_bench()
488+
return self.run_bench()
485489

486490
def prepare_dataset(self):
487491
dataset_tool = Path(self.llm_root, "benchmarks", "cpp",
@@ -504,7 +508,7 @@ def prepare_dataset(self):
504508
"--output-stdev",
505509
"0",
506510
"--num-requests",
507-
"10",
511+
str(self.num_requests),
508512
]
509513
print(f"Running command: {' '.join(command)}")
510514
dataset_output = self.llm_venv.run_cmd(
@@ -558,7 +562,47 @@ def run_bench(self):
558562

559563
if self.extra_llm_api_options:
560564
benchmark_cmd += f" --extra_llm_api_options {self.extra_llm_api_options}"
561-
check_call(benchmark_cmd, shell=True, env=self.llm_venv._new_env)
565+
if self.concurrency:
566+
benchmark_cmd += f" --concurrency {self.concurrency}"
567+
if self.num_requests:
568+
benchmark_cmd += f" --num_requests {self.num_requests}"
569+
570+
benchmark_output = check_output(benchmark_cmd,
571+
shell=True,
572+
env=self.llm_venv._new_env)
573+
return self.parse_benchmark_output(benchmark_output)
574+
575+
def parse_benchmark_output(self, output):
576+
"""Parse the benchmark output to extract key metrics."""
577+
result = {
578+
'concurrency': self.concurrency,
579+
'num_requests': self.num_requests,
580+
'throughput': 0,
581+
'latency': 0
582+
}
583+
584+
lines = output.split('\n')
585+
for line in lines:
586+
line = line.strip()
587+
if 'total token throughput' in line.lower(
588+
) and 'tokens/sec' in line.lower():
589+
try:
590+
throughput = line.split(":")[1].strip()
591+
result['throughput'] = throughput
592+
except (IndexError, ValueError) as e:
593+
print(
594+
f"Failed to parse throughput from line: {line}. Error: {e}"
595+
)
596+
elif 'total latency' in line.lower() and 'ms' in line.lower():
597+
try:
598+
latency = line.split(":")[1].strip()
599+
result['latency'] = latency
600+
except (IndexError, ValueError) as e:
601+
print(
602+
f"Failed to parse latency from line: {line}. Error: {e}"
603+
)
604+
605+
return result
562606

563607

564608
@pytest.mark.parametrize("model_name", ["meta-llama/Meta-Llama-3-8B-Instruct"],
@@ -581,6 +625,67 @@ def test_trtllm_bench_llmapi_launch(llm_root, llm_venv, model_name,
581625
runner()
582626

583627

628+
@skip_pre_hopper
629+
@pytest.mark.skip_less_device_memory(80000)
630+
@pytest.mark.parametrize("model_name", ["meta/Meta-Llama-3.1-8B"],
631+
ids=["llama3_1-8b"])
632+
@pytest.mark.parametrize("model_subdir", ["llama-3.1-model/Meta-Llama-3.1-8B"],
633+
ids=["llama_v3_1"])
634+
@pytest.mark.parametrize("use_pytorch_backend", [False], ids=["trt_backend"])
635+
def test_trtllm_bench_mig_launch(llm_root, llm_venv, model_name, model_subdir,
636+
use_pytorch_backend):
637+
"run bench mark in MIG mode, check if the throughput is increasing by concurrency"
638+
skip_engine_build = False
639+
results = {}
640+
concurrency_list = [1, 32, 64, 128]
641+
642+
for concurrency in concurrency_list:
643+
num_requests = concurrency * 10
644+
runner = BenchRunner(llm_root=llm_root,
645+
llm_venv=llm_venv,
646+
model_name=model_name,
647+
model_subdir=model_subdir,
648+
streaming=False,
649+
use_pytorch_backend=use_pytorch_backend,
650+
use_mpirun=False,
651+
tp_size=1,
652+
concurrency=concurrency,
653+
num_requests=num_requests,
654+
skip_engine_build=skip_engine_build)
655+
656+
output = runner()
657+
results[concurrency] = output
658+
659+
print(f"\n=== Benchmark Results Comparison ===")
660+
print(f"Model: {model_name}")
661+
print(f"Backend: {'PyTorch' if use_pytorch_backend else 'TensorRT'}")
662+
print(
663+
f"{'Concurrency':<15} {'Throughput':<15} {'Latency':<15} {'Num Requests':<15}"
664+
)
665+
print("-" * 60)
666+
667+
for idx, val in enumerate(concurrency_list):
668+
metrics = results.get(val)
669+
if not isinstance(metrics, dict):
670+
pytest.fail(
671+
f"Unexpected benchmark result type for concurrency {val}: {type(metrics)}"
672+
)
673+
try:
674+
throughput = float(metrics.get('throughput', 0))
675+
latency = float(metrics.get('latency', 0))
676+
num_requests = int(metrics.get('num_requests', 0))
677+
except (ValueError, TypeError) as e:
678+
pytest.fail(
679+
f"Failed to parse benchmark results for concurrency {val}: {e}")
680+
assert throughput > 0, f"Throughput is 0 for concurrency {val}"
681+
assert latency > 0, f"Latency is 0 for concurrency {val}"
682+
print(f"{val:<15} {throughput:<15} {latency:<15} {num_requests:<15}")
683+
if idx > 0:
684+
prev_throughput = float(results[concurrency_list[idx - 1]].get(
685+
'throughput', 0))
686+
assert throughput > prev_throughput * 1.3, f"Throughput is not increasing for concurrency {concurrency_list[idx]}"
687+
688+
584689
@pytest.mark.parametrize(
585690
"model_name, llama_model_root",
586691
[pytest.param("TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0")],

0 commit comments

Comments
 (0)