Skip to content

Commit b88dae4

Browse files
committed
update mig tests
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent b4167cc commit b88dae4

File tree

1 file changed

+98
-5
lines changed

1 file changed

+98
-5
lines changed

tests/integration/defs/test_e2e.py

Lines changed: 98 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ def test_qwen_e2e_cpprunner_large_new_tokens(model_name, model_path, llm_venv,
408408
trust_remote_code=True,
409409
use_fast=False)
410410

411-
message = r"<|begin▁of▁sentence|><|User|>The operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.<|Assistant|>"
411+
message = r"REDACTED_SPECIAL_TOKENREDACTED_SPECIAL_TOKENThe operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.REDACTED_SPECIAL_TOKEN"
412412

413413
inputs = tokenizer(message, return_tensors='pt',
414414
add_special_tokens=False)['input_ids']
@@ -449,7 +449,9 @@ def __init__(self,
449449
skip_engine_build: bool = False,
450450
quant: Optional[str] = None,
451451
extra_llm_api_options: Optional[str] = None,
452-
use_mpirun: bool = False):
452+
use_mpirun: bool = False,
453+
concurrency: Optional[int] = None,
454+
num_requests: int = 10):
453455

454456
llm_models = llm_models_root()
455457
assert llm_models is not None
@@ -474,12 +476,14 @@ def __init__(self,
474476
else:
475477
self.mpirun_cmd = ""
476478
self.engine_path = None
479+
self.concurrency = concurrency
480+
self.num_requests = num_requests
477481

478482
def __call__(self):
479483
self.prepare_dataset()
480484
if not (self.skip_engine_build or self.use_pytorch_backend):
481485
self.build_engine()
482-
self.run_bench()
486+
return self.run_bench()
483487

484488
def prepare_dataset(self):
485489
dataset_tool = Path(self.llm_root, "benchmarks", "cpp",
@@ -502,7 +506,7 @@ def prepare_dataset(self):
502506
"--output-stdev",
503507
"0",
504508
"--num-requests",
505-
"10",
509+
str(self.num_requests),
506510
]
507511
print(f"Running command: {' '.join(command)}")
508512
dataset_output = self.llm_venv.run_cmd(
@@ -556,7 +560,43 @@ def run_bench(self):
556560

557561
if self.extra_llm_api_options:
558562
benchmark_cmd += f" --extra_llm_api_options {self.extra_llm_api_options}"
559-
check_call(benchmark_cmd, shell=True, env=self.llm_venv._new_env)
563+
if self.concurrency:
564+
benchmark_cmd += f" --concurrency {self.concurrency}"
565+
if self.num_requests:
566+
benchmark_cmd += f" --num_requests {self.num_requests}"
567+
568+
benchmark_output = check_output(benchmark_cmd,
569+
shell=True,
570+
env=self.llm_venv._new_env)
571+
return self.parse_benchmark_output(benchmark_output)
572+
573+
def parse_benchmark_output(self, output):
574+
"""Parse the benchmark output to extract key metrics."""
575+
result = {
576+
'concurrency': self.concurrency,
577+
'num_requests': self.num_requests,
578+
'throughput': 0,
579+
'latency': 0
580+
}
581+
582+
lines = output.split('\n')
583+
for line in lines:
584+
line = line.strip()
585+
if 'total token throughput' in line.lower(
586+
) and 'tokens/sec' in line.lower():
587+
try:
588+
throughput = line.split(":")[1].strip()
589+
result['throughput'] = throughput
590+
except:
591+
pass
592+
elif 'total latency' in line.lower() and 'ms' in line.lower():
593+
try:
594+
latency = line.split(":")[1].strip()
595+
result['latency'] = latency
596+
except:
597+
pass
598+
599+
return result
560600

561601

562602
@pytest.mark.parametrize("model_name", ["meta-llama/Meta-Llama-3-8B-Instruct"],
@@ -579,6 +619,59 @@ def test_trtllm_bench_llmapi_launch(llm_root, llm_venv, model_name,
579619
runner()
580620

581621

622+
@pytest.mark.parametrize("model_name", ["meta/Meta-Llama-3.1-8B"],
623+
ids=["llama3_1-8b"])
624+
@pytest.mark.parametrize("model_subdir", ["llama-3.1-model/Meta-Llama-3.1-8B"],
625+
ids=["llama_v3_1"])
626+
@pytest.mark.parametrize("use_pytorch_backend", [False], ids=["trt_backend"])
627+
def test_trtllm_bench_mig_launch(llm_root, llm_venv, model_name, model_subdir,
628+
use_pytorch_backend):
629+
"run bench mark in MIG mode, check if the throughput is increasing by concurrency"
630+
skip_engine_build = False
631+
results = {}
632+
concurrency_list = [1, 32, 64, 128]
633+
634+
for concurrency in concurrency_list:
635+
num_requests = concurrency * 10
636+
runner = BenchRunner(llm_root=llm_root,
637+
llm_venv=llm_venv,
638+
model_name=model_name,
639+
model_subdir=model_subdir,
640+
streaming=False,
641+
use_pytorch_backend=use_pytorch_backend,
642+
use_mpirun=False,
643+
tp_size=1,
644+
concurrency=concurrency,
645+
num_requests=num_requests,
646+
skip_engine_build=skip_engine_build)
647+
648+
output = runner()
649+
results[concurrency] = output
650+
651+
print(f"\n=== Benchmark Results Comparison ===")
652+
print(f"Model: {model_name}")
653+
print(f"Backend: {'PyTorch' if use_pytorch_backend else 'TensorRT'}")
654+
print(
655+
f"{'Concurrency':<15} {'Throughput':<15} {'Latency':<15} {'Num Requests':<15}"
656+
)
657+
print("-" * 60)
658+
659+
for idx, val in enumerate(concurrency_list):
660+
if hasattr(results[val], 'get'):
661+
throughput = float(results[val].get('throughput', 0))
662+
latency = float(results[val].get('latency', 0))
663+
num_requests = int(results[val].get('num_requests', 0))
664+
assert throughput > 0, f"Throughput is 0 for concurrency {concurrency}"
665+
assert latency > 0, f"Latency is 0 for concurrency {concurrency}"
666+
print(
667+
f"{concurrency:<15} {throughput:<15} {latency:<15} {num_requests:<15}"
668+
)
669+
if idx > 0:
670+
assert throughput > float(
671+
results[concurrency_list[idx - 1]].get('throughput', 0)
672+
) * 1.3, f"Throughput is not increasing for concurrency {concurrency_list[idx]}"
673+
674+
582675
@pytest.mark.parametrize(
583676
"model_name, llama_model_root",
584677
[pytest.param("TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0")],

0 commit comments

Comments
 (0)