@@ -408,7 +408,7 @@ def test_qwen_e2e_cpprunner_large_new_tokens(model_name, model_path, llm_venv,
408408 trust_remote_code = True ,
409409 use_fast = False )
410410
411- message = r"<|begin▁of▁sentence|><|User|>The operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.<|Assistant|> "
411+ message = r"REDACTED_SPECIAL_TOKENREDACTED_SPECIAL_TOKENThe operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.REDACTED_SPECIAL_TOKEN "
412412
413413 inputs = tokenizer (message , return_tensors = 'pt' ,
414414 add_special_tokens = False )['input_ids' ]
@@ -449,7 +449,9 @@ def __init__(self,
449449 skip_engine_build : bool = False ,
450450 quant : Optional [str ] = None ,
451451 extra_llm_api_options : Optional [str ] = None ,
452- use_mpirun : bool = False ):
452+ use_mpirun : bool = False ,
453+ concurrency : Optional [int ] = None ,
454+ num_requests : int = 10 ):
453455
454456 llm_models = llm_models_root ()
455457 assert llm_models is not None
@@ -474,12 +476,14 @@ def __init__(self,
474476 else :
475477 self .mpirun_cmd = ""
476478 self .engine_path = None
479+ self .concurrency = concurrency
480+ self .num_requests = num_requests
477481
478482 def __call__ (self ):
479483 self .prepare_dataset ()
480484 if not (self .skip_engine_build or self .use_pytorch_backend ):
481485 self .build_engine ()
482- self .run_bench ()
486+ return self .run_bench ()
483487
484488 def prepare_dataset (self ):
485489 dataset_tool = Path (self .llm_root , "benchmarks" , "cpp" ,
@@ -502,7 +506,7 @@ def prepare_dataset(self):
502506 "--output-stdev" ,
503507 "0" ,
504508 "--num-requests" ,
505- "10" ,
509+ str ( self . num_requests ) ,
506510 ]
507511 print (f"Running command: { ' ' .join (command )} " )
508512 dataset_output = self .llm_venv .run_cmd (
@@ -556,7 +560,43 @@ def run_bench(self):
556560
557561 if self .extra_llm_api_options :
558562 benchmark_cmd += f" --extra_llm_api_options { self .extra_llm_api_options } "
559- check_call (benchmark_cmd , shell = True , env = self .llm_venv ._new_env )
563+ if self .concurrency :
564+ benchmark_cmd += f" --concurrency { self .concurrency } "
565+ if self .num_requests :
566+ benchmark_cmd += f" --num_requests { self .num_requests } "
567+
568+ benchmark_output = check_output (benchmark_cmd ,
569+ shell = True ,
570+ env = self .llm_venv ._new_env )
571+ return self .parse_benchmark_output (benchmark_output )
572+
573+ def parse_benchmark_output (self , output ):
574+ """Parse the benchmark output to extract key metrics."""
575+ result = {
576+ 'concurrency' : self .concurrency ,
577+ 'num_requests' : self .num_requests ,
578+ 'throughput' : 0 ,
579+ 'latency' : 0
580+ }
581+
582+ lines = output .split ('\n ' )
583+ for line in lines :
584+ line = line .strip ()
585+ if 'total token throughput' in line .lower (
586+ ) and 'tokens/sec' in line .lower ():
587+ try :
588+ throughput = line .split (":" )[1 ].strip ()
589+ result ['throughput' ] = throughput
590+ except :
591+ pass
592+ elif 'total latency' in line .lower () and 'ms' in line .lower ():
593+ try :
594+ latency = line .split (":" )[1 ].strip ()
595+ result ['latency' ] = latency
596+ except :
597+ pass
598+
599+ return result
560600
561601
562602@pytest .mark .parametrize ("model_name" , ["meta-llama/Meta-Llama-3-8B-Instruct" ],
@@ -579,6 +619,59 @@ def test_trtllm_bench_llmapi_launch(llm_root, llm_venv, model_name,
579619 runner ()
580620
581621
622+ @pytest .mark .parametrize ("model_name" , ["meta/Meta-Llama-3.1-8B" ],
623+ ids = ["llama3_1-8b" ])
624+ @pytest .mark .parametrize ("model_subdir" , ["llama-3.1-model/Meta-Llama-3.1-8B" ],
625+ ids = ["llama_v3_1" ])
626+ @pytest .mark .parametrize ("use_pytorch_backend" , [False ], ids = ["trt_backend" ])
627+ def test_trtllm_bench_mig_launch (llm_root , llm_venv , model_name , model_subdir ,
628+ use_pytorch_backend ):
629+ "run bench mark in MIG mode, check if the throughput is increasing by concurrency"
630+ skip_engine_build = False
631+ results = {}
632+ concurrency_list = [1 , 32 , 64 , 128 ]
633+
634+ for concurrency in concurrency_list :
635+ num_requests = concurrency * 10
636+ runner = BenchRunner (llm_root = llm_root ,
637+ llm_venv = llm_venv ,
638+ model_name = model_name ,
639+ model_subdir = model_subdir ,
640+ streaming = False ,
641+ use_pytorch_backend = use_pytorch_backend ,
642+ use_mpirun = False ,
643+ tp_size = 1 ,
644+ concurrency = concurrency ,
645+ num_requests = num_requests ,
646+ skip_engine_build = skip_engine_build )
647+
648+ output = runner ()
649+ results [concurrency ] = output
650+
651+ print (f"\n === Benchmark Results Comparison ===" )
652+ print (f"Model: { model_name } " )
653+ print (f"Backend: { 'PyTorch' if use_pytorch_backend else 'TensorRT' } " )
654+ print (
655+ f"{ 'Concurrency' :<15} { 'Throughput' :<15} { 'Latency' :<15} { 'Num Requests' :<15} "
656+ )
657+ print ("-" * 60 )
658+
659+ for idx , val in enumerate (concurrency_list ):
660+ if hasattr (results [val ], 'get' ):
661+ throughput = float (results [val ].get ('throughput' , 0 ))
662+ latency = float (results [val ].get ('latency' , 0 ))
663+ num_requests = int (results [val ].get ('num_requests' , 0 ))
664+ assert throughput > 0 , f"Throughput is 0 for concurrency { concurrency } "
665+ assert latency > 0 , f"Latency is 0 for concurrency { concurrency } "
666+ print (
667+ f"{ concurrency :<15} { throughput :<15} { latency :<15} { num_requests :<15} "
668+ )
669+ if idx > 0 :
670+ assert throughput > float (
671+ results [concurrency_list [idx - 1 ]].get ('throughput' , 0 )
672+ ) * 1.3 , f"Throughput is not increasing for concurrency { concurrency_list [idx ]} "
673+
674+
582675@pytest .mark .parametrize (
583676 "model_name, llama_model_root" ,
584677 [pytest .param ("TinyLlama-1.1B-Chat-v1.0" , "TinyLlama-1.1B-Chat-v1.0" )],
0 commit comments