update mig tests

xinhe-nv · xinhe-nv · commit b88dae483d7f · 2025-08-14T13:44:37.000+08:00
Signed-off-by: Xin He (SW-GPU) &lt;200704525+xinhe-nv@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -408,7 +408,7 @@ def test_qwen_e2e_cpprunner_large_new_tokens(model_name, model_path, llm_venv,
         trust_remote_code=True,
         use_fast=False)
 
-    message = r"<｜begin▁of▁sentence｜><｜User｜>The operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.<｜Assistant｜>"
+    message = r"REDACTED_SPECIAL_TOKENREDACTED_SPECIAL_TOKENThe operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.REDACTED_SPECIAL_TOKEN"
 
     inputs = tokenizer(message, return_tensors='pt',
                        add_special_tokens=False)['input_ids']
@@ -449,7 +449,9 @@ def __init__(self,
                  skip_engine_build: bool = False,
                  quant: Optional[str] = None,
                  extra_llm_api_options: Optional[str] = None,
-                 use_mpirun: bool = False):
+                 use_mpirun: bool = False,
+                 concurrency: Optional[int] = None,
+                 num_requests: int = 10):
 
         llm_models = llm_models_root()
         assert llm_models is not None
@@ -474,12 +476,14 @@ def __init__(self,
         else:
             self.mpirun_cmd = ""
         self.engine_path = None
+        self.concurrency = concurrency
+        self.num_requests = num_requests
 
     def __call__(self):
         self.prepare_dataset()
         if not (self.skip_engine_build or self.use_pytorch_backend):
             self.build_engine()
-        self.run_bench()
+        return self.run_bench()
 
     def prepare_dataset(self):
         dataset_tool = Path(self.llm_root, "benchmarks", "cpp",
@@ -502,7 +506,7 @@ def prepare_dataset(self):
             "--output-stdev",
             "0",
             "--num-requests",
-            "10",
+            str(self.num_requests),
         ]
         print(f"Running command: {' '.join(command)}")
         dataset_output = self.llm_venv.run_cmd(
@@ -556,7 +560,43 @@ def run_bench(self):
 
         if self.extra_llm_api_options:
             benchmark_cmd += f" --extra_llm_api_options {self.extra_llm_api_options}"
-        check_call(benchmark_cmd, shell=True, env=self.llm_venv._new_env)
+        if self.concurrency:
+            benchmark_cmd += f" --concurrency {self.concurrency}"
+        if self.num_requests:
+            benchmark_cmd += f" --num_requests {self.num_requests}"
+
+        benchmark_output = check_output(benchmark_cmd,
+                                        shell=True,
+                                        env=self.llm_venv._new_env)
+        return self.parse_benchmark_output(benchmark_output)
+
+    def parse_benchmark_output(self, output):
+        """Parse the benchmark output to extract key metrics."""
+        result = {
+            'concurrency': self.concurrency,
+            'num_requests': self.num_requests,
+            'throughput': 0,
+            'latency': 0
+        }
+
+        lines = output.split('\n')
+        for line in lines:
+            line = line.strip()
+            if 'total token throughput' in line.lower(
+            ) and 'tokens/sec' in line.lower():
+                try:
+                    throughput = line.split(":")[1].strip()
+                    result['throughput'] = throughput
+                except:
+                    pass
+            elif 'total latency' in line.lower() and 'ms' in line.lower():
+                try:
+                    latency = line.split(":")[1].strip()
+                    result['latency'] = latency
+                except:
+                    pass
+
+        return result
 
 
 @pytest.mark.parametrize("model_name", ["meta-llama/Meta-Llama-3-8B-Instruct"],
@@ -579,6 +619,59 @@ def test_trtllm_bench_llmapi_launch(llm_root, llm_venv, model_name,
     runner()
 
 
+@pytest.mark.parametrize("model_name", ["meta/Meta-Llama-3.1-8B"],
+                         ids=["llama3_1-8b"])
+@pytest.mark.parametrize("model_subdir", ["llama-3.1-model/Meta-Llama-3.1-8B"],
+                         ids=["llama_v3_1"])
+@pytest.mark.parametrize("use_pytorch_backend", [False], ids=["trt_backend"])
+def test_trtllm_bench_mig_launch(llm_root, llm_venv, model_name, model_subdir,
+                                 use_pytorch_backend):
+    "run bench mark in MIG mode, check if the throughput is increasing by concurrency"
+    skip_engine_build = False
+    results = {}
+    concurrency_list = [1, 32, 64, 128]
+
+    for concurrency in concurrency_list:
+        num_requests = concurrency * 10
+        runner = BenchRunner(llm_root=llm_root,
+                             llm_venv=llm_venv,
+                             model_name=model_name,
+                             model_subdir=model_subdir,
+                             streaming=False,
+                             use_pytorch_backend=use_pytorch_backend,
+                             use_mpirun=False,
+                             tp_size=1,
+                             concurrency=concurrency,
+                             num_requests=num_requests,
+                             skip_engine_build=skip_engine_build)
+
+        output = runner()
+        results[concurrency] = output
+
+    print(f"\n=== Benchmark Results Comparison ===")
+    print(f"Model: {model_name}")
+    print(f"Backend: {'PyTorch' if use_pytorch_backend else 'TensorRT'}")
+    print(
+        f"{'Concurrency':<15} {'Throughput':<15} {'Latency':<15} {'Num Requests':<15}"
+    )
+    print("-" * 60)
+
+    for idx, val in enumerate(concurrency_list):
+        if hasattr(results[val], 'get'):
+            throughput = float(results[val].get('throughput', 0))
+            latency = float(results[val].get('latency', 0))
+            num_requests = int(results[val].get('num_requests', 0))
+            assert throughput > 0, f"Throughput is 0 for concurrency {concurrency}"
+            assert latency > 0, f"Latency is 0 for concurrency {concurrency}"
+            print(
+                f"{concurrency:<15} {throughput:<15} {latency:<15} {num_requests:<15}"
+            )
+            if idx > 0:
+                assert throughput > float(
+                    results[concurrency_list[idx - 1]].get('throughput', 0)
+                ) * 1.3, f"Throughput is not increasing for concurrency {concurrency_list[idx]}"
+
+
 @pytest.mark.parametrize(
     "model_name, llama_model_root",
     [pytest.param("TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0")],