Fix tests accordingly

LinPoly · LinPoly · commit 59f4d413ff7e · 2025-07-10T07:09:48.000Z
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
@@ -322,7 +322,7 @@ def check_server_health(server_url: str,
 
 @pytest.mark.parametrize("test_mode", ["stress-test", "stress-stage-alone"],
                          ids=lambda x: x)
-@pytest.mark.parametrize("backend", ["trt", "pytorch"], ids=lambda x: x)
+@pytest.mark.parametrize("backend", ["tensorrt", "pytorch"], ids=lambda x: x)
 @pytest.mark.parametrize("capacity_scheduler_policy",
                          ["GUARANTEED_NO_EVICT", "MAX_UTILIZATION"],
                          ids=lambda x: x)
@@ -358,18 +358,17 @@ def test_run_stress_test(config, stress_time_timeout, backend,
     Args:
         config: Model configuration for the test (injected by pytest.mark.parametrize)
         stress_time_timeout: Tuple of (stress_time, stress_timeout) in seconds
-        backend: Backend to use ("trt" or "pytorch")
+        backend: Backend to use ("tensorrt" or "pytorch")
         capacity_scheduler_policy: Scheduler policy ("GUARANTEED_NO_EVICT", "MAX_UTILIZATION")
         test_mode: Test mode ("stress-test" or "stress-stage-alone")
     """
     # Create a new ModelConfig with the backend parameter
     # Convert 'trt' to None as expected by the ModelConfig
-    backend_param = None if backend == "trt" else backend
 
     new_config = ModelConfig(model_dir=config.model_dir,
                              tp_size=config.tp_size,
                              memory_requirement=config.memory_requirement,
-                             backend=backend_param)
+                             backend=backend)
 
     # Extract stress_time and stress_timeout from the tuple
     stress_time, stress_timeout = stress_time_timeout
@@ -542,6 +541,8 @@ def stress_test(config,
         str(config.tp_size),
         "--pp_size",
         str(test_server_config.pp_size),
+        "--backend",
+        config.backend,
     ]
 
     # Only add ep_size parameter if it's not None
@@ -560,12 +561,6 @@ def stress_test(config,
         extra_llm_options_path,
     ])
 
-    # Add backend option only if specified
-    # backend = None means trt backend
-    # backend = pytorch means pytorch backend
-    if config.backend:
-        server_cmd.extend(["--backend", config.backend])
-
     # Log the command we're about to run
     print_info(f"Running command: {' '.join(server_cmd)}")
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat.py b/tests/unittest/llmapi/apps/_test_openai_chat.py
@@ -20,9 +20,7 @@ def model_name():
     return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-@pytest.fixture(scope="module",
-                params=[None, 'pytorch'],
-                ids=["trt", "pytorch"])
+@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
 def backend(request):
     return request.param
 
@@ -67,10 +65,9 @@ def temp_extra_llm_api_options_file(request):
 def server(model_name: str, backend: str, extra_llm_api_options: bool,
            temp_extra_llm_api_options_file: str, num_postprocess_workers: int):
     model_path = get_model_path(model_name)
-    if backend == "pytorch":
-        args = ["--backend", f"{backend}"]
-    else:
-        args = ["--max_beam_width", "4"]
+    args = ["--backend", f"{backend}"]
+    if backend == "tensorrt":
+        args.extend(["--max_beam_width", "4"])
     if extra_llm_api_options:
         args.extend(
             ["--extra_llm_api_options", temp_extra_llm_api_options_file])
diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py
@@ -14,7 +14,7 @@ def model_name():
     return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-@pytest.fixture(scope="module", params=[None, 'pytorch'])
+@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
 def backend(request):
     return request.param
 
@@ -29,10 +29,9 @@ def num_postprocess_workers(request):
 @pytest.fixture(scope="module")
 def server(model_name: str, backend: str, num_postprocess_workers: int):
     model_path = get_model_path(model_name)
-    if backend == "pytorch":
-        args = ["--backend", f"{backend}"]
-    else:
-        args = ["--max_beam_width", "4"]
+    args = ["--backend", f"{backend}"]
+    if backend == "tensorrt":
+        args.extend(["--max_beam_width", "4"])
     args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@@ -21,7 +21,6 @@ def client():
     llm = PyTorchLLM(model=llama_model_path,
                      build_config=build_config,
                      kv_cache_config=KvCacheConfig(),
-                     backend="pytorch",
                      enable_iter_perf_stats=True)
     hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py
@@ -15,17 +15,17 @@ def model_name():
     return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-@pytest.fixture(scope="module", params=["trt", 'pytorch'])
+@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
 def backend(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=['8'])
+@pytest.fixture(scope="module", params=["8"])
 def max_batch_size(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=['80000'])
+@pytest.fixture(scope="module", params=["80000"])
 def max_seq_len(request):
     return request.param
 
@@ -34,19 +34,13 @@ def max_seq_len(request):
 def server(model_name: str, backend: str, max_batch_size: str,
            max_seq_len: str):
     model_path = get_model_path(model_name)
-    args = []
-    if backend == "pytorch":
-        args.append("--backend")
-        args.append(backend)
+    args = ["--backend", f"{backend}"]
     if backend != "pytorch":
-        args.append("--max_beam_width")
-        args.append("4")
+        args.extend(["--max_beam_width", "4"])
     if max_batch_size is not None:
-        args.append("--max_batch_size")
-        args.append(max_batch_size)
+        args.extend(["--max_batch_size", max_batch_size])
     if max_seq_len is not None:
-        args.append("--max_seq_len")
-        args.append(max_seq_len)
+        args.extend(["--max_seq_len", max_seq_len])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py b/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py
@@ -15,9 +15,7 @@ def model_name():
     return "llama-models-v3/llama-v3-8b-instruct-hf"
 
 
-@pytest.fixture(scope="module",
-                params=[None, 'pytorch'],
-                ids=["trt", "pytorch"])
+@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
 def backend(request):
     return request.param
 
@@ -55,13 +53,10 @@ def temp_extra_llm_api_options_file(request):
 def server(model_name: str, backend: str, extra_llm_api_options: bool,
            temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
-    args = ["--tp_size", "2", "--max_beam_width", "1"]
-    if backend is not None:
-        args.append("--backend")
-        args.append(backend)
+    args = ["--tp_size", "2", "--max_beam_width", "1", "--backend", backend]
     if extra_llm_api_options:
-        args.append("--extra_llm_api_options")
-        args.append(temp_extra_llm_api_options_file)
+        args.extend(
+            ["--extra_llm_api_options", temp_extra_llm_api_options_file])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
@@ -95,7 +90,7 @@ def test_chat_tp2(client: openai.OpenAI, model_name: str):
     assert len(chat_completion.choices) == 1
     assert chat_completion.usage.completion_tokens == 1
     message = chat_completion.choices[0].message
-    assert message.content == 'Two'
+    assert message.content == "Two"
 
 
 @skip_single_gpu
diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py b/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py
@@ -48,12 +48,17 @@ def server(model_name: str, backend: str, tp_pp_size: tuple):
     tp_size, pp_size = tp_pp_size
     device_count = torch.cuda.device_count()
     args = [
-        "--tp_size", f"{tp_size}", "--pp_size", f"{pp_size}", "--gpus_per_node",
-        f"{device_count}", "--kv_cache_free_gpu_memory_fraction", "0.95"
+        "--tp_size",
+        f"{tp_size}",
+        "--pp_size",
+        f"{pp_size}",
+        "--gpus_per_node",
+        f"{device_count}",
+        "--kv_cache_free_gpu_memory_fraction",
+        "0.95",
+        "--backend",
+        backend,
     ]
-    if backend is not None:
-        args.append("--backend")
-        args.append(backend)
     with RemoteOpenAIServer(model_path, args, llmapi_launch=True,
                             port=8001) as remote_server:
         yield remote_server
diff --git a/tests/unittest/llmapi/apps/_test_openai_reasoning.py b/tests/unittest/llmapi/apps/_test_openai_reasoning.py
@@ -14,19 +14,15 @@ def model_name() -> str:
     return "DeepSeek-R1-Distill-Qwen-1.5B"
 
 
-@pytest.fixture(scope="module",
-                params=[None, 'pytorch'],
-                ids=["trt", "pytorch"])
+@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
 def backend(request):
     return request.param
 
 
 @pytest.fixture(scope="module")
-def server(model_name: str, backend: str) -> RemoteOpenAIServer:
+def server(model_name: str, backend: str):
     model_path = get_model_path(model_name)
-    args = []
-    if backend == "pytorch":
-        args.extend(["--backend", f"{backend}"])
+    args = ["--backend", f"{backend}"]
     max_beam_width = 1 if backend == "pytorch" else 2
     args.extend(["--max_beam_width", str(max_beam_width)])
     args.extend(["--max_batch_size", "2", "--max_seq_len", "1024"])
@@ -68,7 +64,7 @@ def test_reasoning_parser(client: openai.OpenAI, model_name: str, backend: str):
 
 
 @pytest.fixture(scope="module")
-def oning_client(server: RemoteOpenAIServer) -> openai.OpenAI:
+def async_client(server: RemoteOpenAIServer) -> openai.AsyncOpenAI:
     return server.get_async_client()
 
 
@@ -90,10 +86,10 @@ async def process_stream(
 
 
 @pytest.mark.asyncio(loop_scope="module")
-async def test_reasoning_parser_streaming(oning_client: openai.OpenAI,
-                                          model_name: str, backend: str):
+async def test_reasoning_parser_streaming(async_client: openai.AsyncOpenAI,
+                                          model_name: str):
     messages = [{"role": "user", "content": "hi"}]
-    stream = await oning_client.chat.completions.create(
+    stream = await async_client.chat.completions.create(
         model=model_name,
         messages=messages,
         max_completion_tokens=1000,
@@ -106,7 +102,7 @@ async def test_reasoning_parser_streaming(oning_client: openai.OpenAI,
     assert len(content_chunks) > 0
     assert len(reasoning_content_chunks) > 0
 
-    stream = await oning_client.chat.completions.create(
+    stream = await async_client.chat.completions.create(
         model=model_name,
         messages=messages,
         max_completion_tokens=1,