[BREAKING CHANGE]: change default backend to PyTorch in trtllm-serve (NVIDIA#5717)

LinPoly · Ransiki · commit ba1b3f1b8b67 · 2025-07-28T19:22:50.000-07:00
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
Signed-off-by: Ransiki Zhang &lt;ransikiz@nvidia.com&gt;
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -71,7 +71,7 @@ def _signal_handler_cleanup_child(signum, frame):
 
 def get_llm_args(model: str,
                  tokenizer: Optional[str] = None,
-                 backend: Optional[str] = None,
+                 backend: str = "pytorch",
                  max_beam_width: int = BuildConfig.max_beam_width,
                  max_batch_size: int = BuildConfig.max_batch_size,
                  max_num_tokens: int = BuildConfig.max_num_tokens,
@@ -165,8 +165,8 @@ def launch_server(host: str,
               help="Hostname of the server.")
 @click.option("--port", type=int, default=8000, help="Port of the server.")
 @click.option("--backend",
-              type=click.Choice(["pytorch"]),
-              default=None,
+              type=click.Choice(["pytorch", "trt"]),
+              default="pytorch",
               help="Set to 'pytorch' for pytorch path. Default is cpp path.")
 @click.option('--log_level',
               type=click.Choice(severity_map.keys()),
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml
@@ -2,6 +2,7 @@ hostname: localhost
 port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 free_gpu_memory_fraction: 0.25
+backend: "trt"
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml
@@ -1,6 +1,7 @@
 hostname: localhost
 port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+backend: "trt"
 context_servers:
   num_instances: 0
 generation_servers:
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml
@@ -2,6 +2,7 @@ hostname: localhost
 port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 free_gpu_memory_fraction: 0.25
+backend: "trt"
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
@@ -364,12 +364,11 @@ def test_run_stress_test(config, stress_time_timeout, backend,
     """
     # Create a new ModelConfig with the backend parameter
     # Convert 'trt' to None as expected by the ModelConfig
-    backend_param = None if backend == "trt" else backend
 
     new_config = ModelConfig(model_dir=config.model_dir,
                              tp_size=config.tp_size,
                              memory_requirement=config.memory_requirement,
-                             backend=backend_param)
+                             backend=backend)
 
     # Extract stress_time and stress_timeout from the tuple
     stress_time, stress_timeout = stress_time_timeout
@@ -542,6 +541,8 @@ def stress_test(config,
         str(config.tp_size),
         "--pp_size",
         str(test_server_config.pp_size),
+        "--backend",
+        config.backend,
     ]
 
     # Only add ep_size parameter if it's not None
@@ -560,12 +561,6 @@ def stress_test(config,
         extra_llm_options_path,
     ])
 
-    # Add backend option only if specified
-    # backend = None means trt backend
-    # backend = pytorch means pytorch backend
-    if config.backend:
-        server_cmd.extend(["--backend", config.backend])
-
     # Log the command we're about to run
     print_info(f"Running command: {' '.join(server_cmd)}")
 
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1407,13 +1407,7 @@ def test_openai_completions_example(llm_root, llm_venv, backend: str):
 
 @pytest.mark.parametrize("backend", ["pytorch", "trt"])
 def test_openai_chat_example(llm_root, llm_venv, backend: str):
-    example_root = Path(os.path.join(llm_root, "examples", "apps"))
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd([
-        "-m", "pip", "install", "-r",
-        os.path.join(example_root, "requirements.txt")
-    ])
-
     llm_venv.run_cmd([
         "-m", "pytest",
         str(test_root / "_test_openai_chat.py"), "-k", backend
@@ -1435,21 +1429,14 @@ def test_openai_lora(llm_root, llm_venv):
 
 
 def test_openai_chat_multimodal_example(llm_root, llm_venv):
-    example_root = Path(os.path.join(llm_root, "examples", "apps"))
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd([
-        "-m", "pip", "install", "-r",
-        os.path.join(example_root, "requirements.txt")
-    ])
-
     llm_venv.run_cmd(
         ["-m", "pytest",
          str(test_root / "_test_openai_chat_multimodal.py")])
 
 
 def test_openai_chat_structural_tag_example(llm_venv):
     test_root = unittest_path() / "llmapi" / "apps"
-
     llm_venv.run_cmd([
         "-m", "pytest",
         str(test_root / "_test_openai_chat_structural_tag.py")
@@ -1459,13 +1446,7 @@ def test_openai_chat_structural_tag_example(llm_venv):
 @pytest.mark.skip_less_device(2)
 @pytest.mark.skip_less_device_memory(40000)
 def test_openai_multi_chat_example(llm_root, llm_venv):
-    example_root = Path(os.path.join(llm_root, "examples", "apps"))
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd([
-        "-m", "pip", "install", "-r",
-        os.path.join(example_root, "requirements.txt")
-    ])
-
     llm_venv.run_cmd(
         ["-m", "pytest",
          str(test_root / "_test_openai_multi_chat.py")])
@@ -1475,13 +1456,7 @@ def test_openai_multi_chat_example(llm_root, llm_venv):
 @pytest.mark.skip_less_device(4)
 @pytest.mark.skip_less_device_memory(80000)
 def test_openai_consistent_chat(llm_root, llm_venv):
-    example_root = Path(os.path.join(llm_root, "examples", "apps"))
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd([
-        "-m", "pip", "install", "-r",
-        os.path.join(example_root, "requirements.txt")
-    ])
-
     llm_venv.run_cmd(
         ["-m", "pytest",
          str(test_root / "_test_openai_consistent_chat.py")])
@@ -1491,13 +1466,7 @@ def test_openai_consistent_chat(llm_root, llm_venv):
 @pytest.mark.skip_less_device(4)
 @pytest.mark.skip_less_device_memory(80000)
 def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv):
-    example_root = Path(os.path.join(llm_root, "examples", "apps"))
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd([
-        "-m", "pip", "install", "-r",
-        os.path.join(example_root, "requirements.txt")
-    ])
-
     llm_venv.run_cmd([
         "-m", "pytest", "-k", "tp16pp1",
         str(test_root / "_test_openai_multi_nodes.py")
@@ -1508,13 +1477,7 @@ def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv):
 @pytest.mark.skip_less_device(4)
 @pytest.mark.skip_less_device_memory(80000)
 def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
-    example_root = Path(os.path.join(llm_root, "examples", "apps"))
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd([
-        "-m", "pip", "install", "-r",
-        os.path.join(example_root, "requirements.txt")
-    ])
-
     llm_venv.run_cmd([
         "-m", "pytest", "-k", "tp8pp2",
         str(test_root / "_test_openai_multi_nodes.py")
@@ -1523,13 +1486,7 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
 
 @pytest.mark.skip_less_device_memory(80000)
 def test_trtllm_benchmark_serving(llm_root, llm_venv):
-    example_root = Path(os.path.join(llm_root, "examples", "apps"))
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd([
-        "-m", "pip", "install", "-r",
-        os.path.join(example_root, "requirements.txt")
-    ])
-
     llm_venv.run_cmd(
         ["-m", "pytest",
          str(test_root / "_test_trtllm_serve_benchmark.py")])
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat.py b/tests/unittest/llmapi/apps/_test_openai_chat.py
@@ -20,9 +20,7 @@ def model_name():
     return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-@pytest.fixture(scope="module",
-                params=[None, 'pytorch'],
-                ids=["trt", "pytorch"])
+@pytest.fixture(scope="module", params=["trt", "pytorch"])
 def backend(request):
     return request.param
 
@@ -67,10 +65,9 @@ def temp_extra_llm_api_options_file(request):
 def server(model_name: str, backend: str, extra_llm_api_options: bool,
            temp_extra_llm_api_options_file: str, num_postprocess_workers: int):
     model_path = get_model_path(model_name)
-    if backend == "pytorch":
-        args = ["--backend", f"{backend}"]
-    else:
-        args = ["--max_beam_width", "4"]
+    args = ["--backend", f"{backend}"]
+    if backend == "trt":
+        args.extend(["--max_beam_width", "4"])
     if extra_llm_api_options:
         args.extend(
             ["--extra_llm_api_options", temp_extra_llm_api_options_file])
diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py
@@ -14,7 +14,7 @@ def model_name():
     return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-@pytest.fixture(scope="module", params=["trt", 'pytorch'])
+@pytest.fixture(scope="module", params=["trt", "pytorch"])
 def backend(request):
     return request.param
 
@@ -29,10 +29,9 @@ def num_postprocess_workers(request):
 @pytest.fixture(scope="module")
 def server(model_name: str, backend: str, num_postprocess_workers: int):
     model_path = get_model_path(model_name)
-    if backend == "pytorch":
-        args = ["--backend", f"{backend}"]
-    else:
-        args = ["--max_beam_width", "4"]
+    args = ["--backend", f"{backend}"]
+    if backend == "trt":
+        args.extend(["--max_beam_width", "4"])
     args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@@ -21,7 +21,6 @@ def client():
     llm = PyTorchLLM(model=llama_model_path,
                      build_config=build_config,
                      kv_cache_config=KvCacheConfig(),
-                     backend="pytorch",
                      enable_iter_perf_stats=True)
     hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py
@@ -15,17 +15,17 @@ def model_name():
     return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-@pytest.fixture(scope="module", params=["trt", 'pytorch'])
+@pytest.fixture(scope="module", params=["trt", "pytorch"])
 def backend(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=['8'])
+@pytest.fixture(scope="module", params=["8"])
 def max_batch_size(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=['80000'])
+@pytest.fixture(scope="module", params=["80000"])
 def max_seq_len(request):
     return request.param
 
@@ -34,19 +34,13 @@ def max_seq_len(request):
 def server(model_name: str, backend: str, max_batch_size: str,
            max_seq_len: str):
     model_path = get_model_path(model_name)
-    args = []
-    if backend == "pytorch":
-        args.append("--backend")
-        args.append(backend)
+    args = ["--backend", f"{backend}"]
     if backend != "pytorch":
-        args.append("--max_beam_width")
-        args.append("4")
+        args.extend(["--max_beam_width", "4"])
     if max_batch_size is not None:
-        args.append("--max_batch_size")
-        args.append(max_batch_size)
+        args.extend(["--max_batch_size", max_batch_size])
     if max_seq_len is not None:
-        args.append("--max_seq_len")
-        args.append(max_seq_len)
+        args.extend(["--max_seq_len", max_seq_len])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py b/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py
@@ -15,9 +15,7 @@ def model_name():
     return "llama-models-v3/llama-v3-8b-instruct-hf"
 
 
-@pytest.fixture(scope="module",
-                params=[None, 'pytorch'],
-                ids=["trt", "pytorch"])
+@pytest.fixture(scope="module", params=["trt", "pytorch"])
 def backend(request):
     return request.param
 
@@ -55,13 +53,10 @@ def temp_extra_llm_api_options_file(request):
 def server(model_name: str, backend: str, extra_llm_api_options: bool,
            temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
-    args = ["--tp_size", "2", "--max_beam_width", "1"]
-    if backend is not None:
-        args.append("--backend")
-        args.append(backend)
+    args = ["--tp_size", "2", "--max_beam_width", "1", "--backend", backend]
     if extra_llm_api_options:
-        args.append("--extra_llm_api_options")
-        args.append(temp_extra_llm_api_options_file)
+        args.extend(
+            ["--extra_llm_api_options", temp_extra_llm_api_options_file])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
@@ -95,7 +90,7 @@ def test_chat_tp2(client: openai.OpenAI, model_name: str):
     assert len(chat_completion.choices) == 1
     assert chat_completion.usage.completion_tokens == 1
     message = chat_completion.choices[0].message
-    assert message.content == 'Two'
+    assert message.content == "Two"
 
 
 @skip_single_gpu
diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py b/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py
@@ -48,12 +48,17 @@ def server(model_name: str, backend: str, tp_pp_size: tuple):
     tp_size, pp_size = tp_pp_size
     device_count = torch.cuda.device_count()
     args = [
-        "--tp_size", f"{tp_size}", "--pp_size", f"{pp_size}", "--gpus_per_node",
-        f"{device_count}", "--kv_cache_free_gpu_memory_fraction", "0.95"
+        "--tp_size",
+        f"{tp_size}",
+        "--pp_size",
+        f"{pp_size}",
+        "--gpus_per_node",
+        f"{device_count}",
+        "--kv_cache_free_gpu_memory_fraction",
+        "0.95",
+        "--backend",
+        backend,
     ]
-    if backend is not None:
-        args.append("--backend")
-        args.append(backend)
     with RemoteOpenAIServer(model_path, args, llmapi_launch=True,
                             port=8001) as remote_server:
         yield remote_server
diff --git a/tests/unittest/llmapi/apps/_test_openai_reasoning.py b/tests/unittest/llmapi/apps/_test_openai_reasoning.py