Skip to content

Commit ba1b3f1

Browse files
LinPolyRansiki
authored andcommitted
[BREAKING CHANGE]: change default backend to PyTorch in trtllm-serve (NVIDIA#5717)
Signed-off-by: Pengyun Lin <[email protected]> Signed-off-by: Ransiki Zhang <[email protected]>
1 parent 006d37d commit ba1b3f1

13 files changed

+47
-107
lines changed

tensorrt_llm/commands/serve.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def _signal_handler_cleanup_child(signum, frame):
7171

7272
def get_llm_args(model: str,
7373
tokenizer: Optional[str] = None,
74-
backend: Optional[str] = None,
74+
backend: str = "pytorch",
7575
max_beam_width: int = BuildConfig.max_beam_width,
7676
max_batch_size: int = BuildConfig.max_batch_size,
7777
max_num_tokens: int = BuildConfig.max_num_tokens,
@@ -165,8 +165,8 @@ def launch_server(host: str,
165165
help="Hostname of the server.")
166166
@click.option("--port", type=int, default=8000, help="Port of the server.")
167167
@click.option("--backend",
168-
type=click.Choice(["pytorch"]),
169-
default=None,
168+
type=click.Choice(["pytorch", "trt"]),
169+
default="pytorch",
170170
help="Set to 'pytorch' for pytorch path. Default is cpp path.")
171171
@click.option('--log_level',
172172
type=click.Choice(severity_map.keys()),

tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ hostname: localhost
22
port: 8000
33
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
44
free_gpu_memory_fraction: 0.25
5+
backend: "trt"
56
context_servers:
67
num_instances: 1
78
tensor_parallel_size: 2

tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
hostname: localhost
22
port: 8000
33
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4+
backend: "trt"
45
context_servers:
56
num_instances: 0
67
generation_servers:

tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ hostname: localhost
22
port: 8000
33
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
44
free_gpu_memory_fraction: 0.25
5+
backend: "trt"
56
context_servers:
67
num_instances: 1
78
tensor_parallel_size: 1

tests/integration/defs/stress_test/stress_test.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -364,12 +364,11 @@ def test_run_stress_test(config, stress_time_timeout, backend,
364364
"""
365365
# Create a new ModelConfig with the backend parameter
366366
# Convert 'trt' to None as expected by the ModelConfig
367-
backend_param = None if backend == "trt" else backend
368367

369368
new_config = ModelConfig(model_dir=config.model_dir,
370369
tp_size=config.tp_size,
371370
memory_requirement=config.memory_requirement,
372-
backend=backend_param)
371+
backend=backend)
373372

374373
# Extract stress_time and stress_timeout from the tuple
375374
stress_time, stress_timeout = stress_time_timeout
@@ -542,6 +541,8 @@ def stress_test(config,
542541
str(config.tp_size),
543542
"--pp_size",
544543
str(test_server_config.pp_size),
544+
"--backend",
545+
config.backend,
545546
]
546547

547548
# Only add ep_size parameter if it's not None
@@ -560,12 +561,6 @@ def stress_test(config,
560561
extra_llm_options_path,
561562
])
562563

563-
# Add backend option only if specified
564-
# backend = None means trt backend
565-
# backend = pytorch means pytorch backend
566-
if config.backend:
567-
server_cmd.extend(["--backend", config.backend])
568-
569564
# Log the command we're about to run
570565
print_info(f"Running command: {' '.join(server_cmd)}")
571566

tests/integration/defs/test_e2e.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1407,13 +1407,7 @@ def test_openai_completions_example(llm_root, llm_venv, backend: str):
14071407

14081408
@pytest.mark.parametrize("backend", ["pytorch", "trt"])
14091409
def test_openai_chat_example(llm_root, llm_venv, backend: str):
1410-
example_root = Path(os.path.join(llm_root, "examples", "apps"))
14111410
test_root = unittest_path() / "llmapi" / "apps"
1412-
llm_venv.run_cmd([
1413-
"-m", "pip", "install", "-r",
1414-
os.path.join(example_root, "requirements.txt")
1415-
])
1416-
14171411
llm_venv.run_cmd([
14181412
"-m", "pytest",
14191413
str(test_root / "_test_openai_chat.py"), "-k", backend
@@ -1435,21 +1429,14 @@ def test_openai_lora(llm_root, llm_venv):
14351429

14361430

14371431
def test_openai_chat_multimodal_example(llm_root, llm_venv):
1438-
example_root = Path(os.path.join(llm_root, "examples", "apps"))
14391432
test_root = unittest_path() / "llmapi" / "apps"
1440-
llm_venv.run_cmd([
1441-
"-m", "pip", "install", "-r",
1442-
os.path.join(example_root, "requirements.txt")
1443-
])
1444-
14451433
llm_venv.run_cmd(
14461434
["-m", "pytest",
14471435
str(test_root / "_test_openai_chat_multimodal.py")])
14481436

14491437

14501438
def test_openai_chat_structural_tag_example(llm_venv):
14511439
test_root = unittest_path() / "llmapi" / "apps"
1452-
14531440
llm_venv.run_cmd([
14541441
"-m", "pytest",
14551442
str(test_root / "_test_openai_chat_structural_tag.py")
@@ -1459,13 +1446,7 @@ def test_openai_chat_structural_tag_example(llm_venv):
14591446
@pytest.mark.skip_less_device(2)
14601447
@pytest.mark.skip_less_device_memory(40000)
14611448
def test_openai_multi_chat_example(llm_root, llm_venv):
1462-
example_root = Path(os.path.join(llm_root, "examples", "apps"))
14631449
test_root = unittest_path() / "llmapi" / "apps"
1464-
llm_venv.run_cmd([
1465-
"-m", "pip", "install", "-r",
1466-
os.path.join(example_root, "requirements.txt")
1467-
])
1468-
14691450
llm_venv.run_cmd(
14701451
["-m", "pytest",
14711452
str(test_root / "_test_openai_multi_chat.py")])
@@ -1475,13 +1456,7 @@ def test_openai_multi_chat_example(llm_root, llm_venv):
14751456
@pytest.mark.skip_less_device(4)
14761457
@pytest.mark.skip_less_device_memory(80000)
14771458
def test_openai_consistent_chat(llm_root, llm_venv):
1478-
example_root = Path(os.path.join(llm_root, "examples", "apps"))
14791459
test_root = unittest_path() / "llmapi" / "apps"
1480-
llm_venv.run_cmd([
1481-
"-m", "pip", "install", "-r",
1482-
os.path.join(example_root, "requirements.txt")
1483-
])
1484-
14851460
llm_venv.run_cmd(
14861461
["-m", "pytest",
14871462
str(test_root / "_test_openai_consistent_chat.py")])
@@ -1491,13 +1466,7 @@ def test_openai_consistent_chat(llm_root, llm_venv):
14911466
@pytest.mark.skip_less_device(4)
14921467
@pytest.mark.skip_less_device_memory(80000)
14931468
def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv):
1494-
example_root = Path(os.path.join(llm_root, "examples", "apps"))
14951469
test_root = unittest_path() / "llmapi" / "apps"
1496-
llm_venv.run_cmd([
1497-
"-m", "pip", "install", "-r",
1498-
os.path.join(example_root, "requirements.txt")
1499-
])
1500-
15011470
llm_venv.run_cmd([
15021471
"-m", "pytest", "-k", "tp16pp1",
15031472
str(test_root / "_test_openai_multi_nodes.py")
@@ -1508,13 +1477,7 @@ def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv):
15081477
@pytest.mark.skip_less_device(4)
15091478
@pytest.mark.skip_less_device_memory(80000)
15101479
def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
1511-
example_root = Path(os.path.join(llm_root, "examples", "apps"))
15121480
test_root = unittest_path() / "llmapi" / "apps"
1513-
llm_venv.run_cmd([
1514-
"-m", "pip", "install", "-r",
1515-
os.path.join(example_root, "requirements.txt")
1516-
])
1517-
15181481
llm_venv.run_cmd([
15191482
"-m", "pytest", "-k", "tp8pp2",
15201483
str(test_root / "_test_openai_multi_nodes.py")
@@ -1523,13 +1486,7 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
15231486

15241487
@pytest.mark.skip_less_device_memory(80000)
15251488
def test_trtllm_benchmark_serving(llm_root, llm_venv):
1526-
example_root = Path(os.path.join(llm_root, "examples", "apps"))
15271489
test_root = unittest_path() / "llmapi" / "apps"
1528-
llm_venv.run_cmd([
1529-
"-m", "pip", "install", "-r",
1530-
os.path.join(example_root, "requirements.txt")
1531-
])
1532-
15331490
llm_venv.run_cmd(
15341491
["-m", "pytest",
15351492
str(test_root / "_test_trtllm_serve_benchmark.py")])

tests/unittest/llmapi/apps/_test_openai_chat.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ def model_name():
2020
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
2121

2222

23-
@pytest.fixture(scope="module",
24-
params=[None, 'pytorch'],
25-
ids=["trt", "pytorch"])
23+
@pytest.fixture(scope="module", params=["trt", "pytorch"])
2624
def backend(request):
2725
return request.param
2826

@@ -67,10 +65,9 @@ def temp_extra_llm_api_options_file(request):
6765
def server(model_name: str, backend: str, extra_llm_api_options: bool,
6866
temp_extra_llm_api_options_file: str, num_postprocess_workers: int):
6967
model_path = get_model_path(model_name)
70-
if backend == "pytorch":
71-
args = ["--backend", f"{backend}"]
72-
else:
73-
args = ["--max_beam_width", "4"]
68+
args = ["--backend", f"{backend}"]
69+
if backend == "trt":
70+
args.extend(["--max_beam_width", "4"])
7471
if extra_llm_api_options:
7572
args.extend(
7673
["--extra_llm_api_options", temp_extra_llm_api_options_file])

tests/unittest/llmapi/apps/_test_openai_completions.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def model_name():
1414
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
1515

1616

17-
@pytest.fixture(scope="module", params=["trt", 'pytorch'])
17+
@pytest.fixture(scope="module", params=["trt", "pytorch"])
1818
def backend(request):
1919
return request.param
2020

@@ -29,10 +29,9 @@ def num_postprocess_workers(request):
2929
@pytest.fixture(scope="module")
3030
def server(model_name: str, backend: str, num_postprocess_workers: int):
3131
model_path = get_model_path(model_name)
32-
if backend == "pytorch":
33-
args = ["--backend", f"{backend}"]
34-
else:
35-
args = ["--max_beam_width", "4"]
32+
args = ["--backend", f"{backend}"]
33+
if backend == "trt":
34+
args.extend(["--max_beam_width", "4"])
3635
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
3736
with RemoteOpenAIServer(model_path, args) as remote_server:
3837
yield remote_server

tests/unittest/llmapi/apps/_test_openai_metrics.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def client():
2121
llm = PyTorchLLM(model=llama_model_path,
2222
build_config=build_config,
2323
kv_cache_config=KvCacheConfig(),
24-
backend="pytorch",
2524
enable_iter_perf_stats=True)
2625
hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
2726

tests/unittest/llmapi/apps/_test_openai_misc.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,17 @@ def model_name():
1515
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
1616

1717

18-
@pytest.fixture(scope="module", params=["trt", 'pytorch'])
18+
@pytest.fixture(scope="module", params=["trt", "pytorch"])
1919
def backend(request):
2020
return request.param
2121

2222

23-
@pytest.fixture(scope="module", params=['8'])
23+
@pytest.fixture(scope="module", params=["8"])
2424
def max_batch_size(request):
2525
return request.param
2626

2727

28-
@pytest.fixture(scope="module", params=['80000'])
28+
@pytest.fixture(scope="module", params=["80000"])
2929
def max_seq_len(request):
3030
return request.param
3131

@@ -34,19 +34,13 @@ def max_seq_len(request):
3434
def server(model_name: str, backend: str, max_batch_size: str,
3535
max_seq_len: str):
3636
model_path = get_model_path(model_name)
37-
args = []
38-
if backend == "pytorch":
39-
args.append("--backend")
40-
args.append(backend)
37+
args = ["--backend", f"{backend}"]
4138
if backend != "pytorch":
42-
args.append("--max_beam_width")
43-
args.append("4")
39+
args.extend(["--max_beam_width", "4"])
4440
if max_batch_size is not None:
45-
args.append("--max_batch_size")
46-
args.append(max_batch_size)
41+
args.extend(["--max_batch_size", max_batch_size])
4742
if max_seq_len is not None:
48-
args.append("--max_seq_len")
49-
args.append(max_seq_len)
43+
args.extend(["--max_seq_len", max_seq_len])
5044
with RemoteOpenAIServer(model_path, args) as remote_server:
5145
yield remote_server
5246

0 commit comments

Comments
 (0)