Skip to content

Commit 59f4d41

Browse files
committed
Fix tests accordingly
Signed-off-by: Pengyun Lin <[email protected]>
1 parent dfa67e9 commit 59f4d41

File tree

8 files changed

+43
-63
lines changed

8 files changed

+43
-63
lines changed

tests/integration/defs/stress_test/stress_test.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ def check_server_health(server_url: str,
322322

323323
@pytest.mark.parametrize("test_mode", ["stress-test", "stress-stage-alone"],
324324
ids=lambda x: x)
325-
@pytest.mark.parametrize("backend", ["trt", "pytorch"], ids=lambda x: x)
325+
@pytest.mark.parametrize("backend", ["tensorrt", "pytorch"], ids=lambda x: x)
326326
@pytest.mark.parametrize("capacity_scheduler_policy",
327327
["GUARANTEED_NO_EVICT", "MAX_UTILIZATION"],
328328
ids=lambda x: x)
@@ -358,18 +358,17 @@ def test_run_stress_test(config, stress_time_timeout, backend,
358358
Args:
359359
config: Model configuration for the test (injected by pytest.mark.parametrize)
360360
stress_time_timeout: Tuple of (stress_time, stress_timeout) in seconds
361-
backend: Backend to use ("trt" or "pytorch")
361+
backend: Backend to use ("tensorrt" or "pytorch")
362362
capacity_scheduler_policy: Scheduler policy ("GUARANTEED_NO_EVICT", "MAX_UTILIZATION")
363363
test_mode: Test mode ("stress-test" or "stress-stage-alone")
364364
"""
365365
# Create a new ModelConfig with the backend parameter
366366
# Convert 'trt' to None as expected by the ModelConfig
367-
backend_param = None if backend == "trt" else backend
368367

369368
new_config = ModelConfig(model_dir=config.model_dir,
370369
tp_size=config.tp_size,
371370
memory_requirement=config.memory_requirement,
372-
backend=backend_param)
371+
backend=backend)
373372

374373
# Extract stress_time and stress_timeout from the tuple
375374
stress_time, stress_timeout = stress_time_timeout
@@ -542,6 +541,8 @@ def stress_test(config,
542541
str(config.tp_size),
543542
"--pp_size",
544543
str(test_server_config.pp_size),
544+
"--backend",
545+
config.backend,
545546
]
546547

547548
# Only add ep_size parameter if it's not None
@@ -560,12 +561,6 @@ def stress_test(config,
560561
extra_llm_options_path,
561562
])
562563

563-
# Add backend option only if specified
564-
# backend = None means trt backend
565-
# backend = pytorch means pytorch backend
566-
if config.backend:
567-
server_cmd.extend(["--backend", config.backend])
568-
569564
# Log the command we're about to run
570565
print_info(f"Running command: {' '.join(server_cmd)}")
571566

tests/unittest/llmapi/apps/_test_openai_chat.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ def model_name():
2020
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
2121

2222

23-
@pytest.fixture(scope="module",
24-
params=[None, 'pytorch'],
25-
ids=["trt", "pytorch"])
23+
@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
2624
def backend(request):
2725
return request.param
2826

@@ -67,10 +65,9 @@ def temp_extra_llm_api_options_file(request):
6765
def server(model_name: str, backend: str, extra_llm_api_options: bool,
6866
temp_extra_llm_api_options_file: str, num_postprocess_workers: int):
6967
model_path = get_model_path(model_name)
70-
if backend == "pytorch":
71-
args = ["--backend", f"{backend}"]
72-
else:
73-
args = ["--max_beam_width", "4"]
68+
args = ["--backend", f"{backend}"]
69+
if backend == "tensorrt":
70+
args.extend(["--max_beam_width", "4"])
7471
if extra_llm_api_options:
7572
args.extend(
7673
["--extra_llm_api_options", temp_extra_llm_api_options_file])

tests/unittest/llmapi/apps/_test_openai_completions.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def model_name():
1414
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
1515

1616

17-
@pytest.fixture(scope="module", params=[None, 'pytorch'])
17+
@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
1818
def backend(request):
1919
return request.param
2020

@@ -29,10 +29,9 @@ def num_postprocess_workers(request):
2929
@pytest.fixture(scope="module")
3030
def server(model_name: str, backend: str, num_postprocess_workers: int):
3131
model_path = get_model_path(model_name)
32-
if backend == "pytorch":
33-
args = ["--backend", f"{backend}"]
34-
else:
35-
args = ["--max_beam_width", "4"]
32+
args = ["--backend", f"{backend}"]
33+
if backend == "tensorrt":
34+
args.extend(["--max_beam_width", "4"])
3635
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
3736
with RemoteOpenAIServer(model_path, args) as remote_server:
3837
yield remote_server

tests/unittest/llmapi/apps/_test_openai_metrics.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def client():
2121
llm = PyTorchLLM(model=llama_model_path,
2222
build_config=build_config,
2323
kv_cache_config=KvCacheConfig(),
24-
backend="pytorch",
2524
enable_iter_perf_stats=True)
2625
hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
2726

tests/unittest/llmapi/apps/_test_openai_misc.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,17 @@ def model_name():
1515
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
1616

1717

18-
@pytest.fixture(scope="module", params=["trt", 'pytorch'])
18+
@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
1919
def backend(request):
2020
return request.param
2121

2222

23-
@pytest.fixture(scope="module", params=['8'])
23+
@pytest.fixture(scope="module", params=["8"])
2424
def max_batch_size(request):
2525
return request.param
2626

2727

28-
@pytest.fixture(scope="module", params=['80000'])
28+
@pytest.fixture(scope="module", params=["80000"])
2929
def max_seq_len(request):
3030
return request.param
3131

@@ -34,19 +34,13 @@ def max_seq_len(request):
3434
def server(model_name: str, backend: str, max_batch_size: str,
3535
max_seq_len: str):
3636
model_path = get_model_path(model_name)
37-
args = []
38-
if backend == "pytorch":
39-
args.append("--backend")
40-
args.append(backend)
37+
args = ["--backend", f"{backend}"]
4138
if backend != "pytorch":
42-
args.append("--max_beam_width")
43-
args.append("4")
39+
args.extend(["--max_beam_width", "4"])
4440
if max_batch_size is not None:
45-
args.append("--max_batch_size")
46-
args.append(max_batch_size)
41+
args.extend(["--max_batch_size", max_batch_size])
4742
if max_seq_len is not None:
48-
args.append("--max_seq_len")
49-
args.append(max_seq_len)
43+
args.extend(["--max_seq_len", max_seq_len])
5044
with RemoteOpenAIServer(model_path, args) as remote_server:
5145
yield remote_server
5246

tests/unittest/llmapi/apps/_test_openai_multi_gpu.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@ def model_name():
1515
return "llama-models-v3/llama-v3-8b-instruct-hf"
1616

1717

18-
@pytest.fixture(scope="module",
19-
params=[None, 'pytorch'],
20-
ids=["trt", "pytorch"])
18+
@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
2119
def backend(request):
2220
return request.param
2321

@@ -55,13 +53,10 @@ def temp_extra_llm_api_options_file(request):
5553
def server(model_name: str, backend: str, extra_llm_api_options: bool,
5654
temp_extra_llm_api_options_file: str):
5755
model_path = get_model_path(model_name)
58-
args = ["--tp_size", "2", "--max_beam_width", "1"]
59-
if backend is not None:
60-
args.append("--backend")
61-
args.append(backend)
56+
args = ["--tp_size", "2", "--max_beam_width", "1", "--backend", backend]
6257
if extra_llm_api_options:
63-
args.append("--extra_llm_api_options")
64-
args.append(temp_extra_llm_api_options_file)
58+
args.extend(
59+
["--extra_llm_api_options", temp_extra_llm_api_options_file])
6560
with RemoteOpenAIServer(model_path, args) as remote_server:
6661
yield remote_server
6762

@@ -95,7 +90,7 @@ def test_chat_tp2(client: openai.OpenAI, model_name: str):
9590
assert len(chat_completion.choices) == 1
9691
assert chat_completion.usage.completion_tokens == 1
9792
message = chat_completion.choices[0].message
98-
assert message.content == 'Two'
93+
assert message.content == "Two"
9994

10095

10196
@skip_single_gpu

tests/unittest/llmapi/apps/_test_openai_multi_nodes.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,17 @@ def server(model_name: str, backend: str, tp_pp_size: tuple):
4848
tp_size, pp_size = tp_pp_size
4949
device_count = torch.cuda.device_count()
5050
args = [
51-
"--tp_size", f"{tp_size}", "--pp_size", f"{pp_size}", "--gpus_per_node",
52-
f"{device_count}", "--kv_cache_free_gpu_memory_fraction", "0.95"
51+
"--tp_size",
52+
f"{tp_size}",
53+
"--pp_size",
54+
f"{pp_size}",
55+
"--gpus_per_node",
56+
f"{device_count}",
57+
"--kv_cache_free_gpu_memory_fraction",
58+
"0.95",
59+
"--backend",
60+
backend,
5361
]
54-
if backend is not None:
55-
args.append("--backend")
56-
args.append(backend)
5762
with RemoteOpenAIServer(model_path, args, llmapi_launch=True,
5863
port=8001) as remote_server:
5964
yield remote_server

tests/unittest/llmapi/apps/_test_openai_reasoning.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,15 @@ def model_name() -> str:
1414
return "DeepSeek-R1-Distill-Qwen-1.5B"
1515

1616

17-
@pytest.fixture(scope="module",
18-
params=[None, 'pytorch'],
19-
ids=["trt", "pytorch"])
17+
@pytest.fixture(scope="module", params=["tensorrt", "pytorch"])
2018
def backend(request):
2119
return request.param
2220

2321

2422
@pytest.fixture(scope="module")
25-
def server(model_name: str, backend: str) -> RemoteOpenAIServer:
23+
def server(model_name: str, backend: str):
2624
model_path = get_model_path(model_name)
27-
args = []
28-
if backend == "pytorch":
29-
args.extend(["--backend", f"{backend}"])
25+
args = ["--backend", f"{backend}"]
3026
max_beam_width = 1 if backend == "pytorch" else 2
3127
args.extend(["--max_beam_width", str(max_beam_width)])
3228
args.extend(["--max_batch_size", "2", "--max_seq_len", "1024"])
@@ -68,7 +64,7 @@ def test_reasoning_parser(client: openai.OpenAI, model_name: str, backend: str):
6864

6965

7066
@pytest.fixture(scope="module")
71-
def oning_client(server: RemoteOpenAIServer) -> openai.OpenAI:
67+
def async_client(server: RemoteOpenAIServer) -> openai.AsyncOpenAI:
7268
return server.get_async_client()
7369

7470

@@ -90,10 +86,10 @@ async def process_stream(
9086

9187

9288
@pytest.mark.asyncio(loop_scope="module")
93-
async def test_reasoning_parser_streaming(oning_client: openai.OpenAI,
94-
model_name: str, backend: str):
89+
async def test_reasoning_parser_streaming(async_client: openai.AsyncOpenAI,
90+
model_name: str):
9591
messages = [{"role": "user", "content": "hi"}]
96-
stream = await oning_client.chat.completions.create(
92+
stream = await async_client.chat.completions.create(
9793
model=model_name,
9894
messages=messages,
9995
max_completion_tokens=1000,
@@ -106,7 +102,7 @@ async def test_reasoning_parser_streaming(oning_client: openai.OpenAI,
106102
assert len(content_chunks) > 0
107103
assert len(reasoning_content_chunks) > 0
108104

109-
stream = await oning_client.chat.completions.create(
105+
stream = await async_client.chat.completions.create(
110106
model=model_name,
111107
messages=messages,
112108
max_completion_tokens=1,

0 commit comments

Comments
 (0)