Skip to content

Commit 2a6a059

Browse files
committed
[https://nvbugs/5509024][fix] Fix multimodal keyword matching tests
* Also print parsed_ouput for easy debugging. Signed-off-by: Wanli Jiang <[email protected]>
1 parent aaa381d commit 2a6a059

File tree

1 file changed

+21
-14
lines changed

1 file changed

+21
-14
lines changed

tests/integration/defs/test_e2e.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2334,7 +2334,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
23342334
],
23352335
"mixture_text_image":
23362336
[["invention", "person", "scientists", "Lick", "engineers"],
2337-
["landscape", "trees", "road", "natural", "rock"]]
2337+
["landscape", "trees", "road", "depicts", "scenic"]]
23382338
},
23392339
"gemma-3-27b-it": {
23402340
"image": [
@@ -2378,13 +2378,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
23782378
if model_name == "qwen2-vl-7b-instruct" and modality == "image":
23792379
match_ratio = 4.0 / 6
23802380

2381+
parsed_outputs = parse_output(output)
23812382
for prompt_output, prompt_keywords in zip(
2382-
parse_output(output), expected_keywords[model_name][modality]):
2383+
parsed_outputs, expected_keywords[model_name][modality]):
23832384
matches = [
23842385
keyword in prompt_output.lower() for keyword in prompt_keywords
23852386
]
23862387
obs_match_ratio = 1. * sum(matches) / len(matches)
2387-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2388+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
23882389

23892390
print("All answers are correct!")
23902391

@@ -2517,13 +2518,14 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
25172518
output = llm_venv.run_cmd(cmd, caller=check_output)
25182519

25192520
match_ratio = 0.6
2520-
for prompt_output, prompt_keywords in zip(parse_output(output),
2521+
parsed_outputs = parse_output(output)
2522+
for prompt_output, prompt_keywords in zip(parsed_outputs,
25212523
expected_keywords[modality]):
25222524
matches = [
25232525
keyword in prompt_output.lower() for keyword in prompt_keywords
25242526
]
25252527
obs_match_ratio = 1. * sum(matches) / len(matches)
2526-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2528+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
25272529

25282530
print("All answers are correct!")
25292531

@@ -2623,13 +2625,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
26232625
match_ratio = 0.6
26242626

26252627
# Check output accuracy
2628+
parsed_outputs = parse_output(output)
26262629
for prompt_output, prompt_keywords in zip(
2627-
parse_output(output), expected_keywords[model_name]["image"]):
2630+
parsed_outputs, expected_keywords[model_name]["image"]):
26282631
matches = [
26292632
keyword in prompt_output.lower() for keyword in prompt_keywords
26302633
]
26312634
obs_match_ratio = 1. * sum(matches) / len(matches)
2632-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2635+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
26332636

26342637
print("All answers are correct!")
26352638

@@ -2666,20 +2669,23 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
26662669
expected_keywords = {
26672670
"gemma-3-27b-it": {
26682671
"image": [
2669-
["half", "dome", "yosemite", "landmark", "rounded"],
2670-
["atmosphere", "peaceful", "majestic", "calm", "quiet"],
2672+
["description", "image", "half", "dome", "park"],
2673+
["atmosphere", "peaceful", "majestic", "scene", "sky"],
26712674
],
26722675
},
26732676
"mistral-small-3.1-24b-instruct": {
26742677
"image": [
2675-
["depicts", "landscape", "rock", "sky", "high", "altitude"],
2676-
["atmosphere", "serene", "majestic", "sense", "tranquility"],
2678+
[
2679+
"depicts", "scenic", "landscape", "rock", "formation",
2680+
"background"
2681+
],
2682+
["atmosphere", "serene", "majestic", "clear", "sky", "trees"],
26772683
],
26782684
},
26792685
"Phi-4-multimodal-instruct": {
26802686
"image": [
26812687
["depicts", "landscape", "mountain", "half", "dome"],
2682-
["atmosphere", "serene", "sense", "tranquility", "peace."],
2688+
["atmosphere", "serene", "sense", "scene", "majestic"],
26832689
],
26842690
},
26852691
}
@@ -2722,8 +2728,9 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
27222728
match_ratio = 0.6
27232729

27242730
# Check output accuracy
2731+
parsed_outputs = parse_output(output)
27252732
for prompt_output, prompt_keywords in zip(
2726-
parse_output(output), expected_keywords[model_name]["image"]):
2733+
parsed_outputs, expected_keywords[model_name]["image"]):
27272734
matches = [
27282735
keyword in prompt_output.lower() for keyword in prompt_keywords
27292736
]
@@ -2732,7 +2739,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
27322739
print("prompt_keywords:", prompt_keywords)
27332740
print("matches:", matches)
27342741
print("obs_match_ratio:", obs_match_ratio)
2735-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2742+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
27362743

27372744
print("All answers are correct!")
27382745

0 commit comments

Comments
 (0)