Skip to content

Commit 9b3944b

Browse files
Wanli-Jiangdominicshanshan
authored andcommitted
[https://nvbugs/5509024][fix] Print full parsed outputs and update keywords for multimodal model (NVIDIA#7670)
Signed-off-by: Wanli Jiang <[email protected]> Signed-off-by: Wangshanshan <[email protected]>
1 parent 7d4d5c9 commit 9b3944b

File tree

1 file changed

+21
-14
lines changed

1 file changed

+21
-14
lines changed

tests/integration/defs/test_e2e.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2510,7 +2510,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25102510
],
25112511
"mixture_text_image":
25122512
[["invention", "person", "scientists", "Lick", "engineers"],
2513-
["landscape", "trees", "road", "natural", "rock"]]
2513+
["landscape", "trees", "road", "depicts", "scenic"]]
25142514
},
25152515
"gemma-3-27b-it": {
25162516
"image": [
@@ -2563,13 +2563,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25632563
if model_name == "qwen2-vl-7b-instruct" and modality == "image":
25642564
match_ratio = 4.0 / 6
25652565

2566+
parsed_outputs = parse_output(output)
25662567
for prompt_output, prompt_keywords in zip(
2567-
parse_output(output), expected_keywords[model_name][modality]):
2568+
parsed_outputs, expected_keywords[model_name][modality]):
25682569
matches = [
25692570
keyword in prompt_output.lower() for keyword in prompt_keywords
25702571
]
25712572
obs_match_ratio = 1. * sum(matches) / len(matches)
2572-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2573+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
25732574

25742575
print("All answers are correct!")
25752576

@@ -2970,13 +2971,14 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
29702971
output = llm_venv.run_cmd(cmd, caller=check_output)
29712972

29722973
match_ratio = 0.6
2973-
for prompt_output, prompt_keywords in zip(parse_output(output),
2974+
parsed_outputs = parse_output(output)
2975+
for prompt_output, prompt_keywords in zip(parsed_outputs,
29742976
expected_keywords[modality]):
29752977
matches = [
29762978
keyword in prompt_output.lower() for keyword in prompt_keywords
29772979
]
29782980
obs_match_ratio = 1. * sum(matches) / len(matches)
2979-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2981+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
29802982

29812983
print("All answers are correct!")
29822984

@@ -3081,13 +3083,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
30813083
match_ratio = 0.6
30823084

30833085
# Check output accuracy
3086+
parsed_outputs = parse_output(output)
30843087
for prompt_output, prompt_keywords in zip(
3085-
parse_output(output), expected_keywords[model_name]["image"]):
3088+
parsed_outputs, expected_keywords[model_name]["image"]):
30863089
matches = [
30873090
keyword in prompt_output.lower() for keyword in prompt_keywords
30883091
]
30893092
obs_match_ratio = 1. * sum(matches) / len(matches)
3090-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
3093+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
30913094

30923095
print("All answers are correct!")
30933096

@@ -3124,20 +3127,23 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
31243127
expected_keywords = {
31253128
"gemma-3-27b-it": {
31263129
"image": [
3127-
["half", "dome", "yosemite", "landmark", "rounded"],
3128-
["atmosphere", "peaceful", "majestic", "calm", "quiet"],
3130+
["description", "image", "half", "dome", "park"],
3131+
["atmosphere", "peaceful", "majestic", "scene", "sky"],
31293132
],
31303133
},
31313134
"mistral-small-3.1-24b-instruct": {
31323135
"image": [
3133-
["depicts", "landscape", "rock", "sky", "high", "altitude"],
3134-
["atmosphere", "serene", "majestic", "sense", "tranquility"],
3136+
[
3137+
"depicts", "scenic", "landscape", "rock", "formation",
3138+
"background"
3139+
],
3140+
["atmosphere", "serene", "majestic", "clear", "sky", "trees"],
31353141
],
31363142
},
31373143
"Phi-4-multimodal-instruct": {
31383144
"image": [
31393145
["depicts", "landscape", "mountain", "half", "dome"],
3140-
["atmosphere", "serene", "sense", "tranquility", "peace."],
3146+
["atmosphere", "serene", "sense", "scene", "majestic"],
31413147
],
31423148
},
31433149
}
@@ -3187,8 +3193,9 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
31873193
match_ratio = 0.6
31883194

31893195
# Check output accuracy
3196+
parsed_outputs = parse_output(output)
31903197
for prompt_output, prompt_keywords in zip(
3191-
parse_output(output), expected_keywords[model_name]["image"]):
3198+
parsed_outputs, expected_keywords[model_name]["image"]):
31923199
matches = [
31933200
keyword in prompt_output.lower() for keyword in prompt_keywords
31943201
]
@@ -3197,7 +3204,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
31973204
print("prompt_keywords:", prompt_keywords)
31983205
print("matches:", matches)
31993206
print("obs_match_ratio:", obs_match_ratio)
3200-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
3207+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
32013208

32023209
print("All answers are correct!")
32033210

0 commit comments

Comments
 (0)