@@ -2510,7 +2510,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25102510 ],
25112511 "mixture_text_image" :
25122512 [["invention" , "person" , "scientists" , "Lick" , "engineers" ],
2513- ["landscape" , "trees" , "road" , "natural " , "rock " ]]
2513+ ["landscape" , "trees" , "road" , "depicts " , "scenic " ]]
25142514 },
25152515 "gemma-3-27b-it" : {
25162516 "image" : [
@@ -2563,13 +2563,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25632563 if model_name == "qwen2-vl-7b-instruct" and modality == "image" :
25642564 match_ratio = 4.0 / 6
25652565
2566+ parsed_outputs = parse_output (output )
25662567 for prompt_output , prompt_keywords in zip (
2567- parse_output ( output ) , expected_keywords [model_name ][modality ]):
2568+ parsed_outputs , expected_keywords [model_name ][modality ]):
25682569 matches = [
25692570 keyword in prompt_output .lower () for keyword in prompt_keywords
25702571 ]
25712572 obs_match_ratio = 1. * sum (matches ) / len (matches )
2572- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2573+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
25732574
25742575 print ("All answers are correct!" )
25752576
@@ -2970,13 +2971,14 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
29702971 output = llm_venv .run_cmd (cmd , caller = check_output )
29712972
29722973 match_ratio = 0.6
2973- for prompt_output , prompt_keywords in zip (parse_output (output ),
2974+ parsed_outputs = parse_output (output )
2975+ for prompt_output , prompt_keywords in zip (parsed_outputs ,
29742976 expected_keywords [modality ]):
29752977 matches = [
29762978 keyword in prompt_output .lower () for keyword in prompt_keywords
29772979 ]
29782980 obs_match_ratio = 1. * sum (matches ) / len (matches )
2979- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2981+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
29802982
29812983 print ("All answers are correct!" )
29822984
@@ -3081,13 +3083,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
30813083 match_ratio = 0.6
30823084
30833085 # Check output accuracy
3086+ parsed_outputs = parse_output (output )
30843087 for prompt_output , prompt_keywords in zip (
3085- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
3088+ parsed_outputs , expected_keywords [model_name ]["image" ]):
30863089 matches = [
30873090 keyword in prompt_output .lower () for keyword in prompt_keywords
30883091 ]
30893092 obs_match_ratio = 1. * sum (matches ) / len (matches )
3090- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
3093+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
30913094
30923095 print ("All answers are correct!" )
30933096
@@ -3124,20 +3127,23 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
31243127 expected_keywords = {
31253128 "gemma-3-27b-it" : {
31263129 "image" : [
3127- ["half " , "dome " , "yosemite " , "landmark " , "rounded " ],
3128- ["atmosphere" , "peaceful" , "majestic" , "calm " , "quiet " ],
3130+ ["description " , "image " , "half " , "dome " , "park " ],
3131+ ["atmosphere" , "peaceful" , "majestic" , "scene " , "sky " ],
31293132 ],
31303133 },
31313134 "mistral-small-3.1-24b-instruct" : {
31323135 "image" : [
3133- ["depicts" , "landscape" , "rock" , "sky" , "high" , "altitude" ],
3134- ["atmosphere" , "serene" , "majestic" , "sense" , "tranquility" ],
3136+ [
3137+ "depicts" , "scenic" , "landscape" , "rock" , "formation" ,
3138+ "background"
3139+ ],
3140+ ["atmosphere" , "serene" , "majestic" , "clear" , "sky" , "trees" ],
31353141 ],
31363142 },
31373143 "Phi-4-multimodal-instruct" : {
31383144 "image" : [
31393145 ["depicts" , "landscape" , "mountain" , "half" , "dome" ],
3140- ["atmosphere" , "serene" , "sense" , "tranquility " , "peace. " ],
3146+ ["atmosphere" , "serene" , "sense" , "scene " , "majestic " ],
31413147 ],
31423148 },
31433149 }
@@ -3187,8 +3193,9 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
31873193 match_ratio = 0.6
31883194
31893195 # Check output accuracy
3196+ parsed_outputs = parse_output (output )
31903197 for prompt_output , prompt_keywords in zip (
3191- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
3198+ parsed_outputs , expected_keywords [model_name ]["image" ]):
31923199 matches = [
31933200 keyword in prompt_output .lower () for keyword in prompt_keywords
31943201 ]
@@ -3197,7 +3204,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
31973204 print ("prompt_keywords:" , prompt_keywords )
31983205 print ("matches:" , matches )
31993206 print ("obs_match_ratio:" , obs_match_ratio )
3200- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
3207+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
32013208
32023209 print ("All answers are correct!" )
32033210
0 commit comments