@@ -2457,7 +2457,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
24572457 ],
24582458 "mixture_text_image" :
24592459 [["invention" , "person" , "scientists" , "Lick" , "engineers" ],
2460- ["landscape" , "trees" , "road" , "natural " , "rock " ]]
2460+ ["landscape" , "trees" , "road" , "depicts " , "scenic " ]]
24612461 },
24622462 "gemma-3-27b-it" : {
24632463 "image" : [
@@ -2503,13 +2503,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25032503 if model_name == "qwen2-vl-7b-instruct" and modality == "image" :
25042504 match_ratio = 4.0 / 6
25052505
2506+ parsed_outputs = parse_output (output )
25062507 for prompt_output , prompt_keywords in zip (
2507- parse_output ( output ) , expected_keywords [model_name ][modality ]):
2508+ parsed_outputs , expected_keywords [model_name ][modality ]):
25082509 matches = [
25092510 keyword in prompt_output .lower () for keyword in prompt_keywords
25102511 ]
25112512 obs_match_ratio = 1. * sum (matches ) / len (matches )
2512- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2513+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
25132514
25142515 print ("All answers are correct!" )
25152516
@@ -2870,13 +2871,14 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
28702871 output = llm_venv .run_cmd (cmd , caller = check_output )
28712872
28722873 match_ratio = 0.6
2873- for prompt_output , prompt_keywords in zip (parse_output (output ),
2874+ parsed_outputs = parse_output (output )
2875+ for prompt_output , prompt_keywords in zip (parsed_outputs ,
28742876 expected_keywords [modality ]):
28752877 matches = [
28762878 keyword in prompt_output .lower () for keyword in prompt_keywords
28772879 ]
28782880 obs_match_ratio = 1. * sum (matches ) / len (matches )
2879- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2881+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
28802882
28812883 print ("All answers are correct!" )
28822884
@@ -2981,13 +2983,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
29812983 match_ratio = 0.6
29822984
29832985 # Check output accuracy
2986+ parsed_outputs = parse_output (output )
29842987 for prompt_output , prompt_keywords in zip (
2985- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
2988+ parsed_outputs , expected_keywords [model_name ]["image" ]):
29862989 matches = [
29872990 keyword in prompt_output .lower () for keyword in prompt_keywords
29882991 ]
29892992 obs_match_ratio = 1. * sum (matches ) / len (matches )
2990- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2993+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
29912994
29922995 print ("All answers are correct!" )
29932996
@@ -3024,20 +3027,23 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
30243027 expected_keywords = {
30253028 "gemma-3-27b-it" : {
30263029 "image" : [
3027- ["half " , "dome " , "yosemite " , "landmark " , "rounded " ],
3028- ["atmosphere" , "peaceful" , "majestic" , "calm " , "quiet " ],
3030+ ["description " , "image " , "half " , "dome " , "park " ],
3031+ ["atmosphere" , "peaceful" , "majestic" , "scene " , "sky " ],
30293032 ],
30303033 },
30313034 "mistral-small-3.1-24b-instruct" : {
30323035 "image" : [
3033- ["depicts" , "landscape" , "rock" , "sky" , "high" , "altitude" ],
3034- ["atmosphere" , "serene" , "majestic" , "sense" , "tranquility" ],
3036+ [
3037+ "depicts" , "scenic" , "landscape" , "rock" , "formation" ,
3038+ "background"
3039+ ],
3040+ ["atmosphere" , "serene" , "majestic" , "clear" , "sky" , "trees" ],
30353041 ],
30363042 },
30373043 "Phi-4-multimodal-instruct" : {
30383044 "image" : [
30393045 ["depicts" , "landscape" , "mountain" , "half" , "dome" ],
3040- ["atmosphere" , "serene" , "sense" , "tranquility " , "peace. " ],
3046+ ["atmosphere" , "serene" , "sense" , "scene " , "majestic " ],
30413047 ],
30423048 },
30433049 }
@@ -3087,8 +3093,9 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
30873093 match_ratio = 0.6
30883094
30893095 # Check output accuracy
3096+ parsed_outputs = parse_output (output )
30903097 for prompt_output , prompt_keywords in zip (
3091- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
3098+ parsed_outputs , expected_keywords [model_name ]["image" ]):
30923099 matches = [
30933100 keyword in prompt_output .lower () for keyword in prompt_keywords
30943101 ]
@@ -3097,7 +3104,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
30973104 print ("prompt_keywords:" , prompt_keywords )
30983105 print ("matches:" , matches )
30993106 print ("obs_match_ratio:" , obs_match_ratio )
3100- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
3107+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
31013108
31023109 print ("All answers are correct!" )
31033110
0 commit comments