@@ -2334,7 +2334,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
23342334 ],
23352335 "mixture_text_image" :
23362336 [["invention" , "person" , "scientists" , "Lick" , "engineers" ],
2337- ["landscape" , "trees" , "road" , "natural " , "rock " ]]
2337+ ["landscape" , "trees" , "road" , "depicts " , "scenic " ]]
23382338 },
23392339 "gemma-3-27b-it" : {
23402340 "image" : [
@@ -2378,13 +2378,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
23782378 if model_name == "qwen2-vl-7b-instruct" and modality == "image" :
23792379 match_ratio = 4.0 / 6
23802380
2381+ parsed_outputs = parse_output (output )
23812382 for prompt_output , prompt_keywords in zip (
2382- parse_output ( output ) , expected_keywords [model_name ][modality ]):
2383+ parsed_outputs , expected_keywords [model_name ][modality ]):
23832384 matches = [
23842385 keyword in prompt_output .lower () for keyword in prompt_keywords
23852386 ]
23862387 obs_match_ratio = 1. * sum (matches ) / len (matches )
2387- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2388+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
23882389
23892390 print ("All answers are correct!" )
23902391
@@ -2517,13 +2518,14 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
25172518 output = llm_venv .run_cmd (cmd , caller = check_output )
25182519
25192520 match_ratio = 0.6
2520- for prompt_output , prompt_keywords in zip (parse_output (output ),
2521+ parsed_outputs = parse_output (output )
2522+ for prompt_output , prompt_keywords in zip (parsed_outputs ,
25212523 expected_keywords [modality ]):
25222524 matches = [
25232525 keyword in prompt_output .lower () for keyword in prompt_keywords
25242526 ]
25252527 obs_match_ratio = 1. * sum (matches ) / len (matches )
2526- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2528+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
25272529
25282530 print ("All answers are correct!" )
25292531
@@ -2623,13 +2625,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
26232625 match_ratio = 0.6
26242626
26252627 # Check output accuracy
2628+ parsed_outputs = parse_output (output )
26262629 for prompt_output , prompt_keywords in zip (
2627- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
2630+ parsed_outputs , expected_keywords [model_name ]["image" ]):
26282631 matches = [
26292632 keyword in prompt_output .lower () for keyword in prompt_keywords
26302633 ]
26312634 obs_match_ratio = 1. * sum (matches ) / len (matches )
2632- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2635+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
26332636
26342637 print ("All answers are correct!" )
26352638
@@ -2666,20 +2669,23 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
26662669 expected_keywords = {
26672670 "gemma-3-27b-it" : {
26682671 "image" : [
2669- ["half " , "dome " , "yosemite " , "landmark " , "rounded " ],
2670- ["atmosphere" , "peaceful" , "majestic" , "calm " , "quiet " ],
2672+ ["description " , "image " , "half " , "dome " , "park " ],
2673+ ["atmosphere" , "peaceful" , "majestic" , "scene " , "sky " ],
26712674 ],
26722675 },
26732676 "mistral-small-3.1-24b-instruct" : {
26742677 "image" : [
2675- ["depicts" , "landscape" , "rock" , "sky" , "high" , "altitude" ],
2676- ["atmosphere" , "serene" , "majestic" , "sense" , "tranquility" ],
2678+ [
2679+ "depicts" , "scenic" , "landscape" , "rock" , "formation" ,
2680+ "background"
2681+ ],
2682+ ["atmosphere" , "serene" , "majestic" , "clear" , "sky" , "trees" ],
26772683 ],
26782684 },
26792685 "Phi-4-multimodal-instruct" : {
26802686 "image" : [
26812687 ["depicts" , "landscape" , "mountain" , "half" , "dome" ],
2682- ["atmosphere" , "serene" , "sense" , "tranquility " , "peace. " ],
2688+ ["atmosphere" , "serene" , "sense" , "scene " , "majestic " ],
26832689 ],
26842690 },
26852691 }
@@ -2722,8 +2728,9 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
27222728 match_ratio = 0.6
27232729
27242730 # Check output accuracy
2731+ parsed_outputs = parse_output (output )
27252732 for prompt_output , prompt_keywords in zip (
2726- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
2733+ parsed_outputs , expected_keywords [model_name ]["image" ]):
27272734 matches = [
27282735 keyword in prompt_output .lower () for keyword in prompt_keywords
27292736 ]
@@ -2732,7 +2739,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
27322739 print ("prompt_keywords:" , prompt_keywords )
27332740 print ("matches:" , matches )
27342741 print ("obs_match_ratio:" , obs_match_ratio )
2735- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2742+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
27362743
27372744 print ("All answers are correct!" )
27382745
0 commit comments