34
34
"LiquidAI/LFM2-1.2B" ,
35
35
]
36
36
37
- HF_UNSUPPORTED_MODELS = [
38
- # The HF transformers implementation of
39
- # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
40
- # doesn't compare vLLM output with HF output.
41
- # See https://github.com/huggingface/transformers/pull/35943
42
- "yujiepan/mamba2-codestral-v0.1-tiny-random" ,
43
- # transformers 4.55 is still producing garbage for this model
44
- # TODO(tdoublep): follow-up on transformers side
45
- "ibm-granite/granite-4.0-tiny-preview"
46
- ]
47
-
48
37
V1_SUPPORTED_MODELS = [
49
38
"state-spaces/mamba-130m-hf" ,
50
39
"ai21labs/Jamba-tiny-dev" ,
@@ -90,20 +79,13 @@ def test_models(
90
79
try :
91
80
model_info = HF_EXAMPLE_MODELS .find_hf_info (model )
92
81
model_info .check_available_online (on_fail = "skip" )
93
- hf_version_check = model_info .check_transformers_version (
94
- on_fail = "return" )
82
+ model_info .check_transformers_version (on_fail = "skip" )
95
83
except ValueError :
96
- hf_version_check = None
97
-
98
- if hf_version_check is not None :
99
- print (f"Skipping transformers comparison because: { hf_version_check } " )
84
+ pass
100
85
101
86
with hf_runner (model ) as hf_model :
102
- if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None :
103
- hf_outputs = hf_model .generate_greedy_logprobs_limit (
104
- example_prompts , max_tokens , num_logprobs )
105
- else :
106
- hf_outputs = None
87
+ hf_outputs = hf_model .generate_greedy_logprobs_limit (
88
+ example_prompts , max_tokens , num_logprobs )
107
89
108
90
with monkeypatch .context () as m :
109
91
m .setenv ("VLLM_USE_V1" , "0" )
@@ -121,7 +103,7 @@ def test_models(
121
103
else :
122
104
vllm_v1_outputs = None
123
105
124
- if hf_outputs is not None and vllm_v0_outputs is not None :
106
+ if vllm_v0_outputs is not None :
125
107
check_logprobs_close (
126
108
outputs_0_lst = hf_outputs ,
127
109
outputs_1_lst = vllm_v0_outputs ,
@@ -130,12 +112,10 @@ def test_models(
130
112
)
131
113
132
114
if model in V1_SUPPORTED_MODELS :
133
- ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
134
- assert ref_outputs is not None
135
115
check_logprobs_close (
136
- outputs_0_lst = ref_outputs ,
116
+ outputs_0_lst = hf_outputs ,
137
117
outputs_1_lst = vllm_v1_outputs ,
138
- name_0 = "hf" if hf_outputs is not None else "vllm-v0" ,
118
+ name_0 = "hf" ,
139
119
name_1 = "vllm-v1" ,
140
120
)
141
121
@@ -402,11 +382,8 @@ def test_full_cuda_graph(
402
382
pass
403
383
404
384
with hf_runner (model ) as hf_model :
405
- if model not in HF_UNSUPPORTED_MODELS :
406
- hf_outputs = hf_model .generate_greedy_logprobs_limit (
407
- example_prompts , max_tokens , num_logprobs )
408
- else :
409
- hf_outputs = None
385
+ hf_outputs = hf_model .generate_greedy_logprobs_limit (
386
+ example_prompts , max_tokens , num_logprobs )
410
387
411
388
with monkeypatch .context () as m :
412
389
m .setenv ("VLLM_USE_V1" , "0" )
@@ -421,20 +398,18 @@ def test_full_cuda_graph(
421
398
vllm_v1_outputs = vllm_model .generate_greedy_logprobs (
422
399
example_prompts , max_tokens , num_logprobs )
423
400
424
- if hf_outputs is not None and vllm_v0_outputs is not None :
401
+ if vllm_v0_outputs is not None :
425
402
check_logprobs_close (
426
403
outputs_0_lst = hf_outputs ,
427
404
outputs_1_lst = vllm_v0_outputs ,
428
405
name_0 = "hf" ,
429
406
name_1 = "vllm-v0" ,
430
407
)
431
408
432
- ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
433
- assert ref_outputs is not None
434
409
check_logprobs_close (
435
- outputs_0_lst = ref_outputs ,
410
+ outputs_0_lst = hf_outputs ,
436
411
outputs_1_lst = vllm_v1_outputs ,
437
- name_0 = "hf" if hf_outputs is not None else "vllm-v0" ,
412
+ name_0 = "hf" ,
438
413
name_1 = "vllm-v1" ,
439
414
)
440
415
@@ -460,11 +435,8 @@ def test_fp32_state(
460
435
pass
461
436
462
437
with hf_runner (model ) as hf_model :
463
- if model not in HF_UNSUPPORTED_MODELS :
464
- hf_outputs = hf_model .generate_greedy_logprobs_limit (
465
- example_prompts , max_tokens , num_logprobs )
466
- else :
467
- hf_outputs = None
438
+ hf_outputs = hf_model .generate_greedy_logprobs_limit (
439
+ example_prompts , max_tokens , num_logprobs )
468
440
469
441
with monkeypatch .context () as m :
470
442
m .setenv ("VLLM_USE_V1" , "0" )
@@ -480,18 +452,16 @@ def test_fp32_state(
480
452
vllm_v1_outputs = vllm_model .generate_greedy_logprobs (
481
453
example_prompts , max_tokens , num_logprobs )
482
454
483
- if hf_outputs is not None :
484
- check_logprobs_close (
485
- outputs_0_lst = hf_outputs ,
486
- outputs_1_lst = vllm_v0_outputs ,
487
- name_0 = "hf" ,
488
- name_1 = "vllm-v0" ,
489
- )
455
+ check_logprobs_close (
456
+ outputs_0_lst = hf_outputs ,
457
+ outputs_1_lst = vllm_v0_outputs ,
458
+ name_0 = "hf" ,
459
+ name_1 = "vllm-v0" ,
460
+ )
490
461
491
- ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
492
462
check_logprobs_close (
493
- outputs_0_lst = ref_outputs ,
463
+ outputs_0_lst = hf_outputs ,
494
464
outputs_1_lst = vllm_v1_outputs ,
495
- name_0 = "hf" if hf_outputs is not None else "vllm-v0" ,
465
+ name_0 = "hf" ,
496
466
name_1 = "vllm-v1" ,
497
467
)
0 commit comments