16
16
import unittest
17
17
18
18
from transformers import AddedToken , AutoModelForCausalLM , AutoTokenizer
19
- from transformers .testing_utils import require_gguf , require_torch_gpu , slow , torch_device
19
+ from transformers .testing_utils import (
20
+ require_gguf ,
21
+ require_torch_gpu ,
22
+ slow ,
23
+ torch_device ,
24
+ )
20
25
from transformers .utils import is_torch_available
21
26
22
27
@@ -33,6 +38,7 @@ class GgufIntegrationTests(unittest.TestCase):
33
38
imatrix_model_id = "duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF"
34
39
mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
35
40
qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
41
+ qwen2_moe_model_id = "RichardErkhov/Qwen_-_Qwen1.5-MoE-A2.7B-Chat-gguf"
36
42
llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF"
37
43
tinyllama_model_id = "PenutChen/TinyLlama-1.1B-Chat-v1.0-GGUF"
38
44
@@ -59,6 +65,7 @@ class GgufIntegrationTests(unittest.TestCase):
59
65
60
66
q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf"
61
67
q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf"
68
+ q4_0_qwen2_moe_model_id = "Qwen1.5-MoE-A2.7B-Chat.Q4_0.gguf"
62
69
q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf"
63
70
f16_tinyllama_model_id = "TinyLlama-1.1B-Chat-v1.0.FP16.gguf"
64
71
@@ -298,7 +305,10 @@ def test_f16(self):
298
305
def test_mistral_q4_0 (self ):
299
306
tokenizer = AutoTokenizer .from_pretrained (self .mistral_model_id , gguf_file = self .q4_0_mistral_model_id )
300
307
model = AutoModelForCausalLM .from_pretrained (
301
- self .mistral_model_id , gguf_file = self .q4_0_mistral_model_id , device_map = "auto" , torch_dtype = torch .float16
308
+ self .mistral_model_id ,
309
+ gguf_file = self .q4_0_mistral_model_id ,
310
+ device_map = "auto" ,
311
+ torch_dtype = torch .float16 ,
302
312
)
303
313
304
314
text = tokenizer (self .example_text , return_tensors = "pt" ).to (torch_device )
@@ -310,7 +320,10 @@ def test_mistral_q4_0(self):
310
320
def test_qwen2_q4_0 (self ):
311
321
tokenizer = AutoTokenizer .from_pretrained (self .qwen2_model_id , gguf_file = self .q4_0_qwen2_model_id )
312
322
model = AutoModelForCausalLM .from_pretrained (
313
- self .qwen2_model_id , gguf_file = self .q4_0_qwen2_model_id , device_map = "auto" , torch_dtype = torch .float16
323
+ self .qwen2_model_id ,
324
+ gguf_file = self .q4_0_qwen2_model_id ,
325
+ device_map = "auto" ,
326
+ torch_dtype = torch .float16 ,
314
327
)
315
328
316
329
text = tokenizer (self .example_text , return_tensors = "pt" ).to (torch_device )
@@ -319,6 +332,21 @@ def test_qwen2_q4_0(self):
319
332
EXPECTED_TEXT = "Hello.jsoup\n \n I am a beginner"
320
333
self .assertEqual (tokenizer .decode (out [0 ], skip_special_tokens = True ), EXPECTED_TEXT )
321
334
335
+ def test_qwen2_moe_q4_0 (self ):
336
+ tokenizer = AutoTokenizer .from_pretrained (self .qwen2_moe_model_id , gguf_file = self .q4_0_qwen2_moe_model_id )
337
+ model = AutoModelForCausalLM .from_pretrained (
338
+ self .qwen2_moe_model_id ,
339
+ gguf_file = self .q4_0_qwen2_moe_model_id ,
340
+ device_map = "auto" ,
341
+ torch_dtype = torch .float16 ,
342
+ )
343
+
344
+ text = tokenizer (self .example_text , return_tensors = "pt" ).to (torch_device )
345
+ out = model .generate (** text , max_new_tokens = 10 )
346
+
347
+ EXPECTED_TEXT = "Hello everyone, I'm a newbie here and would like"
348
+ self .assertEqual (tokenizer .decode (out [0 ], skip_special_tokens = True ), EXPECTED_TEXT )
349
+
322
350
def test_llama3_q4_0_tokenizer (self ):
323
351
tokenizer = AutoTokenizer .from_pretrained (self .llama3_model_id , gguf_file = self .q4_llama3_model_id )
324
352
with tempfile .TemporaryDirectory () as tmpdirname :
@@ -331,7 +359,10 @@ def test_llama3_q4_0_tokenizer(self):
331
359
def test_llama3_q4_0 (self ):
332
360
tokenizer = AutoTokenizer .from_pretrained (self .llama3_model_id , gguf_file = self .q4_llama3_model_id )
333
361
model = AutoModelForCausalLM .from_pretrained (
334
- self .llama3_model_id , gguf_file = self .q4_llama3_model_id , device_map = "auto" , torch_dtype = torch .float16
362
+ self .llama3_model_id ,
363
+ gguf_file = self .q4_llama3_model_id ,
364
+ device_map = "auto" ,
365
+ torch_dtype = torch .float16 ,
335
366
)
336
367
337
368
text = tokenizer (self .example_text , return_tensors = "pt" ).to (torch_device )
0 commit comments