Skip to content

Commit 42c1306

Browse files
casincarasbt
andauthored
Qwen3Tokenizer fix for Qwen3 Base models and generation mismatch with HF (#828)
* prevent `self.apply_chat_template` being applied for base Qwen models * - added no chat template comparison in `test_chat_wrap_and_equivalence` - removed duplicate comparison * Revert "- added no chat template comparison in `test_chat_wrap_and_equivalence`" This reverts commit 3a5ee8c. * Revert "prevent `self.apply_chat_template` being applied for base Qwen models" This reverts commit df50439. * copied `download_file` in `utils` from https://github.com/rasbt/reasoning-from-scratch/blob/main/reasoning_from_scratch/utils.py * added copy of test `def test_tokenizer_equivalence()` from `reasoning-from-scratch` in `test_qwen3.py` * removed duplicate code fragment in`test_chat_wrap_and_equivalence` * use apply_chat_template * add toggle for instruct model * Update tokenizer usage --------- Co-authored-by: rasbt <[email protected]>
1 parent bfc6389 commit 42c1306

File tree

7 files changed

+125
-15
lines changed

7 files changed

+125
-15
lines changed

ch05/11_qwen3/README.md

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,14 @@ pip install llms_from_scratch tokenizers
4545
Specify which model to use:
4646

4747
```python
48-
USE_REASONING_MODEL = False # The base model
49-
USE_REASONING_MODEL = True # The "thinking" model
48+
USE_REASONING_MODEL = True
49+
# Uses the base model if USE_REASONING_MODEL = False
50+
51+
USE_INSTRUCT_MODEL = False
52+
# Uses the instruct mode (without reasoning) if
53+
# USE_REASONING_MODEL = True
54+
# USE_INSTRUCT_MODEL = False
55+
# This setting does have no effect if USE_REASONING_MODEL = False
5056

5157

5258
# Use
@@ -187,10 +193,11 @@ else:
187193
tok_filename = "tokenizer-base.json"
188194

189195
tokenizer = Qwen3Tokenizer(
190-
tokenizer_file_path=tok_filename,
196+
tokenizer_file_path=tokenizer_file_path,
191197
repo_id=repo_id,
198+
apply_chat_template=USE_REASONING_MODEL,
192199
add_generation_prompt=USE_REASONING_MODEL,
193-
add_thinking=USE_REASONING_MODEL
200+
add_thinking=not USE_INSTRUCT_MODEL
194201
)
195202
```
196203

ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,7 @@
10641064
"tokenizer = Qwen3Tokenizer(\n",
10651065
" tokenizer_file_path=tokenizer_file_path,\n",
10661066
" repo_id=repo_id,\n",
1067+
" apply_chat_template=True,\n",
10671068
" add_generation_prompt=True,\n",
10681069
" add_thinking=True\n",
10691070
")"

ch05/11_qwen3/standalone-qwen3-moe.ipynb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,7 @@
10061006
"tokenizer = Qwen3Tokenizer(\n",
10071007
" tokenizer_file_path=tokenizer_file_path,\n",
10081008
" repo_id=repo_id,\n",
1009+
" apply_chat_template=True,\n",
10091010
" add_generation_prompt=True,\n",
10101011
" add_thinking=True\n",
10111012
")"

ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,14 @@
115115
"metadata": {},
116116
"outputs": [],
117117
"source": [
118-
"USE_REASONING_MODEL = True"
118+
"USE_REASONING_MODEL = True\n",
119+
"# Uses the base model if USE_REASONING_MODEL = False\n",
120+
"\n",
121+
"USE_INSTRUCT_MODEL = False\n",
122+
"# Uses the instruct mode (without reasoning) if \n",
123+
"# USE_REASONING_MODEL = True\n",
124+
"# USE_INSTRUCT_MODEL = False\n",
125+
"# This setting does have no effect if USE_REASONING_MODEL = False"
119126
]
120127
},
121128
{
@@ -1060,8 +1067,9 @@
10601067
"tokenizer = Qwen3Tokenizer(\n",
10611068
" tokenizer_file_path=tokenizer_file_path,\n",
10621069
" repo_id=repo_id,\n",
1070+
" apply_chat_template=USE_REASONING_MODEL,\n",
10631071
" add_generation_prompt=USE_REASONING_MODEL,\n",
1064-
" add_thinking=USE_REASONING_MODEL\n",
1072+
" add_thinking=not USE_INSTRUCT_MODEL\n",
10651073
")"
10661074
]
10671075
},

ch05/11_qwen3/standalone-qwen3.ipynb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,14 @@
113113
"metadata": {},
114114
"outputs": [],
115115
"source": [
116-
"USE_REASONING_MODEL = True"
116+
"USE_REASONING_MODEL = True\n",
117+
"# Uses the base model if USE_REASONING_MODEL = False\n",
118+
"\n",
119+
"USE_INSTRUCT_MODEL = False\n",
120+
"# Uses the instruct mode (without reasoning) if \n",
121+
"# USE_REASONING_MODEL = True\n",
122+
"# USE_INSTRUCT_MODEL = False\n",
123+
"# This setting does have no effect if USE_REASONING_MODEL = False"
117124
]
118125
},
119126
{
@@ -1002,8 +1009,9 @@
10021009
"tokenizer = Qwen3Tokenizer(\n",
10031010
" tokenizer_file_path=tokenizer_file_path,\n",
10041011
" repo_id=repo_id,\n",
1012+
" apply_chat_template=USE_REASONING_MODEL,\n",
10051013
" add_generation_prompt=USE_REASONING_MODEL,\n",
1006-
" add_thinking=USE_REASONING_MODEL\n",
1014+
" add_thinking=not USE_INSTRUCT_MODEL\n",
10071015
")"
10081016
]
10091017
},

pkg/llms_from_scratch/tests/test_qwen3.py

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@
2020
from llms_from_scratch.kv_cache_batched.qwen3 import Qwen3Model as Qwen3ModelKVBatched
2121
from llms_from_scratch.kv_cache_batched.generate import generate_text_simple as generate_text_simple_batched
2222

23+
from llms_from_scratch.utils import download_file
24+
2325
import importlib
26+
import os
27+
import shutil
28+
import tempfile
2429
import platform
2530
import pytest
2631
import torch
@@ -465,13 +470,6 @@ def test_chat_wrap_and_equivalence(add_gen, add_think):
465470
add_generation_prompt=add_gen,
466471
enable_thinking=add_think,
467472
)
468-
ours = qt.encode(prompt)
469-
ref = hf_tok.apply_chat_template(
470-
messages,
471-
tokenize=True,
472-
add_generation_prompt=add_gen,
473-
enable_thinking=add_think,
474-
)
475473

476474
if add_gen and not add_think:
477475
pass # skip edge case as this is not something we use in practice
@@ -534,6 +532,72 @@ def test_multiturn_equivalence(repo_id, tok_file, add_gen, add_think):
534532
assert ours_dec == ref_dec
535533

536534

535+
@pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
536+
def test_tokenizer_equivalence():
537+
from transformers import AutoTokenizer
538+
539+
prompt = "Give me a short introduction to large language models."
540+
messages = [
541+
{"role": "user", "content": prompt},
542+
]
543+
544+
for apply_chat_template in (True, False):
545+
for s in ("-Base", ""):
546+
repo_id = f"Qwen/Qwen3-0.6B{s}"
547+
tokenizer_ref = AutoTokenizer.from_pretrained(repo_id)
548+
tokenizer_url = f"https://huggingface.co/Qwen/Qwen3-0.6B{s}/resolve/main/tokenizer.json"
549+
download_file(tokenizer_url, out_dir=".")
550+
551+
old_name = "tokenizer.json"
552+
553+
if not s:
554+
new_name = "tokenizer-reasoning.json"
555+
else:
556+
new_name = "tokenizer-base.json"
557+
558+
try:
559+
shutil.move(old_name, new_name)
560+
except Exception:
561+
with tempfile.NamedTemporaryFile(delete=False, dir=".") as tmp_file:
562+
shutil.copyfile(old_name, tmp_file.name)
563+
os.replace(tmp_file.name, new_name)
564+
os.remove(old_name)
565+
566+
for states in ((True, True), (False, False)):
567+
tokenizer = Qwen3Tokenizer(
568+
tokenizer_file_path=new_name,
569+
repo_id=repo_id,
570+
apply_chat_template=apply_chat_template,
571+
add_generation_prompt=states[0],
572+
add_thinking=states[1]
573+
)
574+
input_token_ids = tokenizer.encode(prompt)
575+
576+
if apply_chat_template:
577+
input_token_ids_ref = tokenizer_ref.apply_chat_template(
578+
messages,
579+
tokenize=True,
580+
add_generation_prompt=states[0],
581+
enable_thinking=states[1],
582+
)
583+
else:
584+
input_token_ids_ref = input_token_ids
585+
586+
assert input_token_ids == input_token_ids_ref, states
587+
588+
output_text = tokenizer.decode(input_token_ids)
589+
out_text_ref = tokenizer_ref.decode(input_token_ids_ref)
590+
assert output_text == out_text_ref, states
591+
592+
assert tokenizer.encode("<|endoftext|>") == [tokenizer._special_to_id["<|endoftext|>"]]
593+
assert tokenizer.encode("<|im_end|>") == [tokenizer._special_to_id["<|im_end|>"]]
594+
595+
expected_eos_token = "<|im_end|>" if "base" not in new_name else "<|endoftext|>"
596+
expected_pad_token = "<|endoftext|>"
597+
assert tokenizer.decode([tokenizer.eos_token_id]) == expected_eos_token
598+
assert tokenizer.decode([tokenizer.pad_token_id]) == expected_pad_token
599+
600+
537601
@pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
538602
@pytest.mark.parametrize("repo_id, tok_file", [
539603
("Qwen/Qwen3-0.6B", "Qwen3-0.6B/tokenizer.json"),

pkg/llms_from_scratch/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import re
1010
import types
1111
from pathlib import Path
12+
import urllib.request
13+
import urllib.parse
1214

1315
import nbformat
1416

@@ -122,3 +124,22 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr
122124

123125
exec(src, mod.__dict__)
124126
return mod
127+
128+
def download_file(url, out_dir="."):
129+
"""Simple file download utility for tests."""
130+
from pathlib import Path
131+
out_dir = Path(out_dir)
132+
out_dir.mkdir(parents=True, exist_ok=True)
133+
filename = Path(urllib.parse.urlparse(url).path).name
134+
dest = out_dir / filename
135+
136+
if dest.exists():
137+
return dest
138+
139+
try:
140+
with urllib.request.urlopen(url) as response:
141+
with open(dest, 'wb') as f:
142+
f.write(response.read())
143+
return dest
144+
except Exception as e:
145+
raise RuntimeError(f"Failed to download {url}: {e}")

0 commit comments

Comments
 (0)