Skip to content

Commit b8c8237

Browse files
authored
Qwen3 tokenizer sanity checks (#730)
1 parent 21c4172 commit b8c8237

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

pkg/llms_from_scratch/tests/test_qwen3.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,3 +279,11 @@ def test_tokenizer_equivalence():
279279

280280
assert tokenizer_ref.eos_token_id == tokenizer.eos_token_id
281281
assert tokenizer_ref.pad_token_id == tokenizer.pad_token_id
282+
283+
assert tokenizer.encode("<|endoftext|>") == [tokenizer._special_to_id["<|endoftext|>"]]
284+
assert tokenizer.encode("<|im_end|>") == [tokenizer._special_to_id["<|im_end|>"]]
285+
286+
expected_eos_token = "<|im_end|>" if "Base" not in repo_id else "<|endoftext|>"
287+
expected_pad_token = "<|endoftext|>"
288+
assert tokenizer.decode([tokenizer.eos_token_id]) == expected_eos_token
289+
assert tokenizer.decode([tokenizer.pad_token_id]) == expected_pad_token

0 commit comments

Comments
 (0)