Skip to content

Commit 6e50261

Browse files
Mark ObozovMark Obozov
authored andcommitted
fixes
1 parent 146cac3 commit 6e50261

File tree

9 files changed

+100087
-121
lines changed

9 files changed

+100087
-121
lines changed

tests/assets/merges.txt

Lines changed: 100001 additions & 0 deletions
Large diffs are not rendered by default.

tests/assets/vocab.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/torchtune/models/phi4/test_phi4_tokenizer.py

Lines changed: 14 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,14 @@ def tokenizer(self):
1919
# Pretrained tiktoken model generated via the script in
2020
# https://gist.github.com/ebsmothers/54b133dd87db6679b14318545aaa2de4
2121
return phi4_tokenizer(
22-
path=str(ASSETS / "tiktoken_small.model"),
22+
vocab_path=(ASSETS / "vocab.json"),
23+
merges_path=(ASSETS / "merges.txt"),
2324
)
2425

2526
@pytest.fixture
2627
def expected_tokens(self):
2728
# fmt: off
28-
tokens = [100257, 100264, 115, 121, 322, 398, 100265, 10, 1539, 470, 258, 1444, 933, 1940, 511, 446, 100266, 10, 100264,
29-
477, 273, 100265, 10, 66, 478, 299, 351, 362, 292, 1160, 117, 807, 334, 958, 99, 445, 98, 300, 258, 256, 281,
30-
107, 46, 411, 114, 561, 258, 1156, 279, 316, 334, 604, 337, 112, 445, 1827, 512, 1080, 116, 300, 262, 1249,
31-
524, 340, 10, 35, 35, 35, 828, 1160, 117, 807, 1037, 71, 1414, 534, 258, 1759, 511, 355, 285, 875, 550, 102,
32-
1546, 265, 105, 111, 340, 10, 35, 35, 35, 408, 300, 112, 279, 316, 1037, 100266, 10, 100264, 520, 511, 446,
33-
100265, 10, 73, 776, 362, 425, 1978, 274, 284, 1528, 319, 995, 505, 944, 874, 903, 1585, 616, 345, 1528, 115,
34-
284, 1749, 803, 46, 270, 776, 1341, 258, 1279, 641, 563, 275, 469, 573, 284, 944, 320, 526, 962, 425, 913,
35-
1402, 97, 356, 446, 115, 284, 1229, 1581, 282, 117, 276, 259, 300, 46, 270, 776, 258, 1279, 275, 288, 283,
36-
262, 739, 1886, 284, 783, 1803, 636, 277, 268, 117, 316, 485, 115, 284, 302, 416, 273, 900, 46, 270, 776, 591,
37-
630, 346, 531, 476, 505, 768, 1233, 342, 1923, 292, 522, 662, 280, 274, 913, 601, 359, 300, 44, 335, 834, 335,
38-
531, 476, 505, 604, 264, 509, 1456, 258, 771, 543, 1719, 405, 710, 665, 668, 1280, 46, 100266, 10,
39-
100265] # noqa
29+
tokens = [100257, 100264, 9125, 100265, 198, 2675, 527, 264, 11190, 18328, 100266, 198, 100264, 882, 100265, 198, 14149, 28514, 374, 279, 1888, 6875, 100266, 198, 100264, 78191, 100265, 198, 9642, 433, 374, 100266, 198, 100265]
4030
# fmt: on
4131
return tokens
4232

@@ -45,67 +35,41 @@ def test_tokenize_messages(self, tokenizer, expected_tokens):
4535
Message(role="system", content="You are a helpful assistant", masked=True),
4636
Message(
4737
role="user",
48-
content="Below is an instruction that describes a task. Write a response "
49-
"that appropriately completes the request.\n\n### Instruction:\nGenerate "
50-
"a realistic dating profile bio.\n\n### Response:\n",
38+
content="Pytorch is the best library!",
5139
masked=True,
5240
),
5341
Message(
5442
role="assistant",
55-
content="I'm an outgoing and friendly person who loves spending time with "
56-
"friends and family. I'm also a big-time foodie and love trying out new "
57-
"restaurants and different cuisines. I'm a big fan of the arts and enjoy "
58-
"going to museums and galleries. I'm looking for someone who shares my "
59-
"interest in exploring new places, as well as someone who appreciates a "
60-
"good conversation over coffee.",
43+
content="Yes, it is!",
6144
),
6245
]
6346
tokens, mask = tokenizer.tokenize_messages(messages, add_eos=True)
6447

65-
expected_mask = [True] * 101 + [False] * 131
48+
expected_mask = [True] * 24 + [False] * 10
6649
assert expected_tokens == tokens
6750
assert expected_mask == mask
6851

6952
def test_tokenize_messages_no_system_prompt(self, tokenizer):
7053
messages = [
71-
Message(role="system", content="You are a helpful assistant", masked=True),
7254
Message(
7355
role="user",
74-
content="Below is an instruction that describes a task. Write a response "
75-
"that appropriately completes the request.\n\n### Instruction:\nGenerate "
76-
"a realistic dating profile bio.\n\n### Response:\n",
56+
content="Pytorch is the best library!",
7757
masked=True,
7858
),
7959
Message(
8060
role="assistant",
81-
content="I'm an outgoing and friendly person who loves spending time with "
82-
"friends and family. I'm also a big-time foodie and love trying out new "
83-
"restaurants and different cuisines. I'm a big fan of the arts and enjoy "
84-
"going to museums and galleries. I'm looking for someone who shares my "
85-
"interest in exploring new places, as well as someone who appreciates a "
86-
"good conversation over coffee.",
61+
content="Yes, it is!",
8762
),
8863
]
8964
tokens, mask = tokenizer.tokenize_messages(
9065
messages, ignore_system_prompt=True, add_eos=True
9166
)
9267

9368
# fmt: off
94-
expected_tokens = [100257, 100264, 477, 273, 100265, 10, 66, 478, 299, 351, 362, 292, 1160, 117, 807, 334, 958, 99, 445,
95-
98, 300, 258, 256, 281, 107, 46, 411, 114, 561, 258, 1156, 279, 316, 334, 604, 337, 112, 445, 1827,
96-
512, 1080, 116, 300, 262, 1249, 524, 340, 10, 35, 35, 35, 828, 1160, 117, 807, 1037, 71, 1414, 534,
97-
258, 1759, 511, 355, 285, 875, 550, 102, 1546, 265, 105, 111, 340, 10, 35, 35, 35, 408, 300, 112,
98-
279, 316, 1037, 100266, 10, 100264, 520, 511, 446, 100265, 10, 73, 776, 362, 425, 1978, 274, 284,
99-
1528, 319, 995, 505, 944, 874, 903, 1585, 616, 345, 1528, 115, 284, 1749, 803, 46, 270, 776, 1341,
100-
258, 1279, 641, 563, 275, 469, 573, 284, 944, 320, 526, 962, 425, 913, 1402, 97, 356, 446, 115, 284,
101-
1229, 1581, 282, 117, 276, 259, 300, 46, 270, 776, 258, 1279, 275, 288, 283, 262, 739, 1886, 284,
102-
783, 1803, 636, 277, 268, 117, 316, 485, 115, 284, 302, 416, 273, 900, 46, 270, 776, 591, 630, 346,
103-
531, 476, 505, 768, 1233, 342, 1923, 292, 522, 662, 280, 274, 913, 601, 359, 300, 44, 335, 834, 335,
104-
531, 476, 505, 604, 264, 509, 1456, 258, 771, 543, 1719, 405, 710, 665, 668, 1280, 46, 100266, 10,
105-
100265] # noqa
69+
expected_tokens = [100257, 100264, 882, 100265, 198, 14149, 28514, 374, 279, 1888, 6875, 100266, 198, 100264, 78191, 100265, 198, 9642, 433, 374, 100266, 198, 100265]
10670
# fmt: on
10771

108-
expected_mask = [True] * 84 + [False] * 131
72+
expected_mask = [True] * 13 + [False] * 10
10973
assert expected_tokens == tokens
11074
assert expected_mask == mask
11175

@@ -118,41 +82,22 @@ def test_tokenize_message_drop_eos(self, tokenizer, expected_tokens):
11882
Message(role="system", content="You are a helpful assistant", masked=True),
11983
Message(
12084
role="user",
121-
content="Below is an instruction that describes a task. Write a response "
122-
"that appropriately completes the request.\n\n### Instruction:\nGenerate "
123-
"a realistic dating profile bio.\n\n### Response:\n",
85+
content="Pytorch is the best library!",
12486
masked=True,
12587
),
12688
Message(
12789
role="assistant",
128-
content="I'm an outgoing and friendly person who loves spending time with "
129-
"friends and family. I'm also a big-time foodie and love trying out new "
130-
"restaurants and different cuisines. I'm a big fan of the arts and enjoy "
131-
"going to museums and galleries. I'm looking for someone who shares my "
132-
"interest in exploring new places, as well as someone who appreciates a "
133-
"good conversation over coffee.",
90+
content="Yes, it is!",
13491
),
13592
]
13693

13794
tokens, mask = tokenizer.tokenize_messages(messages, add_eos=False)
13895

13996
# fmt: off
140-
expected_tokens = [100257, 100264, 115, 121, 322, 398, 100265, 10, 1539, 470, 258, 1444, 933, 1940, 511, 446, 100266,
141-
10, 100264, 477, 273, 100265, 10, 66, 478, 299, 351, 362, 292, 1160, 117, 807, 334, 958, 99, 445, 98,
142-
300, 258, 256, 281, 107, 46, 411, 114, 561, 258, 1156, 279, 316, 334, 604, 337, 112, 445, 1827, 512,
143-
1080, 116, 300, 262, 1249, 524, 340, 10, 35, 35, 35, 828, 1160, 117, 807, 1037, 71, 1414, 534, 258,
144-
1759, 511, 355, 285, 875, 550, 102, 1546, 265, 105, 111, 340, 10, 35, 35, 35, 408, 300, 112, 279,
145-
316, 1037, 100266, 10, 100264, 520, 511, 446, 100265, 10, 73, 776, 362, 425, 1978, 274, 284, 1528,
146-
319, 995, 505, 944, 874, 903, 1585, 616, 345, 1528, 115, 284, 1749, 803, 46, 270, 776, 1341, 258,
147-
1279, 641, 563, 275, 469, 573, 284, 944, 320, 526, 962, 425, 913, 1402, 97, 356, 446, 115, 284, 1229,
148-
1581, 282, 117, 276, 259, 300, 46, 270, 776, 258, 1279, 275, 288, 283, 262, 739, 1886, 284, 783,
149-
1803, 636, 277, 268, 117, 316, 485, 115, 284, 302, 416, 273, 900, 46, 270, 776, 591, 630, 346, 531,
150-
476, 505, 768, 1233, 342, 1923, 292, 522, 662, 280, 274, 913, 601, 359, 300, 44, 335, 834, 335, 531,
151-
476, 505, 604, 264, 509, 1456, 258, 771, 543, 1719, 405, 710, 665, 668, 1280, 46, 100266, 10,
152-
100265] # noqa
97+
expected_tokens = [100257, 100264, 9125, 100265, 198, 2675, 527, 264, 11190, 18328, 100266, 198, 100264, 882, 100265, 198, 14149, 28514, 374, 279, 1888, 6875, 100266, 198, 100264, 78191, 100265, 198, 9642, 433, 374, 100266, 198, 100265]
15398
# fmt: on
15499

155-
expected_mask = [True] * 101 + [False] * 130
100+
expected_mask = [True] * 24 + [False] * 9
156101
# Drop eos token.
157102
assert expected_tokens[:-1] == tokens
158103
assert expected_mask == mask

tests/torchtune/modules/tokenizers/test_gpt2.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pytest
88

99
from tests.common import ASSETS
10-
from torchtune.modules.tokenizers import GPT2BaseTokenizer
10+
from torchtune.modules.transforms.tokenizers import GPT2BaseTokenizer
1111

1212

1313
class TestGPT2BaseTokenizer:
@@ -54,6 +54,22 @@ def test_encode(self, tokenizer):
5454
5,
5555
]
5656

57+
def test_decode(self, tokenizer):
58+
tokens = [2, 3, 4, 5]
59+
60+
assert tokenizer.decode(tokens) == ["H", "ell", "o", "Ġworld"]
61+
assert tokenizer.decode(
62+
tokenizer.encode("Hello world!", add_eos=False, add_bos=False)
63+
) == ["H", "ell", "o", "Ġworld"]
64+
assert tokenizer.decode(tokenizer.encode("Hello world!")) == [
65+
None,
66+
"H",
67+
"ell",
68+
"o",
69+
"Ġworld",
70+
None,
71+
]
72+
5773
def test_token_ids(self, tokenizer):
5874
assert tokenizer.eos_id == 1
5975
assert tokenizer.pad_id == 1

torchtune/models/phi4/_model_builders.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def phi4_tokenizer(vocab_path: str = None, merges_path: str = None, path: str =
5555
"""
5656
special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None
5757
template = _get_prompt_template(prompt_template) if prompt_template is not None else None
58-
return Phi4MiniTokenizer(vocab_path=vocab_path, merges_path=merges_path, path=path, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template)
58+
return Phi4MiniTokenizer(vocab_path=vocab_path, merges_path=merges_path, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template)
5959

6060

6161
def lora_phi4(

torchtune/models/phi4/_tokenizer.py

Lines changed: 14 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,9 @@
99
from torchtune.data._messages import Message
1010
from torchtune.data._prompt_templates import PromptTemplate
1111
from torchtune.data._utils import truncate
12-
from torchtune.modules.tokenizers import (
13-
GPT2BaseTokenizer,
14-
ModelTokenizer,
15-
TikTokenBaseTokenizer,
16-
)
12+
from torchtune.modules.tokenizers import ModelTokenizer
1713
from torchtune.modules.transforms import Transform
14+
from torchtune.modules.transforms.tokenizers import GPT2BaseTokenizer
1815

1916
PHI4_SPECIAL_TOKENS = {
2017
"<|dummy_0|>": 100256,
@@ -46,7 +43,6 @@ class Phi4MiniTokenizer(ModelTokenizer, Transform):
4643
TikToken tokenizer configured with Phi4 (14B) special tokens.
4744
4845
Args:
49-
path (str): Path to tokenizer.model file.
5046
merges_path (str): Path to merges.txt file.
5147
vocab_path (str): Path to vocab.json file.
5248
special_tokens (Optional[Dict[str, int]]): mapping containing special text tokens and
@@ -73,7 +69,6 @@ class Phi4MiniTokenizer(ModelTokenizer, Transform):
7369

7470
def __init__(
7571
self,
76-
path: str = None,
7772
merges_path: str = None,
7873
vocab_path: str = None,
7974
special_tokens: Optional[Dict[str, int]] = None,
@@ -87,7 +82,7 @@ def __init__(
8782
# Use custom EOS, BOS and pad ids instead of TikToken's
8883
self.eos_id = self.special_tokens["<|im_end|>"]
8984
self.bos_id = self.special_tokens["<|endoftext|>"]
90-
self.pad_id = self.special_tokens["<|dummy_87|>"]
85+
self.pad_id = self.special_tokens["<|dummy_85|>"]
9186

9287
# During generation, stop when eos_id is encountered
9388
self.stop_tokens = [self.eos_id]
@@ -96,26 +91,15 @@ def __init__(
9691

9792
self.prompt_template = prompt_template
9893

99-
if path is None:
100-
self.tt_model = GPT2BaseTokenizer(
101-
vocab_path,
102-
merges_path,
103-
"replace",
104-
self.eos_id,
105-
self.bos_id,
106-
self.eos_id,
107-
self.pad_id,
108-
)
109-
else:
110-
# To be still flexible at the point of tokenizer.model
111-
self.tt_model = TikTokenBaseTokenizer(
112-
path,
113-
"phi4_tiktoken",
114-
CL100K_PATTERN,
115-
bos_id=self.bos_id,
116-
eos_id=self.eos_id,
117-
special_tokens=self.special_tokens,
118-
)
94+
self.tokenizer_model = GPT2BaseTokenizer(
95+
vocab_path,
96+
merges_path,
97+
"replace",
98+
self.eos_id,
99+
self.bos_id,
100+
self.eos_id,
101+
self.pad_id,
102+
)
119103

120104
@property
121105
def vocab_size(self):
@@ -126,9 +110,8 @@ def encode(
126110
text: str,
127111
add_bos: bool = True,
128112
add_eos: bool = True,
129-
trim_leading_whitespace: bool = False,
130113
) -> List[int]:
131-
return self.tt_model.encode(
114+
return self.tokenizer_model.encode(
132115
text=text,
133116
add_bos=add_bos,
134117
add_eos=add_eos,
@@ -155,14 +138,13 @@ def decode(self, ids: List[int], skip_special_tokens: bool = True) -> str:
155138
ids_for_decode.append(token_id)
156139
return self.tt_model.decode(ids_for_decode)
157140

158-
def _tokenize_header(self, role: str):
141+
def _tokenize_header(self, role: str) -> list:
159142
tokenized_messages = []
160143
tokenized_messages.append(self.special_tokens["<|im_start|>"])
161144
encoded = self.encode(
162145
role,
163146
add_bos=False,
164147
add_eos=False,
165-
trim_leading_whitespace=True,
166148
)
167149

168150
tokenized_messages.extend(encoded)
@@ -256,7 +238,6 @@ def tokenize_messages(
256238
item["content"].rstrip(" "),
257239
add_bos=False,
258240
add_eos=False,
259-
trim_leading_whitespace=True, # Always trim whitespace (just to match HF tokenizer implementation)
260241
)
261242
else:
262243
raise RuntimeError(

torchtune/modules/tokenizers/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
from ._gpt2 import GPT2BaseTokenizer
87
from ._sentencepiece import SentencePieceBaseTokenizer
98
from ._tiktoken import TikTokenBaseTokenizer
109
from ._utils import (
@@ -17,7 +16,6 @@
1716
__all__ = [
1817
"SentencePieceBaseTokenizer",
1918
"TikTokenBaseTokenizer",
20-
"GPT2BaseTokenizer",
2119
"ModelTokenizer",
2220
"BaseTokenizer",
2321
"tokenize_messages_no_special_tokens",
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from torchtune.modules.transforms.tokenizers._gpt2 import GPT2BaseTokenizer
8+
9+
__all__ = ["GPT2BaseTokenizer"]

0 commit comments

Comments
 (0)