fixes

Mark Obozov · Mark Obozov · commit 6e50261eb69d · 2025-02-10T15:50:39.000+03:00
diff --git a/tests/assets/merges.txt b/tests/assets/merges.txt
diff --git a/tests/assets/vocab.json b/tests/assets/vocab.json
diff --git a/tests/torchtune/models/phi4/test_phi4_tokenizer.py b/tests/torchtune/models/phi4/test_phi4_tokenizer.py
@@ -19,24 +19,14 @@ def tokenizer(self):
         # Pretrained tiktoken model generated via the script in
         # https://gist.github.com/ebsmothers/54b133dd87db6679b14318545aaa2de4
         return phi4_tokenizer(
-            path=str(ASSETS / "tiktoken_small.model"),
+            vocab_path=(ASSETS / "vocab.json"),
+            merges_path=(ASSETS / "merges.txt"),
         )
 
     @pytest.fixture
     def expected_tokens(self):
         # fmt: off
-        tokens = [100257, 100264, 115, 121, 322, 398, 100265, 10, 1539, 470, 258, 1444, 933, 1940, 511, 446, 100266, 10, 100264,
-          477, 273, 100265, 10, 66, 478, 299, 351, 362, 292, 1160, 117, 807, 334, 958, 99, 445, 98, 300, 258, 256, 281,
-          107, 46, 411, 114, 561, 258, 1156, 279, 316, 334, 604, 337, 112, 445, 1827, 512, 1080, 116, 300, 262, 1249,
-          524, 340, 10, 35, 35, 35, 828, 1160, 117, 807, 1037, 71, 1414, 534, 258, 1759, 511, 355, 285, 875, 550, 102,
-          1546, 265, 105, 111, 340, 10, 35, 35, 35, 408, 300, 112, 279, 316, 1037, 100266, 10, 100264, 520, 511, 446,
-          100265, 10, 73, 776, 362, 425, 1978, 274, 284, 1528, 319, 995, 505, 944, 874, 903, 1585, 616, 345, 1528, 115,
-          284, 1749, 803, 46, 270, 776, 1341, 258, 1279, 641, 563, 275, 469, 573, 284, 944, 320, 526, 962, 425, 913,
-          1402, 97, 356, 446, 115, 284, 1229, 1581, 282, 117, 276, 259, 300, 46, 270, 776, 258, 1279, 275, 288, 283,
-          262, 739, 1886, 284, 783, 1803, 636, 277, 268, 117, 316, 485, 115, 284, 302, 416, 273, 900, 46, 270, 776, 591,
-          630, 346, 531, 476, 505, 768, 1233, 342, 1923, 292, 522, 662, 280, 274, 913, 601, 359, 300, 44, 335, 834, 335,
-          531, 476, 505, 604, 264, 509, 1456, 258, 771, 543, 1719, 405, 710, 665, 668, 1280, 46, 100266, 10,
-          100265]  # noqa
+        tokens = [100257, 100264, 9125, 100265, 198, 2675, 527, 264, 11190, 18328, 100266, 198, 100264, 882, 100265, 198, 14149, 28514, 374, 279, 1888, 6875, 100266, 198, 100264, 78191, 100265, 198, 9642, 433, 374, 100266, 198, 100265]
         # fmt: on
         return tokens
 
@@ -45,67 +35,41 @@ def test_tokenize_messages(self, tokenizer, expected_tokens):
             Message(role="system", content="You are a helpful assistant", masked=True),
             Message(
                 role="user",
-                content="Below is an instruction that describes a task. Write a response "
-                "that appropriately completes the request.\n\n### Instruction:\nGenerate "
-                "a realistic dating profile bio.\n\n### Response:\n",
+                content="Pytorch is the best library!",
                 masked=True,
             ),
             Message(
                 role="assistant",
-                content="I'm an outgoing and friendly person who loves spending time with "
-                "friends and family. I'm also a big-time foodie and love trying out new "
-                "restaurants and different cuisines. I'm a big fan of the arts and enjoy "
-                "going to museums and galleries. I'm looking for someone who shares my "
-                "interest in exploring new places, as well as someone who appreciates a "
-                "good conversation over coffee.",
+                content="Yes, it is!",
             ),
         ]
         tokens, mask = tokenizer.tokenize_messages(messages, add_eos=True)
 
-        expected_mask = [True] * 101 + [False] * 131
+        expected_mask = [True] * 24 + [False] * 10
         assert expected_tokens == tokens
         assert expected_mask == mask
 
     def test_tokenize_messages_no_system_prompt(self, tokenizer):
         messages = [
-            Message(role="system", content="You are a helpful assistant", masked=True),
             Message(
                 role="user",
-                content="Below is an instruction that describes a task. Write a response "
-                "that appropriately completes the request.\n\n### Instruction:\nGenerate "
-                "a realistic dating profile bio.\n\n### Response:\n",
+                content="Pytorch is the best library!",
                 masked=True,
             ),
             Message(
                 role="assistant",
-                content="I'm an outgoing and friendly person who loves spending time with "
-                "friends and family. I'm also a big-time foodie and love trying out new "
-                "restaurants and different cuisines. I'm a big fan of the arts and enjoy "
-                "going to museums and galleries. I'm looking for someone who shares my "
-                "interest in exploring new places, as well as someone who appreciates a "
-                "good conversation over coffee.",
+                content="Yes, it is!",
             ),
         ]
         tokens, mask = tokenizer.tokenize_messages(
             messages, ignore_system_prompt=True, add_eos=True
         )
 
         # fmt: off
-        expected_tokens = [100257, 100264, 477, 273, 100265, 10, 66, 478, 299, 351, 362, 292, 1160, 117, 807, 334, 958, 99, 445,
-                   98, 300, 258, 256, 281, 107, 46, 411, 114, 561, 258, 1156, 279, 316, 334, 604, 337, 112, 445, 1827,
-                   512, 1080, 116, 300, 262, 1249, 524, 340, 10, 35, 35, 35, 828, 1160, 117, 807, 1037, 71, 1414, 534,
-                   258, 1759, 511, 355, 285, 875, 550, 102, 1546, 265, 105, 111, 340, 10, 35, 35, 35, 408, 300, 112,
-                   279, 316, 1037, 100266, 10, 100264, 520, 511, 446, 100265, 10, 73, 776, 362, 425, 1978, 274, 284,
-                   1528, 319, 995, 505, 944, 874, 903, 1585, 616, 345, 1528, 115, 284, 1749, 803, 46, 270, 776, 1341,
-                   258, 1279, 641, 563, 275, 469, 573, 284, 944, 320, 526, 962, 425, 913, 1402, 97, 356, 446, 115, 284,
-                   1229, 1581, 282, 117, 276, 259, 300, 46, 270, 776, 258, 1279, 275, 288, 283, 262, 739, 1886, 284,
-                   783, 1803, 636, 277, 268, 117, 316, 485, 115, 284, 302, 416, 273, 900, 46, 270, 776, 591, 630, 346,
-                   531, 476, 505, 768, 1233, 342, 1923, 292, 522, 662, 280, 274, 913, 601, 359, 300, 44, 335, 834, 335,
-                   531, 476, 505, 604, 264, 509, 1456, 258, 771, 543, 1719, 405, 710, 665, 668, 1280, 46, 100266, 10,
-                   100265]  # noqa
+        expected_tokens = [100257, 100264, 882, 100265, 198, 14149, 28514, 374, 279, 1888, 6875, 100266, 198, 100264, 78191, 100265, 198, 9642, 433, 374, 100266, 198, 100265]
         # fmt: on
 
-        expected_mask = [True] * 84 + [False] * 131
+        expected_mask = [True] * 13 + [False] * 10
         assert expected_tokens == tokens
         assert expected_mask == mask
 
@@ -118,41 +82,22 @@ def test_tokenize_message_drop_eos(self, tokenizer, expected_tokens):
             Message(role="system", content="You are a helpful assistant", masked=True),
             Message(
                 role="user",
-                content="Below is an instruction that describes a task. Write a response "
-                "that appropriately completes the request.\n\n### Instruction:\nGenerate "
-                "a realistic dating profile bio.\n\n### Response:\n",
+                content="Pytorch is the best library!",
                 masked=True,
             ),
             Message(
                 role="assistant",
-                content="I'm an outgoing and friendly person who loves spending time with "
-                "friends and family. I'm also a big-time foodie and love trying out new "
-                "restaurants and different cuisines. I'm a big fan of the arts and enjoy "
-                "going to museums and galleries. I'm looking for someone who shares my "
-                "interest in exploring new places, as well as someone who appreciates a "
-                "good conversation over coffee.",
+                content="Yes, it is!",
             ),
         ]
 
         tokens, mask = tokenizer.tokenize_messages(messages, add_eos=False)
 
         # fmt: off
-        expected_tokens = [100257, 100264, 115, 121, 322, 398, 100265, 10, 1539, 470, 258, 1444, 933, 1940, 511, 446, 100266,
-        10, 100264, 477, 273, 100265, 10, 66, 478, 299, 351, 362, 292, 1160, 117, 807, 334, 958, 99, 445, 98,
-        300, 258, 256, 281, 107, 46, 411, 114, 561, 258, 1156, 279, 316, 334, 604, 337, 112, 445, 1827, 512,
-        1080, 116, 300, 262, 1249, 524, 340, 10, 35, 35, 35, 828, 1160, 117, 807, 1037, 71, 1414, 534, 258,
-        1759, 511, 355, 285, 875, 550, 102, 1546, 265, 105, 111, 340, 10, 35, 35, 35, 408, 300, 112, 279,
-        316, 1037, 100266, 10, 100264, 520, 511, 446, 100265, 10, 73, 776, 362, 425, 1978, 274, 284, 1528,
-        319, 995, 505, 944, 874, 903, 1585, 616, 345, 1528, 115, 284, 1749, 803, 46, 270, 776, 1341, 258,
-        1279, 641, 563, 275, 469, 573, 284, 944, 320, 526, 962, 425, 913, 1402, 97, 356, 446, 115, 284, 1229,
-        1581, 282, 117, 276, 259, 300, 46, 270, 776, 258, 1279, 275, 288, 283, 262, 739, 1886, 284, 783,
-        1803, 636, 277, 268, 117, 316, 485, 115, 284, 302, 416, 273, 900, 46, 270, 776, 591, 630, 346, 531,
-        476, 505, 768, 1233, 342, 1923, 292, 522, 662, 280, 274, 913, 601, 359, 300, 44, 335, 834, 335, 531,
-        476, 505, 604, 264, 509, 1456, 258, 771, 543, 1719, 405, 710, 665, 668, 1280, 46, 100266, 10,
-        100265]  # noqa
+        expected_tokens = [100257, 100264, 9125, 100265, 198, 2675, 527, 264, 11190, 18328, 100266, 198, 100264, 882, 100265, 198, 14149, 28514, 374, 279, 1888, 6875, 100266, 198, 100264, 78191, 100265, 198, 9642, 433, 374, 100266, 198, 100265]
         # fmt: on
 
-        expected_mask = [True] * 101 + [False] * 130
+        expected_mask = [True] * 24 + [False] * 9
         # Drop eos token.
         assert expected_tokens[:-1] == tokens
         assert expected_mask == mask
diff --git a/tests/torchtune/modules/tokenizers/test_gpt2.py b/tests/torchtune/modules/tokenizers/test_gpt2.py
@@ -7,7 +7,7 @@
 import pytest
 
 from tests.common import ASSETS
-from torchtune.modules.tokenizers import GPT2BaseTokenizer
+from torchtune.modules.transforms.tokenizers import GPT2BaseTokenizer
 
 
 class TestGPT2BaseTokenizer:
@@ -54,6 +54,22 @@ def test_encode(self, tokenizer):
             5,
         ]
 
+    def test_decode(self, tokenizer):
+        tokens = [2, 3, 4, 5]
+
+        assert tokenizer.decode(tokens) == ["H", "ell", "o", "Ġworld"]
+        assert tokenizer.decode(
+            tokenizer.encode("Hello world!", add_eos=False, add_bos=False)
+        ) == ["H", "ell", "o", "Ġworld"]
+        assert tokenizer.decode(tokenizer.encode("Hello world!")) == [
+            None,
+            "H",
+            "ell",
+            "o",
+            "Ġworld",
+            None,
+        ]
+
     def test_token_ids(self, tokenizer):
         assert tokenizer.eos_id == 1
         assert tokenizer.pad_id == 1
diff --git a/torchtune/models/phi4/_model_builders.py b/torchtune/models/phi4/_model_builders.py
@@ -55,7 +55,7 @@ def phi4_tokenizer(vocab_path: str = None, merges_path: str = None, path: str =
     """
     special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None
     template = _get_prompt_template(prompt_template) if prompt_template is not None else None
-    return Phi4MiniTokenizer(vocab_path=vocab_path, merges_path=merges_path, path=path, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template)
+    return Phi4MiniTokenizer(vocab_path=vocab_path, merges_path=merges_path, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template)
 
 
 def lora_phi4(
diff --git a/torchtune/models/phi4/_tokenizer.py b/torchtune/models/phi4/_tokenizer.py
@@ -9,12 +9,9 @@
 from torchtune.data._messages import Message
 from torchtune.data._prompt_templates import PromptTemplate
 from torchtune.data._utils import truncate
-from torchtune.modules.tokenizers import (
-    GPT2BaseTokenizer,
-    ModelTokenizer,
-    TikTokenBaseTokenizer,
-)
+from torchtune.modules.tokenizers import ModelTokenizer
 from torchtune.modules.transforms import Transform
+from torchtune.modules.transforms.tokenizers import GPT2BaseTokenizer
 
 PHI4_SPECIAL_TOKENS = {
     "<|dummy_0|>": 100256,
@@ -46,7 +43,6 @@ class Phi4MiniTokenizer(ModelTokenizer, Transform):
     TikToken tokenizer configured with Phi4 (14B) special tokens.
 
     Args:
-        path (str): Path to tokenizer.model file.
         merges_path (str): Path to merges.txt file.
         vocab_path (str): Path to vocab.json file.
         special_tokens (Optional[Dict[str, int]]): mapping containing special text tokens and
@@ -73,7 +69,6 @@ class Phi4MiniTokenizer(ModelTokenizer, Transform):
 
     def __init__(
         self,
-        path: str = None,
         merges_path: str = None,
         vocab_path: str = None,
         special_tokens: Optional[Dict[str, int]] = None,
@@ -87,7 +82,7 @@ def __init__(
         # Use custom EOS, BOS and pad ids instead of TikToken's
         self.eos_id = self.special_tokens["<|im_end|>"]
         self.bos_id = self.special_tokens["<|endoftext|>"]
-        self.pad_id = self.special_tokens["<|dummy_87|>"]
+        self.pad_id = self.special_tokens["<|dummy_85|>"]
 
         # During generation, stop when eos_id is encountered
         self.stop_tokens = [self.eos_id]
@@ -96,26 +91,15 @@ def __init__(
 
         self.prompt_template = prompt_template
 
-        if path is None:
-            self.tt_model = GPT2BaseTokenizer(
-                vocab_path,
-                merges_path,
-                "replace",
-                self.eos_id,
-                self.bos_id,
-                self.eos_id,
-                self.pad_id,
-            )
-        else:
-            # To be still flexible at the point of tokenizer.model
-            self.tt_model = TikTokenBaseTokenizer(
-                path,
-                "phi4_tiktoken",
-                CL100K_PATTERN,
-                bos_id=self.bos_id,
-                eos_id=self.eos_id,
-                special_tokens=self.special_tokens,
-            )
+        self.tokenizer_model = GPT2BaseTokenizer(
+            vocab_path,
+            merges_path,
+            "replace",
+            self.eos_id,
+            self.bos_id,
+            self.eos_id,
+            self.pad_id,
+        )
 
     @property
     def vocab_size(self):
@@ -126,9 +110,8 @@ def encode(
         text: str,
         add_bos: bool = True,
         add_eos: bool = True,
-        trim_leading_whitespace: bool = False,
     ) -> List[int]:
-        return self.tt_model.encode(
+        return self.tokenizer_model.encode(
             text=text,
             add_bos=add_bos,
             add_eos=add_eos,
@@ -155,14 +138,13 @@ def decode(self, ids: List[int], skip_special_tokens: bool = True) -> str:
                 ids_for_decode.append(token_id)
         return self.tt_model.decode(ids_for_decode)
 
-    def _tokenize_header(self, role: str):
+    def _tokenize_header(self, role: str) -> list:
         tokenized_messages = []
         tokenized_messages.append(self.special_tokens["<|im_start|>"])
         encoded = self.encode(
             role,
             add_bos=False,
             add_eos=False,
-            trim_leading_whitespace=True,
         )
 
         tokenized_messages.extend(encoded)
@@ -256,7 +238,6 @@ def tokenize_messages(
                         item["content"].rstrip(" "),
                         add_bos=False,
                         add_eos=False,
-                        trim_leading_whitespace=True,  # Always trim whitespace (just to match HF tokenizer implementation)
                     )
                 else:
                     raise RuntimeError(
diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/tokenizers/__init__.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._gpt2 import GPT2BaseTokenizer
 from ._sentencepiece import SentencePieceBaseTokenizer
 from ._tiktoken import TikTokenBaseTokenizer
 from ._utils import (
@@ -17,7 +16,6 @@
 __all__ = [
     "SentencePieceBaseTokenizer",
     "TikTokenBaseTokenizer",
-    "GPT2BaseTokenizer",
     "ModelTokenizer",
     "BaseTokenizer",
     "tokenize_messages_no_special_tokens",
diff --git a/torchtune/modules/transforms/tokenizers/__init__.py b/torchtune/modules/transforms/tokenizers/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torchtune.modules.transforms.tokenizers._gpt2 import GPT2BaseTokenizer
+
+__all__ = ["GPT2BaseTokenizer"]
diff --git a/torchtune/modules/transforms/tokenizers/_gpt2.py b/torchtune/modules/transforms/tokenizers/_gpt2.py