meta-pytorch
diff --git a/‎tests/test_utils.py‎
Lines changed: 5 additions & 2 deletions b/‎tests/test_utils.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tests/torchtune/config/test_config_utils.py‎
Lines changed: 0 additions & 32 deletions b/‎tests/torchtune/config/test_config_utils.py‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎tests/torchtune/data/test_chat_formats.py‎
Lines changed: 130 additions & 0 deletions b/‎tests/torchtune/data/test_chat_formats.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎tests/torchtune/data/test_data_utils.py‎
Lines changed: 10 additions & 73 deletions b/‎tests/torchtune/data/test_data_utils.py‎
Lines changed: 10 additions & 73 deletions
@@ -18,7 +18,7 @@
 
 import torch
 from torch import nn
-
+from torchtune.modules import Tokenizer
 
 skip_if_cuda_not_available = unittest.skipIf(
     not torch.cuda.is_available(), "CUDA is not available"
@@ -31,8 +31,11 @@
     "llama2_7b": "/tmp/test-artifacts/llama2-7b-torchtune.pt",
 }
 
+# Inherit from tokenizer class to reuse its tokenize_messages method
+class DummyTokenizer(Tokenizer):
+    def __init__(self):
+        self.encodes_whitespace = False
 
-class DummyTokenizer:
     def encode(self, text, add_bos=True, add_eos=True, **kwargs):
         words = text.split()
         tokens = [len(word) for word in words]
 
@@ -9,11 +9,9 @@
 import pytest
 from torchtune.config._utils import (
     _get_component_from_path,
-    _get_template,
     _merge_yaml_and_cli_args,
     InstantiationError,
 )
-from torchtune.data import AlpacaInstructTemplate
 from torchtune.utils.argparse import TuneRecipeArgumentParser
 
 _CONFIG = {
@@ -109,33 +107,3 @@ def test_merge_yaml_and_cli_args(self, mock_load):
             ValueError, match="Command-line overrides must be in the form of key=value"
         ):
             _ = _merge_yaml_and_cli_args(yaml_args, cli_args)
-
-    def test_get_template(self):
-        # Test valid template class
-        template = _get_template("AlpacaInstructTemplate")
-        assert isinstance(template, AlpacaInstructTemplate)
-
-        # Test invalid template class
-        with pytest.raises(
-            ValueError,
-            match="Must be a PromptTemplate class or a string with placeholders.",
-        ):
-            _ = _get_template("InvalidTemplate")
-
-        # Test valid template strings
-        valid_templates = [
-            "Instruction: {instruction}\nInput: {input}",
-            "Instruction: {instruction}",
-            "{a}",
-        ]
-        for template in valid_templates:
-            assert _get_template(template) == template
-
-        # Test invalid template strings
-        invalid_templates = ["hello", "{}", "a}{b"]
-        for template in invalid_templates:
-            with pytest.raises(
-                ValueError,
-                match="Must be a PromptTemplate class or a string with placeholders.",
-            ):
-                _ = _get_template(template)
@@ -0,0 +1,130 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+from torchtune.data import ChatMLFormat, Llama2ChatFormat, Message, MistralChatFormat
+
+# Taken from Open-Orca/SlimOrca-Dedup on HuggingFace:
+# https://huggingface.co/datasets/Open-Orca/SlimOrca-Dedup
+CHAT_SAMPLE = [
+    Message(
+        role="system",
+        content="You are an AI assistant. User will you give you a task. "
+        "Your goal is to complete the task as faithfully as you can. "
+        "While performing the task think step-by-step and justify your steps.",
+    ),
+    Message(
+        role="user",
+        content="Please briefly summarize this news article:\n\nAOL.com Video - "
+        "Father Lets 8-Year-Old Drive On Icy Road\n\nDescription:Would you let your "
+        "8-year-old drive your car? How about on an icy road? Well one father in "
+        "Russia did just that, and recorded the entire thing. To her credit, the "
+        "child seemed to be doing a great job. (0:44)\n\nTags: 8-year-old driver , "
+        "caught on camera , child driver , pix11\n\nSummary:",
+    ),
+    Message(
+        role="assistant",
+        content="A father in Russia allowed his 8-year-old child to drive his car "
+        "on an icy road and recorded the event. The child appeared to be handling the "
+        "situation well, showcasing their driving skills despite the challenging conditions.",
+    ),
+]
+
+
+def _assert_dialogue_equal(actual, expected):
+    assert len(actual) == len(expected)
+    for i in range(len(actual)):
+        assert actual[i].role == expected[i].role
+        assert actual[i].content == expected[i].content
+
+
+class TestLlama2ChatFormat:
+    expected_dialogue = [
+        Message(
+            role="user",
+            content="[INST] <<SYS>>\nYou are an AI assistant. User will you give you a task. "
+            "Your goal is to complete the task as faithfully as you can. While performing "
+            "the task think step-by-step and justify your steps.\n<</SYS>>\n\nPlease "
+            "briefly summarize this news article:\n\nAOL.com Video - Father Lets 8-Year-Old "
+            "Drive On Icy Road\n\nDescription:Would you let your 8-year-old drive your car? "
+            "How about on an icy road? Well one father in Russia did just that, and recorded "
+            "the entire thing. To her credit, the child seemed to be doing a great job. "
+            "(0:44)\n\nTags: 8-year-old driver , caught on camera , child driver , pix11\n\n"
+            "Summary: [/INST] ",
+        ),
+        Message(
+            role="assistant",
+            content="A father in Russia allowed his 8-year-old child to drive his car on an "
+            "icy road and recorded the event. The child appeared to be handling the situation well, "
+            "showcasing their driving skills despite the challenging conditions.",
+        ),
+    ]
+
+    def test_format(self):
+        actual = Llama2ChatFormat.format(CHAT_SAMPLE)
+        _assert_dialogue_equal(actual, self.expected_dialogue)
+
+
+class TestMistralChatFormat:
+    expected_dialogue = [
+        Message(
+            role="user",
+            content="[INST] Please briefly summarize this news article:\n\nAOL.com Video - Father Lets 8-Year-Old "
+            "Drive On Icy Road\n\nDescription:Would you let your 8-year-old drive your car? "
+            "How about on an icy road? Well one father in Russia did just that, and recorded "
+            "the entire thing. To her credit, the child seemed to be doing a great job. "
+            "(0:44)\n\nTags: 8-year-old driver , caught on camera , child driver , pix11\n\n"
+            "Summary: [/INST] ",
+        ),
+        Message(
+            role="assistant",
+            content="A father in Russia allowed his 8-year-old child to drive his car on an "
+            "icy road and recorded the event. The child appeared to be handling the situation well, "
+            "showcasing their driving skills despite the challenging conditions.",
+        ),
+    ]
+
+    def test_format(self):
+        no_system_sample = CHAT_SAMPLE[1:]
+        actual = MistralChatFormat.format(no_system_sample)
+        _assert_dialogue_equal(actual, self.expected_dialogue)
+
+    def test_format_with_system_prompt_raises(self):
+        with pytest.raises(
+            ValueError, match="System prompts are not supported in MistralChatFormat"
+        ):
+            _ = MistralChatFormat.format(CHAT_SAMPLE)
+
+
+class TestChatMLFormat:
+    expected_dialogue = [
+        Message(
+            role="system",
+            content="<|im_start|>system\nYou are an AI assistant. User will you give you a task. "
+            "Your goal is to complete the task as faithfully as you can. While performing "
+            "the task think step-by-step and justify your steps.<|im_end|>\n",
+        ),
+        Message(
+            role="user",
+            content="<|im_start|>user\nPlease "
+            "briefly summarize this news article:\n\nAOL.com Video - Father Lets 8-Year-Old "
+            "Drive On Icy Road\n\nDescription:Would you let your 8-year-old drive your car? "
+            "How about on an icy road? Well one father in Russia did just that, and recorded "
+            "the entire thing. To her credit, the child seemed to be doing a great job. "
+            "(0:44)\n\nTags: 8-year-old driver , caught on camera , child driver , pix11\n\n"
+            "Summary:<|im_end|>\n",
+        ),
+        Message(
+            role="assistant",
+            content="<|im_start|>assistant\nA father in Russia allowed his 8-year-old child to drive his car on an "
+            "icy road and recorded the event. The child appeared to be handling the situation well, "
+            "showcasing their driving skills despite the challenging conditions.<|im_end|>",
+        ),
+    ]
+
+    def test_format(self):
+        actual = ChatMLFormat.format(CHAT_SAMPLE)
+        _assert_dialogue_equal(actual, self.expected_dialogue)
@@ -4,84 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from tests.test_utils import DummyTokenizer
-from torchtune.data import tokenize_prompt_and_response, truncate
-from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
-
-
-def test_tokenize_prompt_and_response():
-    tokenizer = DummyTokenizer()
-    prompt = "Instruction:\nThis is an instruction.\n\nInput:\nThis is an input.\n\nResponse: "
-    response = "I always know what I'm doing, do you?"
-    prompt_length = 12
-    expected_tokenized_prompt = [
-        0,
-        12,
-        4,
-        2,
-        2,
-        12,
-        6,
-        4,
-        2,
-        2,
-        6,
-        9,
-        1,
-        6,
-        4,
-        4,
-        3,
-        6,
-        2,
-        4,
-        -1,
-    ]
-    expected_tokenized_label = [CROSS_ENTROPY_IGNORE_IDX] * prompt_length + [
-        1,
-        6,
-        4,
-        4,
-        3,
-        6,
-        2,
-        4,
-        -1,
-    ]
-
-    tokenized_prompt, tokenized_label = tokenize_prompt_and_response(
-        tokenizer, prompt, response
-    )
-    assert tokenized_prompt == expected_tokenized_prompt
-    assert tokenized_label == expected_tokenized_label
-
-    tokenized_prompt, tokenized_label = tokenize_prompt_and_response(
-        tokenizer, prompt, response, train_on_input=True
-    )
-    assert tokenized_prompt == expected_tokenized_prompt
-    assert tokenized_label == expected_tokenized_prompt
+from torchtune.data import truncate
 
 
 def test_truncate():
-    prompt_tokens = [1, 2, 3, 4, -1]
-    label_tokens = [1, 2, 3, 4, -1]
+    tokens = [1, 2, 3, 4, -1]
 
     # Test no truncation
-    truncated_prompt_tokens, truncated_label_tokens = truncate(
-        tokenizer=DummyTokenizer(),
-        prompt_tokens=prompt_tokens,
-        label_tokens=label_tokens,
+    truncated_tokens = truncate(
+        tokens=tokens,
         max_seq_len=5,
+        eos_id=-1,
     )
-    assert truncated_prompt_tokens == prompt_tokens
-    assert truncated_label_tokens == label_tokens
+    assert truncated_tokens == tokens
 
-    # Test truncated
-    truncated_prompt_tokens, truncated_label_tokens = truncate(
-        tokenizer=DummyTokenizer(),
-        prompt_tokens=prompt_tokens,
-        label_tokens=label_tokens,
-        max_seq_len=4,
-    )
-    assert truncated_prompt_tokens == [1, 2, 3, -1]
-    assert truncated_label_tokens == [1, 2, 3, -1]
+    masks = [True, True, False, True, False]
+    # Test truncated mask
+    truncated_masks = truncate(tokens=masks, max_seq_len=4, eos_id=False)
+    assert truncated_masks == [True, True, False, False]