Address PR comments

gokulavasan · gokulavasan · commit b0074fb10090 · 2024-01-22T06:43:29.000-08:00
diff --git a/tests/torchtune/datasets/test_slimorca_dataset.py b/tests/torchtune/datasets/test_slimorca_dataset.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import random
 from pathlib import Path
 
 import pytest
@@ -22,7 +23,7 @@ def tokenizer(self):
 
     def test_slim_orca_dataset(self, tokenizer):
         dataset = datasets.get_dataset("slimorca", tokenizer=tokenizer)
-        assert dataset
+        assert len(dataset) == 363_491
 
     def test_prompt_label_generation(self, tokenizer):
         dataset = datasets.get_dataset("slimorca", tokenizer=tokenizer)
@@ -40,15 +41,35 @@ def test_prompt_label_generation(self, tokenizer):
                 "value": "lo",
             },
         ]
-        prompt, label = dataset.generate_prompt_label(sample)
-        assert prompt == "[INST] <<SYS>>\nhi\n<</SYS>>\n\nmid [/INST]"
+        prompt, label = dataset._generate_prompt_label(sample)
+        assert (
+            prompt
+            == f"{datasets.Llama2ChatFormatConstants.B_INST} {datasets.Llama2ChatFormatConstants.B_SYS}hi{datasets.Llama2ChatFormatConstants.E_SYS}mid {datasets.Llama2ChatFormatConstants.E_INST}"  # noqa: B950
+        )
+        assert label == " lo "
+
+        sample = [
+            {
+                "from": "human",
+                "value": "mid",
+            },
+            {
+                "from": "gpt",
+                "value": "lo",
+            },
+        ]
+        prompt, label = dataset._generate_prompt_label(sample)
+        assert (
+            prompt
+            == f"{datasets.Llama2ChatFormatConstants.B_INST} mid {datasets.Llama2ChatFormatConstants.E_INST}"
+        )
         assert label == " lo "
 
     def test_token_generation(self, tokenizer):
         dataset = datasets.get_dataset(
             "slimorca", tokenizer=tokenizer, max_token_length=4096
         )
-        input, label = dataset.generate_tokens("Hello ", "world!")
+        input, label = dataset._generate_tokens("Hello ", "world!")
         assert input == [tokenizer.bos_id, 12, 1803, 1024, 103, tokenizer.eos_id]
         assert label == ([-100] * 3 + [1024, 103, tokenizer.eos_id])
 
@@ -57,7 +78,7 @@ def test_truncated_token_generation(self, tokenizer):
             "slimorca", tokenizer=tokenizer, max_token_length=5
         )
         # 5 is enough for full prompt, but not for label
-        input, label = dataset.generate_tokens("Hello ", "world!")
+        input, label = dataset._generate_tokens("Hello ", "world!")
         assert input == [tokenizer.bos_id, 12, 1803, 1024, tokenizer.eos_id]
         assert label == ([-100] * 3 + [1024, tokenizer.eos_id])
 
@@ -66,10 +87,32 @@ def test_truncated_token_generation(self, tokenizer):
         dataset = datasets.get_dataset(
             "slimorca", tokenizer=tokenizer, max_token_length=4
         )
-        input, label = dataset.generate_tokens("Hello ", "world!")
+        input, label = dataset._generate_tokens("Hello ", "world!")
         assert input == [tokenizer.bos_id, 12, 1024, tokenizer.eos_id]
         assert label == ([-100] * 2 + [1024, tokenizer.eos_id])
 
     def test_value_error(self, tokenizer):
         with pytest.raises(ValueError):
             datasets.get_dataset("slimorca", tokenizer=tokenizer, max_token_length=3)
+
+    @pytest.mark.parametrize("max_token_length", [128, 512, 1024, 4096])
+    def test_dataset_get_item(self, tokenizer, max_token_length):
+        ds = datasets.get_dataset(
+            "slimorca", tokenizer=tokenizer, max_token_length=max_token_length
+        )
+        index = random.randint(0, len(ds))
+        input, label = ds[index]
+        assert (
+            len(input) <= max_token_length
+        ), f"{index} in slimorca fails input token length check"
+        assert (
+            len(label) <= max_token_length
+        ), f"{index} in slimorca fails label token length check"
+        assert len(input) == len(
+            label
+        ), f"{index} in slimorca fails token lists equality check"
+        assert input[0] == tokenizer.bos_id, f"{index} in slimorca fails bos check"
+        assert input[-1] == tokenizer.eos_id, f"{index} in slimorca fails eos check"
+        assert (
+            label[-1] == tokenizer.eos_id
+        ), f"{index} in slimorca fails label eos check"
diff --git a/torchtune/datasets/__init__.py b/torchtune/datasets/__init__.py
@@ -7,7 +7,7 @@
 from torch.utils.data import Dataset
 
 from .alpaca import AlpacaDataset
-from .slimorca import SlimOrcaDataset
+from .slimorca import Llama2ChatFormatConstants, SlimOrcaDataset  # noqa
 
 _DATASET_DICT = {"alpaca": AlpacaDataset, "slimorca": SlimOrcaDataset}
 
diff --git a/torchtune/datasets/slimorca.py b/torchtune/datasets/slimorca.py
@@ -12,7 +12,11 @@
 # Not ideal to import this type here but it's needed for the transform function
 from torchtune.modules import Tokenizer
 
-_CROSS_ENTROPY_IGNORE_IDX = -100
+
+class Llama2ChatFormatConstants:
+    CROSS_ENTROPY_IGNORE_IDX = -100
+    B_INST, E_INST = "[INST]", "[/INST]"
+    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 
 
 class SlimOrcaDataset(Dataset):
@@ -22,10 +26,11 @@ class SlimOrcaDataset(Dataset):
 
     The data is formatted to adhere to Llama2 Chat Format.
     This format is required if the base model is Llama2 Chat Model.
+    The base Llama2 Model doesn't prescribe a particular format.
 
     The returned data is a tuple of input token id list and label token id
     list. If `max_token_length` keyword argument is provided, the returned
-    input token id list is ensured (by truncation if necssary) to be within
+    input token id list is ensured (by truncation if necessary) to be within
     that length.
 
     Args:
@@ -35,7 +40,7 @@ class SlimOrcaDataset(Dataset):
         max sequence length accepted by the model.
 
     Keyword Arguments:
-        max_token_length (int): Maximum number of tokens in the returned.
+        max_token_length (int): Maximum number of tokens in the returned input and label token id lists.
         Default is 1024.
 
     Data input format:
@@ -51,15 +56,16 @@ class SlimOrcaDataset(Dataset):
         their funeral." } ]
 
     Example:
-    >>> slimorca_ds = SlimOrcaDataset(tokenizer=tokenizer)
-    >>> for batch in Dataloader(slimorca_ds, batch_size=8):
-            print(f"Batch size: {len(batch)}")
-        Batch size: 8
+    >>> ds = SlimOrcaDataset(tokenizer=tokenizer, max_token_length=10)
+    >>> for input, label in ds:
+            print(input)
+            print(label)
+
+        Sample Ouput:
+        [1, 351, 82, 391, 221, 220, 193, 12, 471, ..., 2]
+        [-100, -100, -100, -100, -100, -100, -100, -100, 471, ..., 2]
     """
 
-    B_INST, E_INST = "[INST]", "[/INST]"
-    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
-
     def __init__(self, tokenizer: Tokenizer, **kwargs) -> None:
         self._data = load_dataset("Open-Orca/SlimOrca-Dedup", split="train")
         self._tokenizer = tokenizer
@@ -72,18 +78,25 @@ def __init__(self, tokenizer: Tokenizer, **kwargs) -> None:
     def __len__(self):
         return len(self._data)
 
-    def prompt_with_system(self, content: str) -> str:
-        return f"{self.B_INST} {self.B_SYS}{content}{self.E_SYS} {self.E_INST}"
-
-    def prompt_without_system(self, content: str) -> str:
-        return f"{self.B_INST} {content} {self.E_INST}"
-
     def __getitem__(self, index: int) -> Tuple[List[int], List[int]]:
         data = self._data[index]["conversations"]
-        prompt, label = self.generate_prompt_label(data)
-        return self.generate_tokens(prompt, label)
+        prompt, label = self._generate_prompt_label(data)
+        return self._generate_tokens(prompt, label)
+
+    def _generate_tokens(self, prompt: str, label: str) -> Tuple[List[int], List[int]]:
+        """
+        Given a prompt string and label string, generate input and label token id lists.
+
+        Tokenizer is used to tokenize both the strings.
+        The prompt token list is truncated to `max_token_length` - 2
+        (so that there is at least one label token, as EOS takes one token).
+
+        The label token list is truncated to `max_token_length` - len(prompt_token_list)
+
+        Finally input token list is the concatenation of prompt and label token lists.
 
-    def generate_tokens(self, prompt: str, label: str) -> Tuple[List[int], List[int]]:
+        Label token list is padded with cross entropy ignore idx value to match the length of input token list.
+        """
         prompt_tokens = self._tokenizer.encode(prompt, add_bos=True, add_eos=False)
         # Truncate to max token length - 2 (so that there is at least one label token)
         prompt_tokens = prompt_tokens[: self._max_token_length - 2]
@@ -99,12 +112,16 @@ def generate_tokens(self, prompt: str, label: str) -> Tuple[List[int], List[int]
 
         input = prompt_tokens + label_tokens
         label = [
-            _CROSS_ENTROPY_IGNORE_IDX for _ in range(len(prompt_tokens))
+            Llama2ChatFormatConstants.CROSS_ENTROPY_IGNORE_IDX
+            for _ in range(len(prompt_tokens))
         ] + label_tokens
-        assert len(input) == len(label)
         return input, label
 
-    def generate_prompt_label(self, data: List[Dict[str, str]]) -> Tuple[str, str]:
+    def _generate_prompt_label(self, data: List[Dict[str, str]]) -> Tuple[str, str]:
+        """
+        Construct prompt and label strings adhering to Llama2 Chat Format.
+        This method supports only back-and-forth conversation per sample (as it is sufficient for SlimOrca dataset).
+        """
         agent_text_dict = {}
         # agents can be {system, human, gpt}
         for conversation in data:
@@ -113,10 +130,10 @@ def generate_prompt_label(self, data: List[Dict[str, str]]) -> Tuple[str, str]:
             agent_text_dict[agent] = text
 
         # Llama2 Chat Format - https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L284
-        if len(agent_text_dict["system"]) > 0:
-            prompt = f"{self.B_INST} {self.B_SYS}{agent_text_dict['system']}{self.E_SYS}{agent_text_dict['human']} {self.E_INST}"
+        if "system" in agent_text_dict:
+            prompt = f"{Llama2ChatFormatConstants.B_INST} {Llama2ChatFormatConstants.B_SYS}{agent_text_dict['system']}{Llama2ChatFormatConstants.E_SYS}{agent_text_dict['human']} {Llama2ChatFormatConstants.E_INST}"  # noqa: B950
         else:
-            prompt = f"{self.B_INST} {agent_text_dict['human']} {self.E_INST}"
+            prompt = f"{Llama2ChatFormatConstants.B_INST} {agent_text_dict['human']} {Llama2ChatFormatConstants.E_INST}"
 
         response = f" {agent_text_dict['gpt']} "
         return prompt, response