Configure max_seq_len in InstructDataset (#620)

RdoubleA · web-flow · commit 73647e26eb32 · 2024-03-30T16:00:49.000-07:00
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -33,14 +33,23 @@
 
 
 class DummyTokenizer:
-    def encode(self, text, **kwargs):
+    def encode(self, text, add_bos=True, add_eos=True, **kwargs):
         words = text.split()
-        return [len(word) for word in words]
+        tokens = [len(word) for word in words]
+        if add_bos:
+            tokens = [self.bos_id] + tokens
+        if add_eos:
+            tokens = tokens + [self.eos_id]
+        return tokens
 
     @property
     def eos_id(self):
         return -1
 
+    @property
+    def bos_id(self):
+        return 0
+
 
 def get_assets_path():
     return Path(__file__).parent / "assets"
diff --git a/tests/torchtune/data/test_data_utils.py b/tests/torchtune/data/test_data_utils.py
@@ -13,8 +13,9 @@ def test_tokenize_prompt_and_response():
     tokenizer = DummyTokenizer()
     prompt = "Instruction:\nThis is an instruction.\n\nInput:\nThis is an input.\n\nResponse: "
     response = "I always know what I'm doing, do you?"
-    prompt_length = 11
+    prompt_length = 12
     expected_tokenized_prompt = [
+        0,
         12,
         4,
         2,
@@ -34,6 +35,7 @@ def test_tokenize_prompt_and_response():
         6,
         2,
         4,
+        -1,
     ]
     expected_tokenized_label = [CROSS_ENTROPY_IGNORE_IDX] * prompt_length + [
         1,
@@ -44,6 +46,7 @@ def test_tokenize_prompt_and_response():
         6,
         2,
         4,
+        -1,
     ]
 
     tokenized_prompt, tokenized_label = tokenize_prompt_and_response(
diff --git a/tests/torchtune/datasets/test_chat_dataset.py b/tests/torchtune/datasets/test_chat_dataset.py
@@ -126,6 +126,7 @@ def test_get_item(self, mock_load_dataset, template, dialogue):
         mock_load_dataset.return_value = dialogue
         expected_tokenized_prompts = [
             [
+                0,
                 7,
                 3,
                 3,
@@ -146,15 +147,18 @@ def test_get_item(self, mock_load_dataset, template, dialogue):
                 4,
                 2,
                 3,
+                -1,
+                0,
                 5,
                 6,
                 11,
                 10,
                 1,
+                6,
                 -1,
             ]
         ]
-        prompt_lengths = (14, 4)
+        prompt_lengths = (15, 5)
         expected_labels = [
             [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[0]
             + [
@@ -164,9 +168,10 @@ def test_get_item(self, mock_load_dataset, template, dialogue):
                 4,
                 2,
                 3,
+                -1,
             ]
             + [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[1]
-            + [1, -1]
+            + [1, 6, -1]
         ]
 
         ds = ChatDataset(
diff --git a/tests/torchtune/datasets/test_instruct_dataset.py b/tests/torchtune/datasets/test_instruct_dataset.py
@@ -32,8 +32,34 @@ class TestInstructDataset:
         "Instruction:\n{instruction}\n\nInput:\n{input}\n\nResponse: "
     )
     expected_tokenized_prompts = [
-        [12, 4, 2, 3, 2, 12, 10, 6, 4, 2, 3, 2, 6, 10, 9, 1, 5, 4, 4, 3, 6, 2, 4],
-        [12, 4, 2, 2, 12, 10, 6, 4, 2, 2, 6, 10, 9, 1, 6, 4, 4, 3, 6, 2, 4],
+        [
+            0,
+            12,
+            4,
+            2,
+            3,
+            2,
+            12,
+            10,
+            6,
+            4,
+            2,
+            3,
+            2,
+            6,
+            10,
+            9,
+            1,
+            5,
+            4,
+            4,
+            3,
+            6,
+            2,
+            4,
+            -1,
+        ],
+        [0, 12, 4, 2, 2, 12, 10, 6, 4, 2, 2, 6, 10, 9, 1, 6, 4, 4, 3, 6, 2, 4, -1],
     ]
 
     def get_samples(self):
@@ -53,10 +79,12 @@ def get_samples(self):
     @mock.patch("torchtune.datasets._instruct.load_dataset")
     def test_get_item_no_train_on_input(self, mock_load_dataset):
         mock_load_dataset.return_value = self.get_samples()
-        prompt_lengths = (15, 13)
+        prompt_lengths = (16, 14)
         expected_labels = [
-            [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[0] + [1, 5, 4, 4, 3, 6, 2, 4],
-            [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[1] + [1, 6, 4, 4, 3, 6, 2, 4],
+            [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[0]
+            + [1, 5, 4, 4, 3, 6, 2, 4, -1],
+            [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[1]
+            + [1, 6, 4, 4, 3, 6, 2, 4, -1],
         ]
 
         dataset = InstructDataset(
diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py
@@ -13,6 +13,7 @@ def alpaca_dataset(
     tokenizer: Tokenizer,
     train_on_input: bool = True,
     use_clean: bool = False,
+    max_seq_len: int = 512,
 ) -> InstructDataset:
     """
     Support for the Alpaca dataset and its variants from Hugging Face Datasets.
@@ -39,6 +40,10 @@ def alpaca_dataset(
         tokenizer (Tokenizer): Tokenizer used to encode data. Tokenize must implement an `encode` and `decode` method.
         train_on_input (bool): Whether the model is trained on the prompt or not. Default is True.
         use_clean (bool): Whether to use the cleaned version of the dataset or not. Default is False.
+        max_seq_len (int): Maximum number of tokens in the returned input and label token id lists.
+            Default is 512, as set by Stanford Alpaca (https://github.com/tatsu-lab/stanford_alpaca?tab=readme-ov-file#fine-tuning),
+            but we recommend setting this to the highest you can fit in memory and is supported by the model.
+            For example, llama2-7B supports up to 4096 for sequence length.
 
     Returns:
         InstructDataset: dataset configured with Alpaca source data and template
@@ -56,5 +61,6 @@ def alpaca_dataset(
         source="yahma/alpaca-cleaned" if use_clean else "tatsu-lab/alpaca",
         template=AlpacaInstructTemplate(),
         train_on_input=train_on_input,
+        max_seq_len=max_seq_len,
         split="train",
     )
diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py
@@ -11,7 +11,7 @@
 
 from torchtune.config._utils import _get_template
 
-from torchtune.data import PromptTemplate, tokenize_prompt_and_response
+from torchtune.data import PromptTemplate, tokenize_prompt_and_response, truncate
 from torchtune.modules import Tokenizer
 
 
@@ -43,6 +43,9 @@ class InstructDataset(Dataset):
         column_map (Optional[Dict[str, str]]): a mapping from the expected placeholder names in the template
             to the column/key names in the sample. If None, assume these are identical.
         train_on_input (bool): Whether the model is trained on the prompt or not. Default is False.
+        max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
+            Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
+            and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
         **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to `load_dataset`.
     """
 
@@ -54,6 +57,7 @@ def __init__(
         transform: Optional[Callable] = None,
         column_map: Optional[Dict[str, str]] = None,
         train_on_input: bool = False,
+        max_seq_len: Optional[int] = None,
         **load_dataset_kwargs: Dict[str, Any],
     ) -> None:
         self._tokenizer = tokenizer
@@ -62,6 +66,7 @@ def __init__(
         self._transform = transform
         self._column_map = column_map
         self.train_on_input = train_on_input
+        self.max_seq_len = max_seq_len
 
     def __len__(self):
         return len(self._data)
@@ -80,20 +85,30 @@ def _prepare_sample(self, sample: Mapping[str, Any]) -> Tuple[List[int], List[in
             else "output"
         )
 
-        return tokenize_prompt_and_response(
+        prompt_tokens, label_tokens = tokenize_prompt_and_response(
             tokenizer=self._tokenizer,
             prompt=prompt,
             response=transformed_sample[key_output],
             train_on_input=self.train_on_input,
         )
 
+        if self.max_seq_len is not None:
+            prompt_tokens, label_tokens = truncate(
+                self._tokenizer, prompt_tokens, label_tokens, self.max_seq_len
+            )
+
+        assert len(prompt_tokens) == len(label_tokens)
+
+        return prompt_tokens, label_tokens
+
 
 def instruct_dataset(
     tokenizer: Tokenizer,
     source: str,
     template: str,
     column_map: Optional[Dict[str, str]] = None,
     train_on_input: bool = False,
+    max_seq_len: Optional[int] = None,
     **load_dataset_kwargs: Dict[str, Any],
 ) -> InstructDataset:
     """
@@ -110,6 +125,9 @@ def instruct_dataset(
         column_map (Optional[Dict[str, str]]): a mapping from the expected placeholder names in the template
             to the column/key names in the sample. If None, assume these are identical.
         train_on_input (bool): Whether the model is trained on the prompt or not. Default is False.
+        max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
+            Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
+            and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
         **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to `load_dataset`.
 
     Returns:
@@ -121,5 +139,6 @@ def instruct_dataset(
         template=_get_template(template),
         column_map=column_map,
         train_on_input=train_on_input,
+        max_seq_len=max_seq_len,
         **load_dataset_kwargs,
     )