maximegmd
diff --git a/‎tests/torchtune/datasets/test_packed_dataset.py‎
Lines changed: 41 additions & 19 deletions b/‎tests/torchtune/datasets/test_packed_dataset.py‎
Lines changed: 41 additions & 19 deletions
diff --git a/‎torchtune/datasets/_chat.py‎
Lines changed: 5 additions & 1 deletion b/‎torchtune/datasets/_chat.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎torchtune/datasets/_instruct.py‎
Lines changed: 5 additions & 1 deletion b/‎torchtune/datasets/_instruct.py‎
Lines changed: 5 additions & 1 deletion
@@ -77,9 +77,29 @@ def _get_expected_mask_and_input_pos(
 
         return mask[:max_seq_len, :max_seq_len], torch.tensor(input_pos[:max_seq_len])
 
+    def _calculate_num_packs(
+        self, dataset_size, max_seq_len, sample_size, split_across_pack, max_packs
+    ):
+        # First see how many samples we can fit in a single pack
+        num_samples_per_pack, remainder = divmod(max_seq_len, sample_size)
+
+        # If we split across pack (and the samples don't fit perfectly in max_seq_len), we can fit more
+        if split_across_pack and remainder > 0:
+            # Now we need the fractional to see how many we can partially fit in each pack
+            num_samples_per_pack = max_seq_len / sample_size
+
+        # If we don't split across pack, we will need more packs
+        num_packs, remainder = divmod(dataset_size, num_samples_per_pack)
+
+        # If there's leftover, we need to add one more pack
+        if remainder > 0:
+            num_packs += 1
+
+        return num_packs if num_packs < max_packs else max_packs
+
     @pytest.mark.parametrize("max_seq_len", [25])
     @pytest.mark.parametrize("sample_size", [2, 5])
-    @pytest.mark.parametrize("max_packs", [5])
+    @pytest.mark.parametrize("max_packs", [5, 200])
     @pytest.mark.parametrize("split_across_pack", [True, False])
     def test_packed_dataset(
         self, max_seq_len, sample_size, max_packs, split_across_pack
@@ -91,8 +111,13 @@ def test_packed_dataset(
             max_packs=max_packs,
             split_across_pack=split_across_pack,
         )
+
         # Check we get right number of packs
-        assert len(packed) == max_packs
+        correct_num_packs = self._calculate_num_packs(
+            len(dataset), max_seq_len, sample_size, split_across_pack, max_packs
+        )
+        assert len(packed) == correct_num_packs
+
         # Check all fields are same length
         assert (
             len(packed[0]["tokens"])
@@ -105,15 +130,15 @@ def test_packed_dataset(
         if split_across_pack:
             # If we split samples, we'll know how many samples by taking the
             # full length and dividing by sample size
-            last_index, remainder = divmod(max_packs * max_seq_len, sample_size)
+            last_index, remainder = divmod(len(packed) * max_seq_len, sample_size)
             # Account for remaining sample that didn't fit in window
             last_index = last_index if remainder > 0 else last_index - 1
         else:
             # If we don't split samples, we know how many samples by taking
             # how much fits in a single window and multiplying by max rows.
             # If there is a remainder, this will end up being a pad token.
             last_index = (
-                (max_seq_len // sample_size) * max_packs - 1
+                (max_seq_len // sample_size) * len(packed) - 1
                 if max_seq_len % sample_size == 0
                 else 0
             )
@@ -207,11 +232,11 @@ def test_packed_dataset_real_data(self):
 
     def test_pad_pack(self):
         padding_idx = -8
-        ignore_idx = -9
+        ignore_idx = -100  # Same as CROSS_ENTROPY_IGNORE_IDX
         pack = {
             "tokens": [2, 5],
             "labels": [3, 7],
-            "mask": torch.tensor([[True, False], [True, True]]),
+            "seq_lens": [1, 1],
             # Let the first token be the end of the previous sample (pos 8),
             # and the second token the start of the next sample (pos 0). Collate
             # should continue from 0 -> 1, 2, ...
@@ -224,11 +249,11 @@ def test_pad_pack(self):
             max_seq_len=4,
         )
 
-        padded = packed._pad_pack(pack, padding_idx=padding_idx, ignore_idx=ignore_idx)
+        pack = packed._convert_to_tensors(pack)
+        padded = packed._pad_pack(pack, padding_idx=padding_idx)
 
         padded_input = padded["tokens"]
         padded_label = padded["labels"]
-        padded_mask = padded["mask"]
         padded_input_pos = padded["input_pos"]
 
         torch.testing.assert_close(
@@ -237,15 +262,12 @@ def test_pad_pack(self):
         torch.testing.assert_close(
             padded_label, torch.tensor([3, 7, ignore_idx, ignore_idx])
         )
-        assert torch.equal(
-            padded_mask,
-            torch.tensor(
-                [
-                    [True, False, False, False],
-                    [True, True, False, False],
-                    [False, False, True, False],
-                    [False, False, False, True],
-                ]
-            ),
-        )
         torch.testing.assert_close(padded_input_pos, torch.tensor([8, 0, 1, 2]))
+
+    def test_pack_errors_if_sample_too_long(self):
+        dataset = DummyDataset(8)
+        with pytest.raises(ValueError, match="Dataset sample is too long"):
+            PackedDataset(
+                dataset,
+                max_seq_len=4,
+            )
@@ -185,4 +185,8 @@ def chat_dataset(
         train_on_input=train_on_input,
         **load_dataset_kwargs,
     )
-    return PackedDataset(ds, max_seq_len=max_seq_len) if packed else ds
+    return (
+        PackedDataset(ds, max_seq_len=max_seq_len, padding_idx=tokenizer.pad_id)
+        if packed
+        else ds
+    )
@@ -177,4 +177,8 @@ def instruct_dataset(
         max_seq_len=max_seq_len,
         **load_dataset_kwargs,
     )
-    return PackedDataset(ds, max_seq_len=max_seq_len) if packed else ds
+    return (
+        PackedDataset(ds, max_seq_len=max_seq_len, padding_idx=tokenizer.pad_id)
+        if packed
+        else ds
+    )