Address PR comments

gokulavasan · gokulavasan · commit 97f536bd74d2 · 2024-01-23T01:52:43.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,3 +2,6 @@
 style = 'google'
 check-return-types = 'False'
 exclude = 'tests/torchtune/models/llama2/scripts/'
+
+[tool.pytest.ini_options]
+addopts = ["--showlocals"] # show local variables in tracebacks
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -7,6 +7,7 @@
 import math
 import unittest
 import uuid
+from pathlib import Path
 from typing import Any, Union
 
 import torch
@@ -19,6 +20,10 @@
 )
 
 
+def get_assets_path():
+    return Path(__file__).parent / "assets"
+
+
 def init_weights_with_constant(model: nn.Module, constant: float = 1.0) -> None:
     for p in model.parameters():
         nn.init.constant_(p, constant)
diff --git a/tests/torchtune/datasets/test_slimorca_dataset.py b/tests/torchtune/datasets/test_slimorca_dataset.py
@@ -4,26 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import random
-from pathlib import Path
 
 import pytest
 
 from torchtune import datasets
 from torchtune.modules.tokenizer import Tokenizer
 
-ASSETS = Path(__file__).parent.parent.parent / "assets"
+from tests.test_utils import get_assets_path
 
 
 class TestSlimOrcaDataset:
     @pytest.fixture
     def tokenizer(self):
         # m.model is a pretrained Sentencepiece model using the following command:
         # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
-        return Tokenizer.from_file(str(ASSETS / "m.model"))
-
-    def test_slim_orca_dataset(self, tokenizer):
-        dataset = datasets.get_dataset("slimorca", tokenizer=tokenizer)
-        assert len(dataset) == 363_491
+        return Tokenizer.from_file(str(get_assets_path() / "m.model"))
 
     def test_prompt_label_generation(self, tokenizer):
         dataset = datasets.get_dataset("slimorca", tokenizer=tokenizer)
diff --git a/torchtune/datasets/slimorca.py b/torchtune/datasets/slimorca.py
@@ -47,7 +47,7 @@ class SlimOrcaDataset(Dataset):
         **kwargs: Additional keyword arguments to pass to the SlimOrca Dataset.
 
     Keyword Arguments:
-        max_token_length (int): Maximum number of tokens in the returned input and label token id lists. This value needs to be at least 4 though it is generally set it to max sequence length accepted by the model.  Default is 1024.
+        max_token_length (int): Maximum number of tokens in the returned input and label token id lists. This value needs to be at least 4 though it is generally set to max sequence length accepted by the model. Default is 1024.
 
     Raises:
         ValueError: If `max_token_length` is less than 4.