Skip to content

Commit 55d7ae0

Browse files
Mark ObozovMark Obozov
authored andcommitted
fix test
1 parent 6e50261 commit 55d7ae0

File tree

1 file changed

+20
-27
lines changed

1 file changed

+20
-27
lines changed

tests/torchtune/modules/tokenizers/test_gpt2.py

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ class TestGPT2BaseTokenizer:
1414
@pytest.fixture
1515
def tokenizer(self):
1616
tokenizer = GPT2BaseTokenizer(
17-
ASSETS / "tiny_vocab.json",
18-
ASSETS / "tiny_bpe_merges.txt",
17+
ASSETS / "vocab.json",
18+
ASSETS / "merges.txt",
1919
"replace",
2020
1,
2121
1,
@@ -27,47 +27,40 @@ def tokenizer(self):
2727
def test_encode(self, tokenizer):
2828
assert tokenizer.encode("Hello world!") == [
2929
tokenizer.bos_id,
30-
2,
31-
3,
32-
4,
33-
5,
30+
9906,
31+
1917,
3432
tokenizer.eos_id,
3533
]
3634
assert tokenizer.encode("Hello world!", add_eos=False) == [
3735
tokenizer.bos_id,
38-
2,
39-
3,
40-
4,
41-
5,
36+
9906,
37+
1917,
4238
]
4339
assert tokenizer.encode("Hello world!", add_bos=False) == [
44-
2,
45-
3,
46-
4,
47-
5,
40+
9906,
41+
1917,
4842
tokenizer.eos_id,
4943
]
5044
assert tokenizer.encode("Hello world!", add_eos=False, add_bos=False) == [
51-
2,
52-
3,
53-
4,
54-
5,
45+
9906,
46+
1917,
5547
]
5648

5749
def test_decode(self, tokenizer):
58-
tokens = [2, 3, 4, 5]
50+
tokens = [
51+
9906,
52+
1917,
53+
]
5954

60-
assert tokenizer.decode(tokens) == ["H", "ell", "o", "Ġworld"]
55+
assert tokenizer.decode(tokens) == ["Hello", "Ġworld"]
6156
assert tokenizer.decode(
6257
tokenizer.encode("Hello world!", add_eos=False, add_bos=False)
63-
) == ["H", "ell", "o", "Ġworld"]
58+
) == ["Hello", "Ġworld"]
6459
assert tokenizer.decode(tokenizer.encode("Hello world!")) == [
65-
None,
66-
"H",
67-
"ell",
68-
"o",
60+
'"',
61+
"Hello",
6962
"Ġworld",
70-
None,
63+
'"',
7164
]
7265

7366
def test_token_ids(self, tokenizer):
@@ -77,4 +70,4 @@ def test_token_ids(self, tokenizer):
7770
assert tokenizer.unk_id == 1
7871

7972
def test_tokenizer_vocab_size(self, tokenizer):
80-
assert tokenizer.vocab_size == 4
73+
assert tokenizer.vocab_size == 63668

0 commit comments

Comments
 (0)