@@ -14,8 +14,8 @@ class TestGPT2BaseTokenizer:
1414 @pytest .fixture
1515 def tokenizer (self ):
1616 tokenizer = GPT2BaseTokenizer (
17- ASSETS / "tiny_vocab .json" ,
18- ASSETS / "tiny_bpe_merges .txt" ,
17+ ASSETS / "vocab .json" ,
18+ ASSETS / "merges .txt" ,
1919 "replace" ,
2020 1 ,
2121 1 ,
@@ -27,47 +27,40 @@ def tokenizer(self):
2727 def test_encode (self , tokenizer ):
2828 assert tokenizer .encode ("Hello world!" ) == [
2929 tokenizer .bos_id ,
30- 2 ,
31- 3 ,
32- 4 ,
33- 5 ,
30+ 9906 ,
31+ 1917 ,
3432 tokenizer .eos_id ,
3533 ]
3634 assert tokenizer .encode ("Hello world!" , add_eos = False ) == [
3735 tokenizer .bos_id ,
38- 2 ,
39- 3 ,
40- 4 ,
41- 5 ,
36+ 9906 ,
37+ 1917 ,
4238 ]
4339 assert tokenizer .encode ("Hello world!" , add_bos = False ) == [
44- 2 ,
45- 3 ,
46- 4 ,
47- 5 ,
40+ 9906 ,
41+ 1917 ,
4842 tokenizer .eos_id ,
4943 ]
5044 assert tokenizer .encode ("Hello world!" , add_eos = False , add_bos = False ) == [
51- 2 ,
52- 3 ,
53- 4 ,
54- 5 ,
45+ 9906 ,
46+ 1917 ,
5547 ]
5648
5749 def test_decode (self , tokenizer ):
58- tokens = [2 , 3 , 4 , 5 ]
50+ tokens = [
51+ 9906 ,
52+ 1917 ,
53+ ]
5954
60- assert tokenizer .decode (tokens ) == ["H" , "ell" , "o " , "Ġworld" ]
55+ assert tokenizer .decode (tokens ) == ["Hello " , "Ġworld" ]
6156 assert tokenizer .decode (
6257 tokenizer .encode ("Hello world!" , add_eos = False , add_bos = False )
63- ) == ["H" , "ell" , "o " , "Ġworld" ]
58+ ) == ["Hello " , "Ġworld" ]
6459 assert tokenizer .decode (tokenizer .encode ("Hello world!" )) == [
65- None ,
66- "H" ,
67- "ell" ,
68- "o" ,
60+ '"' ,
61+ "Hello" ,
6962 "Ġworld" ,
70- None ,
63+ '"' ,
7164 ]
7265
7366 def test_token_ids (self , tokenizer ):
@@ -77,4 +70,4 @@ def test_token_ids(self, tokenizer):
7770 assert tokenizer .unk_id == 1
7871
7972 def test_tokenizer_vocab_size (self , tokenizer ):
80- assert tokenizer .vocab_size == 4
73+ assert tokenizer .vocab_size == 63668
0 commit comments