@@ -19,24 +19,14 @@ def tokenizer(self):
1919 # Pretrained tiktoken model generated via the script in
2020 # https://gist.github.com/ebsmothers/54b133dd87db6679b14318545aaa2de4
2121 return phi4_tokenizer (
22- path = str (ASSETS / "tiktoken_small.model" ),
22+ vocab_path = (ASSETS / "vocab.json" ),
23+ merges_path = (ASSETS / "merges.txt" ),
2324 )
2425
2526 @pytest .fixture
2627 def expected_tokens (self ):
2728 # fmt: off
28- tokens = [100257 , 100264 , 115 , 121 , 322 , 398 , 100265 , 10 , 1539 , 470 , 258 , 1444 , 933 , 1940 , 511 , 446 , 100266 , 10 , 100264 ,
29- 477 , 273 , 100265 , 10 , 66 , 478 , 299 , 351 , 362 , 292 , 1160 , 117 , 807 , 334 , 958 , 99 , 445 , 98 , 300 , 258 , 256 , 281 ,
30- 107 , 46 , 411 , 114 , 561 , 258 , 1156 , 279 , 316 , 334 , 604 , 337 , 112 , 445 , 1827 , 512 , 1080 , 116 , 300 , 262 , 1249 ,
31- 524 , 340 , 10 , 35 , 35 , 35 , 828 , 1160 , 117 , 807 , 1037 , 71 , 1414 , 534 , 258 , 1759 , 511 , 355 , 285 , 875 , 550 , 102 ,
32- 1546 , 265 , 105 , 111 , 340 , 10 , 35 , 35 , 35 , 408 , 300 , 112 , 279 , 316 , 1037 , 100266 , 10 , 100264 , 520 , 511 , 446 ,
33- 100265 , 10 , 73 , 776 , 362 , 425 , 1978 , 274 , 284 , 1528 , 319 , 995 , 505 , 944 , 874 , 903 , 1585 , 616 , 345 , 1528 , 115 ,
34- 284 , 1749 , 803 , 46 , 270 , 776 , 1341 , 258 , 1279 , 641 , 563 , 275 , 469 , 573 , 284 , 944 , 320 , 526 , 962 , 425 , 913 ,
35- 1402 , 97 , 356 , 446 , 115 , 284 , 1229 , 1581 , 282 , 117 , 276 , 259 , 300 , 46 , 270 , 776 , 258 , 1279 , 275 , 288 , 283 ,
36- 262 , 739 , 1886 , 284 , 783 , 1803 , 636 , 277 , 268 , 117 , 316 , 485 , 115 , 284 , 302 , 416 , 273 , 900 , 46 , 270 , 776 , 591 ,
37- 630 , 346 , 531 , 476 , 505 , 768 , 1233 , 342 , 1923 , 292 , 522 , 662 , 280 , 274 , 913 , 601 , 359 , 300 , 44 , 335 , 834 , 335 ,
38- 531 , 476 , 505 , 604 , 264 , 509 , 1456 , 258 , 771 , 543 , 1719 , 405 , 710 , 665 , 668 , 1280 , 46 , 100266 , 10 ,
39- 100265 ] # noqa
29+ tokens = [100257 , 100264 , 9125 , 100265 , 198 , 2675 , 527 , 264 , 11190 , 18328 , 100266 , 198 , 100264 , 882 , 100265 , 198 , 14149 , 28514 , 374 , 279 , 1888 , 6875 , 100266 , 198 , 100264 , 78191 , 100265 , 198 , 9642 , 433 , 374 , 100266 , 198 , 100265 ]
4030 # fmt: on
4131 return tokens
4232
@@ -45,67 +35,41 @@ def test_tokenize_messages(self, tokenizer, expected_tokens):
4535 Message (role = "system" , content = "You are a helpful assistant" , masked = True ),
4636 Message (
4737 role = "user" ,
48- content = "Below is an instruction that describes a task. Write a response "
49- "that appropriately completes the request.\n \n ### Instruction:\n Generate "
50- "a realistic dating profile bio.\n \n ### Response:\n " ,
38+ content = "Pytorch is the best library!" ,
5139 masked = True ,
5240 ),
5341 Message (
5442 role = "assistant" ,
55- content = "I'm an outgoing and friendly person who loves spending time with "
56- "friends and family. I'm also a big-time foodie and love trying out new "
57- "restaurants and different cuisines. I'm a big fan of the arts and enjoy "
58- "going to museums and galleries. I'm looking for someone who shares my "
59- "interest in exploring new places, as well as someone who appreciates a "
60- "good conversation over coffee." ,
43+ content = "Yes, it is!" ,
6144 ),
6245 ]
6346 tokens , mask = tokenizer .tokenize_messages (messages , add_eos = True )
6447
65- expected_mask = [True ] * 101 + [False ] * 131
48+ expected_mask = [True ] * 24 + [False ] * 10
6649 assert expected_tokens == tokens
6750 assert expected_mask == mask
6851
6952 def test_tokenize_messages_no_system_prompt (self , tokenizer ):
7053 messages = [
71- Message (role = "system" , content = "You are a helpful assistant" , masked = True ),
7254 Message (
7355 role = "user" ,
74- content = "Below is an instruction that describes a task. Write a response "
75- "that appropriately completes the request.\n \n ### Instruction:\n Generate "
76- "a realistic dating profile bio.\n \n ### Response:\n " ,
56+ content = "Pytorch is the best library!" ,
7757 masked = True ,
7858 ),
7959 Message (
8060 role = "assistant" ,
81- content = "I'm an outgoing and friendly person who loves spending time with "
82- "friends and family. I'm also a big-time foodie and love trying out new "
83- "restaurants and different cuisines. I'm a big fan of the arts and enjoy "
84- "going to museums and galleries. I'm looking for someone who shares my "
85- "interest in exploring new places, as well as someone who appreciates a "
86- "good conversation over coffee." ,
61+ content = "Yes, it is!" ,
8762 ),
8863 ]
8964 tokens , mask = tokenizer .tokenize_messages (
9065 messages , ignore_system_prompt = True , add_eos = True
9166 )
9267
9368 # fmt: off
94- expected_tokens = [100257 , 100264 , 477 , 273 , 100265 , 10 , 66 , 478 , 299 , 351 , 362 , 292 , 1160 , 117 , 807 , 334 , 958 , 99 , 445 ,
95- 98 , 300 , 258 , 256 , 281 , 107 , 46 , 411 , 114 , 561 , 258 , 1156 , 279 , 316 , 334 , 604 , 337 , 112 , 445 , 1827 ,
96- 512 , 1080 , 116 , 300 , 262 , 1249 , 524 , 340 , 10 , 35 , 35 , 35 , 828 , 1160 , 117 , 807 , 1037 , 71 , 1414 , 534 ,
97- 258 , 1759 , 511 , 355 , 285 , 875 , 550 , 102 , 1546 , 265 , 105 , 111 , 340 , 10 , 35 , 35 , 35 , 408 , 300 , 112 ,
98- 279 , 316 , 1037 , 100266 , 10 , 100264 , 520 , 511 , 446 , 100265 , 10 , 73 , 776 , 362 , 425 , 1978 , 274 , 284 ,
99- 1528 , 319 , 995 , 505 , 944 , 874 , 903 , 1585 , 616 , 345 , 1528 , 115 , 284 , 1749 , 803 , 46 , 270 , 776 , 1341 ,
100- 258 , 1279 , 641 , 563 , 275 , 469 , 573 , 284 , 944 , 320 , 526 , 962 , 425 , 913 , 1402 , 97 , 356 , 446 , 115 , 284 ,
101- 1229 , 1581 , 282 , 117 , 276 , 259 , 300 , 46 , 270 , 776 , 258 , 1279 , 275 , 288 , 283 , 262 , 739 , 1886 , 284 ,
102- 783 , 1803 , 636 , 277 , 268 , 117 , 316 , 485 , 115 , 284 , 302 , 416 , 273 , 900 , 46 , 270 , 776 , 591 , 630 , 346 ,
103- 531 , 476 , 505 , 768 , 1233 , 342 , 1923 , 292 , 522 , 662 , 280 , 274 , 913 , 601 , 359 , 300 , 44 , 335 , 834 , 335 ,
104- 531 , 476 , 505 , 604 , 264 , 509 , 1456 , 258 , 771 , 543 , 1719 , 405 , 710 , 665 , 668 , 1280 , 46 , 100266 , 10 ,
105- 100265 ] # noqa
69+ expected_tokens = [100257 , 100264 , 882 , 100265 , 198 , 14149 , 28514 , 374 , 279 , 1888 , 6875 , 100266 , 198 , 100264 , 78191 , 100265 , 198 , 9642 , 433 , 374 , 100266 , 198 , 100265 ]
10670 # fmt: on
10771
108- expected_mask = [True ] * 84 + [False ] * 131
72+ expected_mask = [True ] * 13 + [False ] * 10
10973 assert expected_tokens == tokens
11074 assert expected_mask == mask
11175
@@ -118,41 +82,22 @@ def test_tokenize_message_drop_eos(self, tokenizer, expected_tokens):
11882 Message (role = "system" , content = "You are a helpful assistant" , masked = True ),
11983 Message (
12084 role = "user" ,
121- content = "Below is an instruction that describes a task. Write a response "
122- "that appropriately completes the request.\n \n ### Instruction:\n Generate "
123- "a realistic dating profile bio.\n \n ### Response:\n " ,
85+ content = "Pytorch is the best library!" ,
12486 masked = True ,
12587 ),
12688 Message (
12789 role = "assistant" ,
128- content = "I'm an outgoing and friendly person who loves spending time with "
129- "friends and family. I'm also a big-time foodie and love trying out new "
130- "restaurants and different cuisines. I'm a big fan of the arts and enjoy "
131- "going to museums and galleries. I'm looking for someone who shares my "
132- "interest in exploring new places, as well as someone who appreciates a "
133- "good conversation over coffee." ,
90+ content = "Yes, it is!" ,
13491 ),
13592 ]
13693
13794 tokens , mask = tokenizer .tokenize_messages (messages , add_eos = False )
13895
13996 # fmt: off
140- expected_tokens = [100257 , 100264 , 115 , 121 , 322 , 398 , 100265 , 10 , 1539 , 470 , 258 , 1444 , 933 , 1940 , 511 , 446 , 100266 ,
141- 10 , 100264 , 477 , 273 , 100265 , 10 , 66 , 478 , 299 , 351 , 362 , 292 , 1160 , 117 , 807 , 334 , 958 , 99 , 445 , 98 ,
142- 300 , 258 , 256 , 281 , 107 , 46 , 411 , 114 , 561 , 258 , 1156 , 279 , 316 , 334 , 604 , 337 , 112 , 445 , 1827 , 512 ,
143- 1080 , 116 , 300 , 262 , 1249 , 524 , 340 , 10 , 35 , 35 , 35 , 828 , 1160 , 117 , 807 , 1037 , 71 , 1414 , 534 , 258 ,
144- 1759 , 511 , 355 , 285 , 875 , 550 , 102 , 1546 , 265 , 105 , 111 , 340 , 10 , 35 , 35 , 35 , 408 , 300 , 112 , 279 ,
145- 316 , 1037 , 100266 , 10 , 100264 , 520 , 511 , 446 , 100265 , 10 , 73 , 776 , 362 , 425 , 1978 , 274 , 284 , 1528 ,
146- 319 , 995 , 505 , 944 , 874 , 903 , 1585 , 616 , 345 , 1528 , 115 , 284 , 1749 , 803 , 46 , 270 , 776 , 1341 , 258 ,
147- 1279 , 641 , 563 , 275 , 469 , 573 , 284 , 944 , 320 , 526 , 962 , 425 , 913 , 1402 , 97 , 356 , 446 , 115 , 284 , 1229 ,
148- 1581 , 282 , 117 , 276 , 259 , 300 , 46 , 270 , 776 , 258 , 1279 , 275 , 288 , 283 , 262 , 739 , 1886 , 284 , 783 ,
149- 1803 , 636 , 277 , 268 , 117 , 316 , 485 , 115 , 284 , 302 , 416 , 273 , 900 , 46 , 270 , 776 , 591 , 630 , 346 , 531 ,
150- 476 , 505 , 768 , 1233 , 342 , 1923 , 292 , 522 , 662 , 280 , 274 , 913 , 601 , 359 , 300 , 44 , 335 , 834 , 335 , 531 ,
151- 476 , 505 , 604 , 264 , 509 , 1456 , 258 , 771 , 543 , 1719 , 405 , 710 , 665 , 668 , 1280 , 46 , 100266 , 10 ,
152- 100265 ] # noqa
97+ expected_tokens = [100257 , 100264 , 9125 , 100265 , 198 , 2675 , 527 , 264 , 11190 , 18328 , 100266 , 198 , 100264 , 882 , 100265 , 198 , 14149 , 28514 , 374 , 279 , 1888 , 6875 , 100266 , 198 , 100264 , 78191 , 100265 , 198 , 9642 , 433 , 374 , 100266 , 198 , 100265 ]
15398 # fmt: on
15499
155- expected_mask = [True ] * 101 + [False ] * 130
100+ expected_mask = [True ] * 24 + [False ] * 9
156101 # Drop eos token.
157102 assert expected_tokens [:- 1 ] == tokens
158103 assert expected_mask == mask
0 commit comments