Skip to content

Commit 37d0be0

Browse files
authored
Fix extra start token in phi3 tokenizer. Address #1063 (#1065)
1 parent deac041 commit 37d0be0

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

torchtune/models/phi3/_sentencepiece.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,10 @@ def tokenize_messages(
171171
new_line_token_id = self.encode("\n", add_bos=False, add_eos=False)
172172

173173
for message in messages:
174+
# Skip system prompt
175+
if ignore_system_prompts and message.role == "system":
176+
continue
177+
174178
# Prepend BOS on start of new turns
175179
if start_of_turn:
176180
tokenized_messages.append(self.bos_id)
@@ -186,11 +190,8 @@ def tokenize_messages(
186190
end_of_turn = True
187191
mask.append(message.masked)
188192
elif message.role == "system":
189-
if ignore_system_prompts:
190-
continue
191-
else:
192-
tokenized_messages.append(self.special_tokens["<|system|>"])
193-
mask.append(message.masked)
193+
tokenized_messages.append(self.special_tokens["<|system|>"])
194+
mask.append(message.masked)
194195
else:
195196
raise ValueError(
196197
f"Unknown role '{message.role}' for message: '{message.content}'"

0 commit comments

Comments
 (0)