Fix stop string matching for byte-fragment tokens

Incheonkirin · Rocketknight1 · commit c27e608c5a88 · 2026-06-11T14:29:30.000+01:00
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
@@ -1,3 +1,4 @@
+import json
 import time
 import warnings
 from abc import ABC
@@ -241,29 +242,41 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: str | list[
         if isinstance(stop_strings, str):
             stop_strings = [stop_strings]
         self.stop_strings: tuple[str, ...] = tuple(stop_strings)
+        self._stop_string_matching_mode = self._get_stop_string_matching_mode(tokenizer)
+        self._stop_strings_for_matching = self._get_stop_strings_for_matching(
+            self.stop_strings, self._stop_string_matching_mode
+        )
         vocab = tokenizer.get_vocab()
         token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values())
         self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache(
             token_list, token_indices, tokenizer
         )
 
-        self.maximum_token_len = max(len(stop_string) for stop_string in self.stop_strings)
+        self.maximum_token_len = max(len(stop_string) for stop_string in self._stop_strings_for_matching)
         self.num_stop_strings = len(self.stop_strings)
-        self.target_lens = torch.tensor([len(stop_string) for stop_string in stop_strings], dtype=torch.int32)
+        self.target_lens = torch.tensor(
+            [len(stop_string) for stop_string in self._stop_strings_for_matching], dtype=torch.int32
+        )
 
     def clean_and_embed_tokens_with_cache(self, token_list, token_indices, tokenizer):
         # We don't use the tokenizer in the cache key, because I don't trust it to have well-behaved equality
-        if (token_list, token_indices, self.stop_strings) in STOP_STRING_EMBEDDING_CACHE:
-            embedding_vec, max_valid_positions, max_valid_end_lens = STOP_STRING_EMBEDDING_CACHE[
-                (token_list, token_indices, self.stop_strings)
-            ]
-            STOP_STRING_EMBEDDING_CACHE.move_to_end((token_list, token_indices, self.stop_strings))
+        cache_key = (
+            token_list,
+            token_indices,
+            self._stop_strings_for_matching,
+            self._stop_string_matching_mode,
+        )
+        if cache_key in STOP_STRING_EMBEDDING_CACHE:
+            embedding_vec, max_valid_positions, max_valid_end_lens = STOP_STRING_EMBEDDING_CACHE[cache_key]
+            STOP_STRING_EMBEDDING_CACHE.move_to_end(cache_key)
         else:
-            clean_token_list, clean_token_indices = self.clean_tokenizer_vocab(tokenizer)
+            clean_token_list, clean_token_indices = self.clean_tokenizer_vocab(
+                tokenizer, stop_string_matching_mode=self._stop_string_matching_mode
+            )
             embedding_vec, max_valid_positions, max_valid_end_lens = self._stop_string_create_embedding_vec(
-                clean_token_list, clean_token_indices, self.stop_strings
+                clean_token_list, clean_token_indices, self._stop_strings_for_matching
             )
-            STOP_STRING_EMBEDDING_CACHE[(token_list, token_indices, self.stop_strings)] = (
+            STOP_STRING_EMBEDDING_CACHE[cache_key] = (
                 embedding_vec,
                 max_valid_positions,
                 max_valid_end_lens,
@@ -273,30 +286,101 @@ def clean_and_embed_tokens_with_cache(self, token_list, token_indices, tokenizer
         return embedding_vec, max_valid_positions, max_valid_end_lens
 
     @staticmethod
-    def clean_tokenizer_vocab(tokenizer, static_prefix="abcdef"):
+    def _get_stop_string_matching_mode(tokenizer):
+        decoder = getattr(getattr(tokenizer, "backend_tokenizer", None), "decoder", None)
+        if decoder is None:
+            return None
+
+        decoder_state = getattr(decoder, "__getstate__", lambda: None)()
+        if isinstance(decoder_state, str):
+            decoder_state = decoder_state.encode()
+        decoder_config = None
+        if isinstance(decoder_state, bytes):
+            try:
+                decoder_config = json.loads(decoder_state)
+            except json.JSONDecodeError:
+                decoder_config = None
+
+        # Some decoders do not expose a JSON state.
+        if decoder.__class__.__name__ == "ByteLevel":
+            return "byte_level"
+        if decoder_config is not None:
+            # Prefer explicit "<0xNN>" byte-fallback tokens if both markers appear.
+            if StopStringCriteria._decoder_has_type(decoder_config, "ByteFallback"):
+                return "byte_fallback"
+            if StopStringCriteria._decoder_has_type(decoder_config, "ByteLevel"):
+                return "byte_level"
+        return None
+
+    @staticmethod
+    def _decoder_has_type(decoder_config, decoder_type):
+        if isinstance(decoder_config, dict):
+            if decoder_config.get("type") == decoder_type:
+                return True
+            return any(StopStringCriteria._decoder_has_type(value, decoder_type) for value in decoder_config.values())
+        if isinstance(decoder_config, list):
+            return any(StopStringCriteria._decoder_has_type(value, decoder_type) for value in decoder_config)
+        return False
+
+    @staticmethod
+    def _get_stop_strings_for_matching(stop_strings, matching_mode):
+        if matching_mode is None:
+            return stop_strings
+        return tuple(stop_string.encode("utf-8") for stop_string in stop_strings)
+
+    @staticmethod
+    def _byte_level_decoder():
+        from ..convert_slow_tokenizer import bytes_to_unicode
+
+        return {unicode_char: byte for byte, unicode_char in bytes_to_unicode().items()}
+
+    @staticmethod
+    def _token_to_bytes(token, stop_string_matching_mode, byte_decoder):
+        if stop_string_matching_mode == "byte_level":
+            if byte_decoder is not None and all(char in byte_decoder for char in token):
+                return bytes(byte_decoder[char] for char in token)
+            return None
+        if stop_string_matching_mode == "byte_fallback":
+            if (
+                len(token) == 6
+                and token.startswith("<0x")
+                and token.endswith(">")
+                and all(char in "0123456789abcdefABCDEF" for char in token[3:5])
+            ):
+                return bytes([int(token[3:5], 16)])
+        return None
+
+    @staticmethod
+    def clean_tokenizer_vocab(tokenizer, static_prefix="abcdef", stop_string_matching_mode=None):
         """
         This method turns a tokenizer vocab into a "clean" vocab where each token represents the actual string
         it will yield, without any special prefixes like "##" or "Ġ". This is trickier than it looks - the method
         tokenizer.convert_tokens_to_string() does not always return the correct string because of issues with prefix
         space addition/removal. To work around this, we add a static prefix to the start of the token, then remove
-        it (and any prefix that may have been introduced with it) after calling convert_tokens_to_string().
+        it (and any prefix that may have been introduced with it) after calling convert_tokens_to_string(). For
+        byte-level vocabularies, incomplete UTF-8 fragments are kept as bytes until the stop string match is computed.
         """
         vocab = tokenizer.get_vocab()
         clean_token_list = []
         clean_token_indices = []
+        byte_decoder = StopStringCriteria._byte_level_decoder() if stop_string_matching_mode == "byte_level" else None
         sentence_base = tokenizer(static_prefix, add_special_tokens=False)["input_ids"]
         tokens_base = [tokenizer._convert_id_to_token(tok) for tok in sentence_base]
         for token, token_idx in vocab.items():
-            token_string = tokenizer.convert_tokens_to_string(tokens_base + [token])
-            token_string = token_string[token_string.index(static_prefix) + len(static_prefix) :]
+            token_string = StopStringCriteria._token_to_bytes(token, stop_string_matching_mode, byte_decoder)
+            if token_string is None:
+                token_string = tokenizer.convert_tokens_to_string(tokens_base + [token])
+                token_string = token_string[token_string.index(static_prefix) + len(static_prefix) :]
+                if stop_string_matching_mode is not None:
+                    token_string = token_string.encode("utf-8")
             clean_token_list.append(token_string)
             clean_token_indices.append(token_idx)
         return tuple(clean_token_list), tuple(clean_token_indices)
 
     @staticmethod
     def _stop_string_get_matching_positions(
         token_list, token_indices, stop_strings
-    ) -> tuple[dict[str, dict[str, list[int]]], dict[str, dict[str, list[int]]]]:
+    ) -> tuple[dict[str | bytes, dict[str, list[int]]], dict[str | bytes, dict[str, list[int]]]]:
         """This function preprocesses stop strings and the tokenizer vocabulary to determine where tokens can
         validly appear in the stop strings. For each token, it computes a list of positions in the stop string where the
         token appears, as well as a list of the possible "end overlaps" for that token - that is, the number of characters
diff --git a/tests/generation/test_stopping_criteria.py b/tests/generation/test_stopping_criteria.py
@@ -33,6 +33,7 @@
         StopStringCriteria,
         validate_stopping_criteria,
     )
+    from transformers.generation.stopping_criteria import STOP_STRING_EMBEDDING_CACHE
 
 
 @require_torch
@@ -45,6 +46,14 @@ def _get_tensors(self, length):
         scores = torch.ones((batch_size, length), device=torch_device, dtype=torch.float) / length
         return input_ids, scores
 
+    def _assert_isolated_token_decode_loses_stop_string(self, tokenizer, text, stop_string):
+        input_ids = tokenizer(text, add_special_tokens=False)["input_ids"]
+        tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        isolated_text = "".join(tokenizer.convert_tokens_to_string([token]) for token in tokens)
+
+        self.assertTrue(tokenizer.decode(input_ids, skip_special_tokens=False).endswith(stop_string))
+        self.assertNotIn(stop_string, isolated_text)
+
     def test_list_criteria(self):
         input_ids, scores = self._get_tensors(5)
 
@@ -175,6 +184,118 @@ def test_stop_string_criteria(self):
         for i in range(len(false_strings)):
             self.assertFalse(criteria(false_input_ids["input_ids"][i : i + 1], scores))
 
+    def test_stop_string_criteria_byte_fragments(self):
+        STOP_STRING_EMBEDDING_CACHE.clear()
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        self.assertEqual(StopStringCriteria._get_stop_string_matching_mode(tokenizer), "byte_level")
+        self._assert_isolated_token_decode_loses_stop_string(tokenizer, "대화 끝", "끝")
+        self._assert_isolated_token_decode_loses_stop_string(tokenizer, "작업 완료", "완료")
+
+        cases = [
+            ("대화 끝", "끝", True),
+            ("작업 완료", "완료", True),
+            ("대화 끝 다음", "끝", False),
+        ]
+
+        for text, stop_string, expected in cases:
+            input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"]
+            criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=[stop_string])
+            self.assertEqual(bool(criteria(input_ids, scores=None)[0]), expected)
+
+    def test_stop_string_criteria_byte_fallback_fragments(self):
+        STOP_STRING_EMBEDDING_CACHE.clear()
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", use_fast=True)
+        self.assertEqual(StopStringCriteria._get_stop_string_matching_mode(tokenizer), "byte_fallback")
+        self._assert_isolated_token_decode_loses_stop_string(tokenizer, "대화 끝", "끝")
+        self._assert_isolated_token_decode_loses_stop_string(tokenizer, "abc 끝!", "끝!")
+
+        cases = [
+            ("대화 끝", "끝", True),
+            ("abc 끝!", "끝!", True),
+            ("대화 끝 다음", "끝", False),
+            ("완료 후속", "완료", False),
+        ]
+
+        for text, stop_string, expected in cases:
+            input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"]
+            criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=[stop_string])
+            self.assertEqual(bool(criteria(input_ids, scores=None)[0]), expected)
+
+    def test_stop_string_criteria_byte_fragment_compile(self):
+        if not hasattr(torch, "compile"):
+            self.skipTest("torch.compile is not available")
+
+        STOP_STRING_EMBEDDING_CACHE.clear()
+        cases = [
+            ("Qwen/Qwen2-0.5B-Instruct", "대화 끝", "끝"),
+            ("hf-internal-testing/llama-tokenizer", "abc 끝!", "끝!"),
+        ]
+        for tokenizer_name, text, stop_string in cases:
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
+            input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"]
+            criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=[stop_string])
+            compiled_criteria = torch.compile(criteria, backend="eager", fullgraph=True)
+            self.assertTrue(bool(compiled_criteria(input_ids, scores=None)[0]))
+
+    def test_stop_string_criteria_byte_level_ascii(self):
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        self.assertEqual(StopStringCriteria._get_stop_string_matching_mode(tokenizer), "byte_level")
+
+        true_input_ids = tokenizer("the end", return_tensors="pt", add_special_tokens=False)["input_ids"]
+        false_input_ids = tokenizer("end of", return_tensors="pt", add_special_tokens=False)["input_ids"]
+        criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=["end"])
+        self.assertTrue(bool(criteria(true_input_ids, scores=None)[0]))
+        self.assertFalse(bool(criteria(false_input_ids, scores=None)[0]))
+
+    def test_stop_string_criteria_non_byte_level_tokenizer(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+        self.assertIsNone(StopStringCriteria._get_stop_string_matching_mode(tokenizer))
+
+        true_input_ids = tokenizer("the end", return_tensors="pt", add_special_tokens=False)["input_ids"]
+        false_input_ids = tokenizer("end of", return_tensors="pt", add_special_tokens=False)["input_ids"]
+        criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=["end"])
+        self.assertTrue(bool(criteria(true_input_ids, scores=None)[0]))
+        self.assertFalse(bool(criteria(false_input_ids, scores=None)[0]))
+
+    def test_stop_string_matching_mode_helpers(self):
+        class Decoder:
+            def __init__(self, state):
+                self.state = state
+
+            def __getstate__(self):
+                return self.state
+
+        class BackendTokenizer:
+            def __init__(self, decoder):
+                self.decoder = decoder
+
+        class Tokenizer:
+            def __init__(self, decoder):
+                self.backend_tokenizer = BackendTokenizer(decoder)
+
+        self.assertEqual(
+            StopStringCriteria._get_stop_string_matching_mode(
+                Tokenizer(Decoder(b'{"type":"Sequence","decoders":[{"type":"ByteLevel"}]}'))
+            ),
+            "byte_level",
+        )
+        self.assertEqual(
+            StopStringCriteria._get_stop_string_matching_mode(
+                Tokenizer(Decoder(b'{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"ByteLevel"}]}'))
+            ),
+            "byte_fallback",
+        )
+        self.assertIsNone(
+            StopStringCriteria._get_stop_string_matching_mode(
+                Tokenizer(Decoder(b'{"type":"Replace","content":"ByteFallback"}'))
+            )
+        )
+
+        self.assertEqual(StopStringCriteria._token_to_bytes("<0xEB>", "byte_fallback", None), b"\xeb")
+        self.assertEqual(StopStringCriteria._token_to_bytes("<0xeb>", "byte_fallback", None), b"\xeb")
+        for token in ["<0x+1>", "<0xG1>", "<0x 1>", "<0x1>", "<0x100>", "<0xeb", "hello"]:
+            self.assertIsNone(StopStringCriteria._token_to_bytes(token, "byte_fallback", None))
+
     def test_stop_string_criteria_vocab_size_mismatch(self):
         """Test that StopStringCriteria handles tokens above len(tokenizer) correctly."""
         tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")