huggingface · BillionClaw · Mar 17, 2026 · zucchini-nlp · Mar 17, 2026 · zucchini-nlp
diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py
@@ -1285,7 +1285,7 @@ def generate(
         input_ids, model_kwargs = self._expand_inputs_for_generation(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            expand_size=generation_config.num_return_sequences,
+            expand_size=generation_config.num_return_sequences or 1,
             **model_kwargs,
         )
 
@@ -1298,6 +1298,17 @@ def generate(
         attention_mask = attention_mask.repeat(2, 1)
         model_kwargs["attention_mask"] = attention_mask
 
+        # Ensure generation_kwargs exists with boi_token_id
+        if not hasattr(generation_config, "generation_kwargs") or generation_config.generation_kwargs is None:
+            generation_config.generation_kwargs = {}
+        if "boi_token_id" not in generation_config.generation_kwargs:
+            # Default boi_token_id - usually the image_token_id from config
+            generation_config.generation_kwargs["boi_token_id"] = getattr(self.config, "image_token_id", 0)
+
+        # Ensure pad_token_id is set
+        if generation_config.pad_token_id is None:
+            generation_config.pad_token_id = getattr(self.config, "pad_token_id", 0)
+
         # Mask all the tokens that are neither BOS nor BOI with pad token in the unconditional logits.
         mask = (input_tokens[batch_size:, :] != generation_config.bos_token_id) & (
             input_tokens[batch_size:, :] != generation_config.generation_kwargs["boi_token_id"]
@@ -1310,12 +1321,18 @@ def generate(
 
         if model_kwargs.get("past_key_values", None) is None:
             # Prepare cache if not provided.
+            # Need enough space for: input sequence + num_image_tokens iterations + safety margin
+            # The loop runs num_image_tokens times, starting from seq_len position
+            max_length = generation_config.max_length
+            min_cache_len = seq_len + num_image_tokens + 100  # Ensure enough buffer
+            if max_length is None:
+                max_length = min_cache_len
             model_kwargs["past_key_values"] = self._prepare_static_cache(
                 cache_implementation=generation_config.cache_implementation or "static",
                 # batch_size should account for both conditional/unconditional input; hence multiplied by 2.
                 batch_size=batch_size * 2,
-                # we should have at least a cache len of seq_len + num_image_tokens.
-                max_cache_len=max(generation_config.max_length, num_image_tokens + seq_len),
+                # we should have at least a cache len of seq_len + num_image_tokens + buffer.
+                max_cache_len=max(max_length, min_cache_len),
                 model_kwargs=model_kwargs,
             )
 

diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
@@ -1060,7 +1060,7 @@ def generate(
         input_ids, model_kwargs = self._expand_inputs_for_generation(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            expand_size=generation_config.num_return_sequences,
+            expand_size=generation_config.num_return_sequences or 1,
             **model_kwargs,
         )
 
@@ -1073,6 +1073,17 @@ def generate(
         attention_mask = attention_mask.repeat(2, 1)
         model_kwargs["attention_mask"] = attention_mask
 
+        # Ensure generation_kwargs exists with boi_token_id
+        if not hasattr(generation_config, "generation_kwargs") or generation_config.generation_kwargs is None:
+            generation_config.generation_kwargs = {}
+        if "boi_token_id" not in generation_config.generation_kwargs:
+            # Default boi_token_id - usually the image_token_id from config
+            generation_config.generation_kwargs["boi_token_id"] = getattr(self.config, "image_token_id", 0)
+
+        # Ensure pad_token_id is set
+        if generation_config.pad_token_id is None:
+            generation_config.pad_token_id = getattr(self.config, "pad_token_id", 0)
+
         # Mask all the tokens that are neither BOS nor BOI with pad token in the unconditional logits.
         mask = (input_tokens[batch_size:, :] != generation_config.bos_token_id) & (
             input_tokens[batch_size:, :] != generation_config.generation_kwargs["boi_token_id"]
@@ -1085,12 +1096,18 @@ def generate(
 
         if model_kwargs.get("past_key_values", None) is None:
             # Prepare cache if not provided.
+            # Need enough space for: input sequence + num_image_tokens iterations + safety margin
+            # The loop runs num_image_tokens times, starting from seq_len position
+            max_length = generation_config.max_length
+            min_cache_len = seq_len + num_image_tokens + 100  # Ensure enough buffer
+            if max_length is None:
+                max_length = min_cache_len
             model_kwargs["past_key_values"] = self._prepare_static_cache(
                 cache_implementation=generation_config.cache_implementation or "static",
                 # batch_size should account for both conditional/unconditional input; hence multiplied by 2.
                 batch_size=batch_size * 2,
-                # we should have at least a cache len of seq_len + num_image_tokens.
-                max_cache_len=max(generation_config.max_length, num_image_tokens + seq_len),
+                # we should have at least a cache len of seq_len + num_image_tokens + buffer.
+                max_cache_len=max(max_length, min_cache_len),
                 model_kwargs=model_kwargs,
             )