Support gradient checkpointing in DiffusionGemma

kashif · kashif · commit 6ac3d9d0acef · 2026-06-11T22:05:16.000+02:00
diff --git a/src/transformers/models/diffusion_gemma/modeling_diffusion_gemma.py b/src/transformers/models/diffusion_gemma/modeling_diffusion_gemma.py
@@ -341,6 +341,9 @@ def forward(
         value_states = self.v_norm(value_states)
         value_states = value_states.transpose(1, 2)
 
+        # CHANGED: the local KV states are also returned, so that under gradient checkpointing (where the layer
+        # runs cache-less) the calling model can write the cache outside the checkpointed call.
+        local_key_states, local_value_states = key_states, value_states
         if past_key_values is not None:
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
         # CHANGED: removed the `if self.store_full_length_kv` branch
@@ -364,7 +367,7 @@ def forward(
 
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
+        return attn_output, attn_weights, local_key_states, local_value_states
 
 
 class DiffusionGemmaDecoderTextAttention(nn.Module):
@@ -418,7 +421,8 @@ def forward(
         hidden_states: torch.Tensor,
         position_embeddings: torch.Tensor,
         attention_mask: torch.Tensor | None,
-        past_key_values: Cache | None = None,
+        encoder_key_states: torch.Tensor | None = None,
+        encoder_value_states: torch.Tensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
         # The code in this function is adapted from Gemma4TextAttention. ** The modified parts are clearly indicated **
@@ -443,11 +447,9 @@ def forward(
         value_states = self.v_norm(value_states)
         value_states = value_states.transpose(1, 2)
 
-        if past_key_values is not None:
-            # CHANGED: instead of calling `past_key_values.update()` which updates the KV cache in-place and returns
-            # the full KV states, we first obtain the encoder cache contents, and then append the current KV states.
-            encoder_key_states = past_key_values.layers[self.layer_idx].keys
-            encoder_value_states = past_key_values.layers[self.layer_idx].values
+        if encoder_key_states is not None:
+            # CHANGED: the encoder KV states are passed as plain tensors (extracted from the encoder cache by the
+            # calling model) so they survive gradient checkpointing; the canvas KV states are appended to them.
             key_states = torch.cat([encoder_key_states, key_states], dim=2)
             value_states = torch.cat([encoder_value_states, value_states], dim=2)
         # CHANGED: removed the `if self.store_full_length_kv` branch
@@ -602,11 +604,11 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         residual = hidden_states
 
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, _, key_states, value_states = self.self_attn(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
@@ -638,7 +640,7 @@ def forward(
         hidden_states = residual + hidden_states
 
         hidden_states *= self.layer_scalar
-        return hidden_states
+        return hidden_states, key_states, value_states
 
 
 class DiffusionGemmaDecoderTextLayer(GradientCheckpointingLayer):
@@ -675,7 +677,8 @@ def forward(
         position_embeddings: torch.Tensor = None,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
+        encoder_key_states: torch.Tensor | None = None,
+        encoder_value_states: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         residual = hidden_states
@@ -686,7 +689,8 @@ def forward(
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
             position_ids=position_ids,
-            past_key_values=past_key_values,
+            encoder_key_states=encoder_key_states,
+            encoder_value_states=encoder_value_states,
             **kwargs,
         )
         hidden_states = self.post_attention_layernorm(hidden_states)
@@ -798,7 +802,7 @@ def forward(self, inputs_embeds, self_conditioning_signal: torch.Tensor) -> torc
 class DiffusionGemmaPreTrainedModel(PreTrainedModel):
     config: DiffusionGemmaConfig
     base_model_prefix = "model"
-    supports_gradient_checkpointing = False
+    supports_gradient_checkpointing = True
     _no_split_modules = [
         "DiffusionGemmaDecoderTextLayer",
         "DiffusionGemmaEncoderTextLayer",
@@ -940,14 +944,20 @@ def forward(
 
         # decoder layers
         for i, encoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
-            hidden_states = encoder_layer(
+            # Under gradient checkpointing the layer runs cache-less and the cache write happens outside the
+            # checkpointed call instead: the returned KV states, as checkpoint outputs, keep the gradient path
+            # from the decoder open (an in-layer write would be lost and double-applied on recomputation).
+            checkpointing = encoder_layer.gradient_checkpointing and self.training
+            hidden_states, key_states, value_states = encoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings[self.config.layer_types[i]],
                 attention_mask=causal_mask_mapping[self.config.layer_types[i]],
                 position_ids=position_ids,
-                past_key_values=past_key_values,
+                past_key_values=None if checkpointing else past_key_values,
                 **kwargs,
             )
+            if checkpointing:
+                past_key_values.update(key_states, value_states, i)
 
         hidden_states = self.norm(hidden_states)
 
@@ -1289,7 +1299,8 @@ def forward(
                 position_embeddings=position_embeddings[self.text_config.layer_types[i]],
                 attention_mask=mask_mapping[self.text_config.layer_types[i]],
                 position_ids=decoder_position_ids,
-                past_key_values=past_key_values,
+                encoder_key_states=past_key_values.layers[i].keys if past_key_values is not None else None,
+                encoder_value_states=past_key_values.layers[i].values if past_key_values is not None else None,
                 **kwargs,
             )
 
diff --git a/src/transformers/models/diffusion_gemma/modular_diffusion_gemma.py b/src/transformers/models/diffusion_gemma/modular_diffusion_gemma.py
@@ -269,6 +269,9 @@ def forward(
         value_states = self.v_norm(value_states)
         value_states = value_states.transpose(1, 2)
 
+        # CHANGED: the local KV states are also returned, so that under gradient checkpointing (where the layer
+        # runs cache-less) the calling model can write the cache outside the checkpointed call.
+        local_key_states, local_value_states = key_states, value_states
         if past_key_values is not None:
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
         # CHANGED: removed the `if self.store_full_length_kv` branch
@@ -292,7 +295,7 @@ def forward(
 
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
+        return attn_output, attn_weights, local_key_states, local_value_states
 
 
 class DiffusionGemmaDecoderTextAttention(nn.Module):
@@ -346,7 +349,8 @@ def forward(
         hidden_states: torch.Tensor,
         position_embeddings: torch.Tensor,
         attention_mask: torch.Tensor | None,
-        past_key_values: Cache | None = None,
+        encoder_key_states: torch.Tensor | None = None,
+        encoder_value_states: torch.Tensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
         # The code in this function is adapted from Gemma4TextAttention. ** The modified parts are clearly indicated **
@@ -371,11 +375,9 @@ def forward(
         value_states = self.v_norm(value_states)
         value_states = value_states.transpose(1, 2)
 
-        if past_key_values is not None:
-            # CHANGED: instead of calling `past_key_values.update()` which updates the KV cache in-place and returns
-            # the full KV states, we first obtain the encoder cache contents, and then append the current KV states.
-            encoder_key_states = past_key_values.layers[self.layer_idx].keys
-            encoder_value_states = past_key_values.layers[self.layer_idx].values
+        if encoder_key_states is not None:
+            # CHANGED: the encoder KV states are passed as plain tensors (extracted from the encoder cache by the
+            # calling model) so they survive gradient checkpointing; the canvas KV states are appended to them.
             key_states = torch.cat([encoder_key_states, key_states], dim=2)
             value_states = torch.cat([encoder_value_states, value_states], dim=2)
         # CHANGED: removed the `if self.store_full_length_kv` branch
@@ -478,11 +480,11 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         residual = hidden_states
 
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, _, key_states, value_states = self.self_attn(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
@@ -514,7 +516,7 @@ def forward(
         hidden_states = residual + hidden_states
 
         hidden_states *= self.layer_scalar
-        return hidden_states
+        return hidden_states, key_states, value_states
 
 
 class DiffusionGemmaDecoderTextLayer(Gemma4TextDecoderLayer):
@@ -551,7 +553,8 @@ def forward(
         position_embeddings: torch.Tensor = None,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
+        encoder_key_states: torch.Tensor | None = None,
+        encoder_value_states: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         residual = hidden_states
@@ -562,7 +565,8 @@ def forward(
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
             position_ids=position_ids,
-            past_key_values=past_key_values,
+            encoder_key_states=encoder_key_states,
+            encoder_value_states=encoder_value_states,
             **kwargs,
         )
         hidden_states = self.post_attention_layernorm(hidden_states)
@@ -644,7 +648,7 @@ def forward(self, inputs_embeds, self_conditioning_signal: torch.Tensor) -> torc
 class DiffusionGemmaPreTrainedModel(PreTrainedModel):
     config: DiffusionGemmaConfig
     base_model_prefix = "model"
-    supports_gradient_checkpointing = False
+    supports_gradient_checkpointing = True
     _no_split_modules = [
         "DiffusionGemmaDecoderTextLayer",
         "DiffusionGemmaEncoderTextLayer",
@@ -786,14 +790,20 @@ def forward(
 
         # decoder layers
         for i, encoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
-            hidden_states = encoder_layer(
+            # Under gradient checkpointing the layer runs cache-less and the cache write happens outside the
+            # checkpointed call instead: the returned KV states, as checkpoint outputs, keep the gradient path
+            # from the decoder open (an in-layer write would be lost and double-applied on recomputation).
+            checkpointing = encoder_layer.gradient_checkpointing and self.training
+            hidden_states, key_states, value_states = encoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings[self.config.layer_types[i]],
                 attention_mask=causal_mask_mapping[self.config.layer_types[i]],
                 position_ids=position_ids,
-                past_key_values=past_key_values,
+                past_key_values=None if checkpointing else past_key_values,
                 **kwargs,
             )
+            if checkpointing:
+                past_key_values.update(key_states, value_states, i)
 
         hidden_states = self.norm(hidden_states)
 
@@ -1097,7 +1107,8 @@ def forward(
                 position_embeddings=position_embeddings[self.text_config.layer_types[i]],
                 attention_mask=mask_mapping[self.text_config.layer_types[i]],
                 position_ids=decoder_position_ids,
-                past_key_values=past_key_values,
+                encoder_key_states=past_key_values.layers[i].keys if past_key_values is not None else None,
+                encoder_value_states=past_key_values.layers[i].values if past_key_values is not None else None,
                 **kwargs,
             )