huggingface · Rocketknight1 · Jun 8, 2026 · Jun 8, 2026
diff --git a/src/transformers/models/nemotron_h/modeling_nemotron_h.py b/src/transformers/models/nemotron_h/modeling_nemotron_h.py
@@ -277,7 +277,10 @@ def cuda_kernels_forward(
             )
             hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
             hidden_states = self.norm(hidden_states, gate)
-            out = self.out_proj(hidden_states)[:, None, ...]
+            # The SSM kernels return fp32 regardless of the module dtype; cast back to the
+            # projection weight dtype so the matmul does not fail when out_proj.weight is a
+            # narrower dtype than the activations (e.g. fp32 activations with a bf16 out_proj).
+            out = self.out_proj(hidden_states.to(self.out_proj.weight.dtype))[:, None, ...]
         # if no cache is found, calling the kernel
         else:
             if attention_mask is not None and not torch.all(attention_mask == 1):
@@ -369,7 +372,10 @@ def cuda_kernels_forward(
                 scan_output = scan_output.view(batch_size, seq_len, -1)
                 # Multiply "gate" branch and apply extra normalization layer
                 scan_output = self.norm(scan_output, gate)
-                out = self.out_proj(scan_output)
+                # The SSM kernels return fp32 regardless of the module dtype; cast back to the
+                # projection weight dtype so the matmul does not fail when out_proj.weight is a
+                # narrower dtype than the activations (e.g. fp32 activations with a bf16 out_proj).
+                out = self.out_proj(scan_output.to(self.out_proj.weight.dtype))
         return out
 
     # fmt: off

diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py
@@ -565,7 +565,10 @@ def cuda_kernels_forward(
             )
             hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
             hidden_states = self.norm(hidden_states, gate)
-            out = self.out_proj(hidden_states)[:, None, ...]
+            # The SSM kernels return fp32 regardless of the module dtype; cast back to the
+            # projection weight dtype so the matmul does not fail when out_proj.weight is a
+            # narrower dtype than the activations (e.g. fp32 activations with a bf16 out_proj).
+            out = self.out_proj(hidden_states.to(self.out_proj.weight.dtype))[:, None, ...]
         # if no cache is found, calling the kernel
         else:
             if attention_mask is not None and not torch.all(attention_mask == 1):
@@ -657,7 +660,10 @@ def cuda_kernels_forward(
                 scan_output = scan_output.view(batch_size, seq_len, -1)
                 # Multiply "gate" branch and apply extra normalization layer
                 scan_output = self.norm(scan_output, gate)
-                out = self.out_proj(scan_output)
+                # The SSM kernels return fp32 regardless of the module dtype; cast back to the
+                # projection weight dtype so the matmul does not fail when out_proj.weight is a
+                # narrower dtype than the activations (e.g. fp32 activations with a bf16 out_proj).
+                out = self.out_proj(scan_output.to(self.out_proj.weight.dtype))
         return out
 
     # fmt: off

diff --git a/src/transformers/models/zamba2/modular_zamba2.py b/src/transformers/models/zamba2/modular_zamba2.py
@@ -353,7 +353,10 @@ def cuda_kernels_forward(
             )
             hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
             hidden_states = self.norm(hidden_states, gate)
-            out = self.out_proj(hidden_states)[:, None, ...]
+            # The SSM kernels return fp32 regardless of the module dtype; cast back to the
+            # projection weight dtype so the matmul does not fail when out_proj.weight is a
+            # narrower dtype than the activations (e.g. fp32 activations with a bf16 out_proj).
+            out = self.out_proj(hidden_states.to(self.out_proj.weight.dtype))[:, None, ...]
         # if no cache is found, calling the kernel
         else:
             if attention_mask is not None and not torch.all(attention_mask == 1):
@@ -445,7 +448,10 @@ def cuda_kernels_forward(
                 scan_output = scan_output.view(batch_size, seq_len, -1)
                 # Multiply "gate" branch and apply extra normalization layer
                 scan_output = self.norm(scan_output, gate)
-                out = self.out_proj(scan_output)
+                # The SSM kernels return fp32 regardless of the module dtype; cast back to the
+                # projection weight dtype so the matmul does not fail when out_proj.weight is a
+                # narrower dtype than the activations (e.g. fp32 activations with a bf16 out_proj).
+                out = self.out_proj(scan_output.to(self.out_proj.weight.dtype))
         return out
 
     # fmt: off