[Model] Support post-norm architecture for EAGLE-3 supeculators (#42764)

Dogacel · web-flow · commit 12421962955a · 2026-05-19T13:39:00.000-07:00
Signed-off-by: Doğaç Eldenk &lt;dogacel@gmail.com&gt;
diff --git a/vllm/model_executor/models/deepseek_eagle3.py b/vllm/model_executor/models/deepseek_eagle3.py
@@ -199,11 +199,18 @@ def __init__(
             ]
         )
 
-        # fc layer for combining auxiliary hidden states (3x hidden size input)
-        if hasattr(self.config, "target_hidden_size"):
-            fc_input_size = self.config.target_hidden_size * 3
-        else:
-            fc_input_size = self.config.hidden_size * 3
+        # fc layer for combining auxiliary hidden states
+        num_aux_hidden_states = getattr(self.config, "num_aux_hidden_states", None)
+        if num_aux_hidden_states is None:
+            eagle_config = getattr(self.config, "eagle_config", None) or {}
+            layer_ids = eagle_config.get("eagle_aux_hidden_state_layer_ids")
+            num_aux_hidden_states = len(layer_ids) if layer_ids else 3
+        self.num_aux_hidden_states = num_aux_hidden_states
+
+        target_hidden_size = getattr(
+            self.config, "target_hidden_size", self.config.hidden_size
+        )
+        fc_input_size = target_hidden_size * num_aux_hidden_states
 
         self.fc = ReplicatedLinear(
             input_size=fc_input_size,
@@ -215,6 +222,18 @@ def __init__(
             return_bias=False,
         )
 
+        use_fc_norm = getattr(self.config, "fc_norm", False)
+        if use_fc_norm:
+            self.fc_norm = nn.ModuleList(
+                [
+                    RMSNorm(target_hidden_size, eps=self.config.rms_norm_eps)
+                    for _ in range(self.num_aux_hidden_states)
+                ]
+            )
+        else:
+            self.fc_norm = None
+
+        self.norm_output = getattr(self.config, "norm_output", False)
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
@@ -242,8 +261,13 @@ def forward(
                 hidden_states=hidden_states,
                 residual=residual,
             )
+
         hidden_states, hidden_prenorm = self.norm(hidden_states, residual)
-        return hidden_states, hidden_prenorm
+
+        # norm_output variant uses the post-norm hidden states.
+        aux_output = hidden_states if self.norm_output else hidden_prenorm
+
+        return hidden_states, aux_output
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
@@ -172,26 +172,49 @@ def __init__(
             ]
         )
         if self.use_aux_hidden_state:
-            if hasattr(self.config, "target_hidden_size"):
-                fc_input_size = self.config.target_hidden_size * 3
-            else:
-                fc_input_size = self.config.hidden_size * 3
+            self.num_aux_hidden_states = getattr(
+                self.config, "num_aux_hidden_states", None
+            )
+            if self.num_aux_hidden_states is None:
+                eagle_config = getattr(self.config, "eagle_config", None) or {}
+                layer_ids = eagle_config.get("eagle_aux_hidden_state_layer_ids")
+                self.num_aux_hidden_states = len(layer_ids) if layer_ids else 3
+
+            target_hidden_size = getattr(
+                self.config, "target_hidden_size", self.config.hidden_size
+            )
+            self.fc_input_size = target_hidden_size * self.num_aux_hidden_states
+
             if self.norm_before_fc:
                 self.input_norm = RMSNorm(
-                    fc_input_size,
+                    self.fc_input_size,
                     eps=self.config.rms_norm_eps,
                 )
             else:
                 self.input_norm = None
+
+            use_fc_norm = getattr(self.config, "fc_norm", False)
+            if use_fc_norm:
+                self.fc_norm = nn.ModuleList(
+                    [
+                        RMSNorm(target_hidden_size, eps=self.config.rms_norm_eps)
+                        for _ in range(self.num_aux_hidden_states)
+                    ]
+                )
+            else:
+                self.fc_norm = None
+
             self.fc = ReplicatedLinear(
-                input_size=fc_input_size,
+                input_size=self.fc_input_size,
                 output_size=self.config.hidden_size,
                 bias=False,
                 params_dtype=vllm_config.model_config.dtype,
                 quant_config=self.quant_config,
                 prefix=maybe_prefix(prefix, "fc"),
                 return_bias=False,
             )
+
+        self.norm_output = getattr(self.config, "norm_output", False)
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
@@ -220,7 +243,11 @@ def forward(
                 residual=residual,
             )
         hidden_states, hidden_prenorm = self.norm(hidden_states, residual)
-        return hidden_states, hidden_prenorm
+
+        # norm_output variant uses the post-norm hidden states.
+        aux_output = hidden_states if self.norm_output else hidden_prenorm
+
+        return hidden_states, aux_output
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
@@ -312,11 +339,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.use_parallel_drafting:
             self.register_buffer(
                 "mask_hidden",
-                torch.zeros(
-                    1,
-                    (3 if self.model.use_aux_hidden_state else 1)
-                    * self.config.hidden_size,
-                ),
+                torch.zeros(1, self.model.fc_input_size),
                 persistent=False,
             )
 
@@ -371,6 +394,16 @@ def combine_hidden_states(
 
         if self.model.norm_before_fc:
             hidden_states = self.model.input_norm(hidden_states)
+
+        # `norm_before_fc` adds a single RMSNorm before the FC layer, whereas `fc_norm`
+        # applies separate RMSNorms to each chunk of the hidden states.
+        if self.model.fc_norm is not None:
+            chunks = hidden_states.chunk(self.model.num_aux_hidden_states, dim=-1)
+            hidden_states = torch.cat(
+                [norm(chunk) for norm, chunk in zip(self.model.fc_norm, chunks)],
+                dim=-1,
+            )
+
         return self.model.fc(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -5210,9 +5210,14 @@ def _get_eagle3_aux_layers_from_config(self) -> tuple[int, ...] | None:
         layer_ids = getattr(hf_config, "eagle_aux_hidden_state_layer_ids", None)
         if not layer_ids:
             dflash_config = getattr(hf_config, "dflash_config", None)
+            eagle_config = getattr(hf_config, "eagle_config", None)
+
             if dflash_config and isinstance(dflash_config, dict):
                 layer_ids = dflash_config.get("target_layer_ids")
 
+            if eagle_config and isinstance(eagle_config, dict):
+                layer_ids = eagle_config.get("eagle_aux_hidden_state_layer_ids")
+
         if layer_ids and isinstance(layer_ids, (list, tuple)):
             return tuple(layer_ids)