NVIDIA
diff --git a/‎gpt_builders.py‎
Lines changed: 6 additions & 1 deletion b/‎gpt_builders.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎megatron/core/jit.py‎
Lines changed: 21 additions & 6 deletions b/‎megatron/core/jit.py‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎megatron/core/models/gpt/experimental_attention_variant_module_specs.py‎
Lines changed: 52 additions & 0 deletions b/‎megatron/core/models/gpt/experimental_attention_variant_module_specs.py‎
Lines changed: 52 additions & 0 deletions
@@ -8,6 +8,9 @@
     get_gpt_layer_with_inference_spec,
     get_gpt_mtp_block_spec,
 )
+from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+    is_linear_attention_variant,
+)
 from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import (
     get_gpt_heterogeneous_layer_spec,
 )
@@ -42,7 +45,7 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
         else:
             use_te = args.transformer_impl == "transformer_engine"
 
-            if args.num_experts:
+            if args.num_experts or is_linear_attention_variant(args.experimental_attention_variant):
                 assert not (config.transformer_impl == "inference_optimized")
                 # Define the decoder block spec
                 transformer_layer_spec = get_gpt_decoder_block_spec(
@@ -117,6 +120,7 @@ def _get_transformer_layer_spec(use_te, config):
             args.moe_grouped_gemm,
             args.qk_layernorm,
             args.multi_latent_attention,
+            args.experimental_attention_variant,
             moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm,
             qk_l2_norm=args.qk_l2_norm,
             use_kitchen=config.use_kitchen,
@@ -135,6 +139,7 @@ def _get_transformer_layer_spec(use_te, config):
             args.moe_grouped_gemm,
             args.qk_layernorm,
             args.multi_latent_attention,
+            args.experimental_attention_variant,
             moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm,
             normalization=args.normalization,
             use_kitchen=config.use_kitchen,
 
@@ -7,12 +7,27 @@
 jit_fuser = torch.jit.script
 # nvFuser is deprecated in PyTorch JIT starting from 2.2
 
-try:
-    if is_torch_min_version("2.2.0a0"):
-        jit_fuser = torch.compile
-except ImportError:
 
-    def noop_decorator(func):
-        return func
+def noop_decorator(func):
+    '''No-op decorator'''
+    return func
 
+
+def enable_jit_fuser():
+    '''Enable the JIT fuser'''
+    global jit_fuser
+    try:
+        if is_torch_min_version("2.2.0a0"):
+            jit_fuser = torch.compile
+    except ImportError:
+
+        jit_fuser = noop_decorator
+
+
+def disable_jit_fuser():
+    '''Disable the JIT fuser'''
+    global jit_fuser
     jit_fuser = noop_decorator
+
+
+enable_jit_fuser()
@@ -0,0 +1,52 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Optional
+
+from megatron.core.models.backends import BackendSpecProvider
+from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+
+
+def is_linear_attention_variant(experimental_attention_variant: str) -> bool:
+    """Check if the experimental attention variant is a linear attention variant."""
+    linear_attention_variants = ["gated_delta_net"]
+    return experimental_attention_variant in linear_attention_variants
+
+
+def get_gated_delta_net_module_spec_for_backend(
+    backend: BackendSpecProvider, normalization: Optional[str] = None
+) -> ModuleSpec:
+    """Helper function to get module spec for Linear Attention"""
+    rms_norm = normalization == "RMSNorm"
+    attention = ModuleSpec(
+        module=GatedDeltaNet,
+        submodules=GatedDeltaNetSubmodules(
+            in_proj=backend.column_parallel_layer_norm_linear(),
+            out_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False),
+            out_proj=backend.row_parallel_linear(),
+        ),
+        metainfo={"fuse_input_layernorm": True},
+    )
+    return attention
+
+
+def get_experimental_attention_variant_module_spec_for_backend(
+    backend: BackendSpecProvider,
+    sharded_state_dict_keys_map: dict,
+    experimental_attention_variant: Optional[str] = None,
+    qk_layernorm: Optional[bool] = False,
+    qk_l2_norm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
+    mla_down_proj_use_column_parallel: Optional[bool] = False,
+    normalization: Optional[str] = None,
+    fallback_to_eager_attn: Optional[bool] = False,
+) -> ModuleSpec:
+    """Helper function to get module spec for Attention"""
+    if experimental_attention_variant == "gated_delta_net":
+        return get_gated_delta_net_module_spec_for_backend(
+            backend=backend, normalization=normalization
+        )
+    else:
+        raise ValueError(
+            f"Invalid experimental attention variant: {experimental_attention_variant}"
+        )