AMD-AGI
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/torchtitan/configs/llama3.1_8B-pretrain.yaml‎
Lines changed: 6 additions & 0 deletions b/‎examples/torchtitan/configs/llama3.1_8B-pretrain.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎primus/backends/torchtitan/__init__.py‎ b/‎primus/backends/torchtitan/__init__.py‎
diff --git a/‎primus/backends/torchtitan/components/__init__.py‎ b/‎primus/backends/torchtitan/components/__init__.py‎
diff --git a/‎primus/backends/torchtitan/components/quantization/__init__.py‎ b/‎primus/backends/torchtitan/components/quantization/__init__.py‎
diff --git a/‎primus/backends/torchtitan/components/quantization/mx.py‎
Lines changed: 50 additions & 0 deletions b/‎primus/backends/torchtitan/components/quantization/mx.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎primus/backends/torchtitan/models/__init__.py‎ b/‎primus/backends/torchtitan/models/__init__.py‎
diff --git a/‎primus/backends/torchtitan/models/llama3/__init__.py‎ b/‎primus/backends/torchtitan/models/llama3/__init__.py‎
diff --git a/‎primus/backends/torchtitan/models/llama3/model.py‎
Lines changed: 37 additions & 0 deletions b/‎primus/backends/torchtitan/models/llama3/model.py‎
Lines changed: 37 additions & 0 deletions
@@ -137,4 +137,4 @@ jobs:
           rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo
       - name: Clean Primus
         run: |
-          rm -rf ${PRIMUS_WORKDIR}/Primus
+          rm -rf ${PRIMUS_WORKDIR}/Primus
@@ -14,7 +14,7 @@ repos:
       - id: check-added-large-files
       - id: check-merge-conflict
   - repo: https://github.com/pycqa/isort
-    rev: 5.11.5
+    rev: 5.13.2
     hooks:
       - id: isort
         args: ["--profile", "black"]
 
@@ -14,3 +14,9 @@ modules:
       sink_level: null
       file_sink_level: DEBUG
       stderr_sink_level: INFO
+
+      # model:
+      #   converters: ["mx"]
+      primus_turbo:
+        enable_primus_turbo: false
+        enable_attention_float8: false
@@ -0,0 +1,50 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+import torch
+import torch.nn as nn
+from primus_turbo.pytorch.core.float8 import MXQuantConfig
+from primus_turbo.pytorch.modules import MXLinear
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import ParallelDims
+from torchtitan.protocols.model_converter import (
+    ModelConverter,
+    register_model_converter,
+)
+from torchtitan.tools.logging import logger
+
+
+def replace_turbo_mxlinear_modules(model: nn.Module, config: MXQuantConfig):
+    for name, module in model.named_children():
+        if isinstance(module, torch.nn.Linear) and not isinstance(module, MXLinear):
+            mx_linear = MXLinear.from_float(module, config)
+            setattr(model, name, mx_linear)
+        else:
+            replace_turbo_mxlinear_modules(module, config)
+
+
+class PrimusTubroMXConverter(ModelConverter):
+    def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
+        self.enabled = True
+        # TODO: quant config
+        self.config = MXQuantConfig()
+
+    def convert(self, model: nn.Module):
+        if not self.enabled:
+            return
+
+        replace_turbo_mxlinear_modules(model, self.config)
+
+        logger.info("Swapped to MXLinear layers")
+
+    def post_optimizer_hook(self, model: nn.Module | list[nn.Module]):
+        """
+        MXFP8 doesn't require any post-optimizer hooks at the moment
+        """
+        return
+
+
+register_model_converter(PrimusTubroMXConverter, "primus_turbo_mx")
@@ -0,0 +1,37 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+import torch
+from torchtitan.models.llama3.model import Attention as TTAttention
+from torchtitan.models.llama3.model import apply_rotary_emb
+
+
+class Attention(TTAttention):
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ):
+        bs, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+
+        # Use -1 instead of `n_heads` (or `n_kv_heads`) to infer the actual
+        # local heads from sizes of xq, xk, and xv as TP may have sharded them
+        # after the above linear ops.
+        xq = xq.view(bs, seqlen, -1, self.head_dim)
+        xk = xk.view(bs, seqlen, -1, self.head_dim)
+        xv = xv.view(bs, seqlen, -1, self.head_dim)
+
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        # xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        # xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+
+        output = self.sdpa(xq, xk, xv)
+
+        output = output.view(bs, seqlen, -1)
+        return self.wo(output)