Update logic to disable quantizers

ajrasane · ajrasane · commit 7f80e6f7dbe6 · 2025-12-17T08:27:05.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -174,6 +174,21 @@ def build_quant_cfg(
             quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
             quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
 
+        # Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
+        if "qkv_disabled" in qformat:
+            quant_cfg = copy.deepcopy(quant_cfg)  # Don't modify global config
+            for proj in ["q_proj", "k_proj", "v_proj"]:
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
+                    "enable": False
+                }
+        if "qkvo_disabled" in qformat:
+            if "qkv_disabled" not in qformat:  # Avoid double deepcopy
+                quant_cfg = copy.deepcopy(quant_cfg)
+            for proj in ["o_proj"]:
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
+                    "enable": False
+                }
+
     return quant_cfg
 
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import argparse
+import contextlib
 import os
 import random
 import time
@@ -298,12 +299,8 @@ def main(args):
             use_seq_device_map=args.use_seq_device_map,
             attn_implementation=args.attn_implementation,
         )
-    else:
-        assert args.qformat in QUANT_CFG_CHOICES, (
-            f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
-        )
-        quant_cfg = QUANT_CFG_CHOICES[args.qformat]
 
+        quant_cfg = QUANT_CFG_CHOICES[args.qformat]
         # Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
         if "qkv_disabled" in args.qformat:
             # Disable q_proj, k_proj, v_proj quantizers
@@ -325,6 +322,11 @@ def main(args):
                 quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
             for i in range(total_layers - n_layers_to_disable, total_layers):
                 quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
+    else:
+        assert args.qformat in QUANT_CFG_CHOICES, (
+            f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
+        )
+        quant_cfg = QUANT_CFG_CHOICES[args.qformat]
 
         if args.kv_cache_qformat != "none":
             quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
@@ -357,6 +359,8 @@ def main(args):
     # since parameters are distributed. Force cuda:0 for input tensors.
     if device is None or str(device) in ("meta", "cpu"):
         device = "cuda"
+        print(f"Overriding device to {device}")
+
     processor = None
     tokenizer = None
 
@@ -646,11 +650,6 @@ def main(args):
                     print("Updating full_model with quantized language_model...")
                     language_model_lineage[-2].language_model = model
 
-            # if args.verbose:
-            #     mtq.print_quant_summary(full_model)
-
-            import contextlib
-
             if args.verbose:
                 with open("./quant_summary.txt", "w") as f, contextlib.redirect_stdout(f):
                     mtq.print_quant_summary(full_model)