NVIDIA
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/example_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎modelopt/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/torch/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎modelopt/torch/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎modelopt/torch/quantization/config.py‎
Lines changed: 32 additions & 3 deletions b/‎modelopt/torch/quantization/config.py‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎modelopt/torch/quantization/model_quant.py‎
Lines changed: 1 addition & 1 deletion b/‎modelopt/torch/quantization/model_quant.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/quantization/nn/modules/tensor_quantizer.py‎
Lines changed: 97 additions & 13 deletions b/‎modelopt/torch/quantization/nn/modules/tensor_quantizer.py‎
Lines changed: 97 additions & 13 deletions
diff --git a/‎modelopt/torch/quantization/tensor_quant.py‎
Lines changed: 28 additions & 37 deletions b/‎modelopt/torch/quantization/tensor_quant.py‎
Lines changed: 28 additions & 37 deletions
@@ -93,7 +93,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [llm_ptq, vlm_ptq]
+        example: [llm_ptq]  # vlm_ptq temporarily disabled due to pipeline error
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
 
@@ -12,6 +12,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add support for KV Cache Quantization for vLLM FakeQuant PTQ script. See `examples/vllm_serve/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/vllm_serve#Calibrate-and-serve-fake-quant-model-in-vLLM>`__ for more details.
 - Add support for subgraphs in ONNX autocast.
 - Add support for parallel draft heads in Eagle speculative decoding.
+- Add support to enable custom emulated quantization backend. See :meth:`register_quant_backend <modelopt.torch.quantization.nn.modules.tensor_quantizer.register_quant_backend>`` for more details. See an example in ``tests/unit/torch/quantization/test_custom_backend.py``.
 
 **Deprecations**
 
 
@@ -15,6 +15,7 @@
 
 """Nvidia Model Optimizer (modelopt)."""
 
+import warnings as _warnings
 from importlib.metadata import version as _version
 
 __version__ = _version("nvidia-modelopt")
@@ -39,3 +39,9 @@
         )
 except ImportError:
     pass
+
+# Initialize modelopt_internal if available
+with utils.import_plugin(
+    "modelopt_internal", success_msg="modelopt_internal successfully initialized", verbose=True
+):
+    import modelopt_internal
@@ -667,7 +667,7 @@ class QuantizerAttributeConfig(ModeloptBaseConfig):
         description="""If True, enables the quantizer. If False, by-pass the quantizer and returns the input tensor.""",
     )
 
-    num_bits: int | tuple[int, int] = ModeloptField(
+    num_bits: int | tuple[int, int] | str = ModeloptField(
         default=8,
         title="An integer or a tuple of two integers specifying the number of quantization bits.",
         description="""`num_bits` can be:
@@ -677,7 +677,9 @@ class QuantizerAttributeConfig(ModeloptBaseConfig):
 
         #. Constant integer tuple (E,M) for floating point quantization emulating
             Nvidia's FPx quantization. E is the number of exponent bits and M is the number
-            of mantissa bits. Supported FPx quantization formats: FP8 (E4M3, E5M2), FP6(E3M2, E2M3), FP4(E2M1).""",
+            of mantissa bits. Supported FPx quantization formats: FP8 (E4M3, E5M2), FP6(E3M2, E2M3), FP4(E2M1).
+
+        #. String specifying the quantization format. This is current used only for custom backends.""",
     )
 
     @model_validator(mode="before")
@@ -709,10 +711,16 @@ def _validate_recursive(value):
     @model_validator(mode="after")
     def validate_num_bits(self):
         """Validate `num_bits`."""
+        if self.backend is not None:
+            # For custom backends, we don't need to validate num_bits
+            return self
+
         num_bits = self.num_bits
 
         if isinstance(num_bits, int) and num_bits < 1:
-            raise ValueError("num_bits must be a positive integer or a tuple of positive integers.")
+            raise ValueError(
+                f"num_bits must be a positive integer or a tuple of positive integers. {num_bits}"
+            )
 
         if not isinstance(num_bits, tuple):
             return self
@@ -954,6 +962,27 @@ def validate_calibrator(cls, v, info: ValidationInfo):
         """,
     )
 
+    backend: str | None = ModeloptField(
+        default=None,
+        title="Name of custom quantization functional backend.",
+        description="""
+            Selects a non-default quantization functional backend by name. See
+            :meth:`register_quant_backend <modelopt.torch.nn.modules.tensor_quantizer.register_quant_backend>`
+            for more details on how to register a custom quantization backend.
+        """,
+    )
+    backend_extra_args: dict | None = ModeloptField(
+        default=None,
+        title="Extra arguments for the selected backend.",
+        description="""The extra arguments will saved on to the quantizer instance - this wont be
+        passed directly to the backend entrypoint. Can be any serializable dictionary.
+
+        Please use `backend_extra_args` to pass arguments that are not already supported by
+        `QuantizerAttributeConfig`. This will ensure maximum compatibility with the other modelopt
+        features such as modelopt's calibration algorithms.
+        """,
+    )
+
 
 class QuantizeAlgorithmConfig(ModeloptBaseConfig):
     """Calibration algorithm config base."""
 
@@ -229,7 +229,7 @@ def forward_loop(model) -> None:
     Returns: A pytorch model which has been quantized and calibrated.
     """
     model = apply_mode(model, mode=[("quantize", config)], registry=QuantizeModeRegistry)
-    return calibrate(model, config["algorithm"], forward_loop=forward_loop)
+    return calibrate(model, config.get("algorithm"), forward_loop=forward_loop)
 
 
 # TODO: create a config interface for auto_quantize and expose setting
 
@@ -18,7 +18,8 @@
 import contextlib
 import math
 import warnings
-from typing import TYPE_CHECKING, Any
+from collections.abc import Callable
+from typing import Any, Protocol
 
 import torch
 import torch.distributed as dist
@@ -36,7 +37,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from modelopt.torch.utils import standardize_constructor_args
+from modelopt.torch.utils import same_device_as, standardize_constructor_args
 from modelopt.torch.utils.distributed import DistributedProcessGroup
 
 from ... import calib
@@ -56,10 +57,63 @@
 from ...utils import is_torch_export_mode
 from ..functional import normalized_hadamard_transform
 
-if TYPE_CHECKING:
-    from collections.abc import Callable
+__all__ = [
+    "SequentialQuantizer",
+    "TensorQuantizer",
+    "TensorQuantizerCache",
+    "is_registered_quant_backend",
+    "register_quant_backend",
+    "unregister_quant_backend",
+]
 
-__all__ = ["SequentialQuantizer", "TensorQuantizer"]
+
+QuantBackendEntrypoint = Callable[[torch.Tensor, "TensorQuantizer"], torch.Tensor]
+
+_QUANT_FUNCTIONAL_BACKENDS: dict[str, QuantBackendEntrypoint] = {}
+
+
+def register_quant_backend(name: str, entrypoint: QuantBackendEntrypoint) -> None:
+    """Register a custom quantization backend.
+
+    Args:
+        name: The name of the backend.
+        entrypoint: The entrypoint of the backend. The entrypoint should be a callable that takes in
+            the inputs and the tensor quantizer as arguments and returns the quantized tensor.
+            See :class:`modelopt.torch.quantization.config.QuantizerAttributeConfig`
+            for details on choosing from the registered backends via the ``backend`` and
+            ``backend_extra_args`` fields.
+    """
+    if not isinstance(name, str) or not name:
+        raise ValueError("Backend name must be a non-empty string.")
+    if not callable(entrypoint):
+        raise TypeError("Entrypoint must be callable.")
+    if name in _QUANT_FUNCTIONAL_BACKENDS:
+        warnings.warn(f"Overwriting existing backend: {name}")
+    _QUANT_FUNCTIONAL_BACKENDS[name] = entrypoint
+
+
+def unregister_quant_backend(name: str) -> None:
+    """Unregister a custom quantization backend.
+
+    Args:
+        name: The name of the backend to unregister.
+    """
+    if not isinstance(name, str) or not name:
+        raise ValueError("Backend name must be a non-empty string.")
+    _QUANT_FUNCTIONAL_BACKENDS.pop(name, None)
+
+
+def is_registered_quant_backend(name: str) -> bool:
+    """Check if a custom quantization backend is registered.
+
+    Args:
+        name: The name of the backend to check.
+    """
+    return name in _QUANT_FUNCTIONAL_BACKENDS
+
+
+class TensorQuantizerCache(Protocol):
+    """A protocol for a cache interface for TensorQuantizer."""
 
 
 class TensorQuantizer(nn.Module):
@@ -104,6 +158,8 @@ class TensorQuantizer(nn.Module):
         "ds_grads_remaining",
         "ds_id",
         "pre_bwd_fn",
+        # quantizer cache for custom backends, like luts
+        "_quantizer_cache",
     }
 
     def __init__(
@@ -132,6 +188,9 @@ def __init__(
         # Lazy initialize the bias calibrator for KV cache quantization
         self._bias_calibrator = None
 
+        # Optional quantizer cache for caching quantizer related encoding or tensors.
+        self._quantizer_cache = None
+
     def set_from_attribute_config(self, attribute_cfg: QuantizerAttributeConfig | dict):
         """Set quantizer attributes from attribute_dict.
 
@@ -153,6 +212,8 @@ def _calibrator_setter(val):
             "enable": ("_disabled", lambda val: val is False),
             "type": ("_dynamic", lambda val: val == "dynamic"),
             "calibrator": ("_calibrator", _calibrator_setter),
+            "backend": ("backend", lambda val: val),
+            "backend_extra_args": ("backend_extra_args", lambda val: val or {}),
         }
 
         for attribute, val in attribute_cfg.items():
@@ -632,6 +693,12 @@ def _real_quantize(self, inputs):
 
     def _fake_quantize(self, inputs):
         """Fake quantization."""
+        if self.backend is not None:
+            if self.backend not in _QUANT_FUNCTIONAL_BACKENDS:
+                raise KeyError(f"Quant backend '{self.backend}' is not registered.")
+            entrypoint = _QUANT_FUNCTIONAL_BACKENDS[self.backend]
+            return entrypoint(inputs, self)
+
         amax = None
         if not self.is_mx_format:
             amax = self._get_amax(inputs)
@@ -934,7 +1001,8 @@ def forward(self, inputs):
             if hasattr(inputs, "is_contiguous") and not inputs.is_contiguous():
                 inputs.data = inputs.data.contiguous()
             if self.fake_quant:
-                outputs = self._fake_quantize(inputs)
+                with same_device_as(inputs):
+                    outputs = self._fake_quantize(inputs)
             elif not self._dequantize:
                 outputs = self._real_quantize(inputs)
             else:
@@ -964,16 +1032,23 @@ def _short_amax(self, fmt=".4f"):
             return "None"
         if self._amax.is_meta:
             return "meta"
-        if self._amax.numel() == 1:
-            return f"{self._amax.item():{fmt}}"
-        return (
-            f"[{self._amax.min().item():{fmt}},"
-            f" {self._amax.max().item():{fmt}}]({self._amax.numel()})"
-        )
+        return self._short_tensor(self._amax, fmt)
+
+    def _short_tensor(self, tensor: torch.Tensor, fmt=".4f"):
+        """Short description of tensor."""
+        if tensor.numel() == 1:
+            return f"{tensor.item():{fmt}}"
+        return f"[{tensor.min().item():{fmt}}, {tensor.max().item():{fmt}}]({tensor.numel()})"
 
     def extra_repr(self):
         """Set the extra information about this module."""
         if self._disabled:
+            s = "disabled"
+            s += (
+                f" pre_quant_scale={self._short_tensor(self.pre_quant_scale)}"
+                if self.pre_quant_scale is not None
+                else ""
+            )
             return "disabled"
         s = f"{'unsigned ' if self._unsigned else ''}{self._num_bits} bit"
         s += " narrow" if (self._narrow_range) else ""
@@ -983,7 +1058,11 @@ def extra_repr(self):
         else:
             s += f" axis={self._axis}" if self._axis is not None else " per-tensor"
         s += f" amax={self._short_amax()}"
-        s += " pre_quant_scale" if self.pre_quant_scale is not None else ""
+        s += (
+            f" pre_quant_scale={self._short_tensor(self.pre_quant_scale)}"
+            if self.pre_quant_scale is not None
+            else ""
+        )
         s += " rotated" if self._rotate else ""
         s += (
             f" calibrator={self._calibrator.__class__.__name__}"
@@ -995,6 +1074,11 @@ def extra_repr(self):
 
         s += " quant" if (self._if_quant) else ""
         s += " calib" if (self._if_calib) else ""
+        s += (
+            f" backend={self.backend}, extra_args={self.backend_extra_args}"
+            if self.backend is not None
+            else ""
+        )
         return s
 
     def _get_properties_for_modelopt_state(self):
 
@@ -79,14 +79,11 @@ def scaled_e4m3_impl(
     if cuda_ext_fp8 is None:
         return fp8_eager(inputs, amax)
 
-    with torch.cuda.device(
-        None if inputs.device.index == torch.cuda.current_device() else inputs.device.index
-    ):
-        if amax.numel() == 1:
-            outputs = cuda_ext_fp8.fake_e4m3fy(inputs, amax)
-        elif amax.squeeze().ndim == 1:
-            axis = amax.shape.index(amax.numel())
-            outputs = cuda_ext_fp8.fake_e4m3fy_with_axis(inputs, amax.squeeze(), axis)
+    if amax.numel() == 1:
+        outputs = cuda_ext_fp8.fake_e4m3fy(inputs, amax)
+    elif amax.squeeze().ndim == 1:
+        axis = amax.shape.index(amax.numel())
+        outputs = cuda_ext_fp8.fake_e4m3fy_with_axis(inputs, amax.squeeze(), axis)
     return outputs
 
 
@@ -100,17 +97,14 @@ def fake_quant_impl(
     """Implementation of fake quantizing input according to number of bits."""
     cuda_ext = get_cuda_ext()
 
-    with torch.cuda.device(
-        None if inputs.device.index == torch.cuda.current_device() else inputs.device.index
-    ):
-        if amax.numel() == 1:
-            outputs = cuda_ext.fake_tensor_quant(inputs, amax, num_bits, unsigned, narrow_range)
-        else:
-            axis = amax.shape.index(amax.numel())
-            outputs = cuda_ext.fake_tensor_quant_with_axis(
-                inputs, amax.squeeze(), axis, num_bits, unsigned, narrow_range
-            )
-        return outputs
+    if amax.numel() == 1:
+        outputs = cuda_ext.fake_tensor_quant(inputs, amax, num_bits, unsigned, narrow_range)
+    else:
+        axis = amax.shape.index(amax.numel())
+        outputs = cuda_ext.fake_tensor_quant_with_axis(
+            inputs, amax.squeeze(), axis, num_bits, unsigned, narrow_range
+        )
+    return outputs
 
 
 def _quantize_impl(
@@ -173,25 +167,22 @@ def _dynamic_block_quantize_impl(
             assert amax.is_cuda, "amax must be a CUDA tensor for dynamic block quantization."
             if amax.numel() != 1:
                 amax = amax.amax()
-        with torch.cuda.device(
-            None if inputs.device.index == torch.cuda.current_device() else inputs.device.index
+        if (
+            num_bits == (2, 1)  # type: ignore[comparison-overlap]
+            and scale_bits == (4, 3)
+            and triton_kernel.IS_AVAILABLE
+            and not DISABLE_TRITON_KERNEL
+            and amax is not None
         ):
-            if (
-                num_bits == (2, 1)  # type: ignore[comparison-overlap]
-                and scale_bits == (4, 3)
-                and triton_kernel.IS_AVAILABLE
-                and not DISABLE_TRITON_KERNEL
-                and amax is not None
-            ):
-                return triton_kernel.fp4_fake_quant_block(inputs, amax)
-            cuda_ext_mx = get_cuda_ext_mx(raise_if_failed=True)
-            return cuda_ext_mx.fused_amax_convert(
-                inputs,
-                block_size,
-                getattr(cuda_ext_mx.Types, mx_format_map[num_bits]),
-                getattr(cuda_ext_mx.Types, mx_format_map[scale_bits]),
-                amax,
-            )
+            return triton_kernel.fp4_fake_quant_block(inputs, amax)
+        cuda_ext_mx = get_cuda_ext_mx(raise_if_failed=True)
+        return cuda_ext_mx.fused_amax_convert(
+            inputs,
+            block_size,
+            getattr(cuda_ext_mx.Types, mx_format_map[num_bits]),
+            getattr(cuda_ext_mx.Types, mx_format_map[scale_bits]),
+            amax,
+        )
     else:
         raise NotImplementedError(
             f"Unsupported num_bits: {num_bits}, scale_bits: {scale_bits} for dynamic block quantization."