fix float8 rowwise inference perf with torch.compile (#2672)

vkuzo · web-flow · commit 6a74e34c250c · 2025-08-04T13:58:08.000-04:00
In #2379, logic was added which prevented torchinductor from fusing the activation quantization for float8 inference. This PR reverts most of #2379, and adds a test to ensure we see the correct # of GPU kernels for float8 tensorwise and rowwise quantization. We'll have to re-do #2379 without breaking this test. Summary: Test Plan: ```bash TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 pytest test/dtypes/test_affine_quantized_float.py -s -k expected_kernels_on_gpu ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/benchmarks/inference/bench_float8_inference.py b/benchmarks/inference/bench_float8_inference.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import fire
+import torch
+import torch.nn as nn
+from torch._inductor.utils import do_bench_using_profiling
+
+from torchao.quantization.quant_api import (
+    Float8DynamicActivationFloat8WeightConfig,
+    PerRow,
+    quantize_,
+)
+
+
+def benchmark_fn_in_usec(f, *args, **kwargs):
+    no_args = lambda: f(*args, **kwargs)
+    time = do_bench_using_profiling(no_args)
+    return time * 1e3
+
+
+def run(torch_compile_mode: str = "default"):
+    M, K, N = 1024, 2048, 4096
+    x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    m = nn.Sequential(nn.Linear(K, N, device="cuda", dtype=torch.bfloat16))
+    quantize_(m, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
+    m = torch.compile(m, mode=torch_compile_mode)
+    # warm up
+    with torch.no_grad():
+        _ = m(x)
+    # measure
+    with torch.no_grad():
+        time_us = benchmark_fn_in_usec(m, x)
+    print("time_us", time_us)
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -23,6 +23,7 @@
 import pytest
 import torch
 from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.profiler import ProfilerActivity, profile
 from torch.testing._internal import common_utils
 
 from torchao.dtypes.floatx.float8_layout import Float8AQTTensorImpl, preprocess_scale
@@ -718,45 +719,74 @@ def test_preprocess_scale_3d_reshape(self):
         expected_shape = (8, 1)  # Flattened (2*2*2, 1)
         self.assertEqual(result.shape, expected_shape)
 
-    @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
-    @common_utils.parametrize("hp_dtype", [torch.float32, torch.bfloat16])
-    def test_quantize_dequantize_fp8_inductor(self, float8_dtype, hp_dtype):
-        quantize_affine_float8 = torch.ops.torchao.quantize_affine_float8
-        dequantize_affine_float8 = torch.ops.torchao.dequantize_affine_float8
-        input = torch.randn(10, 10)
-        with torch.no_grad():
-            torch._dynamo.reset()
-            expected_scale = torch.tensor(2.0)
-            expected_quantized = quantize_affine_float8(
-                input,
-                expected_scale,
-                float8_dtype=float8_dtype,
-            )
-            expected_dequantized = dequantize_affine_float8(
-                expected_quantized,
-                expected_scale,
-                output_dtype=hp_dtype,
-            )
-            test_q, (code_q,) = torch._inductor.utils.run_and_get_code(
-                torch.compile(quantize_affine_float8),
-                input,
-                expected_scale,
-                float8_dtype=float8_dtype,
-            )
-            torch.testing.FileCheck().check(
-                "torch.ops.torchao.quantize_affine_float8.default"
-            ).run(code_q)
-            test_dq, (code_dq,) = torch._inductor.utils.run_and_get_code(
-                torch.compile(dequantize_affine_float8),
-                test_q,
-                expected_scale,
-                hp_dtype,
-            )
-            torch.testing.FileCheck().check(
-                "torch.ops.torchao.dequantize_affine_float8.default"
-            ).run(code_dq)
-            torch.testing.assert_close(expected_quantized, test_q)
-            torch.testing.assert_close(expected_dequantized, test_dq)
+    @torch.no_grad()
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not is_sm_at_least_90(), "Requires GPU with compute capability >= 9.0"
+    )
+    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    @common_utils.parametrize(
+        "torch_compile_mode",
+        [
+            "default",
+            "reduce-overhead",
+        ],
+    )
+    def test_expected_kernels_on_gpu(self, granularity, torch_compile_mode):
+        """
+        Verify that float8 quantization + torch.compile results in the
+        expected number of kernels in the GPU trace.
+        """
+
+        M, K, N = 128, 256, 512
+        m = torch.nn.Sequential(
+            torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
+        )
+        quantize_(m, Float8DynamicActivationFloat8WeightConfig(granularity=granularity))
+        m = torch.compile(m, mode=torch_compile_mode)
+        x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+        # warm up
+        _ = m(x)
+        # capture trace
+        with profile(activities=[ProfilerActivity.CUDA]) as prof:
+            _ = m(x)
+
+        cuda_kernel_events = [x for x in prof.key_averages() if x.cuda_time > 0]
+
+        if granularity == PerTensor():
+            # kernel 1: x_max_tmp = max(x, ...)
+            # kernel 2: x_max = max(x_max_tmp)
+            # kernel 3: x_float8 = to_float8(x, x_max)
+            # kernel 4: gemm
+            if torch_compile_mode == "default":
+                assert len(cuda_kernel_events) == 4, (
+                    f"too many cuda kernels: {cuda_kernel_events}"
+                )
+            elif torch_compile_mode == "reduce-overhead":
+                # two extra kernels with reduce-overhead:
+                # void at::native::(anonymous namespace)::multi_tensor...
+                # void at::native::vectorized_elementwise_kernel<2, at...
+                # TODO(future): debug and remove these
+                assert len(cuda_kernel_events) == 6, (
+                    f"too many cuda kernels: {cuda_kernel_events}"
+                )
+        else:
+            assert granularity == PerRow()
+            # kernel 1: x_float8 = to_float8(x)
+            # kernel 2: gemm
+            if torch_compile_mode == "default":
+                assert len(cuda_kernel_events) == 2, (
+                    f"too many cuda kernels: {cuda_kernel_events}"
+                )
+            elif torch_compile_mode == "reduce-overhead":
+                # two extra kernels with reduce-overhead:
+                # void at::native::(anonymous namespace)::multi_tensor...
+                # void at::native::vectorized_elementwise_kernel<2, at...
+                # TODO(future): debug and remove these
+                assert len(cuda_kernel_events) == 4, (
+                    f"too many cuda kernels: {cuda_kernel_events}"
+                )
 
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -2279,7 +2279,6 @@ def _expand_scale_to_tensor_shape(
     return expanded_scale
 
 
-@_register_custom_op(quant_lib, False)
 def _quantize_affine_float8(
     tensor: torch.Tensor,
     scale: torch.Tensor,
@@ -2300,15 +2299,6 @@ def _quantize_affine_float8(
     return fp8_tensor
 
 
-@_register_meta_op(quant_lib, "quantize_affine_float8")
-def _quantize_affine_float8_meta(
-    tensor: torch.Tensor,
-    scale: torch.Tensor,
-    float8_dtype: torch.dtype = torch.float8_e4m3fn,
-) -> torch.Tensor:
-    return torch.empty_like(tensor, dtype=float8_dtype)
-
-
 @_register_custom_op(quant_lib, False)
 def _dequantize_affine_float8(
     tensor: torch.Tensor,