Use torch compile compatible next_positive_power_of_2

liji-nv · liji-nv · commit 4e653f603441 · 2025-07-21T10:04:08.000+08:00
Signed-off-by: Jin Li &lt;59594262+liji-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -6,31 +6,22 @@
 
 from tensorrt_llm._torch.utils import (fp4_utils,
                                        get_last_power_of_2_num_tokens_buckets,
-                                       last_positive_power_of_2)
+                                       last_positive_power_of_2,
+                                       next_positive_power_of_2)
 
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                          OptimizationProfile, TunableRunner, TuningConfig)
 
 
 def calculate_tile_tokens_dim(num_tokens: int, num_experts: int,
                               top_k: int) -> int:
+    # Guess tokens per expert assuming perfect expert distribution first.
     num_tokens_per_expert = num_tokens * top_k // num_experts
 
-    # Equivalent to the following:
-    # tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-    #
-    # Torch dynamo cannot correctly track next_positive_power_of_2. Each shape
-    # passed to next_positive_power_of_2 will trigger a new recompile.
-    # Following code still triggers recompile. But it at most produces 4 additional recompiles.
-    if num_tokens_per_expert <= 8:
-        tile_tokens_dim = 8
-    elif num_tokens_per_expert <= 16:
-        tile_tokens_dim = 16
-    elif num_tokens_per_expert <= 32:
-        tile_tokens_dim = 32
-    else:
-        tile_tokens_dim = 64
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
 
     return tile_tokens_dim
 
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -196,7 +196,17 @@ def next_positive_power_of_2(x: int) -> int:
     if x < 1:
         return 1
 
-    return 1 << (x - 1).bit_length()
+    # Following code is equivalent to 1 << (x - 1).bit_length()
+    # But this impl does not contain bit_length() so can be used by torch compile.
+    # It can correctly handle 64bit number which should be enough for now.
+    n = x - 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    return n + 1
 
 
 def last_positive_power_of_2(x: int) -> int: