TRTLLM-6142: set torch recompile_limit based on cuda_graph_batch_sizes and refactored (#119)

MrGeva · lucaslie · commit 2c81f8af4720 · 2025-07-31T07:40:40.000-07:00
* refactored compile_limit

Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;

* removed changes made to TorchCompileCompiler

Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;

* set cache_size_limit in TorchCompileCompiler

Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;

---------

Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py
@@ -23,9 +23,6 @@
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
 
-# Global torch config, set the torch compile cache to fix up to llama 405B
-torch._dynamo.config.cache_size_limit = 20
-
 
 class PromptConfig(BaseModel):
     """Prompt configuration.
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_compile.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_compile.py
@@ -3,11 +3,19 @@
 import torch
 import torch.nn as nn
 
+from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
+
 from ..compiler import BackendCompiler, BackendRegistry
 
 
 @BackendRegistry.register("torch-compile")
 class TorchCompileCompiler(BackendCompiler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Global torch config, set the torch compile cache to fix up to llama 405B
+        torch._dynamo.config.cache_size_limit = 20
+        ad_logger.info(f"Setting cache size limit to {torch._dynamo.config.cache_size_limit}")
+
     def compile(self) -> nn.Module:
         """Compile the model using torch.compile."""
         return torch.compile(self.gm, dynamic=True)
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py
@@ -18,27 +18,23 @@ def __init__(
         model: nn.Module,
         in_spec: TreeSpec,
         out_spec: TreeSpec,
-        max_batch_size: int,
-        cuda_graph_batch_sizes: List[int] = None,
+        cuda_graph_batch_sizes: List[int],
         num_batched_inputs: Optional[int] = 1,  # number of batched, dynamic inputs...
     ):
         super().__init__()
         self._in_spec = in_spec
         self._out_spec = out_spec
         self.model = model
-        self.max_batch_size = max_batch_size
+        self.max_batch_size = max(cuda_graph_batch_sizes)
+        ad_logger.info(f"Setting max batch size to {self.max_batch_size}")
         self.num_batched_inputs = num_batched_inputs if num_batched_inputs is not None else 1
         self.graphs: Dict[Tuple[int, ...], CUDAGraph] = {}
         self._input_buffers: List[torch.Tensor] = [
             torch.empty(0, 1) for _ in range(self.num_batched_inputs)
         ]
         self._out_buffer_flat: List[torch.Tensor] = None
         self._args_hash: Optional[Tuple[int, ...]] = None
-        self.cuda_graph_batch_sizes = (
-            sorted(cuda_graph_batch_sizes, reverse=True)
-            if cuda_graph_batch_sizes is not None
-            else self._get_graph_batch_sizes(self.max_batch_size)
-        )
+        self.cuda_graph_batch_sizes = sorted(cuda_graph_batch_sizes, reverse=True)
         self._cuda_graph_mem_pool = None
 
     def _get_hash(self, flat_args: List[Any]) -> Tuple[int, ...]:
@@ -77,20 +73,6 @@ def _capture_one_graph(self, *args, **kwargs) -> torch.cuda.CUDAGraph:
         self._cuda_graph_mem_pool = self._cuda_graph_mem_pool or graph.pool()
         return graph
 
-    @staticmethod
-    def _get_graph_batch_sizes(
-        max_bs: int, extra: Optional[List[int]] = None, multiplier: int = 128
-    ) -> List[int]:
-        """Heuristic to set batch sizes for graph capture."""
-        # do 1, max_bs, and extra as special batch sizes
-        batch_sizes = {1, max_bs, *(extra or [])}
-
-        # add all multiples of multiplier up to max_bs
-        batch_sizes.update(range(multiplier, max_bs + 1, multiplier))
-
-        # return as sorted list
-        return sorted(batch_sizes, reverse=True)
-
     def capture_graph(self, *args, **kwargs):
         """Capture and pre-fetch the graph for variable batch size."""
         # flatten args, kwargs
@@ -177,15 +159,21 @@ def forward(self, *args, **kwargs) -> Any:
 class TorchCudagraphCompiler(BackendCompiler):
     """Compiler that uses only CUDA graphs."""
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cuda_graph_batch_sizes = self.compiler_kwargs.get("cuda_graph_batch_sizes")
+        if not self.cuda_graph_batch_sizes:
+            self.cuda_graph_batch_sizes = self._get_graph_batch_sizes(self.max_batch_size)
+        ad_logger.info(f"Setting cuda_graph_batch_sizes to {self.cuda_graph_batch_sizes}")
+
     def _init_captured_graph(
         self, gm: nn.Module, in_spec: TreeSpec, out_spec: TreeSpec
     ) -> CapturedGraph:
         return CapturedGraph(
             gm,
             in_spec=in_spec,
             out_spec=out_spec,
-            max_batch_size=self.max_batch_size,
-            cuda_graph_batch_sizes=self.compiler_kwargs.get("cuda_graph_batch_sizes"),
+            cuda_graph_batch_sizes=self.cuda_graph_batch_sizes,
             num_batched_inputs=self.compiler_kwargs.get("num_batched_inputs"),
         )
 
@@ -198,3 +186,17 @@ def compile(self) -> CapturedGraph:
             captured_model.capture_graph(*self.args, **self.kwargs)
 
         return captured_model
+
+    @staticmethod
+    def _get_graph_batch_sizes(
+        max_bs: int, extra: Optional[List[int]] = None, multiplier: int = 128
+    ) -> List[int]:
+        """Heuristic to set batch sizes for graph capture."""
+        # do 1, max_bs, and extra as special batch sizes
+        batch_sizes = {1, max_bs, *(extra or [])}
+
+        # add all multiples of multiplier up to max_bs
+        batch_sizes.update(range(multiplier, max_bs + 1, multiplier))
+
+        # return as sorted list
+        return sorted(batch_sizes, reverse=True)
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
+
 from ..compiler import BackendRegistry
 from .torch_cudagraph import CapturedGraph, TorchCudagraphCompiler
 
@@ -10,6 +12,17 @@
 class TorchOptCompiler(TorchCudagraphCompiler):
     """Compiler that uses both torch.compile and CUDA graphs."""
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        torch._dynamo.config.recompile_limit = max(
+            len(self.cuda_graph_batch_sizes), torch._dynamo.config.recompile_limit
+        )
+        ad_logger.info(f"Setting recompile limit to {torch._dynamo.config.recompile_limit}")
+
+        # Global torch config, set the torch compile cache to fix up to llama 405B
+        torch._dynamo.config.cache_size_limit = 20
+        ad_logger.info(f"Setting cache size limit to {torch._dynamo.config.cache_size_limit}")
+
     def _init_captured_graph(self, gm, in_spec, out_spec) -> CapturedGraph:
         gm = torch.compile(gm, dynamic=True)
         return super()._init_captured_graph(gm, in_spec, out_spec)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -2,6 +2,7 @@
 import tempfile
 from pathlib import Path
 
+import pytest
 import yaml
 from _model_test_utils import _hf_model_dir_or_hub_id
 from click.testing import CliRunner
@@ -64,7 +65,8 @@ def run_benchmark(model_name: str, dataset_path: str, temp_dir: str):
     assert result.exit_code == 0
 
 
-def test_trtllm_bench(llm_root):  # noqa: F811
+@pytest.mark.parametrize("compile_backend", ["torch-compile", "torch-opt", "torch-cudagraph"])
+def test_trtllm_bench(llm_root, compile_backend):  # noqa: F811
     model_name = _hf_model_dir_or_hub_id(
         f"{llm_models_root()}/TinyLlama-1.1B-Chat-v1.0", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     )
@@ -74,8 +76,9 @@ def test_trtllm_bench(llm_root):  # noqa: F811
             yaml.dump(
                 {
                     "model_kwargs": {"num_hidden_layers": 2},
-                    "cuda_graph_batch_sizes": [1, 2],
+                    "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
                     "max_batch_size": 128,
+                    "compile_backend": compile_backend,
                 },
                 f,
             )