Revert "TRTLLM-6142: set torch recompile_limit based on cuda_graph_batch_sizes and refactored (#119)"

lucaslie · lucaslie · commit 5e06672d1ae9 · 2025-07-31T07:57:05.000-07:00
This reverts commit 2c81f8a.
diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py
@@ -23,6 +23,9 @@
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
 
+# Global torch config, set the torch compile cache to fix up to llama 405B
+torch._dynamo.config.cache_size_limit = 20
+
 
 class PromptConfig(BaseModel):
     """Prompt configuration.
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_compile.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_compile.py
@@ -3,19 +3,11 @@
 import torch
 import torch.nn as nn
 
-from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
-
 from ..compiler import BackendCompiler, BackendRegistry
 
 
 @BackendRegistry.register("torch-compile")
 class TorchCompileCompiler(BackendCompiler):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # Global torch config, set the torch compile cache to fix up to llama 405B
-        torch._dynamo.config.cache_size_limit = 20
-        ad_logger.info(f"Setting cache size limit to {torch._dynamo.config.cache_size_limit}")
-
     def compile(self) -> nn.Module:
         """Compile the model using torch.compile."""
         return torch.compile(self.gm, dynamic=True)
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py
@@ -18,23 +18,27 @@ def __init__(
         model: nn.Module,
         in_spec: TreeSpec,
         out_spec: TreeSpec,
-        cuda_graph_batch_sizes: List[int],
+        max_batch_size: int,
+        cuda_graph_batch_sizes: List[int] = None,
         num_batched_inputs: Optional[int] = 1,  # number of batched, dynamic inputs...
     ):
         super().__init__()
         self._in_spec = in_spec
         self._out_spec = out_spec
         self.model = model
-        self.max_batch_size = max(cuda_graph_batch_sizes)
-        ad_logger.info(f"Setting max batch size to {self.max_batch_size}")
+        self.max_batch_size = max_batch_size
         self.num_batched_inputs = num_batched_inputs if num_batched_inputs is not None else 1
         self.graphs: Dict[Tuple[int, ...], CUDAGraph] = {}
         self._input_buffers: List[torch.Tensor] = [
             torch.empty(0, 1) for _ in range(self.num_batched_inputs)
         ]
         self._out_buffer_flat: List[torch.Tensor] = None
         self._args_hash: Optional[Tuple[int, ...]] = None
-        self.cuda_graph_batch_sizes = sorted(cuda_graph_batch_sizes, reverse=True)
+        self.cuda_graph_batch_sizes = (
+            sorted(cuda_graph_batch_sizes, reverse=True)
+            if cuda_graph_batch_sizes is not None
+            else self._get_graph_batch_sizes(self.max_batch_size)
+        )
         self._cuda_graph_mem_pool = None
 
     def _get_hash(self, flat_args: List[Any]) -> Tuple[int, ...]:
@@ -73,6 +77,20 @@ def _capture_one_graph(self, *args, **kwargs) -> torch.cuda.CUDAGraph:
         self._cuda_graph_mem_pool = self._cuda_graph_mem_pool or graph.pool()
         return graph
 
+    @staticmethod
+    def _get_graph_batch_sizes(
+        max_bs: int, extra: Optional[List[int]] = None, multiplier: int = 128
+    ) -> List[int]:
+        """Heuristic to set batch sizes for graph capture."""
+        # do 1, max_bs, and extra as special batch sizes
+        batch_sizes = {1, max_bs, *(extra or [])}
+
+        # add all multiples of multiplier up to max_bs
+        batch_sizes.update(range(multiplier, max_bs + 1, multiplier))
+
+        # return as sorted list
+        return sorted(batch_sizes, reverse=True)
+
     def capture_graph(self, *args, **kwargs):
         """Capture and pre-fetch the graph for variable batch size."""
         # flatten args, kwargs
@@ -159,21 +177,15 @@ def forward(self, *args, **kwargs) -> Any:
 class TorchCudagraphCompiler(BackendCompiler):
     """Compiler that uses only CUDA graphs."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.cuda_graph_batch_sizes = self.compiler_kwargs.get("cuda_graph_batch_sizes")
-        if not self.cuda_graph_batch_sizes:
-            self.cuda_graph_batch_sizes = self._get_graph_batch_sizes(self.max_batch_size)
-        ad_logger.info(f"Setting cuda_graph_batch_sizes to {self.cuda_graph_batch_sizes}")
-
     def _init_captured_graph(
         self, gm: nn.Module, in_spec: TreeSpec, out_spec: TreeSpec
     ) -> CapturedGraph:
         return CapturedGraph(
             gm,
             in_spec=in_spec,
             out_spec=out_spec,
-            cuda_graph_batch_sizes=self.cuda_graph_batch_sizes,
+            max_batch_size=self.max_batch_size,
+            cuda_graph_batch_sizes=self.compiler_kwargs.get("cuda_graph_batch_sizes"),
             num_batched_inputs=self.compiler_kwargs.get("num_batched_inputs"),
         )
 
@@ -186,17 +198,3 @@ def compile(self) -> CapturedGraph:
             captured_model.capture_graph(*self.args, **self.kwargs)
 
         return captured_model
-
-    @staticmethod
-    def _get_graph_batch_sizes(
-        max_bs: int, extra: Optional[List[int]] = None, multiplier: int = 128
-    ) -> List[int]:
-        """Heuristic to set batch sizes for graph capture."""
-        # do 1, max_bs, and extra as special batch sizes
-        batch_sizes = {1, max_bs, *(extra or [])}
-
-        # add all multiples of multiplier up to max_bs
-        batch_sizes.update(range(multiplier, max_bs + 1, multiplier))
-
-        # return as sorted list
-        return sorted(batch_sizes, reverse=True)
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py
@@ -2,8 +2,6 @@
 
 import torch
 
-from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
-
 from ..compiler import BackendRegistry
 from .torch_cudagraph import CapturedGraph, TorchCudagraphCompiler
 
@@ -12,17 +10,6 @@
 class TorchOptCompiler(TorchCudagraphCompiler):
     """Compiler that uses both torch.compile and CUDA graphs."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        torch._dynamo.config.recompile_limit = max(
-            len(self.cuda_graph_batch_sizes), torch._dynamo.config.recompile_limit
-        )
-        ad_logger.info(f"Setting recompile limit to {torch._dynamo.config.recompile_limit}")
-
-        # Global torch config, set the torch compile cache to fix up to llama 405B
-        torch._dynamo.config.cache_size_limit = 20
-        ad_logger.info(f"Setting cache size limit to {torch._dynamo.config.cache_size_limit}")
-
     def _init_captured_graph(self, gm, in_spec, out_spec) -> CapturedGraph:
         gm = torch.compile(gm, dynamic=True)
         return super()._init_captured_graph(gm, in_spec, out_spec)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -2,7 +2,6 @@
 import tempfile
 from pathlib import Path
 
-import pytest
 import yaml
 from _model_test_utils import _hf_model_dir_or_hub_id
 from click.testing import CliRunner
@@ -65,8 +64,7 @@ def run_benchmark(model_name: str, dataset_path: str, temp_dir: str):
     assert result.exit_code == 0
 
 
-@pytest.mark.parametrize("compile_backend", ["torch-compile", "torch-opt", "torch-cudagraph"])
-def test_trtllm_bench(llm_root, compile_backend):  # noqa: F811
+def test_trtllm_bench(llm_root):  # noqa: F811
     model_name = _hf_model_dir_or_hub_id(
         f"{llm_models_root()}/TinyLlama-1.1B-Chat-v1.0", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     )
@@ -76,9 +74,8 @@ def test_trtllm_bench(llm_root, compile_backend):  # noqa: F811
             yaml.dump(
                 {
                     "model_kwargs": {"num_hidden_layers": 2},
-                    "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
+                    "cuda_graph_batch_sizes": [1, 2],
                     "max_batch_size": 128,
-                    "compile_backend": compile_backend,
                 },
                 f,
             )