Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
use_mrope: bool = False,
lora_params: Optional[dict] = None,
) -> None:
"""
Stores a CUDA graph and its associated input buffers.
Expand Down Expand Up @@ -68,6 +69,7 @@ def __init__(

self.attn_metadata = attn_metadata
self.spec_metadata = spec_metadata
self.lora_params = lora_params
self._output = None
self._graph = None
self.optional_extra_model_inputs = ["mrope_position_deltas"]
Expand All @@ -90,6 +92,9 @@ def capture(
"mrope_position_deltas": self.mrope_position_deltas,
}

if self.lora_params is not None:
inputs["lora_params"] = self.lora_params

# We have to do warm up runs to initialize PyTorch's
# internal states according to the docs:
# https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics
Expand Down
103 changes: 99 additions & 4 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
torch_dtype_to_str, trace_func)
from tensorrt_llm.inputs.multimodal import MultimodalParams
from tensorrt_llm.logger import logger
from tensorrt_llm.lora_manager import LoraConfig, LoraModelConfig
from tensorrt_llm.lora_manager import LoraConfig, LoraManager, LoraModelConfig
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Follow import style guideline; avoid symbol imports and name collisions.

Import the module and use qualified names to avoid colliding with bindings’ LoraConfig.

-from tensorrt_llm.lora_manager import LoraConfig, LoraManager, LoraModelConfig
+import tensorrt_llm.lora_manager as lora_mgr

And update usages, e.g.:

- self.lora_manager: Optional[LoraManager] = None
+ self.lora_manager: Optional[lora_mgr.LoraManager] = None
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
from tensorrt_llm.lora_manager import LoraConfig, LoraManager, LoraModelConfig
import tensorrt_llm.lora_manager as lora_mgr
🤖 Prompt for AI Agents
In tensorrt_llm/_torch/pyexecutor/model_engine.py around line 26, replace the
symbol import "from tensorrt_llm.lora_manager import LoraConfig, LoraManager,
LoraModelConfig" with a module import (e.g. "import tensorrt_llm.lora_manager as
lora_manager") and update all references in this file to use qualified names
(e.g. lora_manager.LoraConfig, lora_manager.LoraManager,
lora_manager.LoraModelConfig) to avoid symbol collisions and follow the import
style guideline.

from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models.modeling_utils import QuantAlgo
from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2
Expand Down Expand Up @@ -282,6 +282,15 @@ def __init__(
)

attn_backend = pytorch_backend_config.attn_backend
self.lora_manager: Optional[LoraManager] = None
if lora_config is not None:
self.lora_manager = LoraManager()

self.lora_prefetch_requests_list = None # TODO smor - fix "LoRARequest" import
if lora_config is not None and lora_config.lora_request is not None:
self.lora_prefetch_requests_list = lora_config.lora_request
self.has_lora_prefetched = False

Comment on lines +285 to +293
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Initialize has_lora_prefetched unconditionally; avoid AttributeError.

Currently only set when lora_request is not None but later read unguarded.

-        self.lora_manager: Optional[LoraManager] = None
+        self.lora_manager: Optional[lora_mgr.LoraManager] = None
         if lora_config is not None:
-            self.lora_manager = LoraManager()
+            self.lora_manager = lora_mgr.LoraManager()
 
-        self.lora_prefetch_requests_list = None  # TODO smor - fix "LoRARequest" import
-        if lora_config is not None and lora_config.lora_request is not None:
-            self.lora_prefetch_requests_list = lora_config.lora_request
-            self.has_lora_prefetched = False
+        self.lora_prefetch_requests_list = None  # LoRA prefetch requests (bindings executor side)
+        self.has_lora_prefetched = False
+        if lora_config is not None and getattr(lora_config, "lora_request", None):
+            self.lora_prefetch_requests_list = lora_config.lora_request
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
self.lora_manager: Optional[LoraManager] = None
if lora_config is not None:
self.lora_manager = LoraManager()
self.lora_prefetch_requests_list = None # TODO smor - fix "LoRARequest" import
if lora_config is not None and lora_config.lora_request is not None:
self.lora_prefetch_requests_list = lora_config.lora_request
self.has_lora_prefetched = False
self.lora_manager: Optional[lora_mgr.LoraManager] = None
if lora_config is not None:
self.lora_manager = lora_mgr.LoraManager()
self.lora_prefetch_requests_list = None # LoRA prefetch requests (bindings executor side)
self.has_lora_prefetched = False
if lora_config is not None and getattr(lora_config, "lora_request", None):
self.lora_prefetch_requests_list = lora_config.lora_request
🤖 Prompt for AI Agents
In tensorrt_llm/_torch/pyexecutor/model_engine.py around lines 285-293,
has_lora_prefetched is only set when lora_config.lora_request is not None which
leads to AttributeError when accessed later; initialize self.has_lora_prefetched
= False unconditionally (e.g., immediately after setting self.lora_manager) and
keep setting self.lora_prefetch_requests_list = lora_config.lora_request only
when present so the attribute always exists and is safely readable thereafter.

self.model = self._load_model(
model_path,
mapping=self.mapping,
Expand Down Expand Up @@ -427,6 +436,27 @@ def set_lora_model_config(self, lora_target_modules: list[str],
hidden_size=self.model.config.hidden_size,
dtype=torch_dtype_to_str(self.model.config.torch_dtype))

def set_lora_manager_cpp_peft_cache_manager(
self, resource_manager: ResourceManager):
cpp_peft_cache_manager = resource_manager.get_resource_manager(
ResourceManagerType.PEFT_CACHE_MANAGER)
if cpp_peft_cache_manager is not None and self.lora_manager is not None:
self.lora_manager.set_cpp_peft_cache_manager(
cpp_peft_cache_manager.impl)

Comment on lines +439 to +446
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Don’t reach into .impl; use the manager’s public API consistently.

This helper should set the cpp manager via a public setter, but consumers must then call the Python PeftCacheManager methods, not impl.

No change here; see refactor below in _maybe_get_cuda_graph to stop using impl.

🤖 Prompt for AI Agents
In tensorrt_llm/_torch/pyexecutor/model_engine.py around lines 439 to 446, the
helper currently reaches into cpp_peft_cache_manager.impl; update it to pass the
ResourceManager-returned object through the public API instead of accessing
.impl. Replace the .impl usage by calling
self.lora_manager.set_cpp_peft_cache_manager(cpp_peft_cache_manager) (keeping
the existing None checks) and add a short comment that consumers must interact
with the Python PeftCacheManager methods rather than its internal impl to avoid
direct implementation coupling.

def prefetch_lora_dirs(self):
if self.lora_prefetch_requests_list is None:
return

for request in self.lora_prefetch_requests_list:
self.lora_manager.load_from_ckpt(
[request.path],
model_config=self.lora_model_config,
runtime_mapping=None,
uids=[request.adapter_id])

self.has_lora_prefetched = True

@property
def use_mrope(self):
use_mrope = False
Expand Down Expand Up @@ -481,6 +511,16 @@ def warmup(self, resource_manager: ResourceManager) -> None:
self.cuda_graph_dummy_request = None

def get_cuda_graph_warmup_request(batch_size):
lora_config = None
if self.has_lora_prefetched:
# TODO smor currently I assume a single adapter with uid 0, change this
uid = 0
from tensorrt_llm.bindings import executor as tllm
lora_config = tllm.LoraConfig(
task_id=uid,
weights=self.lora_manager.cpp_lora_weights[uid],
config=self.lora_manager.cpp_lora_config[uid])

Comment on lines +514 to +523
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Avoid shadowing LoraConfig; remove hard-coded uid=0.

Use a distinct name for the bindings object and derive uid from prefetch requests.

-            lora_config = tllm.LoraConfig(
-                task_id=uid,
-                weights=self.lora_manager.cpp_lora_weights[uid],
-                config=self.lora_manager.cpp_lora_config[uid])
+            lora_binding = tllm.LoraConfig(
+                task_id=uid,
+                weights=self.lora_manager.cpp_lora_weights[uid],
+                config=self.lora_manager.cpp_lora_config[uid])

And when passing to add_dummy_requests:

-                    lora_request=
-                    lora_config,  # TODO smor- tests assume BS1 then this will be ignored for now, need to resolve
+                    lora_request=lora_binding,

Also, compute uid:

-                uid = 0
+                # Prefer the first prefetched adapter id
+                uid = getattr(self.lora_prefetch_requests_list[0], "adapter_id", 0)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
lora_config = None
if self.has_lora_prefetched:
# TODO smor currently I assume a single adapter with uid 0, change this
uid = 0
from tensorrt_llm.bindings import executor as tllm
lora_config = tllm.LoraConfig(
task_id=uid,
weights=self.lora_manager.cpp_lora_weights[uid],
config=self.lora_manager.cpp_lora_config[uid])
lora_config = None
if self.has_lora_prefetched:
# Prefer the first prefetched adapter id
uid = getattr(self.lora_prefetch_requests_list[0], "adapter_id", 0)
from tensorrt_llm.bindings import executor as tllm
lora_binding = tllm.LoraConfig(
task_id=uid,
weights=self.lora_manager.cpp_lora_weights[uid],
config=self.lora_manager.cpp_lora_config[uid])
# later, when enqueuing the dummy request:
self.add_dummy_requests(
# ... other parameters ...
lora_request=lora_binding,
# ... remaining parameters ...
)

available_blocks = kv_cache_manager.get_num_free_blocks()
if available_blocks >= batch_size:
result = ScheduledRequests()
Expand All @@ -492,6 +532,8 @@ def get_cuda_graph_warmup_request(batch_size):
is_gen=True,
max_num_draft_tokens=self.max_draft_len,
use_mrope=use_mrope,
lora_request=
lora_config, # TODO smor- tests assume BS1 then this will be ignored for now, need to resolve
)
available_tokens = kv_cache_manager.get_num_available_tokens(
self.max_draft_len)
Expand All @@ -505,6 +547,7 @@ def get_cuda_graph_warmup_request(batch_size):
is_gen=True,
max_num_draft_tokens=self.max_draft_len,
use_mrope=use_mrope,
lora_request=lora_config,
)[0]
# Add the longest request before all other seq_len=1 request to simulate the padding CUDA graph case.
# This batch contains both the longest request and the shortest requests,
Expand Down Expand Up @@ -863,7 +906,8 @@ def _round_up_batch_size(self, batch_size: int) -> int:
def _maybe_get_cuda_graph(
self,
batch: ScheduledRequests,
spec_config: Optional["DecodingBaseConfig"] = None
spec_config: Optional["DecodingBaseConfig"] = None,
resource_manager: Optional[ResourceManager] = None
) -> Optional[DecodingCUDAGraphRunner]:
Comment on lines 906 to 911
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix Ruff F821: DecodingBaseConfig undefined (even in quotes).

Gate a type-only import to satisfy static analysis without runtime dep.

 from typing import Any, Dict, Optional, Tuple
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from ..speculative.decoding_config import DecodingBaseConfig
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def _maybe_get_cuda_graph(
self,
batch: ScheduledRequests,
spec_config: Optional["DecodingBaseConfig"] = None
spec_config: Optional["DecodingBaseConfig"] = None,
resource_manager: Optional[ResourceManager] = None
) -> Optional[DecodingCUDAGraphRunner]:
from typing import Any, Dict, Optional, Tuple
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ..speculative.decoding_config import DecodingBaseConfig
🧰 Tools
🪛 Ruff (0.12.2)

909-909: Undefined name DecodingBaseConfig

(F821)

🤖 Prompt for AI Agents
In tensorrt_llm/_torch/pyexecutor/model_engine.py around lines 906 to 911, Ruff
F821 is raised because DecodingBaseConfig is referenced in type annotations but
not imported (even as a string); add a type-only import to satisfy static
analysis without introducing a runtime dependency: import TYPE_CHECKING from
typing at top of the file and then, inside an if TYPE_CHECKING: block, import
DecodingBaseConfig from the module where it is defined (replace with the correct
module path), leaving the runtime code unchanged so the annotation remains only
for type checking.

"""
Get a CUDA graph runner or return None (e.g. if CUDA graphs are disabled
Expand Down Expand Up @@ -908,8 +952,57 @@ def _maybe_get_cuda_graph(
else:
spec_metadata = None

lora_params = None

if self.has_lora_prefetched:
peft_cache_manager = resource_manager.get_resource_manager(
ResourceManagerType.PEFT_CACHE_MANAGER)

context_requests = batch.context_requests
generation_requests = batch.generation_requests

if len(context_requests) > 0 and len(generation_requests) > 0:
raise ValueError(
"SMOR, non empty context and generation requests isn't tested yet"
)

if len(context_requests) > 0:
raise ValueError("SMOR, context requests isn't tested yet")

if len(generation_requests) > 1:
raise ValueError("SMOR, generation requests isn't tested yet")

generation_request = generation_requests[0]
# TODO smor I have no idea why this is happening
generation_request.lora_weights = generation_request.lora_weights.reshape(
[1] + list(generation_request.lora_weights.shape))
generation_request.lora_config = generation_request.lora_config.reshape(
[1] + list(generation_request.lora_config.shape))
peft_cache_manager.impl.add_request_peft(generation_request, True)

py_lora_task_layer_module_configs = peft_cache_manager.impl.ensure_batch(
context_requests, generation_requests, False)
for req in context_requests:
req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
req.
py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None
for req in generation_requests:
req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
req.
py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None

Comment on lines +955 to +993
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

⚠️ Potential issue

LoRA PEFT setup inside CUDA-graph path is brittle: direct .impl access, ad-hoc reshapes, and hard errors.

  • Don’t reshape here; PeftCacheManager.prepare_resources already does.
  • Stop calling impl methods directly; use PeftCacheManager.add_request_peft/ensure_batch.
  • Replace raises with graceful fallback when encountering unsupported mixed batches.
  • Replace print with logger.
-        lora_params = None
-
-        if self.has_lora_prefetched:
-            peft_cache_manager = resource_manager.get_resource_manager(
-                ResourceManagerType.PEFT_CACHE_MANAGER)
-
-            context_requests = batch.context_requests
-            generation_requests = batch.generation_requests
-
-            if len(context_requests) > 0 and len(generation_requests) > 0:
-                raise ValueError(
-                    "SMOR, non empty context and generation requests isn't tested yet"
-                )
-
-            if len(context_requests) > 0:
-                raise ValueError("SMOR, context requests isn't tested yet")
-
-            if len(generation_requests) > 1:
-                raise ValueError("SMOR, generation requests isn't tested yet")
-
-            generation_request = generation_requests[0]
-            # TODO smor I have no idea why this is happening
-            generation_request.lora_weights = generation_request.lora_weights.reshape(
-                [1] + list(generation_request.lora_weights.shape))
-            generation_request.lora_config = generation_request.lora_config.reshape(
-                [1] + list(generation_request.lora_config.shape))
-            peft_cache_manager.impl.add_request_peft(generation_request, True)
-
-            py_lora_task_layer_module_configs = peft_cache_manager.impl.ensure_batch(
-                context_requests, generation_requests, False)
-            for req in context_requests:
-                req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
-                    req.
-                    py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None
-            for req in generation_requests:
-                req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
-                    req.
-                    py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None
-
-            # TODO smor - look at get lora params from requests
-            # You need something that isn't scheduled requests
-            # It also appears that you should make sure resource manager is called, because prefetch
-            # has to be added to peftCacheManager as well. So it still shouldn't work
-
-            lora_params = self._get_lora_params_from_requests(
-                batch, attn_metadata)
-            print(f"SMOR, not failed on lora_params in maybe_get_cuda_graph")
+        lora_params = None
+        if self.has_lora_prefetched:
+            peft_cache_manager = resource_manager.get_resource_manager(
+                ResourceManagerType.PEFT_CACHE_MANAGER)
+            if peft_cache_manager is None:
+                logger.debug("LoRA prefetched, but no PEFT cache manager present; skipping LoRA for graphs.")
+            else:
+                # Only generation-only batches are CUDA-graphable today.
+                if len(batch.context_requests) == 0 and len(batch.generation_requests) >= 1:
+                    for req in batch.generation_requests:
+                        peft_cache_manager.add_request_peft(req)
+                    py_cfgs = peft_cache_manager.ensure_batch(
+                        batch.context_requests, batch.generation_requests, reset_gpu_cache=False)
+                    for req in batch.generation_requests:
+                        req.py_lora_task_layer_module_configs = py_cfgs.get(req.py_request_id)
+                    lora_params = self._get_lora_params_from_requests(batch, attn_metadata)
+                else:
+                    logger.debug("LoRA + CUDA graph currently supports generation-only batches; skipping LoRA params.")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
lora_params = None
if self.has_lora_prefetched:
peft_cache_manager = resource_manager.get_resource_manager(
ResourceManagerType.PEFT_CACHE_MANAGER)
context_requests = batch.context_requests
generation_requests = batch.generation_requests
if len(context_requests) > 0 and len(generation_requests) > 0:
raise ValueError(
"SMOR, non empty context and generation requests isn't tested yet"
)
if len(context_requests) > 0:
raise ValueError("SMOR, context requests isn't tested yet")
if len(generation_requests) > 1:
raise ValueError("SMOR, generation requests isn't tested yet")
generation_request = generation_requests[0]
# TODO smor I have no idea why this is happening
generation_request.lora_weights = generation_request.lora_weights.reshape(
[1] + list(generation_request.lora_weights.shape))
generation_request.lora_config = generation_request.lora_config.reshape(
[1] + list(generation_request.lora_config.shape))
peft_cache_manager.impl.add_request_peft(generation_request, True)
py_lora_task_layer_module_configs = peft_cache_manager.impl.ensure_batch(
context_requests, generation_requests, False)
for req in context_requests:
req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
req.
py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None
for req in generation_requests:
req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
req.
py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None
lora_params = None
if self.has_lora_prefetched:
peft_cache_manager = resource_manager.get_resource_manager(
ResourceManagerType.PEFT_CACHE_MANAGER)
if peft_cache_manager is None:
logger.debug(
"LoRA prefetched, but no PEFT cache manager present; "
"skipping LoRA for graphs."
)
else:
# Only generation-only batches are CUDA-graphable today.
if len(batch.context_requests) == 0 and len(batch.generation_requests) >= 1:
for req in batch.generation_requests:
peft_cache_manager.add_request_peft(req)
py_cfgs = peft_cache_manager.ensure_batch(
batch.context_requests,
batch.generation_requests,
reset_gpu_cache=False
)
for req in batch.generation_requests:
req.py_lora_task_layer_module_configs = (
py_cfgs.get(req.py_request_id)
)
lora_params = self._get_lora_params_from_requests(
batch, attn_metadata
)
else:
logger.debug(
"LoRA + CUDA graph currently supports generation-only "
"batches; skipping LoRA params."
)

# TODO smor - look at get lora params from requests
# You need something that isn't scheduled requests
# It also appears that you should make sure resource manager is called, because prefetch
# has to be added to peftCacheManager as well. So it still shouldn't work

lora_params = self._get_lora_params_from_requests(
batch, attn_metadata)
print(f"SMOR, not failed on lora_params in maybe_get_cuda_graph")

self._cuda_graphs[batch_size] = DecodingCUDAGraphRunner(
batch_size, "cuda", attn_metadata, spec_metadata, self.use_mrope)
batch_size, "cuda", attn_metadata, spec_metadata, self.use_mrope,
lora_params)
return self._cuda_graphs[batch_size]

def __del__(self) -> None:
Expand Down Expand Up @@ -2040,7 +2133,9 @@ def forward(
with self._maybe_pad_batch(scheduled_requests,
kv_cache_manager) as scheduled_requests:
maybe_graph = self._maybe_get_cuda_graph(
scheduled_requests, spec_config=self.spec_config)
scheduled_requests,
spec_config=self.spec_config,
resource_manager=resource_manager)
if maybe_graph is not None:
attn_metadata = maybe_graph.attn_metadata
if self.is_spec_decode:
Expand Down
6 changes: 6 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,9 @@ def __init__(self,
self.inflight_req_ids = ReqIdsSet()
self.canceled_req_ids = []

self.model_engine.set_lora_manager_cpp_peft_cache_manager(
self.resource_manager)
self.model_engine.prefetch_lora_dirs()
self.model_engine.warmup(self.resource_manager)
if self.draft_model_engine is not None:
self.draft_model_engine.warmup(self.resource_manager)
Expand Down Expand Up @@ -316,6 +319,9 @@ def __init__(self,
if start_worker:
self.start_worker()

def get_lora_manager(self):
return self.model_engine.lora_manager

def _event_loop_wrapper(self):
try:
with customized_gc_thresholds(
Expand Down
17 changes: 16 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ def add_dummy_requests(
prepare_resource: bool = True,
max_num_draft_tokens: int = 0,
use_mrope: bool = False,
lora_request=None,
):
Comment on lines +376 to 377
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Type and API contract for lora_request are unclear; initialize and type it.

Document/annotate the expected type (bindings executor LoraConfig-like) and make has_lora_prefetched paths robust to None. Also initialize has_lora_prefetched at engine level to avoid AttributeError elsewhere.

Apply:

-        lora_request=None,
+        lora_request: Optional[object] = None,

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In tensorrt_llm/_torch/pyexecutor/resource_manager.py around lines 376-377, the
lora_request parameter is untyped and the code assumes an engine attribute
has_lora_prefetched exists causing potential AttributeError; annotate
lora_request with the expected type (e.g., a LoraConfig-like binding/executor
object or Optional[LoraConfig]) in the function signature and add a short
docstring/comment describing the expected fields, initialize
engine.has_lora_prefetched = False when engines are created/registered so the
attribute always exists, and update any checks like if
engine.has_lora_prefetched or engine.has_lora_prefetched == True to safely
handle None (use bool(engine.has_lora_prefetched) or explicit is True) and guard
code paths with None-safe checks before accessing lora_request properties.

beam_width = 1 # TODO: more than 1 beam?
requests = []
Expand All @@ -389,14 +390,28 @@ def add_dummy_requests(
# Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek
mrope_position_deltas = torch.zeros(
1, device="cuda", dtype=torch.int32) if use_mrope else None

lora_task_id = None
lora_weights = None
lora_config = None

if lora_request is not None:
# TODO smor currently work with single adapter only, not sure how this should work with request ids
lora_task_id = lora_request.task_id
lora_weights = lora_request.weights
lora_config = lora_request.config

Comment on lines +394 to +403
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Guard against missing fields and wrong shapes on lora_request.

Accessing task_id/weights/config blindly risks AttributeError; torch reshape later expects tensors. Add duck-typing checks and early validation.

-            lora_task_id = None
-            lora_weights = None
-            lora_config = None
-
-            if lora_request is not None:
-                # TODO smor currently work with single adapter only, not sure how this should work with request ids
-                lora_task_id = lora_request.task_id
-                lora_weights = lora_request.weights
-                lora_config = lora_request.config
+            lora_task_id = None
+            lora_weights = None
+            lora_config = None
+            if lora_request is not None:
+                # Single-adapter warmup; multi-adapter not yet supported.
+                if not all(hasattr(lora_request, a) for a in ("task_id", "weights", "config")):
+                    raise TypeError("lora_request must expose task_id, weights, and config")
+                lora_task_id = int(lora_request.task_id)
+                lora_weights = lora_request.weights
+                lora_config = lora_request.config
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
lora_task_id = None
lora_weights = None
lora_config = None
if lora_request is not None:
# TODO smor currently work with single adapter only, not sure how this should work with request ids
lora_task_id = lora_request.task_id
lora_weights = lora_request.weights
lora_config = lora_request.config
lora_task_id = None
lora_weights = None
lora_config = None
if lora_request is not None:
# Single-adapter warmup; multi-adapter not yet supported.
if not all(hasattr(lora_request, a) for a in ("task_id", "weights", "config")):
raise TypeError("lora_request must expose task_id, weights, and config")
lora_task_id = int(lora_request.task_id)
lora_weights = lora_request.weights
lora_config = lora_request.config
🤖 Prompt for AI Agents
In tensorrt_llm/_torch/pyexecutor/resource_manager.py around lines 394 to 403,
the code accesses lora_request.task_id, .weights and .config without validation
which can raise AttributeError or later cause tensor reshape errors; update this
block to first duck-type-check that lora_request has the attributes (hasattr or
try/except AttributeError), verify lora_request.weights is not None and is a
torch.Tensor or convertable to one, check its ndim/shape matches the expected
shape before any reshape and raise a clear ValueError if not, and validate that
lora_request.config contains required keys/types (or set sensible defaults) so
downstream code doesn’t assume missing fields. Ensure any early-return or error
message includes the problematic field and expected shape/type.

req = LlmRequest(request_id=req_id,
max_new_tokens=1,
input_tokens=[1] * token_num,
sampling_config=SamplingConfig(
sampling_params._get_sampling_config()),
is_streaming=False,
mrope_position_deltas=mrope_position_deltas,
encoder_input_tokens=encoder_input_tokens)
encoder_input_tokens=encoder_input_tokens,
lora_task_id=lora_task_id,
lora_weights=lora_weights,
lora_config=lora_config)
req.is_dummy_request = True
req.paged_kv_block_ids = []
if prepare_resource:
Expand Down
7 changes: 1 addition & 6 deletions tensorrt_llm/executor/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,7 @@ def _create_engine():

if getattr(executor_config, "backend",
"") == "pytorch" and lora_config is not None:
from tensorrt_llm._torch.pyexecutor.resource_manager import \
ResourceManagerType
peft_cache_manager = self.engine.resource_manager.resource_managers.get(
ResourceManagerType.PEFT_CACHE_MANAGER)
self._lora_manager = LoraManager(
cpp_peft_cache_manager=peft_cache_manager.impl)
self._lora_manager = self.engine.get_lora_manager()
lora_model_config = self.engine.model_engine.lora_model_config
assert lora_model_config is not None
self._lora_model_config = lora_model_config
Expand Down
8 changes: 7 additions & 1 deletion tensorrt_llm/lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union

import numpy as np
import torch
Expand Down Expand Up @@ -146,6 +146,7 @@ class LoraConfig(DictConversion):
trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict)
max_loras: int = 4
max_cpu_loras: int = 4
lora_request: Optional[List[Any]] = None # TODO smor fix

Comment on lines +149 to 150
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Avoid Any for LoraConfig.lora_request; use typed forward reference to LoRARequest.

Prevents loss of type-safety and documents intent, while avoiding import cycles with TYPE_CHECKING.

-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
@@
 if TYPE_CHECKING:
     from .runtime import ModelConfig
+    from .executor.request import LoRARequest
@@
-    lora_request: Optional[List[Any]] = None  # TODO smor fix
+    lora_request: Optional[List["LoRARequest"]] = None

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In tensorrt_llm/lora_manager.py around lines 149-150, replace the liberal Any
annotation for lora_request with a typed forward reference to LoRARequest to
preserve type-safety and intent; change the type to
Optional[List["LoRARequest"]], add from typing import TYPE_CHECKING at the top
and under if TYPE_CHECKING: import LoRARequest from its module (or appropriate
path) so the runtime import cycle is avoided while static type checkers see the
real type.

def __post_init__(self):
assert self.lora_ckpt_source in ["hf", "nemo"], (
Expand Down Expand Up @@ -483,6 +484,11 @@ def __init__(
self._cpp_lora_weights: Dict[str, torch.Tensor] = {} # on cpu
self._cpp_lora_config: Dict[str, torch.Tensor] = {} # on cpu
self.lora_target_modules: List[str] = []
self._cpp_peft_cache_manager: Optional[tb_internal.batch_manager.PeftCacheManager] = None

def set_cpp_peft_cache_manager(
self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager
):
self._cpp_peft_cache_manager = cpp_peft_cache_manager

def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool:
Expand Down
31 changes: 31 additions & 0 deletions tests/unittest/llmapi/test_llm_pytorch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from tensorrt_llm import LLM
from tensorrt_llm.llmapi.llm_args import CudaGraphConfig
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.sampling_params import SamplingParams

Expand Down Expand Up @@ -430,3 +431,33 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
lora_request=lora_requests)

assert len(outputs) == 2


def test_lora_dir_with_graph():
lora_req = LoRARequest(
"task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1")

lora_config = LoraConfig(
lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
max_lora_rank=8,
lora_request=[lora_req])

llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
lora_config=lora_config,
cuda_graph_config=CudaGraphConfig(max_batch_size=1))
# cuda_graph_config=None)

prompts = [
"美国的首都在哪里? \n答案:",
]
references = [
"美国的首都是华盛顿。\n\n美国的",
]
sampling_params = SamplingParams(max_tokens=20)
lora_request = [lora_req]

outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)

assert similar(outputs[0].outputs[0].text, references[0])
print(f"lora output: {outputs[0].outputs[0].text}")
print(f"ref output: {references[0]}")
Comment on lines +436 to +463
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Ensure resource cleanup, avoid redundant config, and gate by memory.

  • Always shutdown LLM in finally to prevent resource leaks.
  • Avoid providing LoRA adapter both in LoraConfig and per-generate arg; the per-call lora_request is sufficient here.
  • Align with other 7B tests by adding the 40GB memory guard.
-@pytest.mark.parametrize(
+# keep above tests unchanged
@@
-def test_lora_dir_with_graph():
+@skip_gpu_memory_less_than_40gb
+def test_lora_dir_with_graph():
@@
-    lora_config = LoraConfig(
-        lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
-        max_lora_rank=8,
-        lora_request=[lora_req])
+    lora_config = LoraConfig(
+        lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
+        max_lora_rank=8)
@@
-    llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
-              lora_config=lora_config,
-              cuda_graph_config=CudaGraphConfig(max_batch_size=1))
+    llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
+              lora_config=lora_config,
+              cuda_graph_config=CudaGraphConfig(max_batch_size=1))
@@
-    outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
-
-    assert similar(outputs[0].outputs[0].text, references[0])
-    print(f"lora output: {outputs[0].outputs[0].text}")
-    print(f"ref  output: {references[0]}")
+    try:
+        outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
+        assert similar(outputs[0].outputs[0].text, references[0])
+        print(f"lora output: {outputs[0].outputs[0].text}")
+        print(f"ref  output: {references[0]}")
+    finally:
+        llm.shutdown()

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In tests/unittest/llmapi/test_llm_pytorch.py around lines 436-463, ensure the
test guards memory, avoids redundant LoRA specification, and always cleans up
the LLM: add the same 40GB memory guard used by other 7B tests at the top of the
test and return/skip if not met; construct the LoraConfig without duplicating
per-call adapters (remove lora_request from LoraConfig and keep the single
lora_request passed into llm.generate, or alternatively remove the per-call
lora_request and keep it only in LoraConfig — pick one approach and make them
consistent); wrap LLM usage in try/finally and call llm.shutdown() in the
finally block to guarantee resource cleanup even on assertion failures or
exceptions.