diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index a8335150207..b6c3a68930d 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -763,8 +763,14 @@ def prepare(self) -> None: self.kv_cache_block_offsets[:, :self.num_seqs].copy_( self.host_kv_cache_block_offsets[:, :self.num_seqs], non_blocking=True) + + error_message = ( + f"The max KV cache length of input sequences ({self.kv_lens[:self.num_seqs].max()}) " + f"exceeds the KV cache manager's maximum supported length " + f"({self.kv_cache_manager.max_seq_len}).") + assert self.kv_lens[:self.num_seqs].max( - ) <= self.kv_cache_manager.max_seq_len, f"Please set max_seq_len to at least {self.kv_lens[:self.num_seqs].max()} for kv cache manager." + ) <= self.kv_cache_manager.max_seq_len, error_message self.kv_lens_cuda_runtime = self.kv_lens_cuda[:self.num_seqs] self.kv_lens_runtime = self.kv_lens[:self.num_seqs]