Cherry-pick #5947 (#5989)

lfr-0531 · web-flow · commit 7a1af1c7388b · 2025-07-16T01:33:12.000+09:00
Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1463,6 +1463,16 @@ def previous_seq_slots_device():
                                 previous_batch_len * self.max_beam_width].copy_(
                                     new_tokens.flatten(), non_blocking=True)
 
+        if (not self._disable_overlap_scheduler
+                and next_draft_tokens_device is None
+                and len(extend_requests) > 0):
+            # During warmup, for those generation requests, we don't have previous tensors,
+            # so we need to set the previous_pos_id_offsets and previous_kv_lens_offsets to zeros
+            # to skip the value changes in _preprocess_inputs. Otherwise, there will be illegal memory access
+            # when writing key/values to the KV cache.
+            self.previous_pos_id_offsets_cuda *= 0
+            self.previous_kv_lens_offsets_cuda *= 0
+
         position_ids = torch.tensor(position_ids,
                                     dtype=torch.int,
                                     pin_memory=True)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -23,6 +23,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]