NVIDIA
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
Lines changed: 4 additions & 4 deletions b/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/executor/request.cpp
Lines changed: 15 additions & 4 deletions b/‎cpp/tensorrt_llm/nanobind/executor/request.cpp
Lines changed: 15 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
Lines changed: 2 additions & 4 deletions b/‎cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
Lines changed: 2 additions & 4 deletions
diff --git a/‎jenkins/L0_MergeRequest.groovy
Lines changed: 46 additions & 74 deletions b/‎jenkins/L0_MergeRequest.groovy
Lines changed: 46 additions & 74 deletions
diff --git a/‎jenkins/L0_Test.groovy
Lines changed: 19 additions & 16 deletions b/‎jenkins/L0_Test.groovy
Lines changed: 19 additions & 16 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor.py
Lines changed: 4 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor.py
Lines changed: 4 additions & 0 deletions
@@ -57,8 +57,8 @@ void initBindings(nb::module_& m)
     using GenLlmReq = tb::GenericLlmRequest<runtime::ITensor::SharedPtr>;
 
     // Create and register exceptions in module scope
-    nb::exception<tb::PeftTaskNotCachedException>(m, "PeftTaskNotCachedException");
-    nb::exception<tr::LoraCacheFullException>(m, "LoraCacheFullException");
+    static nb::object peft_exc = nb::exception<tb::PeftTaskNotCachedException>(m, "PeftTaskNotCachedException");
+    static nb::object lora_exc = nb::exception<tr::LoraCacheFullException>(m, "LoraCacheFullException");
 
     // Register with no captures
     nb::register_exception_translator(
@@ -71,11 +71,11 @@ void initBindings(nb::module_& m)
             }
             catch (const tb::PeftTaskNotCachedException& e)
             {
-                PyErr_SetString(nb::type<tb::PeftTaskNotCachedException>().ptr(), e.what());
+                PyErr_SetString(peft_exc.ptr(), e.what());
             }
             catch (const tr::LoraCacheFullException& e)
             {
-                PyErr_SetString(nb::type<tr::LoraCacheFullException>().ptr(), e.what());
+                PyErr_SetString(lora_exc.ptr(), e.what());
             }
         });
 
 
@@ -210,10 +210,21 @@ void initRequestBindings(nb::module_& m)
             nb::cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(state[6]));
     };
     nb::class_<tle::OutputConfig>(m, "OutputConfig")
-        .def(nb::init<bool, bool, bool, bool, bool, bool, std::optional<std::vector<tle::AdditionalModelOutput>>>(),
-            nb::arg("return_log_probs").none() = false, nb::arg("return_context_logits") = false,
-            nb::arg("return_generation_logits") = false, nb::arg("exclude_input_from_output") = false,
-            nb::arg("return_encoder_output") = false, nb::arg("return_perf_metrics") = false,
+        .def(
+            "__init__",
+            [](tle::OutputConfig& self, std::optional<bool> return_log_probs, std::optional<bool> return_context_logits,
+                std::optional<bool> return_generation_logits, std::optional<bool> exclude_input_from_output,
+                std::optional<bool> return_encoder_output, std::optional<bool> return_perf_metrics,
+                std::optional<std::vector<tle::AdditionalModelOutput>> additional_model_outputs)
+            {
+                new (&self) tle::OutputConfig(return_log_probs.value_or(false), return_context_logits.value_or(false),
+                    return_generation_logits.value_or(false), exclude_input_from_output.value_or(false),
+                    return_encoder_output.value_or(false), return_perf_metrics.value_or(false),
+                    additional_model_outputs);
+            },
+            nb::arg("return_log_probs") = nb::none(), nb::arg("return_context_logits") = nb::none(),
+            nb::arg("return_generation_logits") = nb::none(), nb::arg("exclude_input_from_output") = nb::none(),
+            nb::arg("return_encoder_output") = nb::none(), nb::arg("return_perf_metrics") = nb::none(),
             nb::arg("additional_model_outputs") = nb::none())
         .def_rw("return_log_probs", &tle::OutputConfig::returnLogProbs)
         .def_rw("return_context_logits", &tle::OutputConfig::returnContextLogits)
 
@@ -424,7 +424,7 @@ void initConfigBindings(pybind11::module_& m)
         .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI)
         .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX)
         .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL)
-        .def(py::init(
+        .def("from_string",
             [](std::string const& str)
             {
                 if (str == "DEFAULT" || str == "default")
@@ -436,9 +436,7 @@ void initConfigBindings(pybind11::module_& m)
                 if (str == "NIXL" || str == "nixl")
                     return tle::CacheTransceiverConfig::BackendType::NIXL;
                 throw std::runtime_error("Invalid backend type: " + str);
-            }));
-
-    py::implicitly_convertible<std::string, tle::CacheTransceiverConfig::BackendType>();
+            });
 
     py::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
         .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
 
@@ -105,15 +105,13 @@ def EXTRA_STAGE_LIST = "extra_stage"
 @Field
 def MULTI_GPU_FILE_CHANGED = "multi_gpu_file_changed"
 @Field
-def ONLY_PYTORCH_FILE_CHANGED = "only_pytorch_file_changed"
+def ONLY_ONE_GROUP_CHANGED = "only_one_group_changed"
 @Field
 def AUTO_TRIGGER_TAG_LIST = "auto_trigger_tag_list"
 @Field
 def DEBUG_MODE = "debug"
 @Field
 def DETAILED_LOG = "detailed_log"
-@Field
-def ONLY_DOCS_FILE_CHANGED = "only_docs_file_changed"
 
 def testFilter = [
     (REUSE_STAGE_LIST): trimForStageList(gitlabParamsFromBot.get(REUSE_STAGE_LIST, null)?.tokenize(',')),
@@ -127,11 +125,10 @@ def testFilter = [
     (DISABLE_MULTI_GPU_TEST): gitlabParamsFromBot.get((DISABLE_MULTI_GPU_TEST), false),
     (EXTRA_STAGE_LIST): trimForStageList(gitlabParamsFromBot.get((EXTRA_STAGE_LIST), null)?.tokenize(',')),
     (MULTI_GPU_FILE_CHANGED): false,
-    (ONLY_PYTORCH_FILE_CHANGED): false,
+    (ONLY_ONE_GROUP_CHANGED): "",
     (DEBUG_MODE): gitlabParamsFromBot.get(DEBUG_MODE, false),
     (AUTO_TRIGGER_TAG_LIST): [],
     (DETAILED_LOG): gitlabParamsFromBot.get(DETAILED_LOG, false),
-    (ONLY_DOCS_FILE_CHANGED): false,
 ]
 
 String reuseBuild = gitlabParamsFromBot.get('reuse_build', null)
@@ -324,9 +321,8 @@ def setupPipelineEnvironment(pipeline, testFilter, globalVars)
         echo "Env.gitlabMergeRequestLastCommit: ${env.gitlabMergeRequestLastCommit}."
         echo "Freeze GitLab commit. Branch: ${env.gitlabBranch}. Commit: ${env.gitlabCommit}."
         testFilter[(MULTI_GPU_FILE_CHANGED)] = getMultiGpuFileChanged(pipeline, testFilter, globalVars)
-        testFilter[(ONLY_PYTORCH_FILE_CHANGED)] = getOnlyPytorchFileChanged(pipeline, testFilter, globalVars)
+        testFilter[(ONLY_ONE_GROUP_CHANGED)] = getOnlyOneGroupChanged(pipeline, testFilter, globalVars)
         testFilter[(AUTO_TRIGGER_TAG_LIST)] = getAutoTriggerTagList(pipeline, testFilter, globalVars)
-        testFilter[(ONLY_DOCS_FILE_CHANGED)] = getOnlyDocsFileChanged(pipeline, testFilter, globalVars)
         getContainerURIs().each { k, v ->
             globalVars[k] = v
         }
@@ -644,86 +640,62 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
     return relatedFileChanged
 }
 
-def getOnlyPytorchFileChanged(pipeline, testFilter, globalVars) {
+def getOnlyOneGroupChanged(pipeline, testFilter, globalVars) {
     def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
     if (env.alternativeTRT || isOfficialPostMergeJob) {
-        pipeline.echo("Force set ONLY_PYTORCH_FILE_CHANGED false.")
-        return false
+        pipeline.echo("Force set ONLY_ONE_GROUP_CHANGED \"\".")
+        return ""
     }
-    def pytorchOnlyList = [
-        "tensorrt_llm/_torch/",
-        "tensorrt_llm/scaffolding/",
-        "tests/unittest/_torch/",
-        "tests/unittest/scaffolding/",
-        "tests/unittest/llmapi/test_llm_pytorch.py",
-        "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py",
-        "tests/integration/defs/accuracy/test_llm_api_pytorch.py",
-        "tests/integration/defs/disaggregated/",
-        "examples/auto_deploy",
-        "examples/disaggregated",
-        "examples/pytorch/",
-        "examples/scaffolding/",
-        "docs/"
+    def groupFileMap = [
+        "Docs": [ // TODO: Add more docs path to the list, e.g. *.md files in other directories
+            "docs/",
+        ],
+        "PyTorch": [
+            "tensorrt_llm/_torch/",
+            "tensorrt_llm/scaffolding/",
+            "tests/unittest/_torch/",
+            "tests/unittest/scaffolding/",
+            "tests/unittest/llmapi/test_llm_pytorch.py",
+            "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py",
+            "tests/integration/defs/accuracy/test_llm_api_pytorch.py",
+            "tests/integration/defs/disaggregated/",
+            "examples/auto_deploy",
+            "examples/disaggregated",
+            "examples/pytorch/",
+            "examples/scaffolding/",
+            "docs/",
+        ],
+        "Triton": [
+            "tests/integration/defs/triton_server/",
+            "triton_backend/",
+        ],
     ]
 
     def changedFileList = getMergeRequestChangedFileList(pipeline, globalVars)
-
     if (!changedFileList || changedFileList.isEmpty()) {
-        return false
+        return ""
     }
 
-    def result = true
-    for (file in changedFileList) {
-        def isPytorchFile = false
-        for (prefix in pytorchOnlyList) {
-            if (file.startsWith(prefix)) {
-                isPytorchFile = true
-                break
-            }
+    for (group in groupFileMap.keySet()) {
+        def groupPrefixes = groupFileMap[group]
+        def allFilesInGroup = changedFileList.every { file ->
+            groupPrefixes.any { prefix -> file.startsWith(prefix) }
         }
-        if (!isPytorchFile) {
-            pipeline.echo("Found non-PyTorch file: ${file}")
-            result = false
-            break
-        }
-    }
-
-    pipeline.echo("Only PyTorch files changed: ${result}")
-    return result
-}
-
-def getOnlyDocsFileChanged(pipeline, testFilter, globalVars) {
-    def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
-    if (env.alternativeTRT || isOfficialPostMergeJob) {
-        pipeline.echo("Force set ONLY_DOCS_FILE_CHANGED false.")
-        return false
-    }
-
-    // TODO: Add more docs path to the list, e.g. *.md files in other directories
-    def docsFileList = [
-        "docs/",
-    ]
-
-    def changedFileList = getMergeRequestChangedFileList(pipeline, globalVars)
-    if (!changedFileList || changedFileList.isEmpty()) {
-        return false
-    }
 
-    for (file in changedFileList) {
-        def isDocsFile = false
-        for (prefix in docsFileList) {
-            if (file.startsWith(prefix)) {
-                isDocsFile = true
-                break
+        if (allFilesInGroup) {
+            pipeline.echo("Only ${group} files changed.")
+            return group
+        } else {
+            def nonGroupFile = changedFileList.find { file ->
+                !groupPrefixes.any { prefix -> file.startsWith(prefix) }
+            }
+            if (nonGroupFile != null) {
+                pipeline.echo("Found non-${group} file: ${nonGroupFile}")
             }
-        }
-        if (!isDocsFile) {
-            pipeline.echo("Found non-docs file: ${file}")
-            return false
         }
     }
-    pipeline.echo("Only docs files changed.")
-    return true
+
+    return ""
 }
 
 def collectTestResults(pipeline, testFilter)
@@ -1040,7 +1012,7 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
                     testStageName = "[Test-SBSA] Remote Run"
                 }
 
-                if (testFilter[(ONLY_DOCS_FILE_CHANGED)]) {
+                if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
                     echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run"
                     return
                 }
 
@@ -449,16 +449,14 @@ def EXTRA_STAGE_LIST = "extra_stage"
 @Field
 def MULTI_GPU_FILE_CHANGED = "multi_gpu_file_changed"
 @Field
-def ONLY_PYTORCH_FILE_CHANGED = "only_pytorch_file_changed"
+def ONLY_ONE_GROUP_CHANGED = "only_one_group_changed"
 @Field
 def AUTO_TRIGGER_TAG_LIST = "auto_trigger_tag_list"
 @Field
 def DEBUG_MODE = "debug"
 @Field
 def DETAILED_LOG = "detailed_log"
 @Field
-def ONLY_DOCS_FILE_CHANGED = "only_docs_file_changed"
-@Field
 def testFilter = [
     (REUSE_STAGE_LIST): null,
     (ENABLE_SKIP_TEST): false,
@@ -471,11 +469,10 @@ def testFilter = [
     (DISABLE_MULTI_GPU_TEST): false,
     (EXTRA_STAGE_LIST): null,
     (MULTI_GPU_FILE_CHANGED): false,
-    (ONLY_PYTORCH_FILE_CHANGED): false,
+    (ONLY_ONE_GROUP_CHANGED): "",
     (DEBUG_MODE): false,
     (AUTO_TRIGGER_TAG_LIST): [],
     (DETAILED_LOG): false,
-    (ONLY_DOCS_FILE_CHANGED): false,
 ]
 
 @Field
@@ -2209,22 +2206,28 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         println parallelJobsFiltered.keySet()
     }
 
-    if (testFilter[(ONLY_PYTORCH_FILE_CHANGED)]) {
+    if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
+        echo "Only docs files are changed, run doc build stage only."
+        parallelJobsFiltered = docBuildJobs
+        println parallelJobsFiltered.keySet()
+    } else if (testFilter[(ONLY_ONE_GROUP_CHANGED)] != "") {
         if (testFilter[(TEST_BACKEND)] != null) {
-            echo "Force disable ONLY_PYTORCH_FILE_CHANGED mode. Backend mode set by flag: ${testFilter[(TEST_BACKEND)]}."
+            echo "Force disable ONLY_ONE_GROUP_CHANGED mode. Backend mode set by flag: ${testFilter[(TEST_BACKEND)]}."
         } else {
-            echo "ONLY_PYTORCH_FILE_CHANGED mode is true."
-            parallelJobsFiltered = parallelJobsFiltered.findAll { !it.key.contains("-CPP-") && !it.key.contains("-TensorRT-") }
+            echo "ONLY_ONE_GROUP_CHANGED mode is true. The group is: ${testFilter[(ONLY_ONE_GROUP_CHANGED)]}."
+            def excludedBackends = new HashMap()
+            excludedBackends["PyTorch"] = ["-CPP-", "-TensorRT-", "-Triton-"]
+            excludedBackends["Triton"] = ["-PyTorch-", "-CPP-", "-TensorRT-"]
+            def group = testFilter[(ONLY_ONE_GROUP_CHANGED)]
+            if (excludedBackends.containsKey(group)) {
+                parallelJobsFiltered = parallelJobsFiltered.findAll { key, value ->
+                    !excludedBackends[group].any { backend -> key.contains(backend) }
+                }
+            }
             println parallelJobsFiltered.keySet()
         }
     }
 
-    if (testFilter[(ONLY_DOCS_FILE_CHANGED)]) {
-        echo "Only docs files are changed, run doc build stage only."
-        parallelJobsFiltered = docBuildJobs
-        println parallelJobsFiltered.keySet()
-    }
-
     // Check --stage-list, only run the stages in stage-list.
     if (testFilter[TEST_STAGE_LIST] != null) {
         echo "Use TEST_STAGE_LIST for filtering. Stages: ${testFilter[(TEST_STAGE_LIST)]}."
@@ -2405,7 +2408,7 @@ pipeline {
                 expression {
                     // Only run the test list validation when necessary
                     env.targetArch == X86_64_TRIPLE &&
-                    testFilter[ONLY_DOCS_FILE_CHANGED] == false &&
+                    testFilter[ONLY_ONE_GROUP_CHANGED] != "Docs" &&
                     !(env.JOB_NAME ==~ /.*Multi-GPU.*/) &&
                     !(env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/)
                 }
 
@@ -303,6 +303,7 @@ def __init__(
         self.py_batch_idx = None
         self.py_rewind_len = 0
         self.py_draft_tokens = [] if self.draft_tokens is None else self.draft_tokens
+        self.py_last_context_chunk = (None, None)
         self.py_last_draft_tokens = None
         self.py_num_accepted_draft_tokens = 0
         self.py_decoding_iter = 0
 
@@ -1311,6 +1311,10 @@ def _update_request_states_tp(self, scheduled_requests: ScheduledRequests):
 
         for request in scheduled_requests.context_requests:
             if request.state != LlmRequestState.GENERATION_COMPLETE:  # skip failed requests
+                request.py_last_context_chunk = (
+                    request.context_current_position,
+                    request.context_current_position +
+                    request.context_chunk_size)
                 request.move_to_next_context_chunk()
             if request.context_remaining_length == 0:
                 request.state = LlmRequestState.GENERATION_IN_PROGRESS
Original file line number	Diff line number	Diff line change
`@@ -57,8 +57,8 @@ void initBindings(nb::module_& m)`
`57`	`57`	`using GenLlmReq = tb::GenericLlmRequest<runtime::ITensor::SharedPtr>;`
`58`	`58`
`59`	`59`	`// Create and register exceptions in module scope`
`60`		`- nb::exception<tb::PeftTaskNotCachedException>(m, "PeftTaskNotCachedException");`
`61`		`- nb::exception<tr::LoraCacheFullException>(m, "LoraCacheFullException");`
	`60`	`+ static nb::object peft_exc = nb::exception<tb::PeftTaskNotCachedException>(m, "PeftTaskNotCachedException");`
	`61`	`+ static nb::object lora_exc = nb::exception<tr::LoraCacheFullException>(m, "LoraCacheFullException");`
`62`	`62`
`63`	`63`	`// Register with no captures`
`64`	`64`	`nb::register_exception_translator(`
`@@ -71,11 +71,11 @@ void initBindings(nb::module_& m)`
`71`	`71`	`}`
`72`	`72`	`catch (const tb::PeftTaskNotCachedException& e)`
`73`	`73`	`{`
`74`		`- PyErr_SetString(nb::type<tb::PeftTaskNotCachedException>().ptr(), e.what());`
	`74`	`+ PyErr_SetString(peft_exc.ptr(), e.what());`
`75`	`75`	`}`
`76`	`76`	`catch (const tr::LoraCacheFullException& e)`
`77`	`77`	`{`
`78`		`- PyErr_SetString(nb::type<tr::LoraCacheFullException>().ptr(), e.what());`
	`78`	`+ PyErr_SetString(lora_exc.ptr(), e.what());`
`79`	`79`	`}`
`80`	`80`	`});`
`81`	`81`