NVIDIA
diff --git a/‎.github/CODEOWNERS
Lines changed: 3 additions & 0 deletions b/‎.github/CODEOWNERS
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎3rdparty/nanobind b/‎3rdparty/nanobind
diff --git a/‎constraints.txt
Lines changed: 4 additions & 0 deletions b/‎constraints.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/CMakeLists.txt
Lines changed: 20 additions & 5 deletions b/‎cpp/CMakeLists.txt
Lines changed: 20 additions & 5 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 12 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 12 additions & 0 deletions
diff --git a/‎cpp/micro_benchmarks/CMakeLists.txt
Lines changed: 3 additions & 2 deletions b/‎cpp/micro_benchmarks/CMakeLists.txt
Lines changed: 3 additions & 2 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
Lines changed: 29 additions & 7 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
Lines changed: 29 additions & 7 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
Lines changed: 18 additions & 17 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
Lines changed: 18 additions & 17 deletions
diff --git a/‎cpp/tensorrt_llm/CMakeLists.txt
Lines changed: 5 additions & 1 deletion b/‎cpp/tensorrt_llm/CMakeLists.txt
Lines changed: 5 additions & 1 deletion
@@ -19,6 +19,9 @@
 /tensorrt_llm/commands/bench.py @NVIDIA/trtllm-bench-reviewers
 docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 
+## TensorRT-LLM LLM API
+/tensorrt_llm/llmapi @NVIDIA/trt-llm-llmapi-devs
+/tensorrt_llm/executor @NVIDIA/trt-llm-llmapi-devs
 
 # The rule below requires that any PR modifying public APIs must be approved by at least one member
 # of the NVIDIA/trt-llm-committed-api-review-committee or NVIDIA/trt-llm-noncommitted-api-review-committee team.
 
@@ -20,3 +20,6 @@
 [submodule "3rdparty/xgrammar"]
 	path = 3rdparty/xgrammar
 	url = https://github.com/mlc-ai/xgrammar.git
+[submodule "3rdparty/nanobind"]
+	path = 3rdparty/nanobind
+	url = https://github.com/wjakob/nanobind
@@ -7,3 +7,7 @@ h11>=0.16.0
 tornado>=6.5.0
 # WAR against https://github.com/advisories/GHSA-5rjg-fvgr-3xxf
 setuptools>=78.1.1
+# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
+protobuf>=4.25.8
+# WAR against https://github.com/advisories/GHSA-33p9-3p43-82vq
+jupyter-core>=5.8.1
@@ -28,8 +28,6 @@ project(tensorrt_llm LANGUAGES CXX)
 
 # Build options
 option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
-option(BUILD_PYBIND "Build Python bindings for C++ runtime and batch manager"
-       ON)
 option(BUILD_TESTS "Build Google tests" ON)
 option(BUILD_BENCHMARKS "Build benchmarks" ON)
 option(BUILD_MICRO_BENCHMARKS "Build C++ micro benchmarks" OFF)
@@ -68,6 +66,11 @@ endif()
 add_compile_definitions("TLLM_GEN_EXPORT_INTERFACE")
 add_compile_definitions("TLLM_ENABLE_CUDA")
 
+set(BINDING_TYPE
+    "pybind"
+    CACHE STRING
+          "Binding type of Python bindings for C++ runtime and batch manager")
+
 set(INTERNAL_CUTLASS_KERNELS_PATH
     ""
     CACHE
@@ -195,7 +198,14 @@ set(TRT_LIB TensorRT::NvInfer)
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
-add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+if(BINDING_TYPE STREQUAL "pybind")
+  add_subdirectory(${3RDPARTY_DIR}/pybind11
+                   ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+endif()
+if(BINDING_TYPE STREQUAL "nanobind")
+  add_subdirectory(${3RDPARTY_DIR}/nanobind
+                   ${CMAKE_CURRENT_BINARY_DIR}/nanobind)
+endif()
 
 # include as system to suppress warnings
 include_directories(
@@ -206,8 +216,13 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/include
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
-  ${3RDPARTY_DIR}/json/include
-  ${3RDPARTY_DIR}/pybind11/include)
+  ${3RDPARTY_DIR}/json/include)
+if(BINDING_TYPE STREQUAL "pybind")
+  include_directories(${3RDPARTY_DIR}/pybind11/include)
+endif()
+if(BINDING_TYPE STREQUAL "nanobind")
+  include_directories(${3RDPARTY_DIR}/nanobind/include)
+endif()
 
 if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
   add_definitions("-DENABLE_BF16")
 
@@ -553,6 +553,8 @@ class WindowBlockManager
 
     void storeBlocksForReuse(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
+    void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
+
     //! \brief Release blocks of the sequence.
     void releaseBlocks(GenerationRequest& sequence);
 
@@ -1092,6 +1094,9 @@ class BlockManager
     //! \brief Store context blocks
     void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest);
 
+    //! \brief Store newest block for reuse
+    void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
+
     [[nodiscard]] static bool isUseOneMoreBlock(
         SizeType32 windowSize, std::optional<SizeType32> maxSequenceLength, SizeType32 maxBeamWidth)
     {
@@ -1262,6 +1267,10 @@ class BaseKVCacheManager
     //! \details These blocks become reusable from next step.
     virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
 
+    //! \brief Store newest block for reuse.
+    //! \details This block become reusable from next step.
+    virtual void storeNewBlock(LlmRequest const& llmRequest) = 0;
+
     //! \brief Get the block ids of a request [per beam] **for a given window size block manager**
     [[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
         LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
@@ -1568,6 +1577,9 @@ class KVCacheManager : public BaseKVCacheManager
     //! \details These blocks become reusable from next step.
     void storeContextBlocks(LlmRequest const& llmRequest) override;
 
+    //! \brief Store newest blocks for reuse
+    void storeNewBlock(LlmRequest const& llmRequest) override;
+
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
 
     [[nodiscard]] SizeType32 getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const override;
 
@@ -44,8 +44,9 @@ function(add_benchmark test_name test_src)
                                             benchmark::benchmark)
 
   target_compile_features(${test_name} PRIVATE cxx_std_17)
-  target_compile_definitions(${test_name}
-                             PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
+  target_compile_definitions(
+    ${test_name} PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}"
+                        USING_OSS_CUTLASS_MOE_GEMM)
 
   add_dependencies(micro_benchmarks ${test_name})
 endfunction()
 
@@ -222,7 +222,30 @@ struct UniformRoutingConfig : public RoutingConfig
     {
         std::uniform_int_distribution<int> dist(0, num_experts - 1);
         std::vector<int> input(k * num_tokens);
-        std::generate(input.begin(), input.end(), [&] { return dist(twister); });
+        for (int i = 0; i < num_tokens; i++)
+        {
+            for (int j = 0; j < k; j++)
+            {
+                while (true)
+                {
+                    int expert_id = dist(twister);
+                    bool valid = true;
+                    for (int prev_j = 0; prev_j < j; prev_j++)
+                    {
+                        if (expert_id == input[i * k + prev_j])
+                        {
+                            valid = false;
+                            break;
+                        }
+                    }
+                    if (valid)
+                    {
+                        input[i * k + j] = expert_id;
+                        break;
+                    }
+                }
+            }
+        }
         check_cuda_error(cudaMemcpyAsync(
             selected_experts, input.data(), input.size() * sizeof(int), cudaMemcpyHostToDevice, streamPtr->get()));
         check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
@@ -322,9 +345,8 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     constexpr static int WEIGHT_ELEM_PER_BYTE = (INT4 || ANY_FP4) ? 2 : 1;
     int const BASE_HIDDEN_SIZE = 64 / sizeof(WeightType) * WEIGHT_ELEM_PER_BYTE;
 
-    constexpr static int64_t FP4_VECTOR_SIZE = NVFP4
-        ? tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
-        : tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
+    constexpr static int64_t FP4_VECTOR_SIZE = NVFP4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
+                                                     : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
 
     std::vector<BufferManager::IBufferPtr> managed_buffers;
     int* mSelectedExperts{};
@@ -476,7 +498,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     float* mExpertFP8Scale3{};
 
     float* mExpertFP4ActScale1{};
-    using ElementSF = tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::ElementSF;
+    using ElementSF = TmaWarpSpecializedGroupedGemmInput::ElementSF;
     ElementSF* mExpertFP4WeightSf1{};
     float* mExpertFP4GlobalScale1{};
     float* mExpertFP4ActScale2{};
@@ -532,7 +554,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         mInterSize = inter_size / parallelism_config.tp_size;
         mNumExperts = num_experts;
         mK = k;
-        mIsGated = tensorrt_llm::isGatedActivation(mActType);
+        mIsGated = isGatedActivation(mActType);
         mGatedMultiplier = mIsGated ? 2 : 1;
         auto const gated_inter = mInterSize * mGatedMultiplier;
 
@@ -811,7 +833,7 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
     int const num_tokens = state.range(7);
     mUseBias = state.range(8);
     mUseFinalScale = state.range(9);
-    mActType = static_cast<tensorrt_llm::ActivationType>(state.range(10));
+    mActType = static_cast<ActivationType>(state.range(10));
     int tactic_idx1 = state.range(11);
     int tactic_idx2 = state.range(12);
     int const routing_config = state.range(13);
 
@@ -472,16 +472,16 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
                 if (!has_tactic_ids2)
                     t2 = t1;
 
-                benchmark->Args({num_experts,                                             //
-                    get_range("k"),                                                       //
-                    get_range("hidden_size"),                                             //
-                    get_range("inter_size"),                                              //
-                    tp_size, ep_size, world_rank,                                         //
-                    get_range("num_tokens"),                                              //
-                    bias, do_final_scale,                                                 //
-                    get_range("act_fn", 0, (int) tensorrt_llm::ActivationType::Identity), //
-                    t1,                                                                   //
-                    t2,                                                                   //
+                benchmark->Args({num_experts,                               //
+                    get_range("k"),                                         //
+                    get_range("hidden_size"),                               //
+                    get_range("inter_size"),                                //
+                    tp_size, ep_size, world_rank,                           //
+                    get_range("num_tokens"),                                //
+                    bias, do_final_scale,                                   //
+                    get_range("act_fn", 0, (int) ActivationType::Identity), //
+                    t1,                                                     //
+                    t2,                                                     //
                     *routing_config});
             }
         }
@@ -497,10 +497,10 @@ void argGenHardcoded(benchmark::internal::Benchmark* benchmark)
     auto inter_size_mul = {4.f};               // {7.f/2.f, 4.f};
     auto num_tokens = {2048};                  // {1, 20, 200, 2048};
     auto use_bias = {0};                       // {0, 1};
-    auto activation_type = {tensorrt_llm::ActivationType::Gelu};
-    // {tensorrt_llm::ActivationType::Relu, tensorrt_llm::ActivationType::Gelu,
-    // tensorrt_llm::ActivationType::Silu, tensorrt_llm::ActivationType::Geglu,
-    // tensorrt_llm::ActivationType::Swiglu};
+    auto activation_type = {ActivationType::Gelu};
+    // {ActivationType::Relu, ActivationType::Gelu,
+    // ActivationType::Silu, ActivationType::Geglu,
+    // ActivationType::Swiglu};
     auto cutlass_tactic = {-1};                           // {0,..., listAllTactics<BenchClass>().size()};
     auto routing_config = {LOAD_BALANCED_ROUTING_CONFIG}; // {0, 1, 2};
 
@@ -518,7 +518,7 @@ void argGenHardcoded(benchmark::internal::Benchmark* benchmark)
                                         for (auto tactic2 : cutlass_tactic)
                                             for (auto routing : routing_config)
                                                 benchmark->Args({num_expert, k, size, inter_size, 1, 1, 0, tokens, bias,
-                                                    (int) act, tactic1, tactic2, routing});
+                                                    1, (int) act, tactic1, tactic2, routing});
                     }
 }
 
@@ -540,8 +540,9 @@ void argGen(benchmark::internal::Benchmark* benchmark)
 
     // Generic setup
     benchmark->UseManualTime();
-    benchmark->ArgNames({"Num Experts", "K", "Hidden Size", "Inter Size", "TP Size", "EP Size", "World Rank",
-        "Num Tokens", "Use Bias", "Activation Function", "Tactic ID 1", "Tactic ID 2", "Routing ID"});
+    benchmark->ArgNames(
+        {"Num Experts", "K", "Hidden Size", "Inter Size", "TP Size", "EP Size", "World Rank", "Num Tokens", "Use Bias",
+            "Use Final Scale", "Activation Function", "Tactic ID 1", "Tactic ID 2", "Routing ID"});
 
     if (workloadFile)
         argGenLoadFile<BenchClass>(benchmark);
 
@@ -302,10 +302,14 @@ if(BUILD_PYT)
   add_subdirectory(thop)
 endif()
 
-if(BUILD_PYBIND)
+if(BINDING_TYPE STREQUAL "pybind")
   add_subdirectory(pybind)
 endif()
 
+if(BINDING_TYPE STREQUAL "nanobind")
+  add_subdirectory(nanobind)
+endif()
+
 if(BUILD_DEEP_EP)
   add_subdirectory(deep_ep)
 endif()