Skip to content

Commit ba6c04a

Browse files
authored
Merge branch 'main' into fix-scaffolding-unittest
2 parents eef8cda + ed77ef2 commit ba6c04a

File tree

122 files changed

+4441
-5121
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

122 files changed

+4441
-5121
lines changed

.github/CODEOWNERS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
/tensorrt_llm/commands/bench.py @NVIDIA/trtllm-bench-reviewers
2020
docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
2121

22+
## TensorRT-LLM LLM API
23+
/tensorrt_llm/llmapi @NVIDIA/trt-llm-llmapi-devs
24+
/tensorrt_llm/executor @NVIDIA/trt-llm-llmapi-devs
2225

2326
# The rule below requires that any PR modifying public APIs must be approved by at least one member
2427
# of the NVIDIA/trt-llm-committed-api-review-committee or NVIDIA/trt-llm-noncommitted-api-review-committee team.

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,6 @@
2020
[submodule "3rdparty/xgrammar"]
2121
path = 3rdparty/xgrammar
2222
url = https://github.com/mlc-ai/xgrammar.git
23+
[submodule "3rdparty/nanobind"]
24+
path = 3rdparty/nanobind
25+
url = https://github.com/wjakob/nanobind

3rdparty/nanobind

Submodule nanobind added at a0ed258

constraints.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,7 @@ h11>=0.16.0
77
tornado>=6.5.0
88
# WAR against https://github.com/advisories/GHSA-5rjg-fvgr-3xxf
99
setuptools>=78.1.1
10+
# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
11+
protobuf>=4.25.8
12+
# WAR against https://github.com/advisories/GHSA-33p9-3p43-82vq
13+
jupyter-core>=5.8.1

cpp/CMakeLists.txt

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ project(tensorrt_llm LANGUAGES CXX)
2828

2929
# Build options
3030
option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
31-
option(BUILD_PYBIND "Build Python bindings for C++ runtime and batch manager"
32-
ON)
3331
option(BUILD_TESTS "Build Google tests" ON)
3432
option(BUILD_BENCHMARKS "Build benchmarks" ON)
3533
option(BUILD_MICRO_BENCHMARKS "Build C++ micro benchmarks" OFF)
@@ -68,6 +66,11 @@ endif()
6866
add_compile_definitions("TLLM_GEN_EXPORT_INTERFACE")
6967
add_compile_definitions("TLLM_ENABLE_CUDA")
7068

69+
set(BINDING_TYPE
70+
"pybind"
71+
CACHE STRING
72+
"Binding type of Python bindings for C++ runtime and batch manager")
73+
7174
set(INTERNAL_CUTLASS_KERNELS_PATH
7275
""
7376
CACHE
@@ -195,7 +198,14 @@ set(TRT_LIB TensorRT::NvInfer)
195198
get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
196199

197200
set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
198-
add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
201+
if(BINDING_TYPE STREQUAL "pybind")
202+
add_subdirectory(${3RDPARTY_DIR}/pybind11
203+
${CMAKE_CURRENT_BINARY_DIR}/pybind11)
204+
endif()
205+
if(BINDING_TYPE STREQUAL "nanobind")
206+
add_subdirectory(${3RDPARTY_DIR}/nanobind
207+
${CMAKE_CURRENT_BINARY_DIR}/nanobind)
208+
endif()
199209

200210
# include as system to suppress warnings
201211
include_directories(
@@ -206,8 +216,13 @@ include_directories(
206216
${3RDPARTY_DIR}/cutlass/include
207217
${3RDPARTY_DIR}/cutlass/tools/util/include
208218
${3RDPARTY_DIR}/NVTX/include
209-
${3RDPARTY_DIR}/json/include
210-
${3RDPARTY_DIR}/pybind11/include)
219+
${3RDPARTY_DIR}/json/include)
220+
if(BINDING_TYPE STREQUAL "pybind")
221+
include_directories(${3RDPARTY_DIR}/pybind11/include)
222+
endif()
223+
if(BINDING_TYPE STREQUAL "nanobind")
224+
include_directories(${3RDPARTY_DIR}/nanobind/include)
225+
endif()
211226

212227
if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
213228
add_definitions("-DENABLE_BF16")

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,8 @@ class WindowBlockManager
553553

554554
void storeBlocksForReuse(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
555555

556+
void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
557+
556558
//! \brief Release blocks of the sequence.
557559
void releaseBlocks(GenerationRequest& sequence);
558560

@@ -1092,6 +1094,9 @@ class BlockManager
10921094
//! \brief Store context blocks
10931095
void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest);
10941096

1097+
//! \brief Store newest block for reuse
1098+
void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
1099+
10951100
[[nodiscard]] static bool isUseOneMoreBlock(
10961101
SizeType32 windowSize, std::optional<SizeType32> maxSequenceLength, SizeType32 maxBeamWidth)
10971102
{
@@ -1262,6 +1267,10 @@ class BaseKVCacheManager
12621267
//! \details These blocks become reusable from next step.
12631268
virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
12641269

1270+
//! \brief Store newest block for reuse.
1271+
//! \details This block become reusable from next step.
1272+
virtual void storeNewBlock(LlmRequest const& llmRequest) = 0;
1273+
12651274
//! \brief Get the block ids of a request [per beam] **for a given window size block manager**
12661275
[[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
12671276
LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
@@ -1568,6 +1577,9 @@ class KVCacheManager : public BaseKVCacheManager
15681577
//! \details These blocks become reusable from next step.
15691578
void storeContextBlocks(LlmRequest const& llmRequest) override;
15701579

1580+
//! \brief Store newest blocks for reuse
1581+
void storeNewBlock(LlmRequest const& llmRequest) override;
1582+
15711583
[[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
15721584

15731585
[[nodiscard]] SizeType32 getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const override;

cpp/micro_benchmarks/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ function(add_benchmark test_name test_src)
4444
benchmark::benchmark)
4545

4646
target_compile_features(${test_name} PRIVATE cxx_std_17)
47-
target_compile_definitions(${test_name}
48-
PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
47+
target_compile_definitions(
48+
${test_name} PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}"
49+
USING_OSS_CUTLASS_MOE_GEMM)
4950

5051
add_dependencies(micro_benchmarks ${test_name})
5152
endfunction()

cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,30 @@ struct UniformRoutingConfig : public RoutingConfig
222222
{
223223
std::uniform_int_distribution<int> dist(0, num_experts - 1);
224224
std::vector<int> input(k * num_tokens);
225-
std::generate(input.begin(), input.end(), [&] { return dist(twister); });
225+
for (int i = 0; i < num_tokens; i++)
226+
{
227+
for (int j = 0; j < k; j++)
228+
{
229+
while (true)
230+
{
231+
int expert_id = dist(twister);
232+
bool valid = true;
233+
for (int prev_j = 0; prev_j < j; prev_j++)
234+
{
235+
if (expert_id == input[i * k + prev_j])
236+
{
237+
valid = false;
238+
break;
239+
}
240+
}
241+
if (valid)
242+
{
243+
input[i * k + j] = expert_id;
244+
break;
245+
}
246+
}
247+
}
248+
}
226249
check_cuda_error(cudaMemcpyAsync(
227250
selected_experts, input.data(), input.size() * sizeof(int), cudaMemcpyHostToDevice, streamPtr->get()));
228251
check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
@@ -322,9 +345,8 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
322345
constexpr static int WEIGHT_ELEM_PER_BYTE = (INT4 || ANY_FP4) ? 2 : 1;
323346
int const BASE_HIDDEN_SIZE = 64 / sizeof(WeightType) * WEIGHT_ELEM_PER_BYTE;
324347

325-
constexpr static int64_t FP4_VECTOR_SIZE = NVFP4
326-
? tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
327-
: tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
348+
constexpr static int64_t FP4_VECTOR_SIZE = NVFP4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
349+
: TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
328350

329351
std::vector<BufferManager::IBufferPtr> managed_buffers;
330352
int* mSelectedExperts{};
@@ -476,7 +498,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
476498
float* mExpertFP8Scale3{};
477499

478500
float* mExpertFP4ActScale1{};
479-
using ElementSF = tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::ElementSF;
501+
using ElementSF = TmaWarpSpecializedGroupedGemmInput::ElementSF;
480502
ElementSF* mExpertFP4WeightSf1{};
481503
float* mExpertFP4GlobalScale1{};
482504
float* mExpertFP4ActScale2{};
@@ -532,7 +554,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
532554
mInterSize = inter_size / parallelism_config.tp_size;
533555
mNumExperts = num_experts;
534556
mK = k;
535-
mIsGated = tensorrt_llm::isGatedActivation(mActType);
557+
mIsGated = isGatedActivation(mActType);
536558
mGatedMultiplier = mIsGated ? 2 : 1;
537559
auto const gated_inter = mInterSize * mGatedMultiplier;
538560

@@ -811,7 +833,7 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
811833
int const num_tokens = state.range(7);
812834
mUseBias = state.range(8);
813835
mUseFinalScale = state.range(9);
814-
mActType = static_cast<tensorrt_llm::ActivationType>(state.range(10));
836+
mActType = static_cast<ActivationType>(state.range(10));
815837
int tactic_idx1 = state.range(11);
816838
int tactic_idx2 = state.range(12);
817839
int const routing_config = state.range(13);

cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -472,16 +472,16 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
472472
if (!has_tactic_ids2)
473473
t2 = t1;
474474

475-
benchmark->Args({num_experts, //
476-
get_range("k"), //
477-
get_range("hidden_size"), //
478-
get_range("inter_size"), //
479-
tp_size, ep_size, world_rank, //
480-
get_range("num_tokens"), //
481-
bias, do_final_scale, //
482-
get_range("act_fn", 0, (int) tensorrt_llm::ActivationType::Identity), //
483-
t1, //
484-
t2, //
475+
benchmark->Args({num_experts, //
476+
get_range("k"), //
477+
get_range("hidden_size"), //
478+
get_range("inter_size"), //
479+
tp_size, ep_size, world_rank, //
480+
get_range("num_tokens"), //
481+
bias, do_final_scale, //
482+
get_range("act_fn", 0, (int) ActivationType::Identity), //
483+
t1, //
484+
t2, //
485485
*routing_config});
486486
}
487487
}
@@ -497,10 +497,10 @@ void argGenHardcoded(benchmark::internal::Benchmark* benchmark)
497497
auto inter_size_mul = {4.f}; // {7.f/2.f, 4.f};
498498
auto num_tokens = {2048}; // {1, 20, 200, 2048};
499499
auto use_bias = {0}; // {0, 1};
500-
auto activation_type = {tensorrt_llm::ActivationType::Gelu};
501-
// {tensorrt_llm::ActivationType::Relu, tensorrt_llm::ActivationType::Gelu,
502-
// tensorrt_llm::ActivationType::Silu, tensorrt_llm::ActivationType::Geglu,
503-
// tensorrt_llm::ActivationType::Swiglu};
500+
auto activation_type = {ActivationType::Gelu};
501+
// {ActivationType::Relu, ActivationType::Gelu,
502+
// ActivationType::Silu, ActivationType::Geglu,
503+
// ActivationType::Swiglu};
504504
auto cutlass_tactic = {-1}; // {0,..., listAllTactics<BenchClass>().size()};
505505
auto routing_config = {LOAD_BALANCED_ROUTING_CONFIG}; // {0, 1, 2};
506506

@@ -518,7 +518,7 @@ void argGenHardcoded(benchmark::internal::Benchmark* benchmark)
518518
for (auto tactic2 : cutlass_tactic)
519519
for (auto routing : routing_config)
520520
benchmark->Args({num_expert, k, size, inter_size, 1, 1, 0, tokens, bias,
521-
(int) act, tactic1, tactic2, routing});
521+
1, (int) act, tactic1, tactic2, routing});
522522
}
523523
}
524524

@@ -540,8 +540,9 @@ void argGen(benchmark::internal::Benchmark* benchmark)
540540

541541
// Generic setup
542542
benchmark->UseManualTime();
543-
benchmark->ArgNames({"Num Experts", "K", "Hidden Size", "Inter Size", "TP Size", "EP Size", "World Rank",
544-
"Num Tokens", "Use Bias", "Activation Function", "Tactic ID 1", "Tactic ID 2", "Routing ID"});
543+
benchmark->ArgNames(
544+
{"Num Experts", "K", "Hidden Size", "Inter Size", "TP Size", "EP Size", "World Rank", "Num Tokens", "Use Bias",
545+
"Use Final Scale", "Activation Function", "Tactic ID 1", "Tactic ID 2", "Routing ID"});
545546

546547
if (workloadFile)
547548
argGenLoadFile<BenchClass>(benchmark);

cpp/tensorrt_llm/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,14 @@ if(BUILD_PYT)
302302
add_subdirectory(thop)
303303
endif()
304304

305-
if(BUILD_PYBIND)
305+
if(BINDING_TYPE STREQUAL "pybind")
306306
add_subdirectory(pybind)
307307
endif()
308308

309+
if(BINDING_TYPE STREQUAL "nanobind")
310+
add_subdirectory(nanobind)
311+
endif()
312+
309313
if(BUILD_DEEP_EP)
310314
add_subdirectory(deep_ep)
311315
endif()

0 commit comments

Comments
 (0)