Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ TensorRT LLM
* [08/01] Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)
✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)

* [07/26] N-Gram Speculative Decoding in TensorRTLLM
* [07/26] N-Gram Speculative Decoding in TensorRT LLM
✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md)

* [06/19] Disaggregated Serving in TensorRT LLM
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cpp/bertBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da

int main(int argc, char* argv[])
{
cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT.");
cxxopts::Options options("TensorRT LLM C++ Runtime Benchmark", "TensorRT LLM C++ Runtime Benchmark for BERT.");
options.add_options()("h,help", "Print usage");
options.add_options()(
"m,model", "Model name specified for engines.", cxxopts::value<std::string>()->default_value("bert_base"));
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cpp/disaggServerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1145,7 +1145,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,
int main(int argc, char* argv[])

{
cxxopts::Options options("TensorRT-LLm DisaggServer Benchmark");
cxxopts::Options options("TensorRT LLM DisaggServer Benchmark");
options.add_options()("h,help", "Print usage");
options.add_options()("context_engine_dirs", "Directories that store context engines,separator is a ,",
cxxopts::value<std::vector<std::string>>());
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
int main(int argc, char* argv[])
{
cxxopts::Options options(
"TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models.");
"TensorRT LLM BatchManager Benchmark", "TensorRT LLM BatchManager Benchmark for GPT and GPT-like models.");
options.add_options()("h,help", "Print usage");
options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.",
cxxopts::value<std::string>());
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/deep_gemm/compiler.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ std::vector<std::filesystem::path> getJitIncludeDirs()
}
else
{
TLLM_LOG_WARNING("Failed to find TensorRT-LLM installation, DeepGEMM will be disabled.");
TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled.");
}
}
return includeDirs;
Expand Down
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
{
void* ret = dllGetSym(handle, name);
TLLM_CHECK_WITH_INFO(ret != nullptr,
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
"built with UCX support, please rebuild in UCX-enabled environment.");
return ret;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm100(T* D, void const* A, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
break;
}
}
Expand Down Expand Up @@ -146,15 +146,15 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
occupancy);
break;
case tkc::CutlassTileConfigSM100::Undefined:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
break;
case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
break;
}
}
Expand All @@ -177,7 +177,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm120(T* D, void const* A, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
break;
}
}
Expand Down Expand Up @@ -205,16 +205,16 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm120(T* D, void const* A, void const* B,
occupancy);
break;
case tkc::CutlassTileConfigSM120::Undefined:
throw std::runtime_error("[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
break;
case tkc::CutlassTileConfigSM120::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
break;
}
}
Expand Down Expand Up @@ -257,7 +257,7 @@ size_t dispatchMXFP8xMXFP4GemmClusterShapeSm100(T* D, void const* A, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
break;
}
}
Expand Down Expand Up @@ -293,15 +293,15 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
occupancy);
break;
case tkc::CutlassTileConfigSM100::Undefined:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
break;
case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
break;
}
}
Expand Down Expand Up @@ -338,7 +338,7 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
}
}
else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4)
Expand All @@ -356,13 +356,13 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
}
}
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
int* occupancy)
{
throw std::runtime_error(
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture.");
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture.");
}

#else
Expand Down Expand Up @@ -250,7 +250,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
{
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
+ std::to_string(mMaxSmemSize);
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
}
/* // Return workspace size */
if (!A && !B && !D)
Expand All @@ -261,28 +261,28 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
{
std::string errMsg("Requested workspace size insufficient. Required "
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
}
auto can_implement = gemm.can_implement(args);
if (can_implement != cutlass::Status::kSuccess)
{
std::string errMsg = "MXFP8xMXFP4 Gemm cutlass kernel will fail for params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
}
auto initStatus = gemm.initialize(args, workspace, stream);
if (initStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to initialize cutlass MXFP8xMXFP4 gemm. Error: "
+ std::string(cutlassGetStatusString(initStatus));
throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
}
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());
if (runStatus != cutlass::Status::kSuccess)
{
std::string errMsg
= "Failed to run cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
}
return gemm.get_workspace_size(args);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
int* occupancy) \
{ \
throw std::runtime_error( \
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
}

#else
Expand Down Expand Up @@ -268,7 +268,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
{ \
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
+ std::to_string(mMaxSmemSize); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
/* // Return workspace size */ \
if (!A && !B && !D) \
Expand All @@ -279,28 +279,28 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
{ \
std::string errMsg("Requested workspace size insufficient. Required " \
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto can_implement = gemm.can_implement(args); \
if (can_implement != cutlass::Status::kSuccess) \
{ \
std::string errMsg = "FP4 Gemm cutlass kernel will fail for params. Error: " \
+ std::string(cutlassGetStatusString(can_implement)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto initStatus = gemm.initialize(args, workspace, stream); \
if (initStatus != cutlass::Status::kSuccess) \
{ \
std::string errMsg \
= "Failed to initialize cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(initStatus)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); \
if (runStatus != cutlass::Status::kSuccess) \
{ \
std::string errMsg \
= "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
return gemm.get_workspace_size(args); \
}
Expand Down
Loading