Skip to content

Commit 9402c85

Browse files
authored
Merge branch 'main' into vllmKV_development
2 parents e57f494 + a178cea commit 9402c85

File tree

189 files changed

+8183
-2678
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

189 files changed

+8183
-2678
lines changed

.clang-tidy

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
Checks: '*,
22
-altera-id-dependent-backward-branch,
3+
-altera-struct-pack-align,
34
-altera-unroll-loops,
45
-boost-use-ranges,
56
-cppcoreguidelines-avoid-do-while,
@@ -9,8 +10,10 @@ Checks: '*,
910
-fuchsia-default-arguments-calls,
1011
-fuchsia-default-arguments-declarations,
1112
-fuchsia-overloaded-operator,
13+
-fuchsia-virtual-inheritance,
1214
-hicpp-vararg,
1315
-llvm-else-after-return,
1416
-llvmlibc-*,
17+
-misc-include-cleaner,
1518
-misc-non-private-member-variables-in-classes,
1619
-modernize-use-trailing-return-type'

.github/workflows/blossom-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
startsWith(github.event.comment.body, '/bot skip --comment') ||
4141
startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
4242
startsWith(github.event.comment.body, '/bot kill')) && contains(
43-
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv"]'),
43+
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025"]'),
4444
github.actor)
4545
steps:
4646
- name: Check if comment is issued by authorized person

.github/workflows/pr-check.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
uses: agenthunt/[email protected]
3030
continue-on-error: true
3131
with:
32-
pr-title-regex: "^(\\[(None|[A-Z0-9]+-[0-9]+|#[0-9]+|https:\\/\\/nvbugs\\/[0-9]+)\\])(\\[[a-z0-9]+\\]) (([^ ].*)?[^ ])$"
32+
pr-title-regex: "^(\\[(None|[A-Z0-9]+-[0-9]+|#[0-9]+|https:\\/\\/nvbugs\\/[0-9]+)\\]) *(\\[[a-z0-9]+\\]) (([^ ].*)?[^ ])$"
3333
pr-body-regex: ""
3434

3535
- name: PR Title Format Guide

README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ TensorRT-LLM
77
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
88
[![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
99
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
10-
[![cuda](https://img.shields.io/badge/cuda-12.9.0-green)](https://developer.nvidia.com/cuda-downloads)
10+
[![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
1111
[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
12-
[![version](https://img.shields.io/badge/release-1.0.0rc5-green)](./tensorrt_llm/version.py)
12+
[![version](https://img.shields.io/badge/release-1.0.0rc6-green)](./tensorrt_llm/version.py)
1313
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1414

1515
[Architecture](./docs/source/torch/arch_overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](./docs/source/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
@@ -18,6 +18,13 @@ TensorRT-LLM
1818
<div align="left">
1919

2020
## Tech Blogs
21+
22+
* [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
23+
[➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
24+
25+
* [07/26] N-Gram Speculative Decoding in TensorRT‑LLM
26+
[➡️ link](./docs/source/blogs/tech_blog/blog_7_NGram_performance_Analysis_And_Auto_Enablement.md)
27+
2128
* [06/19] Disaggregated Serving in TensorRT-LLM
2229
[➡️ link](./docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md)
2330

constraints.txt

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,5 @@
1-
# These vulnerabilities were inherited from the base image (pytorch:25.05-py3) and should be removed when the base image
1+
# These vulnerabilities were inherited from the base image (pytorch:25.06-py3) and should be removed when the base image
22
# is updated.
33

4-
# WAR against https://github.com/advisories/GHSA-vqfr-h8mv-ghfj
5-
h11>=0.16.0
6-
# WAR against https://github.com/advisories/GHSA-7cx3-6m66-7c5m
7-
tornado>=6.5.0
8-
# WAR against https://github.com/advisories/GHSA-5rjg-fvgr-3xxf
9-
setuptools>=78.1.1
104
# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
115
protobuf>=4.25.8
12-
# WAR against https://github.com/advisories/GHSA-33p9-3p43-82vq
13-
jupyter-core>=5.8.1

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ class GenericLlmRequest
467467
initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
468468
}
469469

470+
GenericLlmRequest(GenericLlmRequest&& request) = default;
471+
GenericLlmRequest(GenericLlmRequest const& request) = default;
472+
470473
void setExcludeInputFromOutput(bool exclude)
471474
{
472475
mExcludeInputFromOutput = exclude;
@@ -2318,6 +2321,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
23182321
mKvCacheRetentionConfig = request.getKvCacheRetentionConfig();
23192322
}
23202323

2324+
LlmRequest(LlmRequest&& request) = default;
2325+
LlmRequest(LlmRequest const& request) = default;
2326+
23212327
/// @brief Create a Response from the current state of the request
23222328
/// @details Note that there is some dependency on the order of operations in this method. Modify with care!
23232329
/// @return An optional Response

cpp/include/tensorrt_llm/runtime/decoderState.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class DecoderState
7171
//! @returns [batchSize], number of finished sequences per request, on gpu
7272
[[nodiscard]] TensorPtr getFinishedSum() const;
7373

74-
//! @returns [batchSize, beamWidth], FinishedState value, on gpu
74+
//! @returns [batchSize, beamWidth], finished states of type FinishedState, on gpu
7575
[[nodiscard]] TensorPtr getFinishReasons() const;
7676

7777
//! @returns [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token
@@ -134,9 +134,6 @@ class DecoderState
134134
//! @returns [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
135135
[[nodiscard]] TensorPtr getAcceptedPackedPaths() const;
136136

137-
//! @returns [maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
138-
[[nodiscard]] TensorPtr getFinishedSteps() const;
139-
140137
[[nodiscard]] SizeType32 getMaxBatchSize() const;
141138

142139
[[nodiscard]] SizeType32 getMaxBeamWidth() const;
@@ -221,10 +218,6 @@ class DecoderState
221218
//! @brief Stateful outputs for the decoder. Allocated for maxBatchSize slots.
222219
DecodingOutputPtr mJointDecodingOutput;
223220

224-
//! @brief [maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState for each generated token
225-
//! of maxTokensPerStep, on gpu
226-
TensorPtr mFinishedSteps;
227-
228221
//! @brief Workspace for beam search in streaming mode.
229222
std::unique_ptr<BeamSearchBuffers> mBeamSearchBuffers;
230223

0 commit comments

Comments
 (0)