From ad027ced4c0211168a5519c2c46824a42511ed26 Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:47:49 -0700 Subject: [PATCH 01/12] add initial v1.0 data without rtx 6000 blackwell se Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 225 ++++++++++++----------- 1 file changed, 113 insertions(+), 112 deletions(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index c06f4039045..7030eac6168 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -27,9 +27,9 @@ Testing was performed on models with weights quantized using [ModelOpt](https:// The following GPU variants were used for testing: - H100 SXM 80GB (DGX H100) - H200 SXM 141GB (DGX H200) -- GH200 96GB HBM3 (480GB LPDDR5X) - B200 180GB (DGX B200) - GB200 192GB (GB200 NVL72) +- RTX 6000 Pro Blackwell Server Edition Other hardware variants may have different TDP, memory bandwidth, core count, or other features leading to performance differences on these workloads. @@ -38,46 +38,11 @@ Other hardware variants may have different TDP, memory bandwidth, core count, or ```text nvidia/Llama-3.3-70B-Instruct-FP4 nvidia/Llama-3.1-405B-Instruct-FP4 +nvidia/Qwen3-235B-A22B-FP4 +nvidia/Qwen3-30B-A3B-FP4 +nvidia/DeepSeek-R1-0528-FP4 ``` -#### Llama 3.3 70B FP4 - -| | GPU: | B200 | GB200 | -|:-----------------------------|:---|:----------|:--------------| -| | TP Size | 1 | 1 | -| ISL, OSL | | | | -| | | | | -| 128, 128 | | 10,613.84 | 11,100.97 | -| 128, 2048 | | 9,445.51 | 10,276.05 | -| 128, 4096 | | 6,276.85 | 7,351.12 | -| 500, 2000 | | 6,983.27 | 8,194.30 | -| 1000, 1000 | | 6,434.29 | 7,401.80 | -| 1000, 2000 | | 6,725.03 | 6,478.72 | -| 1024, 2048 | | 6,546.61 | 7,922.88 | -| 2048, 128 | | 1,330.35 | 1,418.47 | -| 2048, 2048 | | 4,528.48 | 5,326.77 | -| 5000, 500 | | 1,427.44 | 1,502.44 | -| 20000, 2000 | | 636.36 | 732.43 | - -#### Llama 3.1 405B FP4 - -| | GPU: | B200 | GB200 | -|:-----------------------------|:---|:---------|:--------------| -| | TP Size | 4 | 4 | -| ISL, OSL | | | | -| | | | | -| 128, 128 | | 6,218.89 | 6,598.97 | -| 128, 2048 | | 7,178.10 | 7,497.40 | -| 128, 4096 | | 5,890.89 | 5,898.19 | -| 500, 2000 | | 5,844.37 | 6,198.33 | -| 1000, 1000 | | 4,958.53 | 5,243.35 | -| 1000, 2000 | | 4,874.16 | 4,905.51 | -| 1024, 2048 | | 4,833.19 | 4,686.38 | -| 2048, 128 | | 737.95 | 761.58 | -| 2048, 2048 | | 4,024.02 | 4,326.56 | -| 5000, 500 | | 1,032.40 | 1,078.87 | -| 20000, 2000 | | 667.39 | 649.95 | - ### FP8 Models ```text @@ -85,81 +50,117 @@ nvidia/Llama-3.1-8B-Instruct-FP8 nvidia/Llama-3.3-70B-Instruct-FP8 nvidia/Llama-3.1-405B-Instruct-FP8 nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 +nvidia/Qwen3-235B-A22B-FP8 ``` -#### Llama 3.1 8B FP8 - -| | GPU: | GH200 | H100 | H200 | -|:-----------------------------|:---|:--------------|:-----------------|:------------------| -| | TP Size | 1 | 1 | 1 | -| ISL, OSL | | | | | -| | | | | | -| 128, 128 | | 27,304.25 | 26,401.48 | 27,027.80 | -| 128, 2048 | | 24,045.60 | 21,413.21 | 23,102.25 | -| 128, 4096 | | 15,409.85 | 13,541.54 | 17,396.83 | -| 500, 2000 | | 20,123.88 | 17,571.01 | 19,759.16 | -| 1000, 1000 | | 16,352.99 | 14,991.62 | 17,162.49 | -| 1000, 2000 | | 15,705.82 | 13,505.23 | 16,227.11 | -| 1024, 2048 | | 16,102.52 | 13,165.91 | 16,057.66 | -| 2048, 128 | | 3,573.85 | 3,275.55 | 3,390.69 | -| 2048, 2048 | | 10,767.05 | 9,462.43 | 11,822.14 | -| 5000, 500 | | 3,584.74 | 3,276.47 | 3,758.08 | -| 20000, 2000 | | 1,393.31 | 1,340.69 | 1,705.68 | - -#### Llama 3.3 70B FP8 - -| | GPU: | H100 | H200 | -|:-----------------------------|:---|:-----------------|:------------------| -| | TP Size | 2 | 2 | -| ISL, OSL | | | | -| | | | | -| 128, 128 | | 6,092.28 | 6,327.98 | -| 128, 2048 | | 5,892.94 | 7,467.36 | -| 128, 4096 | | 3,828.46 | 5,526.42 | -| 500, 2000 | | 4,654.74 | 6,639.15 | -| 1000, 1000 | | 4,181.06 | 4,773.33 | -| 1000, 2000 | | 3,708.93 | 5,790.36 | -| 1024, 2048 | | 3,785.04 | 5,480.44 | -| 2048, 128 | | 723.40 | 747.55 | -| 2048, 2048 | | 2,785.53 | 3,775.80 | -| 5000, 500 | | 865.55 | 978.28 | -| 20000, 2000 | | 411.85 | 609.42 | - -#### Llama 3.1 405B FP8 -| | GPU: | H100 | H200 | -|:-----------------------------|:---|:-----------------|:------------------| -| | TP Size | 8 | 8 | -| Runtime Input/Output Lengths | | | | -| | | | | -| 128, 128 | | | 3,705.18 | -| 128, 2048 | | 4,517.39 | 4,715.13 | -| 128, 4096 | | 2,910.31 | 4,475.91 | -| 500, 2000 | | 3,664.62 | 4,804.10 | -| 1000, 1000 | | 2,955.50 | 3,208.25 | -| 1000, 2000 | | 2,884.69 | 3,630.29 | -| 1024, 2048 | | 3,237.41 | 3,609.50 | -| 2048, 128 | | 433.47 | 441.35 | -| 2048, 2048 | | 2,216.55 | 2,840.86 | -| 5000, 500 | | 579.05 | 645.26 | -| 20000, 2000 | | 363.27 | 509.87 | - -#### Llama 4 Maverick FP8 - -Note: Performance for Llama 4 on sequence lengths less than 8,192 tokens is affected by an issue introduced in v0.21. To reproduce the Llama 4 performance noted here, please use v0.20 - -| | GPU | H200 | H100 | -|:-----------------------------|:---|:------------------|:-----------------| -| | TP Size | 8 | 8 | -| ISL, OSL | | | | -| | | | | -| 128, 2048 | | 27,543.87 | | -| 128, 4096 | | 18,541.01 | 11,163.12 | -| 500, 2000 | | 21,117.34 | | -| 1000, 2000 | | | 10,556.00 | -| 1024, 2048 | | 16,859.45 | 11,584.33 | -| 2048, 128 | | 4,364.06 | 3,832.38 | -| 2048, 2048 | | 12,800.89 | | -| 5000, 500 | | 5,128.60 | | +#### Llama 4 Scout + +| Sequence Length (ISL/OSL) | B200
TP4 (FP4) | GB200
TP4 (FP4) | H200
TP4 (FP8) | H100
TP4 (FP8) | +|---|---|---|---|---| +| 128/128 | 65,845 | 65,169 | 30,354 | | +| 128/2048 | 45,053 | 46,385 | 34,316 | 15,130 | +| 128/4096 | 31,834 | 34,050 | 21,332 | 8,603 | +| 500/2000 | 40,321 | 41,190 | 24,630 | 12,399 | +| 1000/1000 | 35,440 | 36,624 | 21,636 | 12,129 | +| 1000/2000 | 39,713 | 38,274 | 18,499 | 9,838 | +| 2048/128 | 8,139 | 8,546 | 3,699 | 3,253 | +| 2048/2048 | 24,542 | 24,303 | 14,949 | 7,972 | +| 5000/500 | 9,425 | 9,859 | 4,605 | 3,342 | +| 20000/2000 | 6,141 | 6,509 | 2,105 | | + +#### Llama 3.3 70B + +| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP1 (FP8) | H100
TP2 (FP8) | +|---|---|---|---|---| +| 128/128 | 11,147 | 12,189 | 3,803 | 6,378 | +| 128/2048 | 9,922 | 11,309 | 4,336 | 6,651 | +| 128/4096 | 6,831 | 7,849 | 2,872 | 4,199 | +| 500/2000 | 7,762 | 9,028 | 3,666 | 5,222 | +| 1000/1000 | 7,007 | 7,326 | 2,909 | 4,205 | +| 1000/2000 | 6,271 | 6,513 | 2,994 | 4,146 | +| 2048/128 | 1,339 | 1,450 | 442 | 762 | +| 2048/2048 | 4,783 | 5,646 | 2,003 | 3,082 | +| 5000/500 | 1,459 | 1,602 | 566 | 898 | +| 20000/2000 | 665 | 755 | 283 | 437 | + +#### Qwen3-235B-A22B + +| Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | +|---|---|---|---| +| 128/128 | 63,362 | 30,429 | | +| 128/2048 | 66,057 | 42,821 | 19,658 | +| 128/4096 | 39,496 | 26,852 | 12,447 | +| 500/2000 | 57,117 | 28,026 | 18,351 | +| 1000/1000 | 42,391 | 23,789 | 14,898 | +| 1000/2000 | 34,105 | 22,061 | 15,136 | +| 2048/128 | 7,329 | 3,331 | | +| 2048/2048 | 26,854 | 16,672 | 9,924 | +| 5000/500 | 8,190 | 3,623 | 3,225 | +| 20000/2000 | 4,453 | 1,876 | | + +#### Qwen3-30B-A3B + +| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | +|---|---|---| +| 128/128 | 39,827 | 24,107 | +| 128/2048 | 37,844 | 31,311 | +| 128/4096 | 24,953 | 26,337 | +| 500/2000 | 27,817 | 20,604 | +| 1000/1000 | 25,828 | 17,481 | +| 1000/2000 | 22,051 | 17,177 | +| 2048/128 | 6,251 | 6,595 | +| 2048/2048 | 17,554 | 14,719 | +| 5000/500 | 6,142 | 5,908 | +| 20000/2000 | 2,944 | 3,203 | + +#### Llama 4 Maverick + +| Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | +|---|---|---|---| +| 128/128 | 124,323 | 63,994 | | +| 128/2048 | 112,676 | 40,572 | 10,829 | +| 128/4096 | 68,170 | 24,616 | 6,744 | +| 500/2000 | | 37,835 | 10,108 | +| 1000/1000 | 79,617 | 31,782 | 9,677 | +| 1000/2000 | 63,766 | 34,734 | 9,151 | +| 2048/128 | 18,088 | 7,307 | | +| 2048/2048 | 52,195 | 20,957 | 6,916 | +| 5000/500 | | 8,456 | 3,457 | +| 20000/2000 | 12,678 | 4,106 | | + +#### Llama 3.1 405B + +| Sequence Length (ISL/OSL) | B200
TP4 (FP4) | GB200
TP4 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | +|---|---|---|---|---| +| 128/128 | 6,281 | 6,740 | 3,814 | 3,627 | +| 128/2048 | 8,020 | 8,151 | 5,348 | 4,340 | +| 128/4096 | 6,345 | 6,608 | 4,741 | 3,116 | +| 500/2000 | 6,244 | 6,540 | 4,724 | 3,994 | +| 1000/1000 | 5,209 | 5,389 | 3,330 | 2,919 | +| 1000/2000 | 4,933 | 5,135 | 3,722 | 2,895 | +| 2048/128 | 749 | 797 | 456 | 453 | +| 2048/2048 | 4,212 | 4,407 | 2,948 | 2,296 | +| 5000/500 | 1,048 | 1,112 | 650 | 610 | +| 20000/2000 | 672 | 739 | 505 | 345 | + +#### Llama 3.1 8B + +| Sequence Length (ISL/OSL) | H200
TP1 (FP8) | H100
TP1 (FP8) | +|---|---|---| +| 128/128 | 27,911 | 28,312 | +| 128/2048 | 26,221 | 22,714 | +| 128/4096 | 18,027 | 14,325 | +| 500/2000 | 20,770 | 17,660 | +| 1000/1000 | 17,744 | 15,220 | +| 1000/2000 | 16,828 | 13,899 | +| 2048/128 | 3,538 | 3,450 | +| 2048/2048 | 12,194 | 9,305 | +| 5000/500 | 3,902 | 3,459 | +| 20000/2000 | 1,804 | 1,351 | + + + + ## Reproducing Benchmarked Results From 2dd90192c65e2c6a81f3a38f3e22e59963697121 Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:22:54 -0700 Subject: [PATCH 02/12] add rtx 6000 data to perf-overview Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 80 ++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 7030eac6168..2cc45ee00e2 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -23,6 +23,10 @@ The performance numbers below were collected using the steps described in this d Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). +*(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks* +RTX 6000 Pro Blackwell Server Edition data is now included in the perf overview. RTX 6000 systems can benefit from enabling pipeline parallelism (PP) in LLM workloads, so we included several new benchmarks for this GPU at various TP x PP combinations. That data is presented in a separate table for each network. + + ### Hardware The following GPU variants were used for testing: - H100 SXM 80GB (DGX H100) @@ -68,6 +72,18 @@ nvidia/Qwen3-235B-A22B-FP8 | 5000/500 | 9,425 | 9,859 | 4,605 | 3,342 | | 20000/2000 | 6,141 | 6,509 | 2,105 | | +RTX 6000 Pro Blackwell Server Edition +| Sequence Length (ISL/OSL) | **4 GPUs**
TP2,PP2 (FP4) | **8 GPUs**
TP4,PP2 (FP4) | +|---|---|---| +| 128/128 | 18,644 | 23,454 | +| 128/2048 | 12,321 | 21,035 | +| 128/4096 | 7,643 | 13,421 | +| 1000/1000 | 9,476 | 15,781 | +| 1000/2000 | 8,919 | 16,434 | +| 2048/128 | 2,615 | 2,941 | +| 2048/2048 | 6,208 | 10,410 | +| 5000/500 | 2,662 | | + #### Llama 3.3 70B | Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP1 (FP8) | H100
TP2 (FP8) | @@ -83,6 +99,20 @@ nvidia/Qwen3-235B-A22B-FP8 | 5000/500 | 1,459 | 1,602 | 566 | 898 | | 20000/2000 | 665 | 755 | 283 | 437 | +RTX 6000 Pro Blackwell Server Edition +| Sequence Length (ISL/OSL) | **1 GPUs**
TP1,PP1 (FP4) | **2 GPUs**
TP1,PP2 (FP4) | **4 GPUs**
TP1,PP4 (FP4) | **8 GPUs**
TP1,PP8 (FP4) | +|---|---|---|---|---| +| 128/128 | 3,264 | 6,172 | 9,641 | 11,881 | +| 128/2048 | 2,422 | 4,993 | 7,922 | 9,833 | +| 128/4096 | 1,349 | 2,893 | 4,978 | 7,352 | +| 500/2000 | 1,856 | 4,114 | 6,939 | 9,435 | +| 1000/1000 | 1,787 | 3,707 | 5,961 | 8,166 | +| 1000/2000 | 1,594 | 2,993 | 5,274 | 6,943 | +| 2048/128 | 393 | 813 | 1,511 | 2,495 | +| 2048/2048 | 1,074 | 2,336 | 3,870 | 6,078 | +| 5000/500 | 401 | 812 | 1,511 | 2,491 | +| 20000/2000 | 142 | 319 | 630 | 1,148 | + #### Qwen3-235B-A22B | Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | @@ -98,6 +128,19 @@ nvidia/Qwen3-235B-A22B-FP8 | 5000/500 | 8,190 | 3,623 | 3,225 | | 20000/2000 | 4,453 | 1,876 | | +RTX 6000 Pro Blackwell Server Edition +| Sequence Length (ISL/OSL) | **8 GPUs**
TP2,PP4 (FP4) | +|---|---| +| 128/128 | 18,890 | +| 128/2048 | 12,494 | +| 128/4096 | 7,715 | +| 500/2000 | 11,157 | +| 1000/1000 | 10,697 | +| 1000/2000 | 10,109 | +| 2048/128 | 3,181 | +| 2048/2048 | 6,712 | +| 5000/500 | 3,173 | + #### Qwen3-30B-A3B | Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | @@ -113,6 +156,20 @@ nvidia/Qwen3-235B-A22B-FP8 | 5000/500 | 6,142 | 5,908 | | 20000/2000 | 2,944 | 3,203 | +RTX 6000 Pro Blackwell Server Edition +| Sequence Length (ISL/OSL) | **1 GPUs**
TP1,PP1 (FP4) | **2 GPUs**
TP2,PP1 (FP4) | **4 GPUs**
TP4,PP1 (FP4) | **8 GPUs**
TP8,PP1 (FP4) | +|---|---|---|---|---| +| 128/128 | 28,011 | 32,646 | 36,819 | 39,490 | +| 128/2048 | 12,540 | 22,744 | 35,715 | 52,676 | +| 128/4096 | 7,491 | 15,049 | 28,139 | 33,895 | +| 500/2000 | 10,695 | 17,266 | 26,175 | 44,088 | +| 1000/1000 | 9,910 | 16,431 | 24,046 | 31,785 | +| 1000/2000 | 8,378 | 13,323 | 25,131 | 28,881 | +| 2048/128 | 3,257 | 3,785 | 4,311 | 4,798 | +| 2048/2048 | 5,908 | 10,679 | 18,134 | 22,391 | +| 5000/500 | 2,530 | 3,799 | 5,212 | 5,965 | +| 20000/2000 | 871 | 1,558 | 2,551 | | + #### Llama 4 Maverick | Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | @@ -128,6 +185,19 @@ nvidia/Qwen3-235B-A22B-FP8 | 5000/500 | | 8,456 | 3,457 | | 20000/2000 | 12,678 | 4,106 | | +RTX 6000 Pro Blackwell Server Edition +| Sequence Length (ISL/OSL) | **8 GPUs**
TP4,PP2 (FP4) | +|---|---| +| 128/128 | 30,583 | +| 128/2048 | 19,146 | +| 128/4096 | 12,165 | +| 500/2000 | 17,870 | +| 1000/1000 | 15,954 | +| 1000/2000 | 12,456 | +| 2048/128 | 4,463 | +| 2048/2048 | 10,727 | +| 5000/500 | 4,613 | + #### Llama 3.1 405B | Sequence Length (ISL/OSL) | B200
TP4 (FP4) | GB200
TP4 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | @@ -143,6 +213,16 @@ nvidia/Qwen3-235B-A22B-FP8 | 5000/500 | 1,048 | 1,112 | 650 | 610 | | 20000/2000 | 672 | 739 | 505 | 345 | +RTX 6000 Pro Blackwell Server Edition +| Sequence Length (ISL/OSL) | **8 GPUs**
TP1,PP8 (FP4) | +|---|---| +| 128/128 | 4,452 | +| 128/2048 | 2,981 | +| 1000/1000 | 2,369 | +| 1000/2000 | 1,931 | +| 2048/128 | 579 | +| 2048/2048 | 1,442 | + #### Llama 3.1 8B | Sequence Length (ISL/OSL) | H200
TP1 (FP8) | H100
TP1 (FP8) | From c580d47985a81021a2f76f115a81e5e5cc571026 Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:44:31 -0700 Subject: [PATCH 03/12] update spacing in new text for perf-overview Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 2cc45ee00e2..18b15634877 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -23,7 +23,8 @@ The performance numbers below were collected using the steps described in this d Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). -*(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks* +*(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:* + RTX 6000 Pro Blackwell Server Edition data is now included in the perf overview. RTX 6000 systems can benefit from enabling pipeline parallelism (PP) in LLM workloads, so we included several new benchmarks for this GPU at various TP x PP combinations. That data is presented in a separate table for each network. From 181e81a56a25c8ffa936d4740469b24a71e4dcce Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Thu, 18 Sep 2025 22:22:18 -0700 Subject: [PATCH 04/12] update instructions for reproducing data based on feedback from @flin3500 Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 27 +++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 18b15634877..c024add40e5 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -317,8 +317,9 @@ a model name (HuggingFace reference or path to a local model), a [generated data trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options ``` -The data collected for the v0.21 benchmarks was run with the following file: +The data collected for the v1.0 benchmarks was run with the following files: +Dense / non-MoE models: `llm_options.yml` ```yaml cuda_graph_config: @@ -341,6 +342,30 @@ cuda_graph_config: - 8192 ``` +MoE models: +`llm_options.yml` +```yaml +cuda_graph_config: + enable_attention_dp: true + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 384 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 +``` + In many cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` or lower if out-of-memory errors are encountered. The results will be printed to the terminal upon benchmark completion. For example, From 62584cb182d6cae20ea76cb330b6fe7f262b0ecd Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Thu, 18 Sep 2025 22:27:11 -0700 Subject: [PATCH 05/12] update llama 4 scout table with newer data Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index c024add40e5..ba1860643fc 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -60,18 +60,18 @@ nvidia/Qwen3-235B-A22B-FP8 #### Llama 4 Scout -| Sequence Length (ISL/OSL) | B200
TP4 (FP4) | GB200
TP4 (FP4) | H200
TP4 (FP8) | H100
TP4 (FP8) | +| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP4 (FP8) | H100
TP4 (FP8) | |---|---|---|---|---| -| 128/128 | 65,845 | 65,169 | 30,354 | | -| 128/2048 | 45,053 | 46,385 | 34,316 | 15,130 | -| 128/4096 | 31,834 | 34,050 | 21,332 | 8,603 | -| 500/2000 | 40,321 | 41,190 | 24,630 | 12,399 | -| 1000/1000 | 35,440 | 36,624 | 21,636 | 12,129 | -| 1000/2000 | 39,713 | 38,274 | 18,499 | 9,838 | -| 2048/128 | 8,139 | 8,546 | 3,699 | 3,253 | -| 2048/2048 | 24,542 | 24,303 | 14,949 | 7,972 | -| 5000/500 | 9,425 | 9,859 | 4,605 | 3,342 | -| 20000/2000 | 6,141 | 6,509 | 2,105 | | +| 128/128 | 25,342 | 19,954 | 30,354 | | +| 128/2048 | 14,699 | 15,238 | 34,316 | 15,130 | +| 128/4096 | 8,932 | 9,556 | 21,332 | 8,603 | +| 500/2000 | 11,977 | 11,795 | 24,630 | 12,399 | +| 1000/1000 | 10,591 | 7,738 | 21,636 | 12,129 | +| 1000/2000 | 9,356 | 8,581 | 18,499 | 9,838 | +| 2048/128 | 3,137 | 3,295 | 3,699 | 3,253 | +| 2048/2048 | 7,152 | 7,464 | 14,949 | 7,972 | +| 5000/500 | 2,937 | 3,107 | 4,605 | 3,342 | +| 20000/2000 | 1,644 | 1,767 | 2,105 | | RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **4 GPUs**
TP2,PP2 (FP4) | **8 GPUs**
TP4,PP2 (FP4) | From 00f37686aa854010d68bcbd0f4ab000221541ca6 Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Thu, 18 Sep 2025 22:44:53 -0700 Subject: [PATCH 06/12] update moe / non-moe instructions to include tp / ep / pp options Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index ba1860643fc..2dfb1804724 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -267,6 +267,7 @@ Starting with v0.19, testing was performed using the PyTorch backend - this work | `$osl` | Benchmark output sequence length. | | `$tp_size` | Tensor parallel mapping degree to run the benchmark with | | `$pp_size` | Pipeline parallel mapping degree to run the benchmark with | +| `$ep_size` | Expert parallel mapping degree to run the benchmark with | | `$model_name` | HuggingFace model name eg. meta-llama/Llama-2-7b-hf or use the path to a local weights directory | | `$dataset_file` | Location of the dataset file generated by `prepare_dataset.py` | | `$num_requests` | The number of requests to generate for dataset generation | @@ -313,13 +314,12 @@ To run the benchmark with the generated data set, simply use the `trtllm-bench t run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide a model name (HuggingFace reference or path to a local model), a [generated dataset](#preparing-a-dataset), and a file containing any desired extra options to the LLMApi (details in [tensorrt_llm/llmapi/llm_args.py:LlmArgs](../../../tensorrt_llm/llmapi/llm_args.py)). +For dense / non-MoE models: + ```shell -trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options +trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options ``` -The data collected for the v1.0 benchmarks was run with the following files: - -Dense / non-MoE models: `llm_options.yml` ```yaml cuda_graph_config: @@ -342,7 +342,12 @@ cuda_graph_config: - 8192 ``` -MoE models: +For MoE models: + +```shell +trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options +``` + `llm_options.yml` ```yaml cuda_graph_config: From a8234f0e05cb2bfc9321326b084a334e2362956f Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Fri, 19 Sep 2025 00:09:33 -0700 Subject: [PATCH 07/12] fix indenting in llm_options.yml Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 2dfb1804724..cd1dd4eb38c 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -350,8 +350,8 @@ trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throu `llm_options.yml` ```yaml +enable_attention_dp: true cuda_graph_config: - enable_attention_dp: true enable_padding: true batch_sizes: - 1 From ec21c8ddd472a0b5b8f9485c293d592be529702e Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Fri, 19 Sep 2025 00:13:59 -0700 Subject: [PATCH 08/12] add b200 dsr1 data Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index cd1dd4eb38c..67e99d5d125 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -171,6 +171,18 @@ RTX 6000 Pro Blackwell Server Edition | 5000/500 | 2,530 | 3,799 | 5,212 | 5,965 | | 20000/2000 | 871 | 1,558 | 2,551 | | +#### DeepSeek R1 + +| Sequence Length (ISL/OSL) | B200
TP8 (FP4) | +|---|---| +| 128/128 | 46,513 | +| 128/2048 | 62,599 | +| 128/4096 | 44,046 | +| 1000/1000 | 37,634 | +| 1000/2000 | 40,538 | +| 2048/128 | 5,026 | +| 2048/2048 | 28,852 | + #### Llama 4 Maverick | Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | From 6342de737a3dc55b519c7918a2588468cdcda7f5 Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Fri, 19 Sep 2025 10:44:09 -0700 Subject: [PATCH 09/12] fix formatting on some tables Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 38 ++++++++---------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 67e99d5d125..0d0437aafe5 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -61,8 +61,7 @@ nvidia/Qwen3-235B-A22B-FP8 #### Llama 4 Scout | Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP4 (FP8) | H100
TP4 (FP8) | -|---|---|---|---|---| -| 128/128 | 25,342 | 19,954 | 30,354 | | +|---|---|---|---| | 128/2048 | 14,699 | 15,238 | 34,316 | 15,130 | | 128/4096 | 8,932 | 9,556 | 21,332 | 8,603 | | 500/2000 | 11,977 | 11,795 | 24,630 | 12,399 | @@ -76,7 +75,6 @@ nvidia/Qwen3-235B-A22B-FP8 RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **4 GPUs**
TP2,PP2 (FP4) | **8 GPUs**
TP4,PP2 (FP4) | |---|---|---| -| 128/128 | 18,644 | 23,454 | | 128/2048 | 12,321 | 21,035 | | 128/4096 | 7,643 | 13,421 | | 1000/1000 | 9,476 | 15,781 | @@ -89,7 +87,6 @@ RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP1 (FP8) | H100
TP2 (FP8) | |---|---|---|---|---| -| 128/128 | 11,147 | 12,189 | 3,803 | 6,378 | | 128/2048 | 9,922 | 11,309 | 4,336 | 6,651 | | 128/4096 | 6,831 | 7,849 | 2,872 | 4,199 | | 500/2000 | 7,762 | 9,028 | 3,666 | 5,222 | @@ -103,7 +100,6 @@ RTX 6000 Pro Blackwell Server Edition RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **1 GPUs**
TP1,PP1 (FP4) | **2 GPUs**
TP1,PP2 (FP4) | **4 GPUs**
TP1,PP4 (FP4) | **8 GPUs**
TP1,PP8 (FP4) | |---|---|---|---|---| -| 128/128 | 3,264 | 6,172 | 9,641 | 11,881 | | 128/2048 | 2,422 | 4,993 | 7,922 | 9,833 | | 128/4096 | 1,349 | 2,893 | 4,978 | 7,352 | | 500/2000 | 1,856 | 4,114 | 6,939 | 9,435 | @@ -118,7 +114,6 @@ RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | |---|---|---|---| -| 128/128 | 63,362 | 30,429 | | | 128/2048 | 66,057 | 42,821 | 19,658 | | 128/4096 | 39,496 | 26,852 | 12,447 | | 500/2000 | 57,117 | 28,026 | 18,351 | @@ -132,7 +127,6 @@ RTX 6000 Pro Blackwell Server Edition RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **8 GPUs**
TP2,PP4 (FP4) | |---|---| -| 128/128 | 18,890 | | 128/2048 | 12,494 | | 128/4096 | 7,715 | | 500/2000 | 11,157 | @@ -144,23 +138,21 @@ RTX 6000 Pro Blackwell Server Edition #### Qwen3-30B-A3B -| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | -|---|---|---| -| 128/128 | 39,827 | 24,107 | -| 128/2048 | 37,844 | 31,311 | -| 128/4096 | 24,953 | 26,337 | -| 500/2000 | 27,817 | 20,604 | -| 1000/1000 | 25,828 | 17,481 | -| 1000/2000 | 22,051 | 17,177 | -| 2048/128 | 6,251 | 6,595 | -| 2048/2048 | 17,554 | 14,719 | -| 5000/500 | 6,142 | 5,908 | -| 20000/2000 | 2,944 | 3,203 | +| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | +|---|---| +| 128/2048 | 37,844 | +| 128/4096 | 24,953 | +| 500/2000 | 27,817 | +| 1000/1000 | 25,828 | +| 1000/2000 | 22,051 | +| 2048/128 | 6,251 | +| 2048/2048 | 17,554 | +| 5000/500 | 6,142 | +| 20000/2000 | 2,944 | RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **1 GPUs**
TP1,PP1 (FP4) | **2 GPUs**
TP2,PP1 (FP4) | **4 GPUs**
TP4,PP1 (FP4) | **8 GPUs**
TP8,PP1 (FP4) | |---|---|---|---|---| -| 128/128 | 28,011 | 32,646 | 36,819 | 39,490 | | 128/2048 | 12,540 | 22,744 | 35,715 | 52,676 | | 128/4096 | 7,491 | 15,049 | 28,139 | 33,895 | | 500/2000 | 10,695 | 17,266 | 26,175 | 44,088 | @@ -175,7 +167,6 @@ RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | B200
TP8 (FP4) | |---|---| -| 128/128 | 46,513 | | 128/2048 | 62,599 | | 128/4096 | 44,046 | | 1000/1000 | 37,634 | @@ -187,7 +178,6 @@ RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | |---|---|---|---| -| 128/128 | 124,323 | 63,994 | | | 128/2048 | 112,676 | 40,572 | 10,829 | | 128/4096 | 68,170 | 24,616 | 6,744 | | 500/2000 | | 37,835 | 10,108 | @@ -201,7 +191,6 @@ RTX 6000 Pro Blackwell Server Edition RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **8 GPUs**
TP4,PP2 (FP4) | |---|---| -| 128/128 | 30,583 | | 128/2048 | 19,146 | | 128/4096 | 12,165 | | 500/2000 | 17,870 | @@ -215,7 +204,6 @@ RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | B200
TP4 (FP4) | GB200
TP4 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | |---|---|---|---|---| -| 128/128 | 6,281 | 6,740 | 3,814 | 3,627 | | 128/2048 | 8,020 | 8,151 | 5,348 | 4,340 | | 128/4096 | 6,345 | 6,608 | 4,741 | 3,116 | | 500/2000 | 6,244 | 6,540 | 4,724 | 3,994 | @@ -229,7 +217,6 @@ RTX 6000 Pro Blackwell Server Edition RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **8 GPUs**
TP1,PP8 (FP4) | |---|---| -| 128/128 | 4,452 | | 128/2048 | 2,981 | | 1000/1000 | 2,369 | | 1000/2000 | 1,931 | @@ -240,7 +227,6 @@ RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | H200
TP1 (FP8) | H100
TP1 (FP8) | |---|---|---| -| 128/128 | 27,911 | 28,312 | | 128/2048 | 26,221 | 22,714 | | 128/4096 | 18,027 | 14,325 | | 500/2000 | 20,770 | 17,660 | From d46d509171de0510a8e715181a9376c3240d9a90 Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Fri, 19 Sep 2025 14:06:43 -0700 Subject: [PATCH 10/12] add missing commit Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- docs/source/performance/perf-overview.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 0d0437aafe5..d354d869aa1 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -61,16 +61,16 @@ nvidia/Qwen3-235B-A22B-FP8 #### Llama 4 Scout | Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP4 (FP8) | H100
TP4 (FP8) | -|---|---|---|---| -| 128/2048 | 14,699 | 15,238 | 34,316 | 15,130 | -| 128/4096 | 8,932 | 9,556 | 21,332 | 8,603 | -| 500/2000 | 11,977 | 11,795 | 24,630 | 12,399 | -| 1000/1000 | 10,591 | 7,738 | 21,636 | 12,129 | -| 1000/2000 | 9,356 | 8,581 | 18,499 | 9,838 | -| 2048/128 | 3,137 | 3,295 | 3,699 | 3,253 | -| 2048/2048 | 7,152 | 7,464 | 14,949 | 7,972 | -| 5000/500 | 2,937 | 3,107 | 4,605 | 3,342 | -| 20000/2000 | 1,644 | 1,767 | 2,105 | | +|---------------------------|---------------------|---------------------|-------------------|-------------------| +| 128/2048 | 14,699 | 15,238 | 34,316 | 15,130 | +| 128/4096 | 8,932 | 9,556 | 21,332 | 8,603 | +| 500/2000 | 11,977 | 11,795 | 24,630 | 12,399 | +| 1000/1000 | 10,591 | 7,738 | 21,636 | 12,129 | +| 1000/2000 | 9,356 | 8,581 | 18,499 | 9,838 | +| 2048/128 | 3,137 | 3,295 | 3,699 | 3,253 | +| 2048/2048 | 7,152 | 7,464 | 14,949 | 7,972 | +| 5000/500 | 2,937 | 3,107 | 4,605 | 3,342 | +| 20000/2000 | 1,644 | 1,767 | 2,105 | | RTX 6000 Pro Blackwell Server Edition | Sequence Length (ISL/OSL) | **4 GPUs**
TP2,PP2 (FP4) | **8 GPUs**
TP4,PP2 (FP4) | From 9f028daf064da38bb7865c56b5402f34767c6c52 Mon Sep 17 00:00:00 2001 From: zpatel <22306219+zbpatel@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:19:34 -0700 Subject: [PATCH 11/12] exclude poetry lock from codespell Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com> --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 74f830f07db..f149901e45f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,6 +67,7 @@ repos: - tomli # add ignore words list args: ["-L", "Mor,ans,thirdparty"] + exclude: "security_scanning/examples/auto_deploy/poetry.lock" - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.4 hooks: From 7bf6569924ea5ed7ce667686fc4a2b03510ad9a9 Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Mon, 22 Sep 2025 12:49:04 +0800 Subject: [PATCH 12/12] Update .pre-commit-config.yaml Signed-off-by: Yanchao Lu --- .pre-commit-config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f149901e45f..74f830f07db 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,7 +67,6 @@ repos: - tomli # add ignore words list args: ["-L", "Mor,ans,thirdparty"] - exclude: "security_scanning/examples/auto_deploy/poetry.lock" - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.4 hooks: