Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
240 commits
Select commit Hold shift + click to select a range
8293182
wip
LucasWilkinson May 19, 2025
37c9bab
enable naive microbatching
LucasWilkinson May 19, 2025
df8f889
support MLA
LucasWilkinson May 20, 2025
f93bdd3
support more args in dp example
LucasWilkinson May 20, 2025
9ccfd09
fix dummy mode
LucasWilkinson May 20, 2025
020269c
added multhreading support
SageMoore May 20, 2025
ffb740a
manually manage stream
LucasWilkinson May 20, 2025
04f11d9
working but only on the same stream
LucasWilkinson May 21, 2025
2259b47
use vllm current_stream
LucasWilkinson May 21, 2025
9c60a62
tp1 working multistream tp > 1 broken
LucasWilkinson May 21, 2025
2a7f25f
fix hang
SageMoore May 21, 2025
a8439e2
dp working no yields
LucasWilkinson May 22, 2025
00f526f
seperate gpu wait
LucasWilkinson May 22, 2025
18bf91e
wip
LucasWilkinson May 23, 2025
2dc3b8b
wip
LucasWilkinson May 23, 2025
9edd082
debugging hang
LucasWilkinson May 23, 2025
952f3c5
tone down prints
LucasWilkinson May 23, 2025
e4419df
better debug utils
LucasWilkinson May 23, 2025
37bdf9f
better logging
LucasWilkinson May 23, 2025
020d9b0
fix dp=2 tp=2 hang
SageMoore May 26, 2025
2f39206
add comment
LucasWilkinson May 27, 2025
7b31e8a
wip seperate comm and compute threads
LucasWilkinson May 27, 2025
a743a35
fixes
LucasWilkinson May 27, 2025
f0b66d6
prints
LucasWilkinson May 27, 2025
5cc573e
misc fixes
SageMoore May 29, 2025
895a6c2
one a2a kernel per microbatch group
SageMoore May 30, 2025
5b0249b
various fixes
SageMoore May 30, 2025
62da375
more fixes
SageMoore May 30, 2025
252bf08
debugging
SageMoore May 31, 2025
0323e29
misc cleanups to prepare for rebase
SageMoore Jun 2, 2025
8f59252
misc cleanups to prepare for rebase
SageMoore Jun 2, 2025
90e46ee
misc cleanups to prepare for rebase
SageMoore Jun 2, 2025
065816d
misc cleanups to prepare for rebase
SageMoore Jun 2, 2025
6645882
comment prepare input
SageMoore Jun 2, 2025
d6eca0c
remove modular kernel
SageMoore Jun 2, 2025
21d9529
revert offline_inference/basic.py
SageMoore Jun 2, 2025
8ea80fc
revert offline_inference/basic.py
SageMoore Jun 2, 2025
92e0cc7
format
SageMoore Jun 2, 2025
44a595f
config format
SageMoore Jun 2, 2025
d4b502a
mla format
SageMoore Jun 2, 2025
8332924
dp format
SageMoore Jun 2, 2025
243eac5
forward context format
SageMoore Jun 2, 2025
d463976
pplx format
SageMoore Jun 2, 2025
e34e441
fa format
SageMoore Jun 2, 2025
919eef9
temporarily remove enable_microbatching
SageMoore Jun 2, 2025
2731e8c
temporarily remove enable_microbatching
SageMoore Jun 2, 2025
18e7d6c
Merge branch 'main' of https://github.com/neuralmagic/vllm into lwilk…
SageMoore Jun 3, 2025
539c0c3
first round of fixes
SageMoore Jun 3, 2025
5f4a501
more fixes
SageMoore Jun 3, 2025
e080e06
fix pplx a2a
SageMoore Jun 3, 2025
2e3484c
debugging
SageMoore Jun 3, 2025
f8848bb
misc fixes. lm_eval still gets a wrong answer but it no longer hangs
SageMoore Jun 4, 2025
8a75b3a
added support for ubatch padding. not working
SageMoore Jun 5, 2025
a8675b7
ubatch padding should work now
SageMoore Jun 5, 2025
a00dabc
more padding work. still gets the wrong answer
SageMoore Jun 6, 2025
05ddc34
misc padding fixes
SageMoore Jun 6, 2025
60499f6
padding is getting correctness but there are still some edgecases tri…
SageMoore Jun 7, 2025
e6e3407
fix ubatch padding to account for the case where the padding would re…
SageMoore Jun 8, 2025
642bf2d
Merge branch 'main' of https://github.com/neuralmagic/vllm into lwilk…
SageMoore Jun 8, 2025
ef3c01c
fix using the same buffer across ubatches
LucasWilkinson Jun 9, 2025
d682f5e
wip cudagraphs
SageMoore Jun 12, 2025
b74c731
more hacking
SageMoore Jun 12, 2025
1d112d9
misc changes
SageMoore Jun 17, 2025
0889f66
Merge branch 'main' of https://github.com/neuralmagic/vllm into lwilk…
SageMoore Jun 18, 2025
ff2dd13
more fixes
SageMoore Jun 18, 2025
a4def24
setup deepepll for ubatching
SageMoore Jun 24, 2025
930efd0
yields now work with deepep_ll
SageMoore Jun 24, 2025
96c0c4e
added initial code for cuda graph capturing ubatches
SageMoore Jun 24, 2025
97dbafa
fix correctness issue with full-cudagraphs + attn splitting
SageMoore Jun 24, 2025
144b148
initial full cudagraphs support. normal runs are working. ubatching d…
SageMoore Jun 25, 2025
44a2b34
add attention splitting to dummy runs
SageMoore Jun 25, 2025
e2ba707
factored out some of the context creation code along with misc commet…
SageMoore Jun 25, 2025
0e2b4bd
more refactoring
SageMoore Jun 25, 2025
54deb61
delete any notion of dummy_ubatch
SageMoore Jun 25, 2025
78228a6
refactor a bunch of misc parameters into a UbatchMetadata class
SageMoore Jun 26, 2025
af68574
reintegrate full cudagraphs
SageMoore Jun 26, 2025
4672c72
capture works replay does not
SageMoore Jun 28, 2025
d833982
random push
SageMoore Jun 30, 2025
57d404b
misc
SageMoore Jun 30, 2025
f7a3ee0
Merge remote-tracking branch 'origin/main' into lwilkinson/attn-slicing
LucasWilkinson Jul 1, 2025
c0efbbb
misc changes
SageMoore Jul 2, 2025
0767d98
fix data_parallel.py
SageMoore Jul 2, 2025
0e499c4
first round of cleanups
SageMoore Jul 2, 2025
3d833aa
cleanup
SageMoore Jul 2, 2025
18f7bfb
ubatching fix
SageMoore Jul 2, 2025
ce3ef95
turn yields on for pplx
SageMoore Jul 2, 2025
be2e163
delete basic-ub.py
SageMoore Jul 3, 2025
9b7edc0
cleanup data_parallel.py
SageMoore Jul 3, 2025
0c03d15
cleanup config.py
SageMoore Jul 3, 2025
3112714
cleanup logger.py
SageMoore Jul 3, 2025
1ca6541
cleanup backends/utils.py
SageMoore Jul 3, 2025
fc562e2
cleanup gpu_worker.py
SageMoore Jul 3, 2025
a9d47e8
remove always_microbatch_if_enabled
SageMoore Jul 3, 2025
631be12
refactoring pplx_prepare_finalize.py
SageMoore Jul 3, 2025
6e2a3c0
minor changes
SageMoore Jul 3, 2025
17a7cee
cleanup deepep ll
SageMoore Jul 3, 2025
1d75a02
remove cudagraph logic from flashmla.py
SageMoore Jul 3, 2025
7e2ff26
cleanup flashmla.py
SageMoore Jul 3, 2025
2f3461a
cleanup flashmla.py
SageMoore Jul 3, 2025
83caef8
cleanups for ubatching.py
SageMoore Jul 3, 2025
7cc5a54
cleanup some of the should_ubatch logic
SageMoore Jul 3, 2025
0056be2
less ARs
SageMoore Jul 3, 2025
f7b6e60
gpu_model_runner cleanup
SageMoore Jul 3, 2025
510e839
more cleanup
SageMoore Jul 3, 2025
bb0645c
separate ubatch and normal runs
SageMoore Jul 3, 2025
3a41a3d
cleanup
SageMoore Jul 3, 2025
06cc133
cleanup
SageMoore Jul 3, 2025
908e9f8
cleanup
SageMoore Jul 3, 2025
10ca263
split some of the ubatching logic out of _run_model
SageMoore Jul 3, 2025
82ae694
comments cleanup etc
SageMoore Jul 3, 2025
1a0e711
_prepare_inputs cleanup
SageMoore Jul 8, 2025
716b032
should_ubatch improvements
SageMoore Jul 8, 2025
dc1b6af
format
SageMoore Jul 8, 2025
bfa828f
format
SageMoore Jul 8, 2025
462c6b0
remove some dummy_run logic
SageMoore Jul 8, 2025
9033056
remove FA changes
SageMoore Jul 8, 2025
376e7eb
minor change
SageMoore Jul 8, 2025
9b5913e
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Jul 9, 2025
b53450e
fix deep ep ll teardown
SageMoore Jul 9, 2025
29a5ac1
remove previous fix
SageMoore Jul 9, 2025
6d83b5e
cache comm stream
SageMoore Jul 22, 2025
1ba3ae8
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Jul 25, 2025
ee70ce0
added splitting
SageMoore Jul 25, 2025
b9ad5e4
misc merge fixes
SageMoore Jul 25, 2025
1c41175
full cudagraphs
SageMoore Jul 25, 2025
582d301
add support for splitting dispatch/combine deepep ll kernels
SageMoore Jul 30, 2025
ba17d95
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Jul 31, 2025
e283eff
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Jul 31, 2025
4819bb8
fix eager mode
SageMoore Aug 5, 2025
6b0c303
misc fixes
SageMoore Aug 5, 2025
5bbfd95
add support for multiple builders in the model runner
SageMoore Aug 8, 2025
2cf200c
remove debug logging
SageMoore Aug 8, 2025
28e7c30
Fix pre-commit error
yewentao256 Aug 11, 2025
44ead56
fix set forward context error
yewentao256 Aug 11, 2025
e526b1c
fix num_tokens_across_dp sizing issue
SageMoore Aug 11, 2025
dd2a94f
fix assert error num_tokens_across_dp is None
yewentao256 Aug 11, 2025
5215c80
Merge commit '6e8d8c4afbddf725b34ef938616701869f5b3462' into sage/dbo…
yewentao256 Aug 13, 2025
9e16220
fix ubatch datatype issue
yewentao256 Aug 13, 2025
6d76bd0
revert kv connector fix
SageMoore Aug 13, 2025
090f485
add support for cutlass mla full cudagraphs
SageMoore Aug 13, 2025
143b09e
fix full cudagraphs for cutlass mla
SageMoore Aug 13, 2025
32de502
fixed acc issue
yewentao256 Aug 19, 2025
fc0aca4
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Aug 19, 2025
31ba624
fix full cudagraphs for DBO
SageMoore Aug 20, 2025
85ee541
Merge branch 'sage/dbo-full-cudagraphs' of https://github.com/neuralm…
SageMoore Aug 20, 2025
9ac75b5
refactor model input slicing
SageMoore Aug 21, 2025
34f0057
get eager mode ubatching working with UBatchWrapper
SageMoore Aug 22, 2025
ac6e221
get eager mode ubatching working with UBatchWrapper
SageMoore Aug 22, 2025
c8fdd62
get cudagraphs working with UBatchWrapper
SageMoore Aug 22, 2025
bca8aa9
cudagraphs should generally work now
SageMoore Aug 25, 2025
a35416e
gpu model runner cleanup
SageMoore Aug 25, 2025
4126a89
misc cleanup
SageMoore Aug 26, 2025
52fd4c1
cleanup
SageMoore Aug 26, 2025
717163a
misc cleanup
SageMoore Aug 26, 2025
197dad1
misc cleanup
SageMoore Aug 26, 2025
7813e15
single alloc buffer
LucasWilkinson Aug 26, 2025
968647a
use hooks for ll overlap; better perf; multinode fixes
LucasWilkinson Aug 26, 2025
57423ee
ht support partially working
LucasWilkinson Aug 27, 2025
a3c2d62
got rid of one all reduce
SageMoore Aug 27, 2025
bff1216
only one AR remains
SageMoore Aug 27, 2025
ee00620
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Aug 27, 2025
6b9bda2
minor merge fix
SageMoore Aug 27, 2025
a5bda74
plumb microbatchin_token_threshold
SageMoore Aug 28, 2025
8f63ba9
fix HT handle issue
yewentao256 Aug 28, 2025
d62286f
temp logging
SageMoore Aug 29, 2025
fe19b91
temp workaround for cudagraph dispatching bug
SageMoore Aug 29, 2025
64457a2
minor log update
SageMoore Aug 29, 2025
a762835
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Aug 29, 2025
528be37
misc ubatch_wrapper updates
SageMoore Sep 2, 2025
e104dfa
comment updates
SageMoore Sep 2, 2025
df6ed10
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 2, 2025
9390dcb
comment updates
SageMoore Sep 2, 2025
6660171
lint
SageMoore Sep 2, 2025
01c70b4
fix attn splitting in dummy run
SageMoore Sep 3, 2025
d464b9e
reenable torch.compile for grouped_topk
SageMoore Sep 3, 2025
53f5071
remove deepep ht changes
SageMoore Sep 3, 2025
76f3c96
remove pplx changes
SageMoore Sep 3, 2025
307ecf0
mla cleanup
SageMoore Sep 3, 2025
21b0f16
remove deepep ht changes
SageMoore Sep 3, 2025
c1c003f
mla cleanup
SageMoore Sep 3, 2025
aebacdc
pplx cleanup
SageMoore Sep 3, 2025
4718a2d
pplx cleanup
SageMoore Sep 3, 2025
0c54343
remove enable_async_comms
SageMoore Sep 3, 2025
b6d162f
padding bugfix
SageMoore Sep 3, 2025
44124af
simplify a2a kernel dispatching
SageMoore Sep 3, 2025
7427b2d
simplify ubatch padding
SageMoore Sep 3, 2025
756d721
dp metadata refactor
SageMoore Sep 3, 2025
9602070
debug cruft
SageMoore Sep 3, 2025
32fb038
fix piecewise compilation in the ubatch wrapper
SageMoore Sep 3, 2025
e42c0e7
moves types around
SageMoore Sep 3, 2025
10518bd
add check to assert we are using deepep_low_latency
SageMoore Sep 3, 2025
9e1f1af
misc gpu model runner refactoring
SageMoore Sep 3, 2025
b2ed6c3
misc gpu model runner refactoring
SageMoore Sep 4, 2025
ba00047
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 4, 2025
49cdc3d
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 4, 2025
ec9f13d
misc review comments
SageMoore Sep 4, 2025
6d31123
move files
SageMoore Sep 4, 2025
2def98d
add maybe_run_recv_hook to __exit__
SageMoore Sep 4, 2025
9da3928
padding refactor (doesnt work)
SageMoore Sep 8, 2025
6b6358a
padding fix
SageMoore Sep 9, 2025
87d300e
remove old ubatch splitting code
SageMoore Sep 9, 2025
46895f3
move splitting code into its own file
SageMoore Sep 9, 2025
4114f5c
remove logging
SageMoore Sep 9, 2025
2276ac6
remove context offset
SageMoore Sep 9, 2025
178ec20
minor yield fix
SageMoore Sep 9, 2025
ef313e5
remove flash attention metadata
SageMoore Sep 9, 2025
813ba08
modular kernel refactoring
SageMoore Sep 9, 2025
9e08d5d
eagle fixes
SageMoore Sep 9, 2025
1e3a145
second ubatch empty
SageMoore Sep 9, 2025
fc18cf4
misc review comments
SageMoore Sep 10, 2025
b99ea7d
comments
SageMoore Sep 10, 2025
0d36d13
comments
SageMoore Sep 10, 2025
d3ec67b
comments
SageMoore Sep 10, 2025
120569a
padding refactor
SageMoore Sep 10, 2025
0e479cb
padding refactor
SageMoore Sep 10, 2025
880783e
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 10, 2025
73848ab
fix cpu model runner
SageMoore Sep 11, 2025
7b239fd
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 11, 2025
9c6c6fd
lint
SageMoore Sep 11, 2025
ff75a86
lint
SageMoore Sep 11, 2025
fa30304
lint
SageMoore Sep 11, 2025
bbec31e
lint
SageMoore Sep 11, 2025
9185ffc
lint
SageMoore Sep 11, 2025
911dbe7
lint
SageMoore Sep 11, 2025
bce1898
lint
SageMoore Sep 11, 2025
588e79a
minor typename change
SageMoore Sep 11, 2025
a3d9969
moe layer refactoring
SageMoore Sep 12, 2025
92081eb
should_ubatch_across_dp refactor
SageMoore Sep 12, 2025
462d035
config option name change
SageMoore Sep 12, 2025
4fba0fe
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 12, 2025
e0d65df
Merge branch 'main' into sage/dbo-full-cudagraphs
LucasWilkinson Sep 14, 2025
77bc884
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 15, 2025
38a25a0
revert dp padding change
SageMoore Sep 15, 2025
025e726
comment fix
SageMoore Sep 15, 2025
b1269ef
review comments
SageMoore Sep 15, 2025
fe098a7
review comments
SageMoore Sep 15, 2025
2c5f726
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 15, 2025
76b6248
Merge branch 'main' into sage/dbo-full-cudagraphs
robertgshaw2-redhat Sep 15, 2025
bc1fcb0
spec decode test fixes
SageMoore Sep 16, 2025
9e86147
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/…
SageMoore Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions examples/offline_inference/data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ def parse_args():
default=0.8,
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
)
parser.add_argument(
"--enable-dbo",
action="store_true",
help=("Enable microbatched execution"),
)
parser.add_argument(
"--compilation-config",
type=int,
Expand All @@ -113,6 +118,7 @@ def main(
max_model_len,
compilation_config,
gpu_memory_utilization,
enable_dbo,
quantization,
):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
Expand Down Expand Up @@ -167,6 +173,7 @@ def start(rank):
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enable_dbo=enable_dbo,
quantization=quantization,
compilation_config=compilation_config,
)
Expand Down Expand Up @@ -227,6 +234,7 @@ def start(rank):
args.max_model_len,
args.compilation_config,
args.gpu_memory_utilization,
args.enable_dbo,
args.quantization,
),
)
Expand Down
10 changes: 5 additions & 5 deletions tests/v1/attention/test_attention_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from tests.v1.attention.test_attention_backends import BATCH_SPECS
from tests.v1.attention.utils import create_common_attn_metadata
from vllm.v1.attention.backends.utils import (UbatchSlice,
from vllm.v1.attention.backends.utils import (UBatchSlice,
_make_metadata_with_slice,
slice_query_start_locs,
split_attn_metadata)
Expand Down Expand Up @@ -106,7 +106,7 @@ def mixed_small_metadata():
def test_make_metadata_with_slice_decode_batch(small_decode_metadata):
"""Test slicing decode batch metadata"""
# Split first request only
ubatch_slice = UbatchSlice(slice(0, 1), slice(0, 1))
ubatch_slice = UBatchSlice(slice(0, 1), slice(0, 1))

result = _make_metadata_with_slice(ubatch_slice, small_decode_metadata)

Expand All @@ -120,7 +120,7 @@ def test_make_metadata_with_slice_decode_batch(small_decode_metadata):

def test_make_metadata_with_slice_mixed_batch(mixed_small_metadata):
"""Test slicing mixed batch metadata"""
ubatch_slice = UbatchSlice(slice(1, 3),
ubatch_slice = UBatchSlice(slice(1, 3),
slice(1, 7)) # Requests 1-3, tokens 1-7

result = _make_metadata_with_slice(ubatch_slice, mixed_small_metadata)
Expand All @@ -137,8 +137,8 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata):
num_tokens = large_decode_metadata.num_reqs
mid_point = num_tokens // 2
ubatch_slices = [
UbatchSlice(slice(0, mid_point), slice(0, mid_point)),
UbatchSlice(slice(mid_point, num_tokens), slice(mid_point,
UBatchSlice(slice(0, mid_point), slice(0, mid_point)),
UBatchSlice(slice(mid_point, num_tokens), slice(mid_point,
num_tokens)),
]

Expand Down
8 changes: 6 additions & 2 deletions tests/v1/spec_decode/test_eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,9 @@ def create_deterministic_logits(token_ids):
# Mock runner for attention metadata building
proposer.runner = mock.MagicMock()
proposer.runner.attn_groups.append([mock.MagicMock()])
proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder
proposer.runner.attn_groups[0][0].metadata_builders = [
attn_metadata_builder
]

result = proposer.propose(target_token_ids=target_token_ids,
target_positions=target_positions,
Expand Down Expand Up @@ -489,7 +491,9 @@ def create_deterministic_logits(token_ids, k: int):
# Mock runner for attention metadata building.
proposer.runner = mock.MagicMock()
proposer.runner.attn_groups.append([mock.MagicMock()])
proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder
proposer.runner.attn_groups[0][0].metadata_builders = [
attn_metadata_builder
]

# Setup inputs for the proposer.
target_token_ids = torch.randint(0,
Expand Down
8 changes: 8 additions & 0 deletions vllm/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2848,6 +2848,14 @@ def __post_init__(self):
"when cudagraph_mode piecewise cudagraphs is used, "\
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"

if self.parallel_config.enable_dbo:
a2a_backend = envs.VLLM_ALL2ALL_BACKEND
assert a2a_backend == "deepep_low_latency", \
"Microbatching currently only supports the deepep_low_latency "\
f"all2all backend. {a2a_backend} is not supported. To fix set "\
"the VLLM_ALL2ALL_BACKEND environment variable to "\
"deepep_low_latency and install the DeepEP kerenls."

if not self.instance_id:
self.instance_id = random_uuid()[:5]

Expand Down
8 changes: 8 additions & 0 deletions vllm/config/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,14 @@ class ParallelConfig:
disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""

enable_dbo: bool = False
"""Enable microbatching for the model executor."""

dbo_decode_token_threshold: int = 32
"""The threshold for microbatching. If the number of tokens in the
request is greater than this threshold, microbatching will be used.
Otherwise, the request will be processed in a single batch."""

ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""

Expand Down
5 changes: 0 additions & 5 deletions vllm/distributed/device_communicators/all2all.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,4 @@ def get_handle(self, kwargs):
logger.debug("DeepEP all2all args %s", buffer_kwargs)
handle: deep_ep.Buffer = self.handle_cache.get_or_create(
buffer_kwargs, deep_ep.Buffer)
# It is dangerous to set num sms outside this function. num_sms is not
# a part of the hash-key that identifies this object. If we are in a
# situation where we make objects with different num_sms, the hash key
# in get_or_create must be updated.
handle.set_num_sms(self.num_sms)
return handle
10 changes: 10 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,9 @@ class EngineArgs:
data_parallel_hybrid_lb: bool = False
data_parallel_backend: str = ParallelConfig.data_parallel_backend
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
enable_dbo: bool = ParallelConfig.enable_dbo
dbo_decode_token_threshold: int = \
ParallelConfig.dbo_decode_token_threshold
eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
enable_eplb: bool = ParallelConfig.enable_eplb
expert_placement_strategy: ExpertPlacementStrategy = \
Expand Down Expand Up @@ -695,6 +698,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parallel_group.add_argument(
"--enable-expert-parallel",
**parallel_kwargs["enable_expert_parallel"])
parallel_group.add_argument("--enable-dbo",
**parallel_kwargs["enable_dbo"])
parallel_group.add_argument(
"--dbo-decode-token-threshold",
**parallel_kwargs["dbo_decode_token_threshold"])
Comment on lines +703 to +705
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the future plan for this argument? Will we add a separate --dbo-prefill-token-threshold? Could there be one argument instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep we are planning to add a prefill version of this argument.

parallel_group.add_argument("--enable-eplb",
**parallel_kwargs["enable_eplb"])
parallel_group.add_argument("--eplb-config",
Expand Down Expand Up @@ -1339,6 +1347,8 @@ def create_engine_config(
data_parallel_backend=self.data_parallel_backend,
data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
enable_expert_parallel=self.enable_expert_parallel,
enable_dbo=self.enable_dbo,
dbo_decode_token_threshold=self.dbo_decode_token_threshold,
enable_eplb=self.enable_eplb,
eplb_config=self.eplb_config,
expert_placement_strategy=self.expert_placement_strategy,
Expand Down
121 changes: 101 additions & 20 deletions vllm/forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.v1.worker.ubatch_utils import UBatchSlices, is_second_ubatch_empty

if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata
Expand Down Expand Up @@ -97,6 +98,53 @@ def num_tokens_across_dp(num_tokens: int, dp_size: int,
dist.all_reduce(num_tokens_tensor, group=group)
return num_tokens_tensor.cpu()

@staticmethod
def should_ubatch_across_dp(
should_ubatch: bool, orig_num_tokens_per_ubatch: int,
padded_num_tokens_per_ubatch: int, dp_size: int,
dp_rank: int) -> tuple[bool, Optional[torch.Tensor]]:
"""
1. Decides if each DP rank is going to microbatch. Either all ranks
run with microbatching or none of them do. If this function decides
not to run with microbatching. It will "abort" meaning that no padding
information will be returned to the caller. It will return (False, None)

2. Determines the total number of tokens that each rank will run.
All ranks will be padded out so that the run with the same number
of tokens

Returns: tuple[
should_ubatch: Are all DP ranks going to microbatch
num_tokens_after_padding: A tensor containing the total number of
tokens per-microbatch for each DP rank including padding. Will be
None if should_ubatch if False
]
"""

device = current_platform.device_type
tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32)
tensor[0][dp_rank] = orig_num_tokens_per_ubatch
tensor[1][dp_rank] = padded_num_tokens_per_ubatch
tensor[2][dp_rank] = 1 if should_ubatch else 0

from vllm.distributed.parallel_state import get_dp_group
dist.all_reduce(tensor, group=get_dp_group().device_group)

result: bool = bool(torch.all(tensor[2] == 1).item())
if not result:
return result, None

orig_num_tokens_tensor = tensor[0, :]
padded_num_tokens_tensor = tensor[1, :]

orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
logger.debug("Aborting ubatching %s %s", orig_min_num_tokens,
padded_max_num_tokens)
return False, None
return result, padded_num_tokens_tensor.cpu()

@staticmethod
def make(
parallel_config: ParallelConfig,
Expand All @@ -119,14 +167,15 @@ def make(

# If num_tokens_across_dp is None, it will be computed by all_reduce
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
assert (num_tokens_across_dp is None
or num_tokens_across_dp[dp_rank] == batchsize)
assert (num_tokens_across_dp is None or num_tokens_across_dp[dp_rank]
== batchsize), f"{num_tokens_across_dp[dp_rank]} {batchsize}"
if num_tokens_across_dp is None:
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
batchsize, dp_size, dp_rank)
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp)
cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu,
num_tokens_across_dp)

@contextmanager
def chunked_sizes(self, max_chunk_size_per_rank: int, chunk_idx: int):
Expand Down Expand Up @@ -179,9 +228,12 @@ class ForwardContext:
Type AttentionMetadata for v0,
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
attention layer to its attention metadata
set dynamically for each forward pass
Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
for each microbatch.
Set dynamically for each forward pass
"""
attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"]]
attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"],
list[dict[str, "AttentionMetadata"]]]
# TODO: remove after making all virtual_engines share the same kv cache
virtual_engine: int # set dynamically for each forward pass
# set dynamically for each forward pass
Expand All @@ -191,6 +243,8 @@ class ForwardContext:
cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE
batch_descriptor: Optional[BatchDescriptor] = None

ubatch_slices: Optional[UBatchSlices] = None

def __post_init__(self):
assert self.cudagraph_runtime_mode in [
CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
Expand All @@ -208,6 +262,39 @@ def get_forward_context() -> ForwardContext:
return _forward_context


def create_forward_context(
attn_metadata: Any,
vllm_config: VllmConfig,
virtual_engine: int = 0,
dp_metadata: Optional[DPMetadata] = None,
cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor: Optional[BatchDescriptor] = None,
ubatch_slices: Optional[UBatchSlices] = None):
return ForwardContext(no_compile_layers=vllm_config.compilation_config.
static_forward_context,
virtual_engine=virtual_engine,
attn_metadata=attn_metadata,
dp_metadata=dp_metadata,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=batch_descriptor,
ubatch_slices=ubatch_slices)


@contextmanager
def override_forward_context(forward_context: Optional[ForwardContext]):
"""A context manager that overrides the current forward context.
This is used to override the forward context for a specific
forward pass.
"""
global _forward_context
prev_context = _forward_context
_forward_context = forward_context
try:
yield
finally:
_forward_context = prev_context


@contextmanager
def set_forward_context(
attn_metadata: Any,
Expand All @@ -216,7 +303,8 @@ def set_forward_context(
num_tokens: Optional[int] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None,
cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor: Optional[BatchDescriptor] = None):
batch_descriptor: Optional[BatchDescriptor] = None,
ubatch_slices: Optional[UBatchSlices] = None):
"""A context manager that stores the current forward context,
can be attention metadata, etc.
Here we can inject common logic for every model forward pass.
Expand All @@ -225,27 +313,22 @@ def set_forward_context(
need_to_track_batchsize = track_batchsize and attn_metadata is not None
if need_to_track_batchsize:
forward_start_time = time.perf_counter()

dp_metadata: Optional[DPMetadata] = None
if vllm_config.parallel_config.data_parallel_size > 1 and (
attn_metadata is not None or num_tokens is not None):
dp_metadata = DPMetadata.make(vllm_config.parallel_config,
attn_metadata, num_tokens or 0,
num_tokens_across_dp)

global _forward_context
prev_context = _forward_context
_forward_context = ForwardContext(
no_compile_layers=vllm_config.compilation_config.
static_forward_context,
virtual_engine=virtual_engine,
attn_metadata=attn_metadata,
dp_metadata=dp_metadata,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=batch_descriptor,
)
forward_context = create_forward_context(attn_metadata, vllm_config,
virtual_engine, dp_metadata,
cudagraph_runtime_mode,
batch_descriptor, ubatch_slices)

try:
yield
with override_forward_context(forward_context):
yield
finally:
global last_logging_time, batchsize_logging_interval
if need_to_track_batchsize:
Expand Down Expand Up @@ -282,5 +365,3 @@ def set_forward_context(
logger.info(("Batchsize forward time stats "
"(batchsize, count, median_time(ms)): %s"),
forward_stats)

_forward_context = prev_context
Loading