@@ -169,7 +169,6 @@ def __init__(self,
169
169
self .draft_model_engine = draft_model_engine
170
170
171
171
# enqueue and _fetch_new_requests used data
172
- self .active = True
173
172
self .next_req_id = max_batch_size # The first max_batch_size request IDs are reserved for dummy requests
174
173
self .max_beam_width = max_beam_width
175
174
self .max_draft_len = max_draft_len
@@ -196,7 +195,6 @@ def __init__(self,
196
195
self .max_num_active_requests = model_engine .get_max_num_sequences ()
197
196
self .active_requests : List [LlmRequest ] = []
198
197
self .expected_num_active_requests = 0
199
- self .has_context_request = False
200
198
self .ctx_in_transmission_requests = []
201
199
self .previous_batch : Optional [BatchState ] = None
202
200
self .num_scheduled_requests : int = 0
@@ -1148,7 +1146,7 @@ def _check_disagg_gen_transfer_status(self):
1148
1146
@nvtx_range ("_pad_attention_dp_dummy_request" )
1149
1147
def _pad_attention_dp_dummy_request (self ):
1150
1148
"""
1151
- Pad with a dummy request, if required, to ensure every attention_dp rank has at least one active request.
1149
+ Pad with a generation dummy request, if required, to ensure every attention_dp rank has at least one active request.
1152
1150
"""
1153
1151
if not self .enable_attention_dp :
1154
1152
return
@@ -1166,8 +1164,8 @@ def _pad_attention_dp_dummy_request(self):
1166
1164
if self .expected_num_active_requests - num_active_request > 0 and num_active_request == 0 :
1167
1165
llm_request = self .kv_cache_manager .add_dummy_requests (
1168
1166
request_ids = [0 ],
1169
- is_gen = not self . has_context_request ,
1170
- prepare_resource = not self . has_context_request ,
1167
+ is_gen = True ,
1168
+ prepare_resource = True ,
1171
1169
max_num_draft_tokens = self .max_draft_len ,
1172
1170
)[0 ]
1173
1171
llm_request .is_attention_dp_dummy = True
0 commit comments