Skip to content

Commit e0fb9c0

Browse files
reasonsoloyuanjingx87
authored andcommitted
[https://nvbugs/5448767][fix] fix mpi4py deadlocks in pp event-loop (#6976)
Signed-off-by: Lizhi Zhou <[email protected]>
1 parent 77acc6d commit e0fb9c0

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,12 @@ def _process_iter_stats(self, finished_requests: list[LlmRequest],
627627
batch_state.sample_state.scheduled_requests), req_stats)
628628

629629
def _executor_loop_cleanup(self):
630+
# Unblock receiving processes. When second-last rank quits before last rank,
631+
# last rank will never return from recv_object.
632+
for req in self.send_handles:
633+
if req is not None:
634+
req.wait()
635+
630636
with self.response_cv:
631637
self.is_shutdown = True
632638
self.response_cv.notify_all()
@@ -750,6 +756,7 @@ def _executor_loop_pp(self):
750756

751757
sample_state = self._sample_async(
752758
scheduled_batch, batch_outputs)
759+
assert sample_state is not None, "Sampling failed"
753760
sample_state.host.logits = logits_host
754761
self._update_request_states(scheduled_batch)
755762

@@ -801,6 +808,7 @@ def _executor_loop_pp(self):
801808
if not self.dist.is_second_last_pp_rank:
802809
if self.send_handles[prev_microbatch_id] is not None:
803810
self.send_handles[prev_microbatch_id].wait()
811+
self.send_handles[prev_microbatch_id] = None
804812
needs_logits = (
805813
self._need_return_logits(scheduled_batch)
806814
or (self._need_return_log_probs(scheduled_batch)

0 commit comments

Comments
 (0)