Skip to content

Commit c94f397

Browse files
authored
fix: Improve error handling for Python backend model initialization failures (#408) (#413)
1 parent 1ab97b7 commit c94f397

File tree

3 files changed

+59
-19
lines changed

3 files changed

+59
-19
lines changed

src/pb_stub.cc

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,11 +1040,13 @@ Stub::~Stub()
10401040
{
10411041
#ifdef TRITON_ENABLE_GPU
10421042
try {
1043-
CUDAHandler& cuda_api = CUDAHandler::getInstance();
1044-
for (auto& m :
1045-
shm_pool_->GetCUDAMemoryPoolManager()->CUDAPoolAddressMap()) {
1046-
if (m.second != nullptr) {
1047-
cuda_api.CloseCudaHandle(m.first, m.second);
1043+
if (shm_pool_ != nullptr) {
1044+
CUDAHandler& cuda_api = CUDAHandler::getInstance();
1045+
for (auto& m :
1046+
shm_pool_->GetCUDAMemoryPoolManager()->CUDAPoolAddressMap()) {
1047+
if (m.second != nullptr) {
1048+
cuda_api.CloseCudaHandle(m.first, m.second);
1049+
}
10481050
}
10491051
}
10501052
}
@@ -1053,13 +1055,14 @@ Stub::~Stub()
10531055
}
10541056
#endif
10551057

1056-
{
1058+
// Ensure the interpreter is active before trying to clean up.
1059+
if (Py_IsInitialized()) {
10571060
py::gil_scoped_acquire acquire;
10581061
py::object async_event_loop_local(std::move(async_event_loop_));
10591062
py::object background_futures_local(std::move(background_futures_));
10601063
py::object model_instance_local(std::move(model_instance_));
10611064
}
1062-
stub_instance_.reset();
1065+
10631066
stub_message_queue_.reset();
10641067
parent_message_queue_.reset();
10651068
stub_to_parent_mq_.reset();
@@ -2030,6 +2033,7 @@ main(int argc, char** argv)
20302033
catch (const PythonBackendException& pb_exception) {
20312034
LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what();
20322035
logger.reset();
2036+
stub.reset();
20332037
exit(1);
20342038
}
20352039

src/stub_launcher.cc

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,9 @@ StubLauncher::Launch()
280280
// Push a dummy message to the message queue so that the stub
281281
// process is notified that it can release the object stored in
282282
// shared memory.
283-
stub_message_queue_->Push(DUMMY_MESSAGE);
283+
if (stub_message_queue_) {
284+
stub_message_queue_->Push(DUMMY_MESSAGE);
285+
}
284286

285287
// If the model is not initialized, wait for the stub process to exit.
286288
if (!is_initialized_) {
@@ -299,11 +301,23 @@ StubLauncher::Launch()
299301
//
300302
// The reason it is broken into two steps is that creation of the health
301303
// monitoring thread may take longer which can make the server process think
302-
// that the stub process is unhealthy and return early. Waiting until the
303-
// health thread is spawn would make sure would prevent this issue.
304-
parent_message_queue_->Pop();
304+
// that the stub process is unhealthy and return early. Waiting with a longer
305+
// timeout prevents this issue.
306+
const uint64_t initialization_timeout_ms = 10000; // 10 sec
307+
LOG_MESSAGE(
308+
TRITONSERVER_LOG_VERBOSE,
309+
"Waiting for the stub health monitoring thread to start");
310+
311+
bi::managed_external_buffer::handle_t message;
312+
auto err = ReceiveMessageFromStub(message, initialization_timeout_ms);
313+
if (err != nullptr) {
314+
KillStubProcess();
315+
}
305316

306317
if (stub_process_kind_ == "AUTOCOMPLETE_STUB") {
318+
if (err != nullptr) {
319+
throw BackendModelException(err);
320+
}
307321
try {
308322
AutocompleteStubProcess();
309323
}
@@ -314,6 +328,7 @@ StubLauncher::Launch()
314328
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what()));
315329
}
316330
} else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") {
331+
RETURN_IF_ERROR(err);
317332
RETURN_IF_ERROR(ModelInstanceStubProcess());
318333
} else {
319334
return TRITONSERVER_ErrorNew(
@@ -435,7 +450,9 @@ StubLauncher::Launch()
435450
// Push a dummy message to the message queue so that the stub
436451
// process is notified that it can release the object stored in
437452
// shared memory.
438-
stub_message_queue_->Push(DUMMY_MESSAGE);
453+
if (stub_message_queue_) {
454+
stub_message_queue_->Push(DUMMY_MESSAGE);
455+
}
439456

440457
// If the model is not initialized, wait for the stub process to exit.
441458
if (!is_initialized_) {
@@ -456,11 +473,23 @@ StubLauncher::Launch()
456473
//
457474
// The reason it is broken into two steps is that creation of the health
458475
// monitoring thread may take longer which can make the server process think
459-
// that the stub process is unhealthy and return early. Waiting until the
460-
// health thread is spawn would prevent this issue.
461-
parent_message_queue_->Pop();
476+
// that the stub process is unhealthy and return early. Waiting with a
477+
// longer timeout prevents this issue.
478+
const uint64_t initialization_timeout_ms = 10000; // 10 sec
479+
LOG_MESSAGE(
480+
TRITONSERVER_LOG_VERBOSE,
481+
"Waiting for the stub health monitoring thread to start");
482+
483+
bi::managed_external_buffer::handle_t message;
484+
auto err = ReceiveMessageFromStub(message, initialization_timeout_ms);
485+
if (err != nullptr) {
486+
KillStubProcess();
487+
}
462488

463489
if (stub_process_kind_ == "AUTOCOMPLETE_STUB") {
490+
if (err != nullptr) {
491+
throw BackendModelException(err);
492+
}
464493
try {
465494
AutocompleteStubProcess();
466495
}
@@ -471,6 +500,7 @@ StubLauncher::Launch()
471500
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what()));
472501
}
473502
} else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") {
503+
RETURN_IF_ERROR(err);
474504
RETURN_IF_ERROR(ModelInstanceStubProcess());
475505
} else {
476506
return TRITONSERVER_ErrorNew(
@@ -592,8 +622,13 @@ StubLauncher::ModelInstanceStubProcess()
592622
initialize_message->Args() = initialize_map_handle;
593623
stub_message_queue_->Push(initialize_message->ShmHandle());
594624

625+
const uint64_t initialization_timeout_ms = 5000; // 5 sec
626+
LOG_MESSAGE(
627+
TRITONSERVER_LOG_VERBOSE,
628+
"Waiting for the stub process initialization response");
629+
595630
bi::managed_external_buffer::handle_t message;
596-
RETURN_IF_ERROR(ReceiveMessageFromStub(message));
631+
RETURN_IF_ERROR(ReceiveMessageFromStub(message, initialization_timeout_ms));
597632

598633
std::unique_ptr<IPCMessage> initialize_response_message =
599634
IPCMessage::LoadFromSharedMemory(shm_pool_, message);
@@ -726,11 +761,11 @@ StubLauncher::KillStubProcess()
726761

727762
TRITONSERVER_Error*
728763
StubLauncher::ReceiveMessageFromStub(
729-
bi::managed_external_buffer::handle_t& message)
764+
bi::managed_external_buffer::handle_t& message,
765+
uint64_t timeout_miliseconds)
730766
{
731767
bool success = false;
732768
while (!success) {
733-
uint64_t timeout_miliseconds = 1000;
734769
{
735770
boost::posix_time::ptime timeout =
736771
boost::get_system_time() +

src/stub_launcher.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ class StubLauncher {
147147

148148
// Get a message from the stub process
149149
TRITONSERVER_Error* ReceiveMessageFromStub(
150-
bi::managed_external_buffer::handle_t& message);
150+
bi::managed_external_buffer::handle_t& message,
151+
uint64_t timeout_miliseconds = 1000);
151152

152153
// Wait for stub process
153154
void WaitForStubProcess();

0 commit comments

Comments
 (0)