@@ -280,7 +280,9 @@ StubLauncher::Launch()
280
280
// Push a dummy message to the message queue so that the stub
281
281
// process is notified that it can release the object stored in
282
282
// shared memory.
283
- stub_message_queue_->Push (DUMMY_MESSAGE);
283
+ if (stub_message_queue_) {
284
+ stub_message_queue_->Push (DUMMY_MESSAGE);
285
+ }
284
286
285
287
// If the model is not initialized, wait for the stub process to exit.
286
288
if (!is_initialized_) {
@@ -299,11 +301,23 @@ StubLauncher::Launch()
299
301
//
300
302
// The reason it is broken into two steps is that creation of the health
301
303
// monitoring thread may take longer which can make the server process think
302
- // that the stub process is unhealthy and return early. Waiting until the
303
- // health thread is spawn would make sure would prevent this issue.
304
- parent_message_queue_->Pop ();
304
+ // that the stub process is unhealthy and return early. Waiting with a longer
305
+ // timeout prevents this issue.
306
+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
307
+ LOG_MESSAGE (
308
+ TRITONSERVER_LOG_VERBOSE,
309
+ " Waiting for the stub health monitoring thread to start" );
310
+
311
+ bi::managed_external_buffer::handle_t message;
312
+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
313
+ if (err != nullptr ) {
314
+ KillStubProcess ();
315
+ }
305
316
306
317
if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
318
+ if (err != nullptr ) {
319
+ throw BackendModelException (err);
320
+ }
307
321
try {
308
322
AutocompleteStubProcess ();
309
323
}
@@ -314,6 +328,7 @@ StubLauncher::Launch()
314
328
TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
315
329
}
316
330
} else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
331
+ RETURN_IF_ERROR (err);
317
332
RETURN_IF_ERROR (ModelInstanceStubProcess ());
318
333
} else {
319
334
return TRITONSERVER_ErrorNew (
@@ -435,7 +450,9 @@ StubLauncher::Launch()
435
450
// Push a dummy message to the message queue so that the stub
436
451
// process is notified that it can release the object stored in
437
452
// shared memory.
438
- stub_message_queue_->Push (DUMMY_MESSAGE);
453
+ if (stub_message_queue_) {
454
+ stub_message_queue_->Push (DUMMY_MESSAGE);
455
+ }
439
456
440
457
// If the model is not initialized, wait for the stub process to exit.
441
458
if (!is_initialized_) {
@@ -456,11 +473,23 @@ StubLauncher::Launch()
456
473
//
457
474
// The reason it is broken into two steps is that creation of the health
458
475
// monitoring thread may take longer which can make the server process think
459
- // that the stub process is unhealthy and return early. Waiting until the
460
- // health thread is spawn would prevent this issue.
461
- parent_message_queue_->Pop ();
476
+ // that the stub process is unhealthy and return early. Waiting with a
477
+ // longer timeout prevents this issue.
478
+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
479
+ LOG_MESSAGE (
480
+ TRITONSERVER_LOG_VERBOSE,
481
+ " Waiting for the stub health monitoring thread to start" );
482
+
483
+ bi::managed_external_buffer::handle_t message;
484
+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
485
+ if (err != nullptr ) {
486
+ KillStubProcess ();
487
+ }
462
488
463
489
if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
490
+ if (err != nullptr ) {
491
+ throw BackendModelException (err);
492
+ }
464
493
try {
465
494
AutocompleteStubProcess ();
466
495
}
@@ -471,6 +500,7 @@ StubLauncher::Launch()
471
500
TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
472
501
}
473
502
} else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
503
+ RETURN_IF_ERROR (err);
474
504
RETURN_IF_ERROR (ModelInstanceStubProcess ());
475
505
} else {
476
506
return TRITONSERVER_ErrorNew (
@@ -592,8 +622,13 @@ StubLauncher::ModelInstanceStubProcess()
592
622
initialize_message->Args () = initialize_map_handle;
593
623
stub_message_queue_->Push (initialize_message->ShmHandle ());
594
624
625
+ const uint64_t initialization_timeout_ms = 5000 ; // 5 sec
626
+ LOG_MESSAGE (
627
+ TRITONSERVER_LOG_VERBOSE,
628
+ " Waiting for the stub process initialization response" );
629
+
595
630
bi::managed_external_buffer::handle_t message;
596
- RETURN_IF_ERROR (ReceiveMessageFromStub (message));
631
+ RETURN_IF_ERROR (ReceiveMessageFromStub (message, initialization_timeout_ms ));
597
632
598
633
std::unique_ptr<IPCMessage> initialize_response_message =
599
634
IPCMessage::LoadFromSharedMemory (shm_pool_, message);
@@ -726,11 +761,11 @@ StubLauncher::KillStubProcess()
726
761
727
762
TRITONSERVER_Error*
728
763
StubLauncher::ReceiveMessageFromStub (
729
- bi::managed_external_buffer::handle_t & message)
764
+ bi::managed_external_buffer::handle_t & message,
765
+ uint64_t timeout_miliseconds)
730
766
{
731
767
bool success = false ;
732
768
while (!success) {
733
- uint64_t timeout_miliseconds = 1000 ;
734
769
{
735
770
boost::posix_time::ptime timeout =
736
771
boost::get_system_time () +
0 commit comments