From 1fa2ea4ea9793b60a1c54fc4091f906c82d53dce Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Tue, 9 Apr 2024 01:07:26 +0200 Subject: [PATCH 1/2] Allow for workers dying in the middle of cleanup --- src/sch/Sch.jl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index a6b2ce6b4..cfb680bbd 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -405,7 +405,20 @@ function cleanup_proc(state, p, log_sink) delete!(WORKER_MONITOR_CHANS[wid], state.uid) end end - remote_do(_cleanup_proc, wid, state.uid, log_sink) + + # If the worker process is still alive, clean it up + if wid in workers() + try + remotecall_wait(_cleanup_proc, wid, state.uid, log_sink) + catch ex + # We allow ProcessExitedException's, which means that the worker + # shutdown halfway through cleanup. + if !(ex isa ProcessExitedException) + rethrow() + end + end + end + timespan_finish(ctx, :cleanup_proc, (;worker=wid), nothing) end From 4d123ac8935eb18f66fbb6183979a0c946c7bd12 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Tue, 9 Apr 2024 13:06:52 +0200 Subject: [PATCH 2/2] Allow for dead workers in safepoint() --- src/sch/dynamic.jl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl index 7c52bf748..df78a1dd1 100644 --- a/src/sch/dynamic.jl +++ b/src/sch/dynamic.jl @@ -33,9 +33,18 @@ function safepoint(state) if state.halt.set # Force dynamic thunks and listeners to terminate for (inp_chan,out_chan) in values(state.worker_chans) - close(inp_chan) - close(out_chan) + # Closing these channels will fail if the worker died, which we + # allow. + try + close(inp_chan) + close(out_chan) + catch ex + if !(ex isa ProcessExitedException) + rethrow() + end + end end + # Throw out of scheduler throw(SchedulerHaltedException()) end