diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 52be64fba..48a66dcef 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -409,7 +409,15 @@ function cleanup_proc(state, p, log_sink) # If the worker process is still alive, clean it up if wid in workers() - remotecall_wait(_cleanup_proc, wid, state.uid, log_sink) + try + remotecall_wait(_cleanup_proc, wid, state.uid, log_sink) + catch ex + # We allow ProcessExitedException's, which means that the worker + # shutdown halfway through cleanup. + if !(ex isa ProcessExitedException) + rethrow() + end + end end timespan_finish(ctx, :cleanup_proc, (;worker=wid), nothing) diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl index cac22c0ed..a0e689edf 100644 --- a/src/sch/dynamic.jl +++ b/src/sch/dynamic.jl @@ -32,9 +32,18 @@ function safepoint(state) if state.halt.set # Force dynamic thunks and listeners to terminate for (inp_chan,out_chan) in values(state.worker_chans) - close(inp_chan) - close(out_chan) + # Closing these channels will fail if the worker died, which we + # allow. + try + close(inp_chan) + close(out_chan) + catch ex + if !(ex isa ProcessExitedException) + rethrow() + end + end end + # Throw out of scheduler throw(SchedulerHaltedException()) end diff --git a/src/sch/eager.jl b/src/sch/eager.jl index b90646c27..8fcb10d92 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -6,7 +6,7 @@ const EAGER_STATE = Ref{Union{ComputeState,Nothing}}(nothing) function eager_context() if EAGER_CONTEXT[] === nothing - EAGER_CONTEXT[] = Context([myid(),workers()...]) + EAGER_CONTEXT[] = Context(procs()) end return EAGER_CONTEXT[] end