Skip to content

Commit bf4e613

Browse files
committed
Merge pull request alteryx#143 from rxin/scheduler-hang
Ignore a task update status if the executor doesn't exist anymore. Otherwise if the scheduler receives a task update message when the executor's been removed, the scheduler would hang. It is pretty hard to add unit tests for these right now because it is hard to mock the cluster scheduler. We should do that once @kayousterhout finishes merging the local scheduler and the cluster scheduler.
2 parents 9f7b9bb + a02eed6 commit bf4e613

File tree

1 file changed

+11
-3
lines changed

1 file changed

+11
-3
lines changed

core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,14 @@ class CoarseGrainedSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Ac
8787
case StatusUpdate(executorId, taskId, state, data) =>
8888
scheduler.statusUpdate(taskId, state, data.value)
8989
if (TaskState.isFinished(state)) {
90-
freeCores(executorId) += 1
91-
makeOffers(executorId)
90+
if (executorActor.contains(executorId)) {
91+
freeCores(executorId) += 1
92+
makeOffers(executorId)
93+
} else {
94+
// Ignoring the update since we don't know about the executor.
95+
val msg = "Ignored task status update (%d state %s) from unknown executor %s with ID %s"
96+
logWarning(msg.format(taskId, state, sender, executorId))
97+
}
9298
}
9399

94100
case ReviveOffers =>
@@ -175,7 +181,9 @@ class CoarseGrainedSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Ac
175181
Props(new DriverActor(properties)), name = CoarseGrainedSchedulerBackend.ACTOR_NAME)
176182
}
177183

178-
private val timeout = Duration.create(System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds")
184+
private val timeout = {
185+
Duration.create(System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds")
186+
}
179187

180188
def stopExecutors() {
181189
try {

0 commit comments

Comments
 (0)