Skip to content

Commit c5c4cf2

Browse files
committed
Display the executor ID in the Web UI when ExecutorLostFailure happens
1 parent ad3bd0d commit c5c4cf2

File tree

5 files changed

+13
-8
lines changed

5 files changed

+13
-8
lines changed

core/src/main/scala/org/apache/spark/TaskEndReason.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,8 @@ case object TaskKilled extends TaskFailedReason {
117117
* the task crashed the JVM.
118118
*/
119119
@DeveloperApi
120-
case object ExecutorLostFailure extends TaskFailedReason {
121-
override def toErrorString: String = "ExecutorLostFailure (executor lost)"
120+
case class ExecutorLostFailure(execId: String) extends TaskFailedReason {
121+
override def toErrorString: String = s"ExecutorLostFailure (executor ${execId} lost)"
122122
}
123123

124124
/**

core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ private[spark] class TaskSetManager(
706706
}
707707
// Also re-enqueue any tasks that were running on the node
708708
for ((tid, info) <- taskInfos if info.running && info.executorId == execId) {
709-
handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure)
709+
handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(execId))
710710
}
711711
// recalculate valid locality levels and waits when executor is lost
712712
recomputeLocality()

core/src/main/scala/org/apache/spark/util/JsonProtocol.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ private[spark] object JsonProtocol {
272272

273273
def taskEndReasonToJson(taskEndReason: TaskEndReason): JValue = {
274274
val reason = Utils.getFormattedClassName(taskEndReason)
275-
val json = taskEndReason match {
275+
val json: JObject = taskEndReason match {
276276
case fetchFailed: FetchFailed =>
277277
val blockManagerAddress = Option(fetchFailed.bmAddress).
278278
map(blockManagerIdToJson).getOrElse(JNothing)
@@ -287,6 +287,8 @@ private[spark] object JsonProtocol {
287287
("Description" -> exceptionFailure.description) ~
288288
("Stack Trace" -> stackTrace) ~
289289
("Metrics" -> metrics)
290+
case ExecutorLostFailure(executorId) =>
291+
("Executor ID" -> executorId)
290292
case _ => Utils.emptyJson
291293
}
292294
("Reason" -> reason) ~ json
@@ -636,7 +638,9 @@ private[spark] object JsonProtocol {
636638
new ExceptionFailure(className, description, stackTrace, metrics)
637639
case `taskResultLost` => TaskResultLost
638640
case `taskKilled` => TaskKilled
639-
case `executorLostFailure` => ExecutorLostFailure
641+
case `executorLostFailure` =>
642+
val executorId = (json \ "Executor ID").extract[String]
643+
ExecutorLostFailure(executorId)
640644
case `unknownReason` => UnknownReason
641645
}
642646
}

core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
119119
new ExceptionFailure("Exception", "description", null, None),
120120
TaskResultLost,
121121
TaskKilled,
122-
ExecutorLostFailure,
122+
ExecutorLostFailure("0"),
123123
UnknownReason)
124124
var failCount = 0
125125
for (reason <- taskFailedReasons) {

core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ class JsonProtocolSuite extends FunSuite {
115115
testTaskEndReason(exceptionFailure)
116116
testTaskEndReason(TaskResultLost)
117117
testTaskEndReason(TaskKilled)
118-
testTaskEndReason(ExecutorLostFailure)
118+
testTaskEndReason(ExecutorLostFailure("100"))
119119
testTaskEndReason(UnknownReason)
120120

121121
// BlockId
@@ -403,7 +403,8 @@ class JsonProtocolSuite extends FunSuite {
403403
assertOptionEquals(r1.metrics, r2.metrics, assertTaskMetricsEquals)
404404
case (TaskResultLost, TaskResultLost) =>
405405
case (TaskKilled, TaskKilled) =>
406-
case (ExecutorLostFailure, ExecutorLostFailure) =>
406+
case (ExecutorLostFailure(execId1), ExecutorLostFailure(execId2)) =>
407+
assert(execId1 === execId2)
407408
case (UnknownReason, UnknownReason) =>
408409
case _ => fail("Task end reasons don't match in types!")
409410
}

0 commit comments

Comments
 (0)