@@ -1478,9 +1478,11 @@ private[spark] class DAGScheduler(
1478
1478
mapOutputTracker.unregisterAllMapOutput(failedMapStage.shuffleDep.shuffleId)
1479
1479
1480
1480
case failedResultStage : ResultStage =>
1481
- // Mark all the partitions of the result stage to be not finished, to ensure retry
1482
- // all the tasks on resubmitted stage attempt.
1483
- failedResultStage.activeJob.map(_.resetAllPartitions())
1481
+ // Abort the failed result stage since we may have committed output for some
1482
+ // partitions.
1483
+ val reason = " Could not recover from a failed barrier ResultStage. Most recent " +
1484
+ s " failure reason: $failureMessage"
1485
+ abortStage(failedResultStage, reason, None )
1484
1486
}
1485
1487
}
1486
1488
@@ -1553,62 +1555,75 @@ private[spark] class DAGScheduler(
1553
1555
1554
1556
// Always fail the current stage and retry all the tasks when a barrier task fail.
1555
1557
val failedStage = stageIdToStage(task.stageId)
1556
- logInfo(s " Marking $failedStage ( ${failedStage.name}) as failed due to a barrier task " +
1557
- " failed." )
1558
- val message = s " Stage failed because barrier task $task finished unsuccessfully. \n " +
1559
- failure.toErrorString
1560
- try {
1561
- // killAllTaskAttempts will fail if a SchedulerBackend does not implement killTask.
1562
- val reason = s " Task $task from barrier stage $failedStage ( ${failedStage.name}) failed. "
1563
- taskScheduler.killAllTaskAttempts(stageId, interruptThread = false , reason)
1564
- } catch {
1565
- case e : UnsupportedOperationException =>
1566
- // Cannot continue with barrier stage if failed to cancel zombie barrier tasks.
1567
- // TODO SPARK-24877 leave the zombie tasks and ignore their completion events.
1568
- logWarning(s " Could not kill all tasks for stage $stageId" , e)
1569
- abortStage(failedStage, " Could not kill zombie barrier tasks for stage " +
1570
- s " $failedStage ( ${failedStage.name}) " , Some (e))
1571
- }
1572
- markStageAsFinished(failedStage, Some (message))
1558
+ if (failedStage.latestInfo.attemptNumber != task.stageAttemptId) {
1559
+ logInfo(s " Ignoring task failure from $task as it's from $failedStage attempt " +
1560
+ s " ${task.stageAttemptId} and there is a more recent attempt for that stage " +
1561
+ s " (attempt ${failedStage.latestInfo.attemptNumber}) running " )
1562
+ } else {
1563
+ logInfo(s " Marking $failedStage ( ${failedStage.name}) as failed due to a barrier task " +
1564
+ " failed." )
1565
+ val message = s " Stage failed because barrier task $task finished unsuccessfully. \n " +
1566
+ failure.toErrorString
1567
+ try {
1568
+ // killAllTaskAttempts will fail if a SchedulerBackend does not implement killTask.
1569
+ val reason = s " Task $task from barrier stage $failedStage ( ${failedStage.name}) " +
1570
+ " failed."
1571
+ taskScheduler.killAllTaskAttempts(stageId, interruptThread = false , reason)
1572
+ } catch {
1573
+ case e : UnsupportedOperationException =>
1574
+ // Cannot continue with barrier stage if failed to cancel zombie barrier tasks.
1575
+ // TODO SPARK-24877 leave the zombie tasks and ignore their completion events.
1576
+ logWarning(s " Could not kill all tasks for stage $stageId" , e)
1577
+ abortStage(failedStage, " Could not kill zombie barrier tasks for stage " +
1578
+ s " $failedStage ( ${failedStage.name}) " , Some (e))
1579
+ }
1580
+ markStageAsFinished(failedStage, Some (message))
1573
1581
1574
- failedStage.failedAttemptIds.add(task.stageAttemptId)
1575
- // TODO Refactor the failure handling logic to combine similar code with that of
1576
- // FetchFailed.
1577
- val shouldAbortStage =
1578
- failedStage.failedAttemptIds.size >= maxConsecutiveStageAttempts ||
1579
- disallowStageRetryForTest
1582
+ failedStage.failedAttemptIds.add(task.stageAttemptId)
1583
+ // TODO Refactor the failure handling logic to combine similar code with that of
1584
+ // FetchFailed.
1585
+ val shouldAbortStage =
1586
+ failedStage.failedAttemptIds.size >= maxConsecutiveStageAttempts ||
1587
+ disallowStageRetryForTest
1580
1588
1581
- if (shouldAbortStage) {
1582
- val abortMessage = if (disallowStageRetryForTest) {
1583
- " Barrier stage will not retry stage due to testing config. Most recent failure " +
1584
- s " reason: $message"
1589
+ if (shouldAbortStage) {
1590
+ val abortMessage = if (disallowStageRetryForTest) {
1591
+ " Barrier stage will not retry stage due to testing config. Most recent failure " +
1592
+ s " reason: $message"
1593
+ } else {
1594
+ s """ $failedStage ( ${failedStage.name})
1595
+ |has failed the maximum allowable number of
1596
+ |times: $maxConsecutiveStageAttempts.
1597
+ |Most recent failure reason: $message
1598
+ """ .stripMargin.replaceAll(" \n " , " " )
1599
+ }
1600
+ abortStage(failedStage, abortMessage, None )
1585
1601
} else {
1586
- s """ $failedStage ( ${failedStage.name})
1587
- |has failed the maximum allowable number of
1588
- |times: $maxConsecutiveStageAttempts.
1589
- |Most recent failure reason: $message""" .stripMargin.replaceAll(" \n " , " " )
1590
- }
1591
- abortStage(failedStage, abortMessage, None )
1592
- } else {
1593
- failedStage match {
1594
- case failedMapStage : ShuffleMapStage =>
1595
- // Mark all the map as broken in the map stage, to ensure retry all the tasks on
1596
- // resubmitted stage attempt.
1597
- mapOutputTracker.unregisterAllMapOutput(failedMapStage.shuffleDep.shuffleId)
1598
-
1599
- case failedResultStage : ResultStage =>
1600
- // Mark all the partitions of the result stage to be not finished, to ensure retry
1601
- // all the tasks on resubmitted stage attempt.
1602
- failedResultStage.activeJob.map(_.resetAllPartitions())
1603
- }
1602
+ failedStage match {
1603
+ case failedMapStage : ShuffleMapStage =>
1604
+ // Mark all the map as broken in the map stage, to ensure retry all the tasks on
1605
+ // resubmitted stage attempt.
1606
+ mapOutputTracker.unregisterAllMapOutput(failedMapStage.shuffleDep.shuffleId)
1604
1607
1605
- // update failedStages and make sure a ResubmitFailedStages event is enqueued
1606
- failedStages += failedStage
1607
- logInfo(s " Resubmitting $failedStage ( ${failedStage.name}) due to barrier stage " +
1608
- " failure." )
1609
- messageScheduler.schedule(new Runnable {
1610
- override def run (): Unit = eventProcessLoop.post(ResubmitFailedStages )
1611
- }, DAGScheduler .RESUBMIT_TIMEOUT , TimeUnit .MILLISECONDS )
1608
+ case failedResultStage : ResultStage =>
1609
+ // Abort the failed result stage since we may have committed output for some
1610
+ // partitions.
1611
+ val reason = " Could not recover from a failed barrier ResultStage. Most recent " +
1612
+ s " failure reason: $message"
1613
+ abortStage(failedResultStage, reason, None )
1614
+ }
1615
+ // In case multiple task failures triggered for a single stage attempt, ensure we only
1616
+ // resubmit the failed stage once.
1617
+ val noResubmitEnqueued = ! failedStages.contains(failedStage)
1618
+ failedStages += failedStage
1619
+ if (noResubmitEnqueued) {
1620
+ logInfo(s " Resubmitting $failedStage ( ${failedStage.name}) due to barrier stage " +
1621
+ " failure." )
1622
+ messageScheduler.schedule(new Runnable {
1623
+ override def run (): Unit = eventProcessLoop.post(ResubmitFailedStages )
1624
+ }, DAGScheduler .RESUBMIT_TIMEOUT , TimeUnit .MILLISECONDS )
1625
+ }
1626
+ }
1612
1627
}
1613
1628
1614
1629
case Resubmitted =>
0 commit comments