Skip to content

Commit 5f419bf

Browse files
sryzatgravescs
authored andcommitted
SPARK-1032. If Yarn app fails before registering, app master stays aroun...
...d long after This reopens https://github.com/apache/incubator-spark/pull/648 against the new repo. Author: Sandy Ryza <[email protected]> Closes #28 from sryza/sandy-spark-1032 and squashes the following commits: 5953f50 [Sandy Ryza] SPARK-1032. If Yarn app fails before registering, app master stays around long after
1 parent edf8a56 commit 5f419bf

File tree

2 files changed

+38
-18
lines changed

2 files changed

+38
-18
lines changed

yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
6565
private val maxNumWorkerFailures = sparkConf.getInt("spark.yarn.max.worker.failures",
6666
math.max(args.numWorkers * 2, 3))
6767

68+
private var registered = false
69+
6870
def run() {
6971
// Setup the directories so things go to yarn approved directories rather
7072
// then user specified and /tmp.
@@ -110,7 +112,12 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
110112
waitForSparkContextInitialized()
111113

112114
// Do this after spark master is up and SparkContext is created so that we can register UI Url
113-
val appMasterResponse: RegisterApplicationMasterResponse = registerApplicationMaster()
115+
synchronized {
116+
if (!isFinished) {
117+
registerApplicationMaster()
118+
registered = true
119+
}
120+
}
114121

115122
// Allocate all containers
116123
allocateWorkers()
@@ -208,7 +215,8 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
208215
var count = 0
209216
val waitTime = 10000L
210217
val numTries = sparkConf.getInt("spark.yarn.ApplicationMaster.waitTries", 10)
211-
while (ApplicationMaster.sparkContextRef.get() == null && count < numTries) {
218+
while (ApplicationMaster.sparkContextRef.get() == null && count < numTries
219+
&& !isFinished) {
212220
logInfo("Waiting for spark context initialization ... " + count)
213221
count = count + 1
214222
ApplicationMaster.sparkContextRef.wait(waitTime)
@@ -341,17 +349,19 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
341349
return
342350
}
343351
isFinished = true
352+
353+
logInfo("finishApplicationMaster with " + status)
354+
if (registered) {
355+
val finishReq = Records.newRecord(classOf[FinishApplicationMasterRequest])
356+
.asInstanceOf[FinishApplicationMasterRequest]
357+
finishReq.setAppAttemptId(appAttemptId)
358+
finishReq.setFinishApplicationStatus(status)
359+
finishReq.setDiagnostics(diagnostics)
360+
// Set tracking url to empty since we don't have a history server.
361+
finishReq.setTrackingUrl("")
362+
resourceManager.finishApplicationMaster(finishReq)
363+
}
344364
}
345-
346-
logInfo("finishApplicationMaster with " + status)
347-
val finishReq = Records.newRecord(classOf[FinishApplicationMasterRequest])
348-
.asInstanceOf[FinishApplicationMasterRequest]
349-
finishReq.setAppAttemptId(appAttemptId)
350-
finishReq.setFinishApplicationStatus(status)
351-
finishReq.setDiagnostics(diagnostics)
352-
// Set tracking url to empty since we don't have a history server.
353-
finishReq.setTrackingUrl("")
354-
resourceManager.finishApplicationMaster(finishReq)
355365
}
356366

357367
/**

yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
6767
private val maxNumWorkerFailures = sparkConf.getInt("spark.yarn.max.worker.failures",
6868
math.max(args.numWorkers * 2, 3))
6969

70+
private var registered = false
71+
7072
def run() {
7173
// Setup the directories so things go to YARN approved directories rather
7274
// than user specified and /tmp.
@@ -99,7 +101,12 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
99101
waitForSparkContextInitialized()
100102

101103
// Do this after Spark master is up and SparkContext is created so that we can register UI Url.
102-
val appMasterResponse: RegisterApplicationMasterResponse = registerApplicationMaster()
104+
synchronized {
105+
if (!isFinished) {
106+
registerApplicationMaster()
107+
registered = true
108+
}
109+
}
103110

104111
// Allocate all containers
105112
allocateWorkers()
@@ -180,7 +187,8 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
180187
var numTries = 0
181188
val waitTime = 10000L
182189
val maxNumTries = sparkConf.getInt("spark.yarn.applicationMaster.waitTries", 10)
183-
while (ApplicationMaster.sparkContextRef.get() == null && numTries < maxNumTries) {
190+
while (ApplicationMaster.sparkContextRef.get() == null && numTries < maxNumTries
191+
&& !isFinished) {
184192
logInfo("Waiting for Spark context initialization ... " + numTries)
185193
numTries = numTries + 1
186194
ApplicationMaster.sparkContextRef.wait(waitTime)
@@ -313,11 +321,13 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
313321
return
314322
}
315323
isFinished = true
316-
}
317324

318-
logInfo("finishApplicationMaster with " + status)
319-
// Set tracking URL to empty since we don't have a history server.
320-
amClient.unregisterApplicationMaster(status, "" /* appMessage */ , "" /* appTrackingUrl */)
325+
logInfo("finishApplicationMaster with " + status)
326+
if (registered) {
327+
// Set tracking URL to empty since we don't have a history server.
328+
amClient.unregisterApplicationMaster(status, "" /* appMessage */ , "" /* appTrackingUrl */)
329+
}
330+
}
321331
}
322332

323333
/**

0 commit comments

Comments
 (0)