@@ -33,6 +33,7 @@ import org.apache.hadoop.yarn.api.records._
33
33
import org .apache .hadoop .yarn .conf .YarnConfiguration
34
34
35
35
import org .apache .spark .{Logging , SecurityManager , SparkConf , SparkContext , SparkEnv }
36
+ import org .apache .spark .SparkException
36
37
import org .apache .spark .deploy .SparkHadoopUtil
37
38
import org .apache .spark .deploy .history .HistoryServer
38
39
import org .apache .spark .scheduler .cluster .CoarseGrainedSchedulerBackend
@@ -136,13 +137,12 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
136
137
runExecutorLauncher(securityMgr)
137
138
}
138
139
} catch {
139
- case e : Throwable => {
140
+ case e : Exception =>
140
141
// catch everything else if not specifically handled
141
142
logError(" Uncaught exception: " , e)
142
143
finish(FinalApplicationStatus .FAILED ,
143
144
ApplicationMaster .EXIT_UNCAUGHT_EXCEPTION ,
144
145
" Uncaught exception: " + e.getMessage())
145
- }
146
146
}
147
147
exitCode
148
148
}
@@ -213,6 +213,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
213
213
214
214
private def runDriver (securityMgr : SecurityManager ): Unit = {
215
215
addAmIpFilter()
216
+ setupSystemSecurityManager()
216
217
userClassThread = startUserClass()
217
218
218
219
// This a bit hacky, but we need to wait until the spark.driver.port property has
@@ -258,7 +259,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
258
259
val t = new Thread {
259
260
override def run () {
260
261
var failureCount = 0
261
- while (! finished && ! Thread .currentThread().isInterrupted() ) {
262
+ while (! finished) {
262
263
try {
263
264
if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
264
265
finish(FinalApplicationStatus .FAILED ,
@@ -328,7 +329,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
328
329
sparkContextRef.synchronized {
329
330
var count = 0
330
331
val waitTime = 10000L
331
- val numTries = sparkConf.getInt(" spark.yarn.ApplicationMaster .waitTries" , 10 )
332
+ val numTries = sparkConf.getInt(" spark.yarn.applicationMaster .waitTries" , 10 )
332
333
while (sparkContextRef.get() == null && count < numTries && ! finished) {
333
334
logInfo(" Waiting for spark context initialization ... " + count)
334
335
count = count + 1
@@ -355,7 +356,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
355
356
// spark driver should already be up since it launched us, but we don't want to
356
357
// wait forever, so wait 100 seconds max to match the cluster mode setting.
357
358
// Leave this config unpublished for now.
358
- val numTries = sparkConf.getInt(" spark.yarn.ApplicationMaster.client .waitTries" , 1000 )
359
+ val numTries = sparkConf.getInt(" spark.yarn.applicationMaster .waitTries" , 1000 )
359
360
360
361
while (! driverUp && ! finished && count < numTries) {
361
362
try {
@@ -373,7 +374,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
373
374
}
374
375
375
376
if (! driverUp) {
376
- throw new Exception (" Failed to connect to driver!" )
377
+ throw new SparkException (" Failed to connect to driver!" )
377
378
}
378
379
379
380
sparkConf.set(" spark.driver.host" , driverHost)
@@ -403,55 +404,59 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
403
404
actor ! AddWebUIFilter (amFilter, params, proxyBase)
404
405
}
405
406
}
407
+ '
408
+ /**
409
+ * This system security manager applies to the entire process.
410
+ * It's main purpose is to handle the case if the user code does a System.exit.
411
+ * This allows us to catch that and properly set the YARN application status and
412
+ * cleanup if needed.
413
+ */
414
+ private def setupSystemSecurityManager () = {
415
+ try {
416
+ var stopped = false
417
+ System .setSecurityManager(new java.lang.SecurityManager () {
418
+ override def checkExit (paramInt : Int ) {
419
+ if (! stopped) {
420
+ logInfo(" In securityManager checkExit, exit code: " + paramInt)
421
+ if (paramInt == 0 ) {
422
+ finish(FinalApplicationStatus .SUCCEEDED , ApplicationMaster .EXIT_SUCCESS )
423
+ } else {
424
+ finish(FinalApplicationStatus .FAILED ,
425
+ paramInt,
426
+ " User class exited with non-zero exit code" )
427
+ }
428
+ stopped = true
429
+ }
430
+ }
431
+ // required for the checkExit to work properly
432
+ override def checkPermission (perm : java.security.Permission ): Unit = {
433
+ }
434
+ })
435
+ }
436
+ catch {
437
+ case e : SecurityException =>
438
+ finish(FinalApplicationStatus .FAILED ,
439
+ ApplicationMaster .EXIT_SECURITY ,
440
+ " Error in setSecurityManager" )
441
+ logError(" Error in setSecurityManager:" , e)
442
+ }
443
+ }
406
444
407
445
/**
408
- * Start the user class, which contains the spark driver.
446
+ * Start the user class, which contains the spark driver, in a separate Thread .
409
447
* If the main routine exits cleanly or exits with System.exit(0) we
410
448
* assume it was successful, for all other cases we assume failure.
449
+ *
450
+ * Returns the user thread that was started.
411
451
*/
412
452
private def startUserClass (): Thread = {
413
453
logInfo(" Starting the user JAR in a separate Thread" )
414
454
System .setProperty(" spark.executor.instances" , args.numExecutors.toString)
415
- var stopped = false
416
455
val mainMethod = Class .forName(args.userClass, false ,
417
456
Thread .currentThread.getContextClassLoader).getMethod(" main" , classOf [Array [String ]])
418
457
419
458
val userThread = new Thread {
420
459
override def run () {
421
-
422
- try {
423
- // Note this security manager applies to the entire process, not
424
- // just this thread. It's here to handle the case if the user code
425
- // does System.exit
426
- System .setSecurityManager(new java.lang.SecurityManager () {
427
- override def checkExit (paramInt : Int ) {
428
- if (! stopped) {
429
- logInfo(" In securityManager checkExit, exit code: " + paramInt)
430
- if (paramInt == 0 ) {
431
- finish(FinalApplicationStatus .SUCCEEDED , ApplicationMaster .EXIT_SUCCESS )
432
- } else {
433
- finish(FinalApplicationStatus .FAILED ,
434
- paramInt,
435
- " User class exited with non-zero exit code" )
436
- }
437
- stopped = true
438
- }
439
- }
440
-
441
- // required for the checkExit to work properly
442
- override def checkPermission (perm : java.security.Permission ): Unit = {
443
- }
444
- })
445
- }
446
- catch {
447
- case e : SecurityException => {
448
- finish(FinalApplicationStatus .FAILED ,
449
- ApplicationMaster .EXIT_SECURITY ,
450
- " Error in setSecurityManager" )
451
- logError(" Error in setSecurityManager:" , e)
452
- }
453
- }
454
-
455
460
try {
456
461
val mainArgs = new Array [String ](args.userArgs.size)
457
462
args.userArgs.copyToArray(mainArgs, 0 , args.userArgs.size)
@@ -463,14 +468,12 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
463
468
e.getCause match {
464
469
case _ : InterruptedException =>
465
470
// Reporter thread can interrupt to stop user class
466
-
467
- case e : Throwable => {
471
+ case e : Throwable =>
468
472
finish(FinalApplicationStatus .FAILED ,
469
473
ApplicationMaster .EXIT_EXCEPTION_USER_CLASS ,
470
474
" User class threw exception: " + e.getMessage)
471
475
// re-throw to get it logged
472
476
throw e
473
- }
474
477
}
475
478
}
476
479
}
@@ -512,13 +515,13 @@ object ApplicationMaster extends Logging {
512
515
val SHUTDOWN_HOOK_PRIORITY : Int = 30
513
516
514
517
// exit codes for different causes, no reason behind the values
515
- val EXIT_SUCCESS = 0
516
- val EXIT_UNCAUGHT_EXCEPTION = 10
517
- val EXIT_MAX_EXECUTOR_FAILURES = 11
518
- val EXIT_REPORTER_FAILURE = 12
519
- val EXIT_SC_NOT_INITED = 13
520
- val EXIT_SECURITY = 14
521
- val EXIT_EXCEPTION_USER_CLASS = 15
518
+ private val EXIT_SUCCESS = 0
519
+ private val EXIT_UNCAUGHT_EXCEPTION = 10
520
+ private val EXIT_MAX_EXECUTOR_FAILURES = 11
521
+ private val EXIT_REPORTER_FAILURE = 12
522
+ private val EXIT_SC_NOT_INITED = 13
523
+ private val EXIT_SECURITY = 14
524
+ private val EXIT_EXCEPTION_USER_CLASS = 15
522
525
523
526
private var master : ApplicationMaster = _
524
527
0 commit comments