Skip to content

Commit 896a371

Browse files
zzvarajeanlyn
authored andcommitted
[SPARK-7533] [YARN] Decrease spacing between AM-RM heartbeats.
Added faster RM-heartbeats on pending container allocations with multiplicative back-off. Also updated related documentations. Author: ehnalis <[email protected]> Closes apache#6082 from ehnalis/yarn and squashes the following commits: a1d2101 [ehnalis] MIss-spell fixed. 90f8ba4 [ehnalis] Changed default HB values. 6120295 [ehnalis] Removed the bug, when allocation heartbeat would not start from initial value. 08bac63 [ehnalis] Refined style, grammar, removed duplicated code. 073d283 [ehnalis] [SPARK-7533] [YARN] Decrease spacing between AM-RM heartbeats. d4408c9 [ehnalis] [SPARK-7533] [YARN] Decrease spacing between AM-RM heartbeats.
1 parent 07faedb commit 896a371

File tree

2 files changed

+39
-10
lines changed

2 files changed

+39
-10
lines changed

docs/running-on-yarn.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,22 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
7171
</tr>
7272
<tr>
7373
<td><code>spark.yarn.scheduler.heartbeat.interval-ms</code></td>
74-
<td>5000</td>
74+
<td>3000</td>
7575
<td>
7676
The interval in ms in which the Spark application master heartbeats into the YARN ResourceManager.
77+
The value is capped at half the value of YARN's configuration for the expiry interval
78+
(<code>yarn.am.liveness-monitor.expiry-interval-ms</code>).
79+
</td>
80+
</tr>
81+
<tr>
82+
<td><code>spark.yarn.scheduler.initial-allocation.interval</code></td>
83+
<td>200ms</td>
84+
<td>
85+
The initial interval in which the Spark application master eagerly heartbeats to the YARN ResourceManager
86+
when there are pending container allocation requests. It should be no larger than
87+
<code>spark.yarn.scheduler.heartbeat.interval-ms</code>. The allocation interval will doubled on
88+
successive eager heartbeats if pending containers still exist, until
89+
<code>spark.yarn.scheduler.heartbeat.interval-ms</code> is reached.
7790
</td>
7891
</tr>
7992
<tr>

yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -300,11 +300,14 @@ private[spark] class ApplicationMaster(
300300
val expiryInterval = yarnConf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 120000)
301301

302302
// we want to be reasonably responsive without causing too many requests to RM.
303-
val schedulerInterval =
304-
sparkConf.getTimeAsMs("spark.yarn.scheduler.heartbeat.interval-ms", "5s")
303+
val heartbeatInterval = math.max(0, math.min(expiryInterval / 2,
304+
sparkConf.getTimeAsMs("spark.yarn.scheduler.heartbeat.interval-ms", "3s")))
305305

306-
// must be <= expiryInterval / 2.
307-
val interval = math.max(0, math.min(expiryInterval / 2, schedulerInterval))
306+
// we want to check more frequently for pending containers
307+
val initialAllocationInterval = math.min(heartbeatInterval,
308+
sparkConf.getTimeAsMs("spark.yarn.scheduler.initial-allocation.interval", "200ms"))
309+
310+
var nextAllocationInterval = initialAllocationInterval
308311

309312
// The number of failures in a row until Reporter thread give up
310313
val reporterMaxFailures = sparkConf.getInt("spark.yarn.scheduler.reporterThread.maxFailures", 5)
@@ -330,15 +333,27 @@ private[spark] class ApplicationMaster(
330333
if (!NonFatal(e) || failureCount >= reporterMaxFailures) {
331334
finish(FinalApplicationStatus.FAILED,
332335
ApplicationMaster.EXIT_REPORTER_FAILURE, "Exception was thrown " +
333-
s"${failureCount} time(s) from Reporter thread.")
334-
336+
s"$failureCount time(s) from Reporter thread.")
335337
} else {
336-
logWarning(s"Reporter thread fails ${failureCount} time(s) in a row.", e)
338+
logWarning(s"Reporter thread fails $failureCount time(s) in a row.", e)
337339
}
338340
}
339341
}
340342
try {
341-
Thread.sleep(interval)
343+
val numPendingAllocate = allocator.getNumPendingAllocate
344+
val sleepInterval =
345+
if (numPendingAllocate > 0) {
346+
val currentAllocationInterval =
347+
math.min(heartbeatInterval, nextAllocationInterval)
348+
nextAllocationInterval *= 2
349+
currentAllocationInterval
350+
} else {
351+
nextAllocationInterval = initialAllocationInterval
352+
heartbeatInterval
353+
}
354+
logDebug(s"Number of pending allocations is $numPendingAllocate. " +
355+
s"Sleeping for $sleepInterval.")
356+
Thread.sleep(sleepInterval)
342357
} catch {
343358
case e: InterruptedException =>
344359
}
@@ -349,7 +364,8 @@ private[spark] class ApplicationMaster(
349364
t.setDaemon(true)
350365
t.setName("Reporter")
351366
t.start()
352-
logInfo("Started progress reporter thread - sleep time : " + interval)
367+
logInfo(s"Started progress reporter thread with (heartbeat : $heartbeatInterval, " +
368+
s"initial allocation : $initialAllocationInterval) intervals")
353369
t
354370
}
355371

0 commit comments

Comments
 (0)