Skip to content

Commit c937f00

Browse files
committed
SPARK-3837. Warn when YARN kills containers for exceeding memory limits
1 parent bfa614b commit c937f00

File tree

2 files changed

+63
-3
lines changed

2 files changed

+63
-3
lines changed

yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.spark.deploy.yarn
2020
import java.util.{List => JList}
2121
import java.util.concurrent._
2222
import java.util.concurrent.atomic.AtomicInteger
23+
import java.util.regex.Pattern
2324

2425
import scala.collection.JavaConversions._
2526
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
@@ -330,12 +331,21 @@ private[yarn] abstract class YarnAllocator(
330331
logInfo("Completed container %s (state: %s, exit status: %s)".format(
331332
containerId,
332333
completedContainer.getState,
333-
completedContainer.getExitStatus()))
334+
completedContainer.getExitStatus))
334335
// Hadoop 2.2.X added a ContainerExitStatus we should switch to use
335336
// there are some exit status' we shouldn't necessarily count against us, but for
336337
// now I think its ok as none of the containers are expected to exit
337-
if (completedContainer.getExitStatus() != 0) {
338-
logInfo("Container marked as failed: " + containerId)
338+
if (completedContainer.getExitStatus == -103) { // vmem limit exceeded
339+
logWarning(MemLimitLogger.memLimitExceededLogMessage(
340+
completedContainer.getDiagnostics,
341+
MemLimitLogger.VMEM_EXCEEDED_PATTERN))
342+
} else if (completedContainer.getExitStatus == -104) { // pmem limit exceeded
343+
logWarning(MemLimitLogger.memLimitExceededLogMessage(
344+
completedContainer.getDiagnostics,
345+
MemLimitLogger.PMEM_EXCEEDED_PATTERN))
346+
} else if (completedContainer.getExitStatus != 0) {
347+
logInfo("Container marked as failed: " + containerId + ". Exit status: " +
348+
completedContainer.getExitStatus)
339349
numExecutorsFailed.incrementAndGet()
340350
}
341351
}
@@ -463,3 +473,19 @@ private[yarn] abstract class YarnAllocator(
463473
}
464474

465475
}
476+
477+
private[yarn] object MemLimitLogger {
478+
private val MEM_REGEX = "[0-9.]+ [KMG]B"
479+
val PMEM_EXCEEDED_PATTERN =
480+
Pattern.compile(s"$MEM_REGEX of $MEM_REGEX physical memory used")
481+
val VMEM_EXCEEDED_PATTERN =
482+
Pattern.compile(s"$MEM_REGEX of $MEM_REGEX virtual memory used")
483+
484+
def memLimitExceededLogMessage(diagnostics: String, pattern: Pattern): String = {
485+
val matcher = pattern.matcher(diagnostics)
486+
val diag = if (matcher.find()) " " + matcher.group() + "." else ""
487+
("Container killed by YARN for exceeding memory limits." + diag
488+
+ " Consider boosting spark.yarn.executor.memoryOverhead.")
489+
}
490+
491+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.deploy.yarn
19+
20+
import org.apache.spark.deploy.yarn.MemLimitLogger._
21+
import org.scalatest.FunSuite
22+
23+
class YarnAllocatorSuite extends FunSuite {
24+
test("memory exceeded diagnostic regexes") {
25+
val diagnostics =
26+
"Container [pid=12465,containerID=container_1412887393566_0003_01_000002] is running " +
27+
"beyond physical memory limits. Current usage: 2.1 MB of 2 GB physical memory used; " +
28+
"5.8 GB of 4.2 GB virtual memory used. Killing container."
29+
val vmemMsg = memLimitExceededLogMessage(diagnostics, VMEM_EXCEEDED_PATTERN)
30+
val pmemMsg = memLimitExceededLogMessage(diagnostics, PMEM_EXCEEDED_PATTERN)
31+
assert(vmemMsg.contains("5.8 GB of 4.2 GB virtual memory used."))
32+
assert(pmemMsg.contains("2.1 MB of 2 GB physical memory used."))
33+
}
34+
}

0 commit comments

Comments
 (0)