Skip to content

Commit 104b366

Browse files
committed
Use the actual reference queue length
The previous code used the length of the referenceBuffer, which is the number of elements registered for clean-up, rather than the number of elements registered AND de-referenced. What we want is the length of the referenceQueue. However, Java does not expose this, so we must access it through reflection. Since this is potentially expensive, we need to limit the number of times we access the queue length this way.
1 parent 0b7e768 commit 104b366

File tree

1 file changed

+46
-7
lines changed

1 file changed

+46
-7
lines changed

core/src/main/scala/org/apache/spark/ContextCleaner.scala

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark
1919

2020
import java.lang.ref.{ReferenceQueue, WeakReference}
21+
import java.lang.reflect.Field
2122

2223
import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
2324

@@ -64,9 +65,26 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
6465

6566
private val cleaningThread = new Thread() { override def run() { keepCleaning() }}
6667

67-
// Capacity of the reference buffer before we log an error message
68-
private val referenceBufferCapacity = 10000
68+
/**
69+
* Keep track of the reference queue length and log an error if this exceeds a certain capacity.
70+
* Unfortunately, Java's ReferenceQueue exposes neither the queue length nor the enqueue method,
71+
* so we have to do this through reflection. This is expensive, however, so we should access
72+
* this field only once in a while.
73+
*/
74+
private val queueCapacity = 10000
6975
private var queueFullErrorMessageLogged = false
76+
private val queueLengthAccessor: Option[Field] = {
77+
try {
78+
val f = classOf[ReferenceQueue[AnyRef]].getDeclaredField("queueLength")
79+
f.setAccessible(true)
80+
Some(f)
81+
} catch {
82+
case e: Exception =>
83+
logDebug("Failed to expose java.lang.ref.ReferenceQueue's queueLength field: " + e)
84+
None
85+
}
86+
}
87+
private val logQueueLengthInterval = 1000
7088

7189
/**
7290
* Whether the cleaning thread will block on cleanup tasks.
@@ -117,13 +135,11 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
117135
/** Register an object for cleanup. */
118136
private def registerForCleanup(objectForCleanup: AnyRef, task: CleanupTask) {
119137
referenceBuffer += new CleanupTaskWeakReference(task, objectForCleanup, referenceQueue)
120-
if (referenceBuffer.size > referenceBufferCapacity) {
121-
logQueueFullErrorMessage()
122-
}
123138
}
124139

125140
/** Keep cleaning RDD, shuffle, and broadcast state. */
126141
private def keepCleaning(): Unit = Utils.logUncaughtExceptions {
142+
var iteration = 0
127143
while (!stopped) {
128144
try {
129145
val reference = Option(referenceQueue.remove(ContextCleaner.REF_QUEUE_POLL_TIMEOUT))
@@ -140,10 +156,14 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
140156
case CleanBroadcast(broadcastId) =>
141157
doCleanupBroadcast(broadcastId, blocking = blockOnCleanupTasks)
142158
}
159+
if (iteration % logQueueLengthInterval == 0) {
160+
logQueueLength()
161+
}
143162
}
144163
} catch {
145164
case e: Exception => logError("Error in cleaning thread", e)
146165
}
166+
iteration += 1
147167
}
148168
}
149169

@@ -190,16 +210,35 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
190210
private def logQueueFullErrorMessage(): Unit = {
191211
if (!queueFullErrorMessageLogged) {
192212
queueFullErrorMessageLogged = true
193-
logError(s"Reference queue size in ContextCleaner has exceeded $referenceBufferCapacity! " +
213+
logError(s"Reference queue size in ContextCleaner has exceeded $queueCapacity! " +
194214
"This means the rate at which we clean up RDDs, shuffles, and/or broadcasts is too slow.")
195-
if (!blockOnCleanupTasks) {
215+
if (blockOnCleanupTasks) {
196216
logError("Consider setting spark.cleaner.referenceTracking.blocking to false." +
197217
"Note that there is a known issue (SPARK-3015) in disabling blocking, especially if " +
198218
"the workload involves creating many RDDs in quick successions.")
199219
}
200220
}
201221
}
202222

223+
/**
224+
* Log the length of the reference queue through reflection.
225+
* This is an expensive operation and should be called sparingly.
226+
*/
227+
private def logQueueLength(): Unit = {
228+
try {
229+
queueLengthAccessor.foreach { field =>
230+
val length = field.getLong(referenceQueue)
231+
logDebug("Reference queue size is " + length)
232+
if (length > queueCapacity) {
233+
logQueueFullErrorMessage()
234+
}
235+
}
236+
} catch {
237+
case e: Exception =>
238+
logDebug("Failed to access reference queue's length through reflection: " + e)
239+
}
240+
}
241+
203242
private def blockManagerMaster = sc.env.blockManager.master
204243
private def broadcastManager = sc.env.broadcastManager
205244
private def mapOutputTrackerMaster = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]

0 commit comments

Comments
 (0)