Skip to content

Commit f6e9eef

Browse files
JoshRosennemccarthy
authored andcommitted
[SPARK-7766] KryoSerializerInstance reuse is unsafe when auto-reset is disabled
SPARK-3386 / apache#5606 modified the shuffle write path to re-use serializer instances across multiple calls to DiskBlockObjectWriter. It turns out that this introduced a very rare bug when using `KryoSerializer`: if auto-reset is disabled and reference-tracking is enabled, then we'll end up re-using the same serializer instance to write multiple output streams without calling `reset()` between write calls, which can lead to cases where objects in one file may contain references to objects that are in previous files, causing errors during deserialization. This patch fixes this bug by calling `reset()` at the start of `serialize()` and `serializeStream()`. I also added a regression test which demonstrates that this problem only occurs when auto-reset is disabled and reference-tracking is enabled. Author: Josh Rosen <[email protected]> Closes apache#6293 from JoshRosen/kryo-instance-reuse-bug and squashes the following commits: e19726d [Josh Rosen] Add fix for SPARK-7766. 71845e3 [Josh Rosen] Add failing regression test to trigger Kryo re-use bug
1 parent dce43d7 commit f6e9eef

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends Serializ
177177

178178
override def serialize[T: ClassTag](t: T): ByteBuffer = {
179179
output.clear()
180+
kryo.reset() // We must reset in case this serializer instance was reused (see SPARK-7766)
180181
try {
181182
kryo.writeClassAndObject(output, t)
182183
} catch {
@@ -202,6 +203,7 @@ private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends Serializ
202203
}
203204

204205
override def serializeStream(s: OutputStream): SerializationStream = {
206+
kryo.reset() // We must reset in case this serializer instance was reused (see SPARK-7766)
205207
new KryoSerializationStream(kryo, s)
206208
}
207209

core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.spark.serializer
1919

20+
import java.io.ByteArrayOutputStream
21+
2022
import scala.collection.mutable
2123
import scala.reflect.ClassTag
2224

@@ -319,6 +321,37 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
319321
val ser2 = new KryoSerializer(conf).newInstance().asInstanceOf[KryoSerializerInstance]
320322
assert(!ser2.getAutoReset)
321323
}
324+
325+
private def testSerializerInstanceReuse(autoReset: Boolean, referenceTracking: Boolean): Unit = {
326+
val conf = new SparkConf(loadDefaults = false)
327+
.set("spark.kryo.referenceTracking", referenceTracking.toString)
328+
if (!autoReset) {
329+
conf.set("spark.kryo.registrator", classOf[RegistratorWithoutAutoReset].getName)
330+
}
331+
val ser = new KryoSerializer(conf)
332+
val serInstance = ser.newInstance().asInstanceOf[KryoSerializerInstance]
333+
assert (serInstance.getAutoReset() === autoReset)
334+
val obj = ("Hello", "World")
335+
def serializeObjects(): Array[Byte] = {
336+
val baos = new ByteArrayOutputStream()
337+
val serStream = serInstance.serializeStream(baos)
338+
serStream.writeObject(obj)
339+
serStream.writeObject(obj)
340+
serStream.close()
341+
baos.toByteArray
342+
}
343+
val output1: Array[Byte] = serializeObjects()
344+
val output2: Array[Byte] = serializeObjects()
345+
assert (output1 === output2)
346+
}
347+
348+
// Regression test for SPARK-7766, an issue where disabling auto-reset and enabling
349+
// reference-tracking would lead to corrupted output when serializer instances are re-used
350+
for (referenceTracking <- Set(true, false); autoReset <- Set(true, false)) {
351+
test(s"instance reuse with autoReset = $autoReset, referenceTracking = $referenceTracking") {
352+
testSerializerInstanceReuse(autoReset = autoReset, referenceTracking = referenceTracking)
353+
}
354+
}
322355
}
323356

324357

0 commit comments

Comments
 (0)