simplify the unit test

chenghao-intel · chenghao-intel · commit b8dd7eb765df · 2015-09-09T17:59:39.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -188,6 +188,12 @@ private[spark] class ExternalSorter[K, V, C](
 
   private val spills = new ArrayBuffer[SpilledFile]
 
+  /**
+   * Number of files this sorter has spilled so far.
+   * Exposed for testing.
+   */
+  private[spark] def numSpills: Int = spills.size
+
   override def insertAll(records: Iterator[Product2[K, V]]): Unit = {
     // TODO: stop combining if we find that the reduction factor isn't high
     val shouldCombine = aggregator.isDefined
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
@@ -72,7 +72,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
     override def writeKey[T: ClassTag](key: T): SerializationStream = {
       // The key is only needed on the map side when computing partition ids. It does not need to
       // be shuffled.
-      assert(key.isInstanceOf[Int])
+      assert(null == key || key.isInstanceOf[Int])
       this
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -17,13 +17,16 @@
 
 package org.apache.spark.sql.execution
 
-import java.io.{DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
+import java.io.{File, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.executor.ShuffleWriteMetrics
+import org.apache.spark.storage.ShuffleBlockId
+import org.apache.spark.util.collection.ExternalSorter
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
+import org.apache.spark._
 
 
 /**
@@ -87,4 +90,42 @@ class UnsafeRowSerializerSuite extends SparkFunSuite {
     assert(!deserializerIter.hasNext)
     assert(input.closed)
   }
+
+  test("SPARK-10466: external sorter spilling with unsafe row serializer") {
+    val conf = new SparkConf()
+      .set("spark.shuffle.spill.initialMemoryThreshold", "1024")
+      .set("spark.shuffle.sort.bypassMergeThreshold", "0")
+      .set("spark.shuffle.memoryFraction", "0.0001")
+    var sc: SparkContext = null
+    var outputFile: File = null
+    try {
+      sc = new SparkContext("local", "test", conf)
+      outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")
+      val data = (1 to 1000).iterator.map { i =>
+        (i, toUnsafeRow(Row(i), Array(IntegerType)))
+      }
+      val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
+        partitioner = Some(new HashPartitioner(10)),
+        serializer = Some(new UnsafeRowSerializer(numFields = 1)))
+
+      // Ensure we spilled something and have to merge them later
+      assert(sorter.numSpills === 0)
+      sorter.insertAll(data)
+      assert(sorter.numSpills > 0)
+
+      // Merging spilled files should not throw assertion error
+      val taskContext = new TaskContextImpl(0, 0, 0, 0, null, null, InternalAccumulator.create(sc))
+      taskContext.taskMetrics.shuffleWriteMetrics = Some(new ShuffleWriteMetrics)
+      sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), taskContext, outputFile)
+
+    } finally {
+      // Clean up
+      if (sc != null) {
+        sc.stop()
+      }
+      if (outputFile != null) {
+        outputFile.delete()
+      }
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst`
`72`	`72`	`override def writeKey[T: ClassTag](key: T): SerializationStream = {`
`73`	`73`	`// The key is only needed on the map side when computing partition ids. It does not need to`
`74`	`74`	`// be shuffled.`
`75`		`- assert(key.isInstanceOf[Int])`
	`75`	`+ assert(null == key \|\| key.isInstanceOf[Int])`
`76`	`76`	`this`
`77`	`77`	`}`
`78`	`78`