optimize limit using coalesce

scwf · scwf · commit 681243aa2ae1 · 2014-12-01T15:51:00.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -148,20 +148,15 @@ case class Limit(limit: Int, child: SparkPlan)
   }
 
   override def execute() = {
-    val rdd: RDD[_ <: Product2[Boolean, Row]] = if (sortBasedShuffleOn) {
-      child.execute().mapPartitions { iter =>
-        iter.take(limit).map(row => (false, row.copy()))
+    if (sortBasedShuffleOn) {
+      child.execute().map(_.copy).coalesce(1).mapPartitions { iter =>
+        iter.take(limit)
       }
     } else {
-      child.execute().mapPartitions { iter =>
-        val mutablePair = new MutablePair[Boolean, Row]()
-        iter.take(limit).map(row => mutablePair.update(false, row))
+      child.execute().coalesce(1).mapPartitions { iter =>
+        iter.take(limit)
       }
     }
-    val part = new HashPartitioner(1)
-    val shuffled = new ShuffledRDD[Boolean, Row, Row](rdd, part)
-    shuffled.setSerializer(new SparkSqlSerializer(new SparkConf(false)))
-    shuffled.mapPartitions(_.take(limit).map(_._2))
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -148,20 +148,15 @@ case class Limit(limit: Int, child: SparkPlan)`
`148`	`148`	`}`
`149`	`149`
`150`	`150`	`override def execute() = {`
`151`		`- val rdd: RDD[_ <: Product2[Boolean, Row]] = if (sortBasedShuffleOn) {`
`152`		`- child.execute().mapPartitions { iter =>`
`153`		`- iter.take(limit).map(row => (false, row.copy()))`
	`151`	`+ if (sortBasedShuffleOn) {`
	`152`	`+ child.execute().map(_.copy).coalesce(1).mapPartitions { iter =>`
	`153`	`+ iter.take(limit)`
`154`	`154`	`}`
`155`	`155`	`} else {`
`156`		`- child.execute().mapPartitions { iter =>`
`157`		`- val mutablePair = new MutablePair[Boolean, Row]()`
`158`		`- iter.take(limit).map(row => mutablePair.update(false, row))`
	`156`	`+ child.execute().coalesce(1).mapPartitions { iter =>`
	`157`	`+ iter.take(limit)`
`159`	`158`	`}`
`160`	`159`	`}`
`161`		`- val part = new HashPartitioner(1)`
`162`		`- val shuffled = new ShuffledRDD[Boolean, Row, Row](rdd, part)`
`163`		`- shuffled.setSerializer(new SparkSqlSerializer(new SparkConf(false)))`
`164`		`- shuffled.mapPartitions(_.take(limit).map(_._2))`
`165`	`160`	`}`
`166`	`161`	`}`
`167`	`162`