top-most limit should not cause memory leak

cloud-fan · cloud-fan · commit cca1dda60b73 · 2017-08-16T15:48:21.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -72,7 +72,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           execution.TakeOrderedAndProjectExec(
             limit, order, projectList, planLater(child)) :: Nil
         case logical.Limit(IntegerLiteral(limit), child) =>
-          execution.CollectLimitExec(limit, planLater(child)) :: Nil
+          // Normally wrapping child with `LocalLimitExec` here is a no-op, because
+          // `CollectLimitExec.executeCollect` will call `LogicalLimitExec.executeTake`, which
+          // calls `child.executeTake`. If child supports whole stage codegen, adding this
+          // `LocalLimitExec` can break the input consuming loop inside whole stage codegen and
+          // trigger the resource releasing work, after we consume `limit` rows.
+          execution.CollectLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil
         case other => planLater(other) :: Nil
       }
       case logical.Limit(IntegerLiteral(limit), logical.Sort(order, true, child)) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -474,6 +474,10 @@ case class CollapseCodegenStages(conf: SQLConf) extends Rule[SparkPlan] {
   }
 
   private def supportCodegen(plan: SparkPlan): Boolean = plan match {
+    // Do not enable whole stage codegen for a single limit.
+    case limit: BaseLimitExec if !limit.child.isInstanceOf[CodegenSupport] ||
+        !limit.child.asInstanceOf[CodegenSupport].supportCodegen =>
+      false
     case plan: CodegenSupport if plan.supportCodegen =>
       val willFallback = plan.expressions.exists(_.find(e => !supportCodegen(e)).isDefined)
       // the generated code will be huge if there are too many columns
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -54,6 +54,10 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
   val limit: Int
   override def output: Seq[Attribute] = child.output
 
+  override def executeTake(n: Int): Array[InternalRow] = child.executeTake(math.min(n, limit))
+
+  override def executeCollect(): Array[InternalRow] = child.executeTake(limit)
+
   protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
     iter.take(limit)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2658,4 +2658,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(sql("SELECT __auto_generated_subquery_name.i from (SELECT i FROM v)"), Row(1))
     }
   }
+
+  test("SPARK-21743: top-most limit should not cause memory leak") {
+    // In unit test, Spark will fail the query if memory leak detected.
+    spark.range(100).groupBy("id").count().limit(1).collect()
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -2658,4 +2658,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {`
`2658`	`2658`	`checkAnswer(sql("SELECT __auto_generated_subquery_name.i from (SELECT i FROM v)"), Row(1))`
`2659`	`2659`	`}`
`2660`	`2660`	`}`
	`2661`	`+`
	`2662`	`+ test("SPARK-21743: top-most limit should not cause memory leak") {`
	`2663`	`+ // In unit test, Spark will fail the query if memory leak detected.`
	`2664`	`+ spark.range(100).groupBy("id").count().limit(1).collect()`
	`2665`	`+ }`
`2661`	`2666`	`}`