Better config, docs, etc.

marmbrus · marmbrus · commit 4220f1e82cea · 2014-07-10T22:23:10.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -22,7 +22,7 @@ package org.apache.spark.sql.catalyst.expressions
  * new row. If the schema of the input row is specified, then the given expression will be bound to
  * that schema.
  */
-class InterpretedProjection(expressions: Seq[Expression]) extends (Row => Row) {
+class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -23,6 +23,9 @@ import org.apache.spark.sql.catalyst.types.BooleanType
 
 
 object InterpretedPredicate {
+  def apply(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
+    apply(BindReferences.bindReference(expression, inputSchema))
+
   def apply(expression: Expression): (Row => Boolean) = {
     (r: Row) => expression.eval(r).asInstanceOf[Boolean]
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -35,6 +35,18 @@ trait SQLConf {
   /** Number of partitions to use for shuffle operators. */
   private[spark] def numShufflePartitions: Int = get("spark.sql.shuffle.partitions", "200").toInt
 
+  /**
+   * When set to true, Spark SQL will use the scala compiler at runtime to generate custom bytecode
+   * that evaluates expressions found in queries.  In general this custom code runs much faster
+   * than interpreted evaluation, but there are significant start-up costs due to compilation.
+   * As a result codegen is only benificial when queries run for a long time, or when the same
+   * expressions are used multiple times.
+   *
+   * Defaults to false as this feature is currently experimental.
+   */
+  private[spark] def codegenEnabled: Boolean =
+    if (get("spark.sql.codegen", "true") == "true") true else false
+
   /**
    * Upper bound on the sizes (in bytes) of the tables qualified for the auto conversion to
    * a broadcast value during the physical executions of join operations.  Setting this to 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -234,6 +234,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
     val sqlContext: SQLContext = self
 
+    def codegenEnabled = self.codegenEnabled
+
     def numPartitions = self.numShufflePartitions
 
     val strategies: Seq[Strategy] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
@@ -50,6 +50,9 @@ case class Generate(
   override def output =
     if (join) child.output ++ generatorOutput else generatorOutput
 
+  /** Codegenned rows are not serializable... */
+  override val codegenEnabled = false
+
   override def execute() = {
     if (join) {
       child.execute().mapPartitions { iter =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Logging, Row}
+import org.apache.spark.sql.{SQLContext, Logging, Row}
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions._
@@ -35,6 +35,8 @@ import org.apache.spark.sql.catalyst.plans.physical._
 abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging {
   self: Product =>
 
+  val codegenEnabled = true
+
   // TODO: Move to `DistributedPlan`
   /** Specifies how data is partitioned across different nodes in the cluster. */
   def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH!
@@ -53,17 +55,29 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging {
   def executeCollect(): Array[Row] = execute().map(_.copy()).collect()
 
   def newProjection(expressions: Seq[Expression], inputSchema: Seq[Attribute]): Projection =
-    GenerateProjection(expressions, inputSchema)
+    if (codegenEnabled) {
+      GenerateProjection(expressions, inputSchema)
+    } else {
+      new InterpretedProjection(expressions, inputSchema)
+    }
 
   def newMutableProjection(
       expressions: Seq[Expression],
       inputSchema: Seq[Attribute]): () => MutableProjection = {
-    GenerateMutableProjection(expressions, inputSchema)
+    if(codegenEnabled) {
+      GenerateMutableProjection(expressions, inputSchema)
+    } else {
+      () => new InterpretedMutableProjection(expressions, inputSchema)
+    }
   }
 
 
   def newPredicate(expression: Expression, inputSchema: Seq[Attribute]): (Row) => Boolean = {
-    GeneratePredicate(expression, inputSchema)
+    if (codegenEnabled) {
+      GeneratePredicate(expression, inputSchema)
+    } else {
+      InterpretedPredicate(expression, inputSchema)
+    }
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.{SQLContext, execution}
+import org.apache.spark.sql.{SQLConf, SQLContext, execution}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
@@ -108,7 +108,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
              child)
              if canBeCodeGened(
                   allAggregates(partialComputation) ++
-                  allAggregates(rewrittenAggregateExpressions))=>
+                  allAggregates(rewrittenAggregateExpressions)) &&
+               codegenEnabled =>
           execution.GeneratedAggregate(
             partial = false,
             namedGroupingAttributes,
@@ -119,7 +120,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
               partialComputation,
               planLater(child))(sqlContext))(sqlContext) :: Nil
 
-
       // Cases where some aggregate can not be codegened
       case PartialAggregation(
              namedGroupingAttributes,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregates.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregates.scala
@@ -17,10 +17,8 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.types._
@@ -51,8 +49,6 @@ case class GeneratedAggregate(
     child: SparkPlan)(@transient sqlContext: SQLContext)
   extends UnaryNode with NoBind {
 
-  private def sc = sqlContext.sparkContext
-
   override def requiredChildDistribution =
     if (partial) {
       UnspecifiedDistribution :: Nil
@@ -66,24 +62,24 @@ case class GeneratedAggregate(
 
   override def otherCopyArgs = sqlContext :: Nil
 
-  def output = aggregateExpressions.map(_.toAttribute)
+  override def output = aggregateExpressions.map(_.toAttribute)
 
-  def execute() = {
+  override def execute() = {
     val aggregatesToCompute = aggregateExpressions.flatMap { a =>
       a.collect { case agg: AggregateExpression => agg}
     }
 
     val computeFunctions = aggregatesToCompute.map {
-      case c@Count(expr) =>
-        val currentCount = AttributeReference("currentCount", LongType, true)()
+      case c @ Count(expr) =>
+        val currentCount = AttributeReference("currentCount", LongType, nullable = false)()
         val initialValue = Literal(0L)
         val updateFunction = If(IsNotNull(expr), Add(currentCount, Literal(1L)), currentCount)
         val result = currentCount
 
         AggregateEvaluation(currentCount :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
 
       case Sum(expr) =>
-        val currentSum = AttributeReference("currentSum", expr.dataType, true)()
+        val currentSum = AttributeReference("currentSum", expr.dataType, nullable = false)()
         val initialValue = Cast(Literal(0L), expr.dataType)
 
         // Coalasce avoids double calculation...
@@ -93,9 +89,9 @@ case class GeneratedAggregate(
 
         AggregateEvaluation(currentSum :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
 
-      case a@Average(expr) =>
-        val currentCount = AttributeReference("currentCount", LongType, true)()
-        val currentSum = AttributeReference("currentSum", expr.dataType, true)()
+      case a @ Average(expr) =>
+        val currentCount = AttributeReference("currentCount", LongType, nullable = false)()
+        val currentSum = AttributeReference("currentSum", expr.dataType, nullable = false)()
         val initialCount = Literal(0L)
         val initialSum = Cast(Literal(0L), expr.dataType)
         val updateCount = If(IsNotNull(expr), Add(currentCount, Literal(1L)), currentCount)
@@ -131,50 +127,70 @@ case class GeneratedAggregate(
 
     child.execute().mapPartitions { iter =>
       // Builds a new custom class for holding the results of aggregation for a group.
+      @transient
       val newAggregationBuffer =
         newProjection(computeFunctions.flatMap(_.initialValues), child.output)
 
       // A projection that is used to update the aggregate values for a group given a new tuple.
       // This projection should be targeted at the current values for the group and then applied
       // to a joined row of the current values with the new input row.
+      @transient
       val updateProjection =
         newMutableProjection(
           computeFunctions.flatMap(_.update),
           computeFunctions.flatMap(_.schema) ++ child.output)()
 
       // A projection that computes the group given an input tuple.
+      @transient
       val groupProjection = newProjection(groupingExpressions, child.output)
 
       // A projection that produces the final result, given a computation.
+      @transient
       val resultProjectionBuilder =
         newMutableProjection(
           resultExpressions,
           (namedGroups.map(_._2.toAttribute) ++ computationSchema).toSeq)
 
-      val buffers = new java.util.HashMap[Row, MutableRow]()
       val joinedRow = new JoinedRow
 
-      var currentRow: Row = null
-      while (iter.hasNext) {
-        currentRow = iter.next()
-        val currentGroup = groupProjection(currentRow)
-        var currentBuffer = buffers.get(currentGroup)
-        if (currentBuffer == null) {
-          currentBuffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow]
-          buffers.put(currentGroup, currentBuffer)
+      if (groupingExpressions.isEmpty) {
+        // TODO: Codegening anything other than the updateProjection is probably over kill.
+        val buffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow]
+        var currentRow: Row = null
+        while (iter.hasNext) {
+          currentRow = iter.next()
+          updateProjection.target(buffer)(joinedRow(buffer, currentRow))
+        }
+
+        val resultProjection = resultProjectionBuilder()
+        Iterator(resultProjection(buffer))
+      } else {
+        val buffers = new java.util.HashMap[Row, MutableRow]()
+
+        var currentRow: Row = null
+        while (iter.hasNext) {
+          currentRow = iter.next()
+          val currentGroup = groupProjection(currentRow)
+          var currentBuffer = buffers.get(currentGroup)
+          if (currentBuffer == null) {
+            currentBuffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow]
+            buffers.put(currentGroup, currentBuffer)
+          }
+          // Target the projection at the current aggregation buffer and then project the updated
+          // values.
+          updateProjection.target(currentBuffer)(joinedRow(currentBuffer, currentRow))
         }
-        updateProjection.target(currentBuffer)(joinedRow(currentBuffer, currentRow))
-      }
 
-      new Iterator[Row] {
-        private[this] val resultIterator = buffers.entrySet.iterator()
-        private[this] val resultProjection = resultProjectionBuilder()
+        new Iterator[Row] {
+          private[this] val resultIterator = buffers.entrySet.iterator()
+          private[this] val resultProjection = resultProjectionBuilder()
 
-        def hasNext = resultIterator.hasNext
+          def hasNext = resultIterator.hasNext
 
-        def next() = {
-          val currentGroup = resultIterator.next()
-          resultProjection(joinedRow(currentGroup.getKey, currentGroup.getValue))
+          def next() = {
+            val currentGroup = resultIterator.next()
+            resultProjection(joinedRow(currentGroup.getKey, currentGroup.getValue))
+          }
         }
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,9 @@ import org.apache.spark.sql.catalyst.types.BooleanType`
`23`	`23`
`24`	`24`
`25`	`25`	`object InterpretedPredicate {`
	`26`	`+ def apply(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =`
	`27`	`+ apply(BindReferences.bindReference(expression, inputSchema))`
	`28`	`+`
`26`	`29`	`def apply(expression: Expression): (Row => Boolean) = {`
`27`	`30`	`(r: Row) => expression.eval(r).asInstanceOf[Boolean]`
`28`	`31`	`}`