Added SortedOperation pattern to match *some* definitely sorted operations and avoid some sorting cost in HiveComparisonTest.

liancheng · liancheng · commit bf0e7dce3879 · 2014-05-28T21:25:17.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -168,3 +168,16 @@ object Unions {
     case other => other :: Nil
   }
 }
+
+/**
+ * A pattern that matches (some) sorted operations and returns corresponding sorting orders.
+ * Currently operations matched by this pattern are guaranteed to be sorted, but not all sorted
+ * operations are matched by this pattern.
+ */
+object SortedOperation {
+  // TODO (lian) detect more sorted operations
+  def unapply(plan: LogicalPlan): Option[Seq[SortOrder]] = plan match {
+    case FilteredOperation(_, Sort(order, _)) => Some(order)
+    case _ => None
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -116,7 +116,7 @@ case class Aggregate(
    */
   @transient
   private[this] lazy val resultMap =
-    (computedAggregates.map { agg => agg.unbound -> agg.resultAttribute} ++ namedGroups).toMap
+    (computedAggregates.map { agg => agg.unbound -> agg.resultAttribute } ++ namedGroups).toMap
 
   /**
    * Substituted version of aggregateExpressions expressions which are used to compute final
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -19,11 +19,12 @@ package org.apache.spark.sql.hive.execution
 
 import java.io._
 
+import org.scalatest.{BeforeAndAfterAll, FunSuite, GivenWhenThen}
+
 import org.apache.spark.sql.Logging
+import org.apache.spark.sql.catalyst.planning.SortedOperation
 import org.apache.spark.sql.catalyst.plans.logical.{ExplainCommand, NativeCommand}
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution.Sort
-import org.scalatest.{BeforeAndAfterAll, FunSuite, GivenWhenThen}
 import org.apache.spark.sql.hive.test.TestHive
 
 /**
@@ -131,14 +132,8 @@ abstract class HiveComparisonTest
     val orderedAnswer = hiveQuery.logical match {
       // Clean out non-deterministic time schema info.
       case _: NativeCommand => answer.filterNot(nonDeterministicLine).filterNot(_ == "")
-      case _: ExplainCommand => answer
-      case _ =>
-        // TODO: Really we only care about the final total ordering here...
-        val isOrdered = hiveQuery.executedPlan.collect {
-          case s @ Sort(_, global, _) if global => s
-        }.nonEmpty
-        // If the query results aren't sorted, then sort them to ensure deterministic answers.
-        if (!isOrdered) answer.sorted else answer
+      case _: ExplainCommand | SortedOperation(_) => answer
+      case _ => answer.sorted
     }
     orderedAnswer.map(cleanPaths)
   }
@@ -161,7 +156,7 @@ abstract class HiveComparisonTest
     "minFileSize"
   )
   protected def nonDeterministicLine(line: String) =
-    nonDeterministicLineIndicators.map(line contains _).reduceLeft(_||_)
+    nonDeterministicLineIndicators.exists(line contains _)
 
   /**
    * Removes non-deterministic paths from `str` so cached answers will compare correctly.