update the code as suggested

chenghao-intel · chenghao-intel · commit e1d88aa9a37e · 2015-05-20T17:19:10.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -685,10 +685,13 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   @scala.annotation.varargs
-  def groupBy(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr), GroupByType)
+  def groupBy(cols: Column*): GroupedData = {
+    GroupedData(this, cols.map(_.expr), GroupedData.GroupByType)
+  }
 
   /**
-   * Rollup the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
    * See [[GroupedData]] for all the available aggregate functions.
    *
    * {{{
@@ -705,10 +708,13 @@ class DataFrame private[sql](
    * @since 1.4.0
    */
   @scala.annotation.varargs
-  def rollup(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr), RollupType)
+  def rollup(cols: Column*): GroupedData = {
+    GroupedData(this, cols.map(_.expr), GroupedData.RollupType)
+  }
 
   /**
-   * Cube the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
    * See [[GroupedData]] for all the available aggregate functions.
    *
    * {{{
@@ -725,7 +731,7 @@ class DataFrame private[sql](
    * @since 1.4.0
    */
   @scala.annotation.varargs
-  def cube(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr), CubeType)
+  def cube(cols: Column*): GroupedData = GroupedData(this, cols.map(_.expr), GroupedData.CubeType)
 
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
@@ -750,14 +756,15 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def groupBy(col1: String, cols: String*): GroupedData = {
     val colNames: Seq[String] = col1 +: cols
-    new GroupedData(this, colNames.map(colName => resolve(colName)), GroupByType)
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.GroupByType)
   }
 
   /**
-   * Rollup the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * Create a multi-dimensional rollup for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
    * See [[GroupedData]] for all the available aggregate functions.
    *
-   * This is a variant of groupBy that can only group by existing columns using column names
+   * This is a variant of rollup that can only group by existing columns using column names
    * (i.e. cannot construct expressions).
    *
    * {{{
@@ -776,14 +783,15 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def rollup(col1: String, cols: String*): GroupedData = {
     val colNames: Seq[String] = col1 +: cols
-    new GroupedData(this, colNames.map(colName => resolve(colName)), RollupType)
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.RollupType)
   }
 
   /**
-   * Cube the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * Create a multi-dimensional cube for the current [[DataFrame]] using the specified columns,
+   * so we can run aggregation on them.
    * See [[GroupedData]] for all the available aggregate functions.
    *
-   * This is a variant of groupBy that can only group by existing columns using column names
+   * This is a variant of cube that can only group by existing columns using column names
    * (i.e. cannot construct expressions).
    *
    * {{{
@@ -802,7 +810,7 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def cube(col1: String, cols: String*): GroupedData = {
     val colNames: Seq[String] = col1 +: cols
-    new GroupedData(this, colNames.map(colName => resolve(colName)), CubeType)
+    GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.CubeType)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -32,19 +32,31 @@ import org.apache.spark.sql.types.NumericType
 sealed private[sql] trait GroupType
 
 /**
- * To indicate it's the GroupBy
+ * Companion object for GroupedData
  */
-private[sql] object GroupByType extends GroupType
+private[sql] object GroupedData {
+  def apply(
+      df: DataFrame,
+      groupingExprs: Seq[Expression],
+      groupType: GroupType): GroupedData = {
+    new GroupedData(df, groupingExprs).withNewGroupType(groupType)
+  }
 
-/**
- * To indicate it's the CUBE
- */
-private[sql] object CubeType extends GroupType
+  /**
+   * To indicate it's the GroupBy
+   */
+  private[sql] object GroupByType extends GroupType
 
-/**
- * To indicate it's the ROLLUP
- */
-private[sql] object RollupType extends GroupType
+  /**
+   * To indicate it's the CUBE
+   */
+  private[sql] object CubeType extends GroupType
+
+  /**
+   * To indicate it's the ROLLUP
+   */
+  private[sql] object RollupType extends GroupType
+}
 
 /**
  * :: Experimental ::
@@ -53,35 +65,36 @@ private[sql] object RollupType extends GroupType
  * @since 1.3.0
  */
 @Experimental
-class GroupedData protected[sql](
-    df: DataFrame,
-    groupingExprs: Seq[Expression],
-    groupType: GroupType) {
+class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) {
 
-  protected def aggregateExpressions(aggrExprs: Seq[NamedExpression])
-    : Seq[NamedExpression] = {
-    if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
-      val retainedExprs = groupingExprs.map {
-        case expr: NamedExpression => expr
-        case expr: Expression => Alias(expr, expr.prettyString)()
-      }
-      retainedExprs ++ aggrExprs
-    } else {
-      aggrExprs
-    }
+  private var groupType: GroupType = _
+
+  private[sql] def withNewGroupType(groupType: GroupType): GroupedData = {
+    this.groupType = groupType
+    this
   }
 
-  protected[sql] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
+  private[sql] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
+    val aggregates = if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
+        val retainedExprs = groupingExprs.map {
+          case expr: NamedExpression => expr
+          case expr: Expression => Alias(expr, expr.prettyString)()
+        }
+        retainedExprs ++ aggExprs
+      } else {
+        aggExprs
+      }
+
     groupType match {
-      case GroupByType =>
+      case GroupedData.GroupByType =>
         DataFrame(
-          df.sqlContext, Aggregate(groupingExprs, aggregateExpressions(aggExprs), df.logicalPlan))
-      case RollupType =>
+          df.sqlContext, Aggregate(groupingExprs, aggregates, df.logicalPlan))
+      case GroupedData.RollupType =>
         DataFrame(
-          df.sqlContext, Rollup(groupingExprs, df.logicalPlan, aggregateExpressions(aggExprs)))
-      case CubeType =>
+          df.sqlContext, Rollup(groupingExprs, df.logicalPlan, aggregates))
+      case GroupedData.CubeType =>
         DataFrame(
-          df.sqlContext, Cube(groupingExprs, df.logicalPlan, aggregateExpressions(aggExprs)))
+          df.sqlContext, Cube(groupingExprs, df.logicalPlan, aggregates))
     }
   }
 
@@ -288,5 +301,4 @@ class GroupedData protected[sql](
   def sum(colNames: String*): DataFrame = {
     aggregateNumericColumns(colNames:_*)(Sum)
   }
-
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
@@ -25,7 +25,10 @@ import org.apache.spark.sql.hive.test.TestHive.implicits._
 
 case class TestData2Int(a: Int, b: Int)
 
-class HiveDataFrameAnalyticsSuiteSuite extends QueryTest {
+// TODO ideally we should put the test suite into the package `sql`, as
+// `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
+// support the `cube` or `rollup` yet.
+class HiveDataFrameAnalyticsSuite extends QueryTest {
   val testData =
     TestHive.sparkContext.parallelize(
       TestData2Int(1, 2) ::
@@ -56,22 +59,4 @@ class HiveDataFrameAnalyticsSuiteSuite extends QueryTest {
       sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
     )
   }
-
-  test("spark.sql.retainGroupColumns config") {
-    val oldConf = conf.getConf("spark.sql.retainGroupColumns", "true")
-    try {
-      conf.setConf("spark.sql.retainGroupColumns", "false")
-      checkAnswer(
-        testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
-        sql("select sum(a-b) from mytable group by a + b, b with rollup").collect()
-      )
-
-      checkAnswer(
-        testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
-        sql("select sum(a-b) from mytable group by a + b, b with cube").collect()
-      )
-    } finally {
-      conf.setConf("spark.sql.retainGroupColumns", oldConf)
-    }
-  }
 }