Add Cube / Rollup for dataframe

chenghao-intel · chenghao-intel · commit ef357e1bd15a · 2015-05-18T23:30:29.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -687,6 +687,46 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def groupBy(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr))
 
+  /**
+   * Rollup the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * {{{
+   *   // Compute the average for all numeric columns rolluped by department and group.
+   *   df.rollup($"department", $"group").avg()
+   *
+   *   // Compute the max age and average salary, rolluped by department and gender.
+   *   df.rollup($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def rollup(cols: Column*): GroupedData = new RollupedData(this, cols.map(_.expr))
+
+  /**
+   * Cube the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * {{{
+   *   // Compute the average for all numeric columns cubed by department and group.
+   *   df.cube($"department", $"group").avg()
+   *
+   *   // Compute the max age and average salary, cubed by department and gender.
+   *   df.cube($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def cube(cols: Column*): GroupedData = new CubedData(this, cols.map(_.expr))
+
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
    * See [[GroupedData]] for all the available aggregate functions.
@@ -713,6 +753,58 @@ class DataFrame private[sql](
     new GroupedData(this, colNames.map(colName => resolve(colName)))
   }
 
+  /**
+   * Rollup the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * This is a variant of groupBy that can only group by existing columns using column names
+   * (i.e. cannot construct expressions).
+   *
+   * {{{
+   *   // Compute the average for all numeric columns rolluped by department and group.
+   *   df.rollup("department", "group").avg()
+   *
+   *   // Compute the max age and average salary, rolluped by department and gender.
+   *   df.rollup($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def rollup(col1: String, cols: String*): GroupedData = {
+    val colNames: Seq[String] = col1 +: cols
+    new RollupedData(this, colNames.map(colName => resolve(colName)))
+  }
+
+  /**
+   * Cube the [[DataFrame]] using the specified columns, so we can run aggregation on them.
+   * See [[GroupedData]] for all the available aggregate functions.
+   *
+   * This is a variant of groupBy that can only group by existing columns using column names
+   * (i.e. cannot construct expressions).
+   *
+   * {{{
+   *   // Compute the average for all numeric columns cubed by department and group.
+   *   df.cube("department", "group").avg()
+   *
+   *   // Compute the max age and average salary, cubed by department and gender.
+   *   df.cube($"department", $"gender").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   * @group dfops
+   * @since 1.4.0
+   */
+  @scala.annotation.varargs
+  def cube(col1: String, cols: String*): GroupedData = {
+    val colNames: Seq[String] = col1 +: cols
+    new CubedData(this, colNames.map(colName => resolve(colName)))
+  }
+
   /**
    * (Scala-specific) Aggregates on the entire [[DataFrame]] without groups.
    * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -23,7 +23,7 @@ import scala.language.implicitConversions
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.Aggregate
+import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
 import org.apache.spark.sql.types.NumericType
 
 
@@ -36,13 +36,22 @@ import org.apache.spark.sql.types.NumericType
 @Experimental
 class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) {
 
-  private[sql] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
-    val namedGroupingExprs = groupingExprs.map {
-      case expr: NamedExpression => expr
-      case expr: Expression => Alias(expr, expr.prettyString)()
+  protected def aggregateExpressions(aggrExprs: Seq[NamedExpression])
+  : Seq[NamedExpression] = {
+    if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
+      val retainedExprs = groupingExprs.map {
+        case expr: NamedExpression => expr
+        case expr: Expression => Alias(expr, expr.prettyString)()
+      }
+      retainedExprs ++ aggrExprs
+    } else {
+      aggrExprs
     }
+  }
+
+  protected[sql] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
     DataFrame(
-      df.sqlContext, Aggregate(groupingExprs, namedGroupingExprs ++ aggExprs, df.logicalPlan))
+      df.sqlContext, Aggregate(groupingExprs, aggregateExpressions(aggExprs), df.logicalPlan))
   }
 
   private[this] def aggregateNumericColumns(colNames: String*)(f: Expression => Expression)
@@ -175,19 +184,10 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
    */
   @scala.annotation.varargs
   def agg(expr: Column, exprs: Column*): DataFrame = {
-    val aggExprs = (expr +: exprs).map(_.expr).map {
+    (expr +: exprs).map(_.expr).map {
       case expr: NamedExpression => expr
       case expr: Expression => Alias(expr, expr.prettyString)()
     }
-    if (df.sqlContext.conf.dataFrameRetainGroupColumns) {
-      val retainedExprs = groupingExprs.map {
-        case expr: NamedExpression => expr
-        case expr: Expression => Alias(expr, expr.prettyString)()
-      }
-      DataFrame(df.sqlContext, Aggregate(groupingExprs, retainedExprs ++ aggExprs, df.logicalPlan))
-    } else {
-      DataFrame(df.sqlContext, Aggregate(groupingExprs, aggExprs, df.logicalPlan))
-    }
   }
 
   /**
@@ -256,5 +256,38 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
   @scala.annotation.varargs
   def sum(colNames: String*): DataFrame = {
     aggregateNumericColumns(colNames:_*)(Sum)
-  }    
+  }
+
+}
+
+/**
+ * :: Experimental ::
+ * A set of methods for aggregations on a [[DataFrame]] cube, created by [[DataFrame.cube]].
+ *
+ * @since 1.4.0
+ */
+@Experimental
+class CubedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
+  extends GroupedData(df, groupingExprs) {
+
+  protected[sql] implicit override def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
+    DataFrame(
+      df.sqlContext, Cube(groupingExprs, df.logicalPlan, aggregateExpressions(aggExprs)))
+  }
+}
+
+/**
+ * :: Experimental ::
+ * A set of methods for aggregations on a [[DataFrame]] rollup, created by [[DataFrame.rollup]].
+ *
+ * @since 1.4.0
+ */
+@Experimental
+class RollupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
+  extends GroupedData(df, groupingExprs) {
+
+  protected[sql] implicit override def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
+    DataFrame(
+      df.sqlContext, Rollup(groupingExprs, df.logicalPlan, aggregateExpressions(aggExprs)))
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+case class TestData2Int(a: Int, b: Int)
+
+class HiveDataFrameAnalyticsSuiteSuite extends QueryTest {
+  val testData =
+    TestHive.sparkContext.parallelize(
+      TestData2Int(1, 2) ::
+        TestData2Int(2, 4) :: Nil).toDF()
+
+  testData.registerTempTable("mytable")
+
+  test("rollup") {
+    checkAnswer(
+      testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect()
+    )
+
+    checkAnswer(
+      testData.rollup("a", "b").agg(sum("b")),
+      sql("select a, b, sum(b) from mytable group by a, b with rollup").collect()
+    )
+  }
+
+  test("cube") {
+    checkAnswer(
+      testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect()
+    )
+
+    checkAnswer(
+      testData.cube("a", "b").agg(sum("b")),
+      sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
+    )
+  }
+
+  test("spark.sql.retainGroupColumns config") {
+    val oldConf = conf.getConf("spark.sql.retainGroupColumns", "true")
+    try {
+      conf.setConf("spark.sql.retainGroupColumns", "false")
+      checkAnswer(
+        testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+        sql("select sum(a-b) from mytable group by a + b, b with rollup").collect()
+      )
+
+      checkAnswer(
+        testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
+        sql("select sum(a-b) from mytable group by a + b, b with cube").collect()
+      )
+    } finally {
+      conf.setConf("spark.sql.retainGroupColumns", oldConf)
+    }
+  }
+}