unbiased standard deviation aggregation function

brkyvz · brkyvz · commit 27ae62566e1f · 2015-08-08T23:10:26.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -302,3 +302,87 @@ case class Sum(child: Expression) extends AlgebraicAggregate {
 
   override val evaluateExpression = Cast(currentSum, resultType)
 }
+
+/**
+ * Calculates the unbiased Standard Deviation using the online formula here:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ */
+case class StandardDeviation(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  // Return data type.
+  override def dataType: DataType = resultType
+
+  // Expected input data type.
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, NullType))
+
+  private lazy val resultType = child.dataType match {
+    case DecimalType.Fixed(p, s) =>
+      DecimalType.bounded(p + 4, s + 4)
+    case _ => DoubleType
+  }
+
+  private lazy val sumDataType = child.dataType match {
+    case _ @ DecimalType.Fixed(p, s) => DecimalType.bounded(p + 10, s)
+    case _ => DoubleType
+  }
+
+  private lazy val currentCount = AttributeReference("currentCount", LongType)()
+  private lazy val currentAvg = AttributeReference("currentAverage", sumDataType)()
+  private lazy val currentMk = AttributeReference("currentMoment", sumDataType)()
+
+  // the values should be updated in a special order, because they re-use each other
+  override lazy val bufferAttributes = currentCount :: currentAvg :: currentMk :: Nil
+
+  override lazy val initialValues = Seq(
+    /* currentCount = */ Literal(0L),
+    /* currentAvg = */ Cast(Literal(0), sumDataType),
+    /* currentMk = */ Cast(Literal(0), sumDataType)
+  )
+
+  override lazy val updateExpressions = {
+    val currentValue = Coalesce(Cast(child, sumDataType) :: Cast(Literal(0), sumDataType) :: Nil)
+    val deltaX = Subtract(currentValue, currentAvg)
+    val updatedCount = If(IsNull(child), currentCount, currentCount + 1L)
+    val updatedAvg = Add(currentAvg, Divide(deltaX, updatedCount))
+    Seq(
+      /* currentCount = */ updatedCount,
+      /* currentAvg = */ If(IsNull(child), currentAvg, updatedAvg),
+      /* currentMk = */ If(IsNull(child),
+        currentMk, Add(currentMk, deltaX * Subtract(currentValue, updatedAvg)))
+    )
+  }
+
+  override lazy val mergeExpressions = {
+    val totalCount = currentCount.left + currentCount.right
+    val deltaX = currentAvg.left - currentAvg.right
+    val deltaX2 = deltaX * deltaX
+    val sumMoments = currentMk.left + currentMk.right
+    val sumLeft = currentAvg.left * currentCount.left
+    val sumRight = currentAvg.right * currentCount.right
+    Seq(
+      /* currentCount = */ totalCount,
+      /* currentAvg = */ If(EqualTo(totalCount, Cast(Literal(0L), LongType)),
+        Cast(Literal(0), sumDataType), (sumLeft + sumRight) / totalCount),
+      /* currentMk = */ If(EqualTo(totalCount, Cast(Literal(0L), LongType)),
+        Cast(Literal(0), sumDataType),
+        sumMoments + deltaX2 * currentCount.left / totalCount * currentCount.right)
+    )
+  }
+
+  override lazy val evaluateExpression = {
+    val count = If(EqualTo(currentCount, Cast(Literal(0L), LongType)),
+      currentCount, currentCount - Cast(Literal(1L), LongType))
+    child.dataType match {
+      case DecimalType.Fixed(p, s) =>
+        // increase the precision and scale to prevent precision loss
+        val dt = DecimalType.bounded(p + 14, s + 4)
+        Cast(Sqrt(Cast(currentMk, dt) / Cast(count, dt)), resultType)
+      case _ =>
+        Sqrt(Cast(currentMk, resultType) / Cast(count, resultType))
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -164,4 +164,12 @@ object Utils {
       }
     case other => None
   }
+
+  def standardDeviation(e: Expression): Expression = {
+    val std = aggregate.AggregateExpression2(
+      aggregateFunction = aggregate.StandardDeviation(e),
+      mode = aggregate.Complete,
+      isDistinct = false)
+    Alias(std, s"std(${e.prettyString})")()
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.StandardDeviation
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
@@ -1268,15 +1269,11 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def describe(cols: String*): DataFrame = {
 
-    // TODO: Add stddev as an expression, and remove it from here.
-    def stddevExpr(expr: Expression): Expression =
-      Sqrt(Subtract(Average(Multiply(expr, expr)), Multiply(Average(expr), Average(expr))))
-
     // The list of summary statistics to compute, in the form of expressions.
     val statistics = List[(String, Expression => Expression)](
       "count" -> Count,
       "mean" -> Average,
-      "stddev" -> stddevExpr,
+      "stddev" -> aggregate.Utils.standardDeviation,
       "min" -> Min,
       "max" -> Max)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -23,6 +23,7 @@ import scala.language.implicitConversions
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedAlias, UnresolvedAttribute, Star}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.StandardDeviation
 import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
 import org.apache.spark.sql.types.NumericType
 
@@ -283,6 +284,18 @@ class GroupedData protected[sql](
     aggregateNumericColumns(colNames : _*)(Min)
   }
 
+  /**
+   * Compute the sample standard deviation for each numeric column for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the standard deviation for them.
+   *
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def std(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(aggregate.Utils.standardDeviation)
+  }
+
   /**
    * Compute the sum for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
@@ -87,6 +87,8 @@ class SortBasedAggregationIterator(
   // The aggregation buffer used by the sort-based aggregation.
   private[this] val sortBasedAggregationBuffer: MutableRow = newBuffer
 
+  private val dataTypes = allAggregateFunctions.flatMap(_.bufferAttributes).map(_.dataType)
+
   /** Processes rows in the current group. It will stop when it find a new group. */
   protected def processCurrentSortedGroup(): Unit = {
     currentGroupingKey = nextGroupingKey
@@ -95,6 +97,7 @@ class SortBasedAggregationIterator(
     var findNextPartition = false
     // firstRowInNextGroup is the first row of this group. We first process it.
     processRow(sortBasedAggregationBuffer, firstRowInNextGroup)
+    println(dataTypes.zipWithIndex.map(d => sortBasedAggregationBuffer.get(d._2, d._1)).mkString("[", ",", "]"))
 
     // The search will stop when we see the next group or there is no
     // input row left in the iter.
@@ -107,7 +110,9 @@ class SortBasedAggregationIterator(
       // Check if the current row belongs the current input row.
       if (currentGroupingKey == groupingKey) {
         processRow(sortBasedAggregationBuffer, currentRow)
-
+        println("Second")
+        println(currentRow)
+        println(dataTypes.zipWithIndex.map(d => sortBasedAggregationBuffer.get(d._2, d._1)).mkString("[", ",", "]"))
         hasNext = inputKVIterator.next()
       } else {
         // We find a new group.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.{TypeTag, typeTag}
 import scala.util.Try
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.catalyst.expressions.aggregate.StandardDeviation
 import org.apache.spark.sql.catalyst.{SqlParser, ScalaReflection}
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star}
 import org.apache.spark.sql.catalyst.expressions._
@@ -294,6 +295,22 @@ object functions {
    */
   def min(columnName: String): Column = min(Column(columnName))
 
+  /**
+   * Aggregate function: returns the sample standard deviation of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.5.0
+   */
+  def std(e: Column): Column = aggregate.Utils.standardDeviation(e.expr)
+
+  /**
+   * Aggregate function: returns the sample standard deviation of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.5.0
+   */
+  def std(columnName: String): Column = std(Column(columnName))
+
   /**
    * Aggregate function: returns the sum of all values in the expression.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -442,7 +442,7 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
     val describeResult = Seq(
       Row("count", "4", "4"),
       Row("mean", "33.0", "178.0"),
-      Row("stddev", "16.583123951777", "10.0"),
+      Row("stddev", "19.148542155126762", "11.547005383792516"),
       Row("min", "16", "164"),
       Row("max", "60", "192"))
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.sql._
+import org.apache.spark.sql.functions.std
 import org.scalatest.BeforeAndAfterAll
 import _root_.test.org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
 
@@ -84,6 +85,32 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be
     sqlContext.setConf(SQLConf.USE_SQL_AGGREGATE2.key, originalUseAggregate2.toString)
   }
 
+  test("test standard deviation") {
+    val df = Seq.tabulate(10)(i => (i, 1)).toDF("val", "key")
+    checkAnswer(
+      df.select(std("val")),
+      Row(3.0276503540974917) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("agg1").groupBy("key").std("value"),
+      Row(1, 10.0) :: Row(2, 0.7071067811865476) :: Row(3, null) ::
+        Row(null, 81.8535277187245) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("agg1").select(std("key"), std("value")),
+      Row(0.7817359599705717, 44.898098909801135) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("agg2").groupBy("key", "value1").std("value2"),
+      Row(1, 10, null) :: Row(1, 30, 42.42640687119285) :: Row(2, -1, null) ::
+        Row(2, 1, 0.0) :: Row(2, null, null) :: Row(3, null, null) :: Row(null, -10, null) ::
+        Row(null, -60, null) :: Row(null, 100, null) :: Row(null, null, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("emptyTable").select(std("value")),
+      Row(null) :: Nil)
+  }
+
   test("empty table") {
     // If there is no GROUP BY clause and the table is empty, we will generate a single row.
     checkAnswer(

Original file line number	Diff line number	Diff line change
`@@ -164,4 +164,12 @@ object Utils {`
`164`	`164`	`}`
`165`	`165`	`case other => None`
`166`	`166`	`}`
	`167`	`+`
	`168`	`+ def standardDeviation(e: Expression): Expression = {`
	`169`	`+ val std = aggregate.AggregateExpression2(`
	`170`	`+ aggregateFunction = aggregate.StandardDeviation(e),`
	`171`	`+ mode = aggregate.Complete,`
	`172`	`+ isDistinct = false)`
	`173`	`+ Alias(std, s"std(${e.prettyString})")()`
	`174`	`+ }`
`167`	`175`	`}`