Make Statistics a case class.

concretevitamin · concretevitamin · commit 4ef0d2626127 · 2014-07-29T11:48:26.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -26,18 +26,17 @@ import org.apache.spark.sql.catalyst.trees
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
   self: Product =>
 
-  // TODO: make a case class?
   /**
    * Estimates of various statistics.  The default estimation logic simply sums up the corresponding
    * statistic produced by the children.  To override this behavior, override `statistics` and
    * assign it a overriden version of `Statistics`.
    */
-  protected class Statistics {
-    lazy val childrenStats = children.map(_.statistics)
-    lazy val numTuples: Long = childrenStats.map(_.numTuples).sum
-    lazy val sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
-  }
+  case class Statistics(
+    numTuples: Long = childrenStats.map(_.numTuples).sum,
+    sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
+  )
   lazy val statistics: Statistics = new Statistics
+  lazy val childrenStats = children.map(_.statistics)
 
   /**
    * Returns the set of attributes that are referenced by this node
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -81,19 +81,19 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan)
       }).asInstanceOf[this.type]
   }
 
-  override lazy val statistics = new Statistics {
+  @transient override lazy val statistics = Statistics(
     // If this is wrapping around ExistingRdd and no reasonable estimation logic is implemented,
     // return a default value.
-    override lazy val sizeInBytes: Long = {
+    sizeInBytes = {
       val defaultSum = childrenStats.map(_.sizeInBytes).sum
       alreadyPlanned match {
         // TODO: Instead of returning a default value here, find a way to return a meaningful
-        // estimate for RDDs. See PR 1238 for more discussions.
+        // size estimate for RDDs. See PR 1238 for more discussions.
         case e: ExistingRdd if defaultSum == 0 => statsDefaultSizeInBytes
         case _ => defaultSum
       }
     }
-  }
+  )
 
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -53,14 +53,14 @@ private[sql] case class ParquetRelation(
 
   self: Product =>
 
-  @transient override lazy val statistics = new Statistics {
+  @transient override lazy val statistics = Statistics(
     // TODO: investigate getting encoded column statistics in the parquet file?
-    override lazy val sizeInBytes: Long = {
+    sizeInBytes = {
       val hdfsPath = new Path(path)
       val fs = hdfsPath.getFileSystem(conf.getOrElse(ContextUtil.getConfiguration(new Job())))
       fs.getContentSummary(hdfsPath).getLength // TODO: in bytes or system-dependent?
     }
-  }
+  )
 
   /** Schema derived from ParquetFile */
   def parquetSchema: MessageType =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -273,7 +273,7 @@ private[hive] case class MetastoreRelation
   @transient override lazy val statistics = new Statistics {
     // TODO: check if this estimate is valid for tables after partition pruning.
     // Size getters adapted from SizeBasedBigTableSelectorForAutoSMJ.java in Hive (version 0.13).
-    override lazy val sizeInBytes: Long =
+    override val sizeInBytes: Long =
       maybeGetSize(hiveConf, hiveQlTable.getProperty("totalSize"), path)
 
     private[this] def maybeGetSize(conf: HiveConf, size: String, path: Path): Long = {

Original file line number	Diff line number	Diff line change
`@@ -81,19 +81,19 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan)`
`81`	`81`	`}).asInstanceOf[this.type]`
`82`	`82`	`}`
`83`	`83`
`84`		`- override lazy val statistics = new Statistics {`
	`84`	`+ @transient override lazy val statistics = Statistics(`
`85`	`85`	`// If this is wrapping around ExistingRdd and no reasonable estimation logic is implemented,`
`86`	`86`	`// return a default value.`
`87`		`- override lazy val sizeInBytes: Long = {`
	`87`	`+ sizeInBytes = {`
`88`	`88`	`val defaultSum = childrenStats.map(_.sizeInBytes).sum`
`89`	`89`	`alreadyPlanned match {`
`90`	`90`	`// TODO: Instead of returning a default value here, find a way to return a meaningful`
`91`		`- // estimate for RDDs. See PR 1238 for more discussions.`
	`91`	`+ // size estimate for RDDs. See PR 1238 for more discussions.`
`92`	`92`	`case e: ExistingRdd if defaultSum == 0 => statsDefaultSizeInBytes`
`93`	`93`	`case _ => defaultSum`
`94`	`94`	`}`
`95`	`95`	`}`
`96`		`- }`
	`96`	`+ )`
`97`	`97`
`98`	`98`	`}`
`99`	`99`