[SPARK-23034][SQL] Show RDD/relation names in RDD/Hive table scan nodes

maropu · tejasapatil · cloud-fan · commit 2a0a8f753bbd · 2018-08-23T14:26:10.000+08:00
## What changes were proposed in this pull request? This pr proposed to show RDD/relation names in RDD/Hive table scan nodes. This change made these names show up in the webUI and explain results. For example; ``` scala> sql("CREATE TABLE t(c1 int) USING hive") scala> sql("INSERT INTO t VALUES(1)") scala> spark.table("t").explain() == Physical Plan == Scan hive default.t [c1#8], HiveTableRelation `default`.`t`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [c1#8] ^^^^^^^^^^^ ``` <img width="212" alt="spark-pr-hive" src="https://user-images.githubusercontent.com/692303/44501013-51264c80-a6c6-11e8-94f8-0704aee83bb6.png"> Closes #20226 ## How was this patch tested? Added tests in `DataFrameSuite`, `DatasetSuite`, and `HiveExplainSuite` Closes #22153 from maropu/pr20226. Lead-authored-by: Takeshi Yamamuro <yamamuro@apache.org> Co-authored-by: Tejas Patil <tejasp@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala
@@ -117,7 +117,7 @@ private[kafka010] class KafkaRelation(
         DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.timestamp)),
         cr.timestampType.id)
     }
-    sqlContext.internalCreateDataFrame(rdd, schema).rdd
+    sqlContext.internalCreateDataFrame(rdd.setName("kafka"), schema).rdd
   }
 
   private def getPartitionOffsets(
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -215,7 +215,7 @@ private[kafka010] class KafkaSource(
     }
     if (start.isDefined && start.get == end) {
       return sqlContext.internalCreateDataFrame(
-        sqlContext.sparkContext.emptyRDD, schema, isStreaming = true)
+        sqlContext.sparkContext.emptyRDD[InternalRow].setName("empty"), schema, isStreaming = true)
     }
     val fromPartitionOffsets = start match {
       case Some(prevBatchEndOffset) =>
@@ -299,7 +299,7 @@ private[kafka010] class KafkaSource(
     logInfo("GetBatch generating RDD of offset range: " +
       offsetRanges.sortBy(_.topicPartition.toString).mkString(", "))
 
-    sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
+    sqlContext.internalCreateDataFrame(rdd.setName("kafka"), schema, isStreaming = true)
   }
 
   /** Stop this source and free any resources it has allocated. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -270,7 +270,7 @@ class SparkSession private(
    */
   @transient
   lazy val emptyDataFrame: DataFrame = {
-    createDataFrame(sparkContext.emptyRDD[Row], StructType(Nil))
+    createDataFrame(sparkContext.emptyRDD[Row].setName("empty"), StructType(Nil))
   }
 
   /**
@@ -395,7 +395,7 @@ class SparkSession private(
     // BeanInfo is not serializable so we must rediscover it remotely for each partition.
       SQLContext.beansToRows(iter, Utils.classForName(className), attributeSeq)
     }
-    Dataset.ofRows(self, LogicalRDD(attributeSeq, rowRdd)(self))
+    Dataset.ofRows(self, LogicalRDD(attributeSeq, rowRdd.setName(rdd.name))(self))
   }
 
   /**
@@ -594,7 +594,7 @@ class SparkSession private(
     } else {
       rowRDD.map { r: Row => InternalRow.fromSeq(r.toSeq) }
     }
-    internalCreateDataFrame(catalystRows, schema)
+    internalCreateDataFrame(catalystRows.setName(rowRDD.name), schema)
   }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -103,6 +103,10 @@ case class ExternalRDDScanExec[T](
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
+  private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("")
+
+  override val nodeName: String = s"Scan$rddName"
+
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
     val outputDataType = outputObjAttr.dataType
@@ -116,7 +120,7 @@ case class ExternalRDDScanExec[T](
   }
 
   override def simpleString: String = {
-    s"Scan $nodeName${output.mkString("[", ",", "]")}"
+    s"$nodeName${output.mkString("[", ",", "]")}"
   }
 }
 
@@ -169,10 +173,14 @@ case class LogicalRDD(
 case class RDDScanExec(
     output: Seq[Attribute],
     rdd: RDD[InternalRow],
-    override val nodeName: String,
+    name: String,
     override val outputPartitioning: Partitioning = UnknownPartitioning(0),
     override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode {
 
+  private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("")
+
+  override val nodeName: String = s"Scan $name$rddName"
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
@@ -189,6 +197,6 @@ case class RDDScanExec(
   }
 
   override def simpleString: String = {
-    s"Scan $nodeName${Utils.truncatedString(output, "[", ",", "]")}"
+    s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}"
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -211,6 +211,6 @@ private[sql] object ArrowConverters {
       ArrowConverters.fromPayloadIterator(iter.map(new ArrowPayload(_)), context)
     }
     val schema = DataType.fromJson(schemaString).asInstanceOf[StructType]
-    sqlContext.internalCreateDataFrame(rdd, schema)
+    sqlContext.internalCreateDataFrame(rdd.setName("arrow"), schema)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2552,4 +2552,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       assert(numJobs == 1)
     }
   }
+
+  test("SPARK-23034 show rdd names in RDD scan nodes") {
+    val rddWithName = spark.sparkContext.parallelize(Row(1, "abc") :: Nil).setName("testRdd")
+    val df2 = spark.createDataFrame(rddWithName, StructType.fromDDL("c0 int, c1 string"))
+    val output2 = new java.io.ByteArrayOutputStream()
+    Console.withOut(output2) {
+      df2.explain(extended = false)
+    }
+    assert(output2.toString.contains("Scan ExistingRDD testRdd"))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1498,6 +1498,16 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       df.where($"city".contains(new java.lang.Character('A'))),
       Seq(Row("Amsterdam")))
   }
+
+  test("SPARK-23034 show rdd names in RDD scan nodes") {
+    val rddWithName = spark.sparkContext.parallelize(SingleData(1) :: Nil).setName("testRdd")
+    val df = spark.createDataFrame(rddWithName)
+    val output = new java.io.ByteArrayOutputStream()
+    Console.withOut(output) {
+      df.explain(extended = false)
+    }
+    assert(output.toString.contains("Scan testRdd"))
+  }
 }
 
 case class TestDataUnion(x: Int, y: Int, z: Int)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -27,6 +27,7 @@ import org.scalatest.exceptions.TestFailedException
 import org.apache.spark.SparkException
 import org.apache.spark.api.java.function.FlatMapGroupsWithStateFunction
 import org.apache.spark.sql.Encoder
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsWithState
 import org.apache.spark.sql.catalyst.plans.physical.UnknownPartitioning
@@ -1229,6 +1230,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
       timeoutType: GroupStateTimeout = GroupStateTimeout.NoTimeout,
       batchTimestampMs: Long = NO_TIMESTAMP): FlatMapGroupsWithStateExec = {
     val stateFormatVersion = spark.conf.get(SQLConf.FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION)
+    val emptyRdd = spark.sparkContext.emptyRDD[InternalRow]
     MemoryStream[Int]
       .toDS
       .groupByKey(x => x)
@@ -1237,7 +1239,8 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
         case FlatMapGroupsWithState(f, k, v, g, d, o, s, m, _, t, _) =>
           FlatMapGroupsWithStateExec(
             f, k, v, g, d, o, None, s, stateFormatVersion, m, t,
-            Some(currentBatchTimestamp), Some(currentBatchWatermark), RDDScanExec(g, null, "rdd"))
+            Some(currentBatchTimestamp), Some(currentBatchWatermark),
+            RDDScanExec(g, emptyRdd, "rdd"))
       }.get
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -62,6 +62,8 @@ case class HiveTableScanExec(
 
   override def conf: SQLConf = sparkSession.sessionState.conf
 
+  override def nodeName: String = s"Scan hive ${relation.tableMeta.qualifiedName}"
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -170,4 +170,16 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
       sql("EXPLAIN EXTENDED CODEGEN SELECT 1")
     }
   }
+
+  test("SPARK-23034 show relation names in Hive table scan nodes") {
+    val tableName = "tab"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName(c1 int) USING hive")
+      val output = new java.io.ByteArrayOutputStream()
+      Console.withOut(output) {
+        spark.table(tableName).explain(extended = false)
+      }
+      assert(output.toString.contains(s"Scan hive default.$tableName"))
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ private[kafka010] class KafkaRelation(`
`117`	`117`	`DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.timestamp)),`
`118`	`118`	`cr.timestampType.id)`
`119`	`119`	`}`
`120`		`- sqlContext.internalCreateDataFrame(rdd, schema).rdd`
	`120`	`+ sqlContext.internalCreateDataFrame(rdd.setName("kafka"), schema).rdd`
`121`	`121`	`}`
`122`	`122`
`123`	`123`	`private def getPartitionOffsets(`
Original file line number	Diff line number	Diff line change
`@@ -215,7 +215,7 @@ private[kafka010] class KafkaSource(`
`215`	`215`	`}`
`216`	`216`	`if (start.isDefined && start.get == end) {`
`217`	`217`	`return sqlContext.internalCreateDataFrame(`
`218`		`- sqlContext.sparkContext.emptyRDD, schema, isStreaming = true)`
	`218`	`+ sqlContext.sparkContext.emptyRDD[InternalRow].setName("empty"), schema, isStreaming = true)`
`219`	`219`	`}`
`220`	`220`	`val fromPartitionOffsets = start match {`
`221`	`221`	`case Some(prevBatchEndOffset) =>`
`@@ -299,7 +299,7 @@ private[kafka010] class KafkaSource(`
`299`	`299`	`logInfo("GetBatch generating RDD of offset range: " +`
`300`	`300`	`offsetRanges.sortBy(_.topicPartition.toString).mkString(", "))`
`301`	`301`
`302`		`- sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)`
	`302`	`+ sqlContext.internalCreateDataFrame(rdd.setName("kafka"), schema, isStreaming = true)`
`303`	`303`	`}`
`304`	`304`
`305`	`305`	`/** Stop this source and free any resources it has allocated. */`
Original file line number	Diff line number	Diff line change
`@@ -270,7 +270,7 @@ class SparkSession private(`
`270`	`270`	`*/`
`271`	`271`	`@transient`
`272`	`272`	`lazy val emptyDataFrame: DataFrame = {`
`273`		`- createDataFrame(sparkContext.emptyRDD[Row], StructType(Nil))`
	`273`	`+ createDataFrame(sparkContext.emptyRDD[Row].setName("empty"), StructType(Nil))`
`274`	`274`	`}`
`275`	`275`
`276`	`276`	`/**`
`@@ -395,7 +395,7 @@ class SparkSession private(`
`395`	`395`	`// BeanInfo is not serializable so we must rediscover it remotely for each partition.`
`396`	`396`	`SQLContext.beansToRows(iter, Utils.classForName(className), attributeSeq)`
`397`	`397`	`}`
`398`		`- Dataset.ofRows(self, LogicalRDD(attributeSeq, rowRdd)(self))`
	`398`	`+ Dataset.ofRows(self, LogicalRDD(attributeSeq, rowRdd.setName(rdd.name))(self))`
`399`	`399`	`}`
`400`	`400`
`401`	`401`	`/**`
`@@ -594,7 +594,7 @@ class SparkSession private(`
`594`	`594`	`} else {`
`595`	`595`	`rowRDD.map { r: Row => InternalRow.fromSeq(r.toSeq) }`
`596`	`596`	`}`
`597`		`- internalCreateDataFrame(catalystRows, schema)`
	`597`	`+ internalCreateDataFrame(catalystRows.setName(rowRDD.name), schema)`
`598`	`598`	`}`
`599`	`599`
`600`	`600`
Original file line number	Diff line number	Diff line change
`@@ -211,6 +211,6 @@ private[sql] object ArrowConverters {`
`211`	`211`	`ArrowConverters.fromPayloadIterator(iter.map(new ArrowPayload(_)), context)`
`212`	`212`	`}`
`213`	`213`	`val schema = DataType.fromJson(schemaString).asInstanceOf[StructType]`
`214`		`- sqlContext.internalCreateDataFrame(rdd, schema)`
	`214`	`+ sqlContext.internalCreateDataFrame(rdd.setName("arrow"), schema)`
`215`	`215`	`}`
`216`	`216`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2552,4 +2552,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {`
`2552`	`2552`	`assert(numJobs == 1)`
`2553`	`2553`	`}`
`2554`	`2554`	`}`
	`2555`	`+`
	`2556`	`+ test("SPARK-23034 show rdd names in RDD scan nodes") {`
	`2557`	`+ val rddWithName = spark.sparkContext.parallelize(Row(1, "abc") :: Nil).setName("testRdd")`
	`2558`	`+ val df2 = spark.createDataFrame(rddWithName, StructType.fromDDL("c0 int, c1 string"))`
	`2559`	`+ val output2 = new java.io.ByteArrayOutputStream()`
	`2560`	`+ Console.withOut(output2) {`
	`2561`	`+ df2.explain(extended = false)`
	`2562`	`+ }`
	`2563`	`+ assert(output2.toString.contains("Scan ExistingRDD testRdd"))`
	`2564`	`+ }`
`2555`	`2565`	`}`
Original file line number	Diff line number	Diff line change
`@@ -170,4 +170,16 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto`
`170`	`170`	`sql("EXPLAIN EXTENDED CODEGEN SELECT 1")`
`171`	`171`	`}`
`172`	`172`	`}`
	`173`	`+`
	`174`	`+ test("SPARK-23034 show relation names in Hive table scan nodes") {`
	`175`	`+ val tableName = "tab"`
	`176`	`+ withTable(tableName) {`
	`177`	`+ sql(s"CREATE TABLE $tableName(c1 int) USING hive")`
	`178`	`+ val output = new java.io.ByteArrayOutputStream()`
	`179`	`+ Console.withOut(output) {`
	`180`	`+ spark.table(tableName).explain(extended = false)`
	`181`	`+ }`
	`182`	`+ assert(output.toString.contains(s"Scan hive default.$tableName"))`
	`183`	`+ }`
	`184`	`+ }`
`173`	`185`	`}`