Update applySchema API.

yhuai · yhuai · commit 1d9c13a3f15a · 2014-07-23T12:13:30.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -88,33 +88,18 @@ class SQLContext(@transient val sparkContext: SparkContext)
     new SchemaRDD(this, SparkLogicalPlan(ExistingRdd.fromProductRdd(rdd)))
 
   /**
-   * Creates a [[SchemaRDD]] from an [[RDD]] by applying a schema to this RDD and using a function
-   * that will be applied to each partition of the RDD to convert RDD records to [[Row]]s.
+   * :: DeveloperApi ::
+   * Creates a [[SchemaRDD]] from an [[RDD]] containing [[Row]]s by applying a schema to this RDD.
+   * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
+   * the provided schema. Otherwise, there will be runtime exception.
    *
    * @group userf
    */
-  def applySchema[A](rdd: RDD[A], schema: StructType, f: A => Row): SchemaRDD =
-    applySchemaToPartitions(rdd, schema, (iter: Iterator[A]) => iter.map(f))
-
-  /**
-   * Creates a [[SchemaRDD]] from an [[RDD]] by applying a schema to this RDD and using a function
-   * that will be applied to each partition of the RDD to convert RDD records to [[Row]]s.
-   * Similar to `RDD.mapPartitions``, this function can be used to improve performance where there
-   * is other setup work that can be amortized and used repeatedly for all of the
-   * elements in a partition.
-   * @group userf
-   */
-  def applySchemaToPartitions[A](
-      rdd: RDD[A],
-      schema: StructType,
-      f: Iterator[A] => Iterator[Row]): SchemaRDD =
-    new SchemaRDD(this, makeCustomRDDScan(rdd, schema, f))
-
-  protected[sql] def makeCustomRDDScan[A](
-      rdd: RDD[A],
-      schema: StructType,
-      f: Iterator[A] => Iterator[Row]): LogicalPlan =
-    SparkLogicalPlan(ExistingRdd(schema.toAttributes, rdd.mapPartitions(f)))
+  @DeveloperApi
+  def applySchema(rowRDD: RDD[Row], schema: StructType): SchemaRDD = {
+    val logicalPlan = SparkLogicalPlan(ExistingRdd(schema.toAttributes, rowRDD))
+    new SchemaRDD(this, logicalPlan)
+  }
 
   /**
    * Loads a Parquet file, returning the result as a [[SchemaRDD]].
@@ -133,11 +118,13 @@ class SQLContext(@transient val sparkContext: SparkContext)
   def jsonFile(path: String): SchemaRDD = jsonFile(path, 1.0)
 
   /**
+   * :: Experimental ::
    * Loads a JSON file (one object per line) and applies the given schema,
    * returning the result as a [[SchemaRDD]].
    *
    * @group userf
    */
+  @Experimental
   def jsonFile(path: String, schema: StructType): SchemaRDD = {
     val json = sparkContext.textFile(path)
     jsonRDD(json, schema)
@@ -162,32 +149,28 @@ class SQLContext(@transient val sparkContext: SparkContext)
   def jsonRDD(json: RDD[String]): SchemaRDD = jsonRDD(json, 1.0)
 
   /**
+   * :: Experimental ::
    * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
    * returning the result as a [[SchemaRDD]].
    *
    * @group userf
    */
+  @Experimental
   def jsonRDD(json: RDD[String], schema: StructType): SchemaRDD = {
     val appliedSchema =
       Option(schema).getOrElse(JsonRDD.nullTypeToStringType(JsonRDD.inferSchema(json, 1.0)))
-
-    applySchemaToPartitions(
-      json,
-      appliedSchema,
-      JsonRDD.jsonStringToRow(appliedSchema, _: Iterator[String]))
+    val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema)
+    applySchema(rowRDD, appliedSchema)
   }
 
   /**
    * :: Experimental ::
    */
   @Experimental
   def jsonRDD(json: RDD[String], samplingRatio: Double): SchemaRDD = {
-    val schema = JsonRDD.nullTypeToStringType(JsonRDD.inferSchema(json, samplingRatio))
-
-    applySchemaToPartitions(
-      json,
-      schema,
-      JsonRDD.jsonStringToRow(schema, _: Iterator[String]))
+    val appliedSchema = JsonRDD.nullTypeToStringType(JsonRDD.inferSchema(json, samplingRatio))
+    val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema)
+    applySchema(rowRDD, appliedSchema)
   }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -32,18 +32,17 @@ import org.apache.spark.sql.Logging
 private[sql] object JsonRDD extends Logging {
 
   private[sql] def jsonStringToRow(
-      schema: StructType,
-      jsonIter: Iterator[String]): Iterator[Row] = {
-    parseJson(jsonIter).map(parsed => asRow(parsed, schema))
+      json: RDD[String],
+      schema: StructType): RDD[Row] = {
+    parseJson(json).map(parsed => asRow(parsed, schema))
   }
 
   private[sql] def inferSchema(
       json: RDD[String],
       samplingRatio: Double = 1.0): StructType = {
     require(samplingRatio > 0, s"samplingRatio ($samplingRatio) should be greater than 0")
     val schemaData = if (samplingRatio > 0.99) json else json.sample(false, samplingRatio, 1)
-    val allKeys =
-      schemaData.mapPartitions(iter => parseJson(iter)).map(allKeysWithValueTypes).reduce(_ ++ _)
+    val allKeys = parseJson(schemaData).map(allKeysWithValueTypes).reduce(_ ++ _)
     createSchema(allKeys)
   }
 
@@ -255,7 +254,7 @@ private[sql] object JsonRDD extends Logging {
     case atom => atom
   }
 
-  private def parseJson(jsonIter: Iterator[String]): Iterator[Map[String, Any]] = {
+  private def parseJson(json: RDD[String]): RDD[Map[String, Any]] = {
     // According to [Jackson-72: https://jira.codehaus.org/browse/JACKSON-72],
     // ObjectMapper will not return BigDecimal when
     // "DeserializationFeature.USE_BIG_DECIMAL_FOR_FLOATS" is disabled
@@ -264,15 +263,17 @@ private[sql] object JsonRDD extends Logging {
     // for every float number, which will be slow.
     // So, right now, we will have Infinity for those BigDecimal number.
     // TODO: Support BigDecimal.
-    // Also, when there is a key appearing multiple times (a duplicate key),
-    // the ObjectMapper will take the last value associated with this duplicate key.
-    // For example: for {"key": 1, "key":2}, we will get "key"->2.
-    val mapper = new ObjectMapper()
-    jsonIter.map {
-      record =>
-        val parsed = scalafy(mapper.readValue(record, classOf[java.util.Map[String, Any]]))
-        parsed.asInstanceOf[Map[String, Any]]
-    }
+    json.mapPartitions(iter => {
+      // Also, when there is a key appearing multiple times (a duplicate key),
+      // the ObjectMapper will take the last value associated with this duplicate key.
+      // For example: for {"key": 1, "key":2}, we will get "key"->2.
+      val mapper = new ObjectMapper()
+      iter.map {
+        record =>
+          val parsed = scalafy(mapper.readValue(record, classOf[java.util.Map[String, Any]]))
+          parsed.asInstanceOf[Map[String, Any]]
+      }
+    })
   }
 
   private def toLong(value: Any): Long = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.json
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.json.JsonRDD.{enforceCorrectType, compatibleType}