Address comments of Josh and Andrew.

yhuai · yhuai · commit 6fa73dfa47b5 · 2015-05-19T20:49:57.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
@@ -50,7 +50,6 @@ private[spark] class SqlNewHadoopPartition(
 }
 
 /**
- * :: DeveloperApi ::
  * An RDD that provides core functionality for reading data stored in Hadoop (e.g., files in HDFS,
  * sources in HBase, or S3), using the new MapReduce API (`org.apache.hadoop.mapreduce`).
  * It is based on [[org.apache.spark.rdd.NewHadoopRDD]]. It has three additions.
@@ -60,13 +59,10 @@ private[spark] class SqlNewHadoopPartition(
  * 3. An optional closure `initLocalJobFuncOpt` that set configurations at both the driver side
  *    and the executor side to the shared Hadoop Configuration.
  *
- * @param sc The SparkContext to associate the RDD with.
- * @param inputFormatClass Storage format of the data to be read.
- * @param keyClass Class of the key associated with the inputFormatClass.
- * @param valueClass Class of the value associated with the inputFormatClass.
- * @param conf The Hadoop configuration.
+ * Note: This is RDD is basically a cloned version of [[org.apache.spark.rdd.NewHadoopRDD]] with
+ * changes based on [[org.apache.spark.rdd.HadoopRDD]]. In future, this functionality will be
+ * folded into core.
  */
-@DeveloperApi
 private[sql] class SqlNewHadoopRDD[K, V](
     @transient sc : SparkContext,
     broadcastedConf: Broadcast[SerializableWritable[Configuration]],
@@ -85,11 +81,22 @@ private[sql] class SqlNewHadoopRDD[K, V](
 
   protected def getJob(): Job = {
     val conf: Configuration = broadcastedConf.value.value
+    // "new Job" will make a copy of the conf. Then, it is
+    // safe to mutate conf properties with initLocalJobFuncOpt
+    // and initDriverSideJobFuncOpt.
     val newJob = new Job(conf)
     initLocalJobFuncOpt.map(f => f(newJob))
     newJob
   }
 
+  def getConf(isDriverSide: Boolean): Configuration = {
+    val job = getJob()
+    if (isDriverSide) {
+      initDriverSideJobFuncOpt.map(f => f(job))
+    }
+    job.getConfiguration
+  }
+
   private val jobTrackerId: String = {
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
     formatter.format(new Date())
@@ -235,14 +242,6 @@ private[sql] class SqlNewHadoopRDD[K, V](
     }
     super.persist(storageLevel)
   }
-
-  def getConf(isDriverSide: Boolean): Configuration = {
-    val job = getJob()
-    if (isDriverSide) {
-      initDriverSideJobFuncOpt.map(f => f(job))
-    }
-    job.getConfiguration
-  }
 }
 
 private[spark] object SqlNewHadoopRDD {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.spark.SerializableWritable
-import org.apache.spark.broadcast.Broadcast
-
 import scala.collection.mutable
 import scala.util.Try
 
@@ -28,7 +25,9 @@ import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
+import org.apache.spark.SerializableWritable
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection