apache
diff --git a/‎assembly/pom.xml
Lines changed: 10 additions & 0 deletions b/‎assembly/pom.xml
Lines changed: 10 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala
Lines changed: 1 addition & 0 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala
Lines changed: 64 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala
Lines changed: 64 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
Lines changed: 19 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
Lines changed: 19 additions & 6 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Lines changed: 1 addition & 3 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Lines changed: 1 addition & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
Lines changed: 8 additions & 8 deletions b/‎core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
Lines changed: 8 additions & 8 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
Lines changed: 9 additions & 4 deletions b/‎core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
Lines changed: 9 additions & 4 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
Lines changed: 6 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
Lines changed: 4 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
Lines changed: 10 additions & 2 deletions b/‎core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
Lines changed: 10 additions & 2 deletions
@@ -114,6 +114,16 @@
                 <exclude>META-INF/*.RSA</exclude>
               </excludes>
             </filter>
+            <filter>
+              <!-- Exclude libgfortran, libgcc for license issues -->
+              <artifact>org.jblas:jblas</artifact>
+              <excludes>
+                <!-- Linux amd64 is OK; not statically linked -->
+                <exclude>lib/Linux/i386/**</exclude>
+                <exclude>lib/Mac OS X/**</exclude>
+                <exclude>lib/Windows/**</exclude>
+              </excludes>
+            </filter>
           </filters>
         </configuration>
         <executions>
 
@@ -1363,6 +1363,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
         cleaner.foreach(_.stop())
         dagScheduler.stop()
         dagScheduler = null
+        progressBar.foreach(_.stop())
         taskScheduler = null
         // TODO: Cache.stop()?
         env.stop()
 
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.python
+
+import java.io.DataOutputStream
+import java.net.Socket
+
+import py4j.GatewayServer
+
+import org.apache.spark.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * Process that starts a Py4J GatewayServer on an ephemeral port and communicates the bound port
+ * back to its caller via a callback port specified by the caller.
+ *
+ * This process is launched (via SparkSubmit) by the PySpark driver (see java_gateway.py).
+ */
+private[spark] object PythonGatewayServer extends Logging {
+  def main(args: Array[String]): Unit = Utils.tryOrExit {
+    // Start a GatewayServer on an ephemeral port
+    val gatewayServer: GatewayServer = new GatewayServer(null, 0)
+    gatewayServer.start()
+    val boundPort: Int = gatewayServer.getListeningPort
+    if (boundPort == -1) {
+      logError("GatewayServer failed to bind; exiting")
+      System.exit(1)
+    } else {
+      logDebug(s"Started PythonGatewayServer on port $boundPort")
+    }
+
+    // Communicate the bound port back to the caller via the caller-specified callback port
+    val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST")
+    val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt
+    logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort")
+    val callbackSocket = new Socket(callbackHost, callbackPort)
+    val dos = new DataOutputStream(callbackSocket.getOutputStream)
+    dos.writeInt(boundPort)
+    dos.close()
+    callbackSocket.close()
+
+    // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies:
+    while (System.in.read() != -1) {
+      // Do nothing
+    }
+    logDebug("Exiting due to broken pipe from Python driver")
+    System.exit(0)
+  }
+}
@@ -144,11 +144,24 @@ private[spark] class PythonRDD(
                 stream.readFully(update)
                 accumulator += Collections.singletonList(update)
               }
+
               // Check whether the worker is ready to be re-used.
-              if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
-                if (reuse_worker) {
-                  env.releasePythonWorker(pythonExec, envVars.toMap, worker)
-                  released = true
+              if (reuse_worker) {
+                // It has a high possibility that the ending mark is already available,
+                // And current task should not be blocked by checking it
+
+                if (stream.available() >= 4) {
+                  val ending = stream.readInt()
+                  if (ending == SpecialLengths.END_OF_STREAM) {
+                    env.releasePythonWorker(pythonExec, envVars.toMap, worker)
+                    released = true
+                    logInfo(s"Communication with worker ended cleanly, re-use it: $worker")
+                  } else {
+                    logInfo(s"Communication with worker did not end cleanly " +
+                      s"(ending with $ending), close it: $worker")
+                  }
+                } else {
+                  logInfo(s"The ending mark from worker is not available, close it: $worker")
                 }
               }
               null
@@ -248,13 +261,13 @@ private[spark] class PythonRDD(
       } catch {
         case e: Exception if context.isCompleted || context.isInterrupted =>
           logDebug("Exception thrown after task completion (likely due to cleanup)", e)
-          worker.shutdownOutput()
+          Utils.tryLog(worker.shutdownOutput())
 
         case e: Exception =>
           // We must avoid throwing exceptions here, because the thread uncaught exception handler
           // will kill the whole executor (see org.apache.spark.executor.Executor).
           _exception = e
-          worker.shutdownOutput()
+          Utils.tryLog(worker.shutdownOutput())
       } finally {
         // Release memory used by this thread for shuffles
         env.shuffleMemoryManager.releaseMemoryForThisThread()
 
@@ -39,7 +39,6 @@ import org.apache.ivy.plugins.resolver.{ChainResolver, IBiblioResolver}
 
 import org.apache.spark.SPARK_VERSION
 import org.apache.spark.deploy.rest._
-import org.apache.spark.executor._
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
 
 /**
@@ -302,8 +301,7 @@ object SparkSubmit {
     // If we're running a python app, set the main class to our specific python runner
     if (args.isPython && deployMode == CLIENT) {
       if (args.primaryResource == PYSPARK_SHELL) {
-        args.mainClass = "py4j.GatewayServer"
-        args.childArgs = ArrayBuffer("--die-on-broken-pipe", "0")
+        args.mainClass = "org.apache.spark.api.python.PythonGatewayServer"
       } else {
         // If a python file is provided, add it to the child arguments and list of files to deploy.
         // Usage: PythonAppRunner <main python file> <extra python files> [app arguments]
 
@@ -17,8 +17,6 @@
 
 package org.apache.spark.deploy.rest
 
-import scala.util.Try
-
 import com.fasterxml.jackson.annotation._
 import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility
 import com.fasterxml.jackson.annotation.JsonInclude.Include
@@ -111,12 +109,14 @@ private[spark] object SubmitRestProtocolMessage {
    * If the action field is not found, throw a [[SubmitRestMissingFieldException]].
    */
   def parseAction(json: String): String = {
-    parse(json).asInstanceOf[JObject].obj
-      .find { case (f, _) => f == "action" }
-      .map { case (_, v) => v.asInstanceOf[JString].s }
-      .getOrElse {
-        throw new SubmitRestMissingFieldException(s"Action field not found in JSON:\n$json")
-      }
+    val value: Option[String] = parse(json) match {
+      case JObject(fields) =>
+        fields.collectFirst { case ("action", v) => v }.collect { case JString(s) => s }
+      case _ => None
+    }
+    value.getOrElse {
+      throw new SubmitRestMissingFieldException(s"Action field not found in JSON:\n$json")
+    }
   }
 
   /**
 
@@ -49,7 +49,7 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
   }
   private val subDirs = Array.fill(localDirs.length)(new Array[File](subDirsPerLocalDir))
 
-  addShutdownHook()
+  private val shutdownHook = addShutdownHook()
 
   /** Looks up a file by hashing it into one of our local subdirectories. */
   // This method should be kept in sync with
@@ -134,17 +134,22 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
     }
   }
 
-  private def addShutdownHook() {
-    Runtime.getRuntime.addShutdownHook(new Thread("delete Spark local dirs") {
+  private def addShutdownHook(): Thread = {
+    val shutdownHook = new Thread("delete Spark local dirs") {
       override def run(): Unit = Utils.logUncaughtExceptions {
         logDebug("Shutdown hook called")
         DiskBlockManager.this.stop()
       }
-    })
+    }
+    Runtime.getRuntime.addShutdownHook(shutdownHook)
+    shutdownHook
   }
 
   /** Cleanup local dirs and stop shuffle sender. */
   private[spark] def stop() {
+    // Remove the shutdown hook.  It causes memory leaks if we leave it around.
+    Runtime.getRuntime.removeShutdownHook(shutdownHook)
+
     // Only perform cleanup if an external service is not serving our shuffle files.
     if (!blockManager.externalShuffleServiceEnabled || blockManager.blockManagerId.isDriver) {
       localDirs.foreach { localDir =>
 
@@ -28,7 +28,6 @@ import org.apache.spark._
  * of them will be combined together, showed in one line.
  */
 private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging {
-
   // Carrige return
   val CR = '\r'
   // Update period of progress bar, in milliseconds
@@ -121,4 +120,10 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging {
     clear()
     lastFinishTime = System.currentTimeMillis()
   }
+
+  /**
+   * Tear down the timer thread.  The timer thread is a GC root, and it retains the entire
+   * SparkContext if it's not terminated.
+   */
+  def stop(): Unit = timer.cancel()
 }
@@ -245,13 +245,15 @@ class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
     val goodJson = constructSubmitRequest(masterUrl).toJson
     val badJson1 = goodJson.replaceAll("action", "fraction") // invalid JSON
     val badJson2 = goodJson.substring(goodJson.size / 2) // malformed JSON
+    val notJson = "\"hello, world\""
     val (response1, code1) = sendHttpRequestWithResponse(submitRequestPath, "POST") // missing JSON
     val (response2, code2) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson1)
     val (response3, code3) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson2)
     val (response4, code4) = sendHttpRequestWithResponse(killRequestPath, "POST") // missing ID
     val (response5, code5) = sendHttpRequestWithResponse(s"$killRequestPath/", "POST")
     val (response6, code6) = sendHttpRequestWithResponse(statusRequestPath, "GET") // missing ID
     val (response7, code7) = sendHttpRequestWithResponse(s"$statusRequestPath/", "GET")
+    val (response8, code8) = sendHttpRequestWithResponse(submitRequestPath, "POST", notJson)
     // these should all fail as error responses
     getErrorResponse(response1)
     getErrorResponse(response2)
@@ -260,13 +262,15 @@ class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
     getErrorResponse(response5)
     getErrorResponse(response6)
     getErrorResponse(response7)
+    getErrorResponse(response8)
     assert(code1 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code2 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code3 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code4 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code5 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code6 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code7 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code8 === HttpServletResponse.SC_BAD_REQUEST)
   }
 
   test("bad request paths") {
 
@@ -28,7 +28,7 @@ import org.scalatest.FunSuite
 
 import org.apache.hadoop.io.Text
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.util.Utils
 import org.apache.hadoop.io.compress.{DefaultCodec, CompressionCodecFactory, GzipCodec}
 
@@ -42,7 +42,15 @@ class WholeTextFileRecordReaderSuite extends FunSuite with BeforeAndAfterAll {
   private var factory: CompressionCodecFactory = _
 
   override def beforeAll() {
-    sc = new SparkContext("local", "test")
+    // Hadoop's FileSystem caching does not use the Configuration as part of its cache key, which
+    // can cause Filesystem.get(Configuration) to return a cached instance created with a different
+    // configuration than the one passed to get() (see HADOOP-8490 for more details). This caused
+    // hard-to-reproduce test failures, since any suites that were run after this one would inherit
+    // the new value of "fs.local.block.size" (see SPARK-5227 and SPARK-5679). To work around this,
+    // we disable FileSystem caching in this suite.
+    val conf = new SparkConf().set("spark.hadoop.fs.file.impl.disable.cache", "true")
+
+    sc = new SparkContext("local", "test", conf)
 
     // Set the block size of local file system to test whether files are split right or not.
     sc.hadoopConfiguration.setLong("fs.local.block.size", 32)