apache
diff --git a/‎build/mvn
Lines changed: 4 additions & 1 deletion b/‎build/mvn
Lines changed: 4 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
Lines changed: 4 additions & 17 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
Lines changed: 4 additions & 17 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
Lines changed: 5 additions & 1 deletion b/‎mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
Lines changed: 5 additions & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
Lines changed: 4 additions & 3 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
Lines changed: 4 additions & 3 deletions
diff --git a/‎project/SparkBuild.scala
Lines changed: 10 additions & 2 deletions b/‎project/SparkBuild.scala
Lines changed: 10 additions & 2 deletions
diff --git a/‎python/pyspark/context.py
Lines changed: 4 additions & 3 deletions b/‎python/pyspark/context.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎python/pyspark/sql/context.py
Lines changed: 14 additions & 14 deletions b/‎python/pyspark/sql/context.py
Lines changed: 14 additions & 14 deletions
diff --git a/‎python/pyspark/sql/dataframe.py
Lines changed: 54 additions & 11 deletions b/‎python/pyspark/sql/dataframe.py
Lines changed: 54 additions & 11 deletions
@@ -21,6 +21,8 @@
 _DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # Preserve the calling directory
 _CALLING_DIR="$(pwd)"
+# Options used during compilation
+_COMPILE_JVM_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
 
 # Installs any application tarball given a URL, the expected tarball name,
 # and, optionally, a checkable binary path to determine if the binary has
@@ -136,14 +138,15 @@ cd "${_CALLING_DIR}"
 # Now that zinc is ensured to be installed, check its status and, if its
 # not running or just installed, start it
 if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status`" ]; then
+  export ZINC_OPTS=${ZINC_OPTS:-"$_COMPILE_JVM_OPTS"}
   ${ZINC_BIN} -shutdown
   ${ZINC_BIN} -start -port ${ZINC_PORT} \
     -scala-compiler "${SCALA_COMPILER}" \
     -scala-library "${SCALA_LIBRARY}" &>/dev/null
 fi
 
 # Set any `mvn` options if not already present
-export MAVEN_OPTS=${MAVEN_OPTS:-"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"}
+export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"}
 
 # Last, call the `mvn` command as usual
 ${MVN_BIN} "$@"
@@ -144,24 +144,11 @@ private[spark] class PythonRDD(
                 stream.readFully(update)
                 accumulator += Collections.singletonList(update)
               }
-
               // Check whether the worker is ready to be re-used.
-              if (reuse_worker) {
-                // It has a high possibility that the ending mark is already available,
-                // And current task should not be blocked by checking it
-
-                if (stream.available() >= 4) {
-                  val ending = stream.readInt()
-                  if (ending == SpecialLengths.END_OF_STREAM) {
-                    env.releasePythonWorker(pythonExec, envVars.toMap, worker)
-                    released = true
-                    logInfo(s"Communication with worker ended cleanly, re-use it: $worker")
-                  } else {
-                    logInfo(s"Communication with worker did not end cleanly " +
-                      s"(ending with $ending), close it: $worker")
-                  }
-                } else {
-                  logInfo(s"The ending mark from worker is not available, close it: $worker")
+              if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
+                if (reuse_worker) {
+                  env.releasePythonWorker(pythonExec, envVars.toMap, worker)
+                  released = true
                 }
               }
               null
 
@@ -355,6 +355,10 @@ class LogisticRegressionWithLBFGS
   }
 
   override protected def createModel(weights: Vector, intercept: Double) = {
-    new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)
+    if (numOfLinearPredictor == 1) {
+      new LogisticRegressionModel(weights, intercept)
+    } else {
+      new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)
+    }
   }
 }
@@ -126,7 +126,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   /**
    * The dimension of training features.
    */
-  protected var numFeatures: Int = 0
+  protected var numFeatures: Int = -1
 
   /**
    * Set if the algorithm should use feature scaling to improve the convergence during optimization.
@@ -163,7 +163,9 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    * RDD of LabeledPoint entries.
    */
   def run(input: RDD[LabeledPoint]): M = {
-    numFeatures = input.first().features.size
+    if (numFeatures < 0) {
+      numFeatures = input.map(_.features.size).first()
+    }
 
     /**
      * When `numOfLinearPredictor > 1`, the intercepts are encapsulated into weights,
@@ -193,7 +195,6 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    * of LabeledPoint entries starting from the initial weights provided.
    */
   def run(input: RDD[LabeledPoint], initialWeights: Vector): M = {
-    numFeatures = input.first().features.size
 
     if (input.getStorageLevel == StorageLevel.NONE) {
       logWarning("The input data is not directly cached, which may hurt performance if its"
 
@@ -361,9 +361,16 @@ object Unidoc {
     publish := {},
 
     unidocProjectFilter in(ScalaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, catalyst, streamingFlumeSink, yarn),
+      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn),
     unidocProjectFilter in(JavaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, bagel, examples, tools, catalyst, streamingFlumeSink, yarn),
+      inAnyProject -- inProjects(OldDeps.project, repl, bagel, examples, tools, streamingFlumeSink, yarn),
+
+    // Skip actual catalyst, but include the subproject.
+    // Catalyst is not public API and contains quasiquotes which break scaladoc.
+    unidocAllSources in (ScalaUnidoc, unidoc) := {
+      (unidocAllSources in (ScalaUnidoc, unidoc)).value
+        .map(_.filterNot(_.getCanonicalPath.contains("sql/catalyst")))
+    },
 
     // Skip class names containing $ and some internal packages in Javadocs
     unidocAllSources in (JavaUnidoc, unidoc) := {
@@ -376,6 +383,7 @@ object Unidoc {
         .map(_.filterNot(_.getCanonicalPath.contains("executor")))
         .map(_.filterNot(_.getCanonicalPath.contains("python")))
         .map(_.filterNot(_.getCanonicalPath.contains("collection")))
+        .map(_.filterNot(_.getCanonicalPath.contains("sql/catalyst")))
     },
 
     // Javadoc options: create a window title, and group key packages on index page
 
@@ -64,6 +64,8 @@ class SparkContext(object):
     _lock = Lock()
     _python_includes = None  # zip and egg files that need to be added to PYTHONPATH
 
+    PACKAGE_EXTENSIONS = ('.zip', '.egg', '.jar')
+
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                  environment=None, batchSize=0, serializer=PickleSerializer(), conf=None,
                  gateway=None, jsc=None, profiler_cls=BasicProfiler):
@@ -185,8 +187,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         for path in self._conf.get("spark.submit.pyFiles", "").split(","):
             if path != "":
                 (dirname, filename) = os.path.split(path)
-                if filename.lower().endswith("zip") or filename.lower().endswith("egg") \
-                    or filename.lower().endswith("jar"):
+                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                     self._python_includes.append(filename)
                     sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
 
@@ -706,7 +707,7 @@ def addPyFile(self, path):
         self.addFile(path)
         (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix
 
-        if filename.endswith('.zip') or filename.endswith('.ZIP') or filename.endswith('.egg'):
+        if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
             self._python_includes.append(filename)
             # for tests in local mode
             sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
 
@@ -252,7 +252,7 @@ def applySchema(self, rdd, schema):
         >>> schema = StructType([StructField("field1", IntegerType(), False),
         ...     StructField("field2", StringType(), False)])
         >>> df = sqlCtx.applySchema(rdd2, schema)
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.sql("SELECT * from table1")
         >>> df2.collect()
         [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
@@ -405,17 +405,17 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
 
         return self.applySchema(data, schema)
 
-    def registerRDDAsTable(self, rdd, tableName):
+    def registerDataFrameAsTable(self, rdd, tableName):
         """Registers the given RDD as a temporary table in the catalog.
 
         Temporary tables exist only during the lifetime of this instance of
         SQLContext.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         """
         if (rdd.__class__ is DataFrame):
             df = rdd._jdf
-            self._ssql_ctx.registerRDDAsTable(df, tableName)
+            self._ssql_ctx.registerDataFrameAsTable(df, tableName)
         else:
             raise ValueError("Can only register DataFrame as table")
 
@@ -456,7 +456,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         ...   print>>ofn, json
         >>> ofn.close()
         >>> df1 = sqlCtx.jsonFile(jsonFile)
-        >>> sqlCtx.registerRDDAsTable(df1, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df1, "table1")
         >>> df2 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table1")
@@ -467,7 +467,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
 
         >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema)
-        >>> sqlCtx.registerRDDAsTable(df3, "table2")
+        >>> sqlCtx.registerDataFrameAsTable(df3, "table2")
         >>> df4 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table2")
@@ -485,7 +485,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         ...             StructField("field5",
         ...                 ArrayType(IntegerType(), False), True)]), False)])
         >>> df5 = sqlCtx.jsonFile(jsonFile, schema)
-        >>> sqlCtx.registerRDDAsTable(df5, "table3")
+        >>> sqlCtx.registerDataFrameAsTable(df5, "table3")
         >>> df6 = sqlCtx.sql(
         ...   "SELECT field2 AS f1, field3.field5 as f2, "
         ...   "field3.field5[0] as f3 from table3")
@@ -509,7 +509,7 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         determine the schema.
 
         >>> df1 = sqlCtx.jsonRDD(json)
-        >>> sqlCtx.registerRDDAsTable(df1, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df1, "table1")
         >>> df2 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table1")
@@ -520,7 +520,7 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
 
         >>> df3 = sqlCtx.jsonRDD(json, df1.schema)
-        >>> sqlCtx.registerRDDAsTable(df3, "table2")
+        >>> sqlCtx.registerDataFrameAsTable(df3, "table2")
         >>> df4 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table2")
@@ -538,7 +538,7 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         ...             StructField("field5",
         ...                 ArrayType(IntegerType(), False), True)]), False)])
         >>> df5 = sqlCtx.jsonRDD(json, schema)
-        >>> sqlCtx.registerRDDAsTable(df5, "table3")
+        >>> sqlCtx.registerDataFrameAsTable(df5, "table3")
         >>> df6 = sqlCtx.sql(
         ...   "SELECT field2 AS f1, field3.field5 as f2, "
         ...   "field3.field5[0] as f3 from table3")
@@ -628,7 +628,7 @@ def createExternalTable(self, tableName, path=None, source=None,
     def sql(self, sqlQuery):
         """Return a L{DataFrame} representing the result of the given query.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
         >>> df2.collect()
         [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
@@ -638,7 +638,7 @@ def sql(self, sqlQuery):
     def table(self, tableName):
         """Returns the specified table as a L{DataFrame}.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.table("table1")
         >>> sorted(df.collect()) == sorted(df2.collect())
         True
@@ -653,7 +653,7 @@ def tables(self, dbName=None):
         The returned DataFrame has two columns, tableName and isTemporary
         (a column with BooleanType indicating if a table is a temporary one or not).
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.tables()
         >>> df2.filter("tableName = 'table1'").first()
         Row(tableName=u'table1', isTemporary=True)
@@ -668,7 +668,7 @@ def tableNames(self, dbName=None):
 
         If `dbName` is not specified, the current database will be used.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> "table1" in sqlCtx.tableNames()
         True
         >>> "table1" in sqlCtx.tableNames("db")
 
@@ -238,6 +238,22 @@ def printSchema(self):
         """
         print (self._jdf.schema().treeString())
 
+    def explain(self, extended=False):
+        """
+        Prints the plans (logical and physical) to the console for
+        debugging purpose.
+
+        If extended is False, only prints the physical plan.
+        """
+        self._jdf.explain(extended)
+
+    def isLocal(self):
+        """
+        Returns True if the `collect` and `take` methods can be run locally
+        (without any Spark executors).
+        """
+        return self._jdf.isLocal()
+
     def show(self):
         """
         Print the first 20 rows.
@@ -247,14 +263,12 @@ def show(self):
         2   Alice
         5   Bob
         >>> df
-        age name
-        2   Alice
-        5   Bob
+        DataFrame[age: int, name: string]
         """
-        print (self)
+        print self._jdf.showString().encode('utf8', 'ignore')
 
     def __repr__(self):
-        return self._jdf.showString()
+        return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
 
     def count(self):
         """Return the number of elements in this RDD.
@@ -336,13 +350,40 @@ def mapPartitions(self, f, preservesPartitioning=False):
         """
         Return a new RDD by applying a function to each partition.
 
+        It's a shorthand for df.rdd.mapPartitions()
+
         >>> rdd = sc.parallelize([1, 2, 3, 4], 4)
         >>> def f(iterator): yield 1
         >>> rdd.mapPartitions(f).sum()
         4
         """
         return self.rdd.mapPartitions(f, preservesPartitioning)
 
+    def foreach(self, f):
+        """
+        Applies a function to all rows of this DataFrame.
+
+        It's a shorthand for df.rdd.foreach()
+
+        >>> def f(person):
+        ...     print person.name
+        >>> df.foreach(f)
+        """
+        return self.rdd.foreach(f)
+
+    def foreachPartition(self, f):
+        """
+        Applies a function to each partition of this DataFrame.
+
+        It's a shorthand for df.rdd.foreachPartition()
+
+        >>> def f(people):
+        ...     for person in people:
+        ...         print person.name
+        >>> df.foreachPartition(f)
+        """
+        return self.rdd.foreachPartition(f)
+
     def cache(self):
         """ Persist with the default storage level (C{MEMORY_ONLY_SER}).
         """
@@ -377,8 +418,13 @@ def repartition(self, numPartitions):
         """ Return a new :class:`DataFrame` that has exactly `numPartitions`
         partitions.
         """
-        rdd = self._jdf.repartition(numPartitions, None)
-        return DataFrame(rdd, self.sql_ctx)
+        return DataFrame(self._jdf.repartition(numPartitions, None), self.sql_ctx)
+
+    def distinct(self):
+        """
+        Return a new :class:`DataFrame` containing the distinct rows in this DataFrame.
+        """
+        return DataFrame(self._jdf.distinct(), self.sql_ctx)
 
     def sample(self, withReplacement, fraction, seed=None):
         """
@@ -957,10 +1003,7 @@ def cast(self, dataType):
         return Column(jc, self.sql_ctx)
 
     def __repr__(self):
-        if self._jdf.isComputable():
-            return self._jdf.samples()
-        else:
-            return 'Column<%s>' % self._jdf.toString()
+        return 'Column<%s>' % self._jdf.toString().encode('utf8')
 
     def toPandas(self):
         """
Original file line number	Diff line number	Diff line change
`@@ -355,6 +355,10 @@ class LogisticRegressionWithLBFGS`
`355`	`355`	`}`
`356`	`356`
`357`	`357`	`override protected def createModel(weights: Vector, intercept: Double) = {`
`358`		`- new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)`
	`358`	`+ if (numOfLinearPredictor == 1) {`
	`359`	`+ new LogisticRegressionModel(weights, intercept)`
	`360`	`+ } else {`
	`361`	`+ new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)`
	`362`	`+ }`
`359`	`363`	`}`
`360`	`364`	`}`