lightbend
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
Lines changed: 9 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
Lines changed: 9 additions & 2 deletions
diff --git a/‎docs/ml-features.md
Lines changed: 89 additions & 0 deletions b/‎docs/ml-features.md
Lines changed: 89 additions & 0 deletions
diff --git a/‎docs/mllib-data-types.md
Lines changed: 64 additions & 64 deletions b/‎docs/mllib-data-types.md
Lines changed: 64 additions & 64 deletions
diff --git a/‎ec2/spark_ec2.py
Lines changed: 5 additions & 1 deletion b/‎ec2/spark_ec2.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
Lines changed: 24 additions & 17 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
Lines changed: 24 additions & 17 deletions
@@ -50,8 +50,15 @@ private[spark] object PythonUtils {
   /**
    * Convert list of T into seq of T (for calling API with varargs)
    */
-  def toSeq[T](cols: JList[T]): Seq[T] = {
-    cols.toList.toSeq
+  def toSeq[T](vs: JList[T]): Seq[T] = {
+    vs.toList.toSeq
+  }
+
+  /**
+   * Convert list of T into array of T (for calling API with array)
+   */
+  def toArray[T](vs: JList[T]): Array[T] = {
+    vs.toArray().asInstanceOf[Array[T]]
   }
 
   /**
 
@@ -106,6 +106,95 @@ for features_label in featurized.select("features", "label").take(3):
 </div>
 </div>
 
+## Word2Vec
+
+`Word2Vec` is an `Estimator` which takes sequences of words that represents documents and trains a `Word2VecModel`. The model is a `Map(String, Vector)` essentially, which maps each word to an unique fix-sized vector. The `Word2VecModel` transforms each documents into a vector using the average of all words in the document, which aims to other computations of documents such as similarity calculation consequencely. Please refer to the [MLlib user guide on Word2Vec](mllib-feature-extraction.html#Word2Vec) for more details on Word2Vec.
+
+Word2Vec is implemented in [Word2Vec](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec). In the following code segment, we start with a set of documents, each of them is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.Word2Vec
+
+// Input data: Each row is a bag of words from a sentence or document.
+val documentDF = sqlContext.createDataFrame(Seq(
+  "Hi I heard about Spark".split(" "),
+  "I wish Java could use case classes".split(" "),
+  "Logistic regression models are neat".split(" ")
+).map(Tuple1.apply)).toDF("text")
+
+// Learn a mapping from words to Vectors.
+val word2Vec = new Word2Vec()
+  .setInputCol("text")
+  .setOutputCol("result")
+  .setVectorSize(3)
+  .setMinCount(0)
+val model = word2Vec.fit(documentDF)
+val result = model.transform(documentDF)
+result.select("result").take(3).foreach(println)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+JavaSparkContext jsc = ...
+SQLContext sqlContext = ...
+
+// Input data: Each row is a bag of words from a sentence or document.
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(Lists.newArrayList("Hi I heard about Spark".split(" "))),
+  RowFactory.create(Lists.newArrayList("I wish Java could use case classes".split(" "))),
+  RowFactory.create(Lists.newArrayList("Logistic regression models are neat".split(" ")))
+));
+StructType schema = new StructType(new StructField[]{
+  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+});
+DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+// Learn a mapping from words to Vectors.
+Word2Vec word2Vec = new Word2Vec()
+  .setInputCol("text")
+  .setOutputCol("result")
+  .setVectorSize(3)
+  .setMinCount(0);
+Word2VecModel model = word2Vec.fit(documentDF);
+DataFrame result = model.transform(documentDF);
+for (Row r: result.select("result").take(3)) {
+  System.out.println(r);
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import Word2Vec
+
+# Input data: Each row is a bag of words from a sentence or document.
+documentDF = sqlContext.createDataFrame([
+  ("Hi I heard about Spark".split(" "), ),
+  ("I wish Java could use case classes".split(" "), ),
+  ("Logistic regression models are neat".split(" "), )
+], ["text"])
+# Learn a mapping from words to Vectors.
+word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
+model = word2Vec.fit(documentDF)
+result = model.transform(documentDF)
+for feature in result.select("result").take(3):
+  print(feature)
+{% endhighlight %}
+</div>
+</div>
 
 # Feature Transformers
 
 
@@ -296,70 +296,6 @@ backed by an RDD of its entries.
 The underlying RDDs of a distributed matrix must be deterministic, because we cache the matrix size.
 In general the use of non-deterministic RDDs can lead to errors.
 
-### BlockMatrix
-
-A `BlockMatrix` is a distributed matrix backed by an RDD of `MatrixBlock`s, where a `MatrixBlock` is
-a tuple of `((Int, Int), Matrix)`, where the `(Int, Int)` is the index of the block, and `Matrix` is
-the sub-matrix at the given index with size `rowsPerBlock` x `colsPerBlock`.
-`BlockMatrix` supports methods such as `add` and `multiply` with another `BlockMatrix`.
-`BlockMatrix` also has a helper function `validate` which can be used to check whether the
-`BlockMatrix` is set up properly.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
-most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
-`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
-Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
-
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
-
-val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries
-// Create a CoordinateMatrix from an RDD[MatrixEntry].
-val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
-// Transform the CoordinateMatrix to a BlockMatrix
-val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
-
-// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
-// Nothing happens if it is valid.
-matA.validate()
-
-// Calculate A^T A.
-val ata = matA.transpose.multiply(matA)
-{% endhighlight %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-A [`BlockMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) can be
-most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
-`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
-Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
-
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
-import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
-import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
-
-JavaRDD<MatrixEntry> entries = ... // a JavaRDD of (i, j, v) Matrix Entries
-// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
-CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
-// Transform the CoordinateMatrix to a BlockMatrix
-BlockMatrix matA = coordMat.toBlockMatrix().cache();
-
-// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
-// Nothing happens if it is valid.
-matA.validate();
-
-// Calculate A^T A.
-BlockMatrix ata = matA.transpose().multiply(matA);
-{% endhighlight %}
-</div>
-</div>
-
 ### RowMatrix
 
 A `RowMatrix` is a row-oriented distributed matrix without meaningful row indices, backed by an RDD
@@ -530,3 +466,67 @@ IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix();
 {% endhighlight %}
 </div>
 </div>
+
+### BlockMatrix
+
+A `BlockMatrix` is a distributed matrix backed by an RDD of `MatrixBlock`s, where a `MatrixBlock` is
+a tuple of `((Int, Int), Matrix)`, where the `(Int, Int)` is the index of the block, and `Matrix` is
+the sub-matrix at the given index with size `rowsPerBlock` x `colsPerBlock`.
+`BlockMatrix` supports methods such as `add` and `multiply` with another `BlockMatrix`.
+`BlockMatrix` also has a helper function `validate` which can be used to check whether the
+`BlockMatrix` is set up properly.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
+`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
+Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
+
+val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries
+// Create a CoordinateMatrix from an RDD[MatrixEntry].
+val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
+// Transform the CoordinateMatrix to a BlockMatrix
+val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
+
+// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Nothing happens if it is valid.
+matA.validate()
+
+// Calculate A^T A.
+val ata = matA.transpose.multiply(matA)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+A [`BlockMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) can be
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
+`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
+Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
+import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
+import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
+
+JavaRDD<MatrixEntry> entries = ... // a JavaRDD of (i, j, v) Matrix Entries
+// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
+CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
+// Transform the CoordinateMatrix to a BlockMatrix
+BlockMatrix matA = coordMat.toBlockMatrix().cache();
+
+// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Nothing happens if it is valid.
+matA.validate();
+
+// Calculate A^T A.
+BlockMatrix ata = matA.transpose().multiply(matA);
+{% endhighlight %}
+</div>
+</div>
@@ -864,7 +864,11 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
         for i in cluster_instances:
             i.update()
 
-        statuses = conn.get_all_instance_status(instance_ids=[i.id for i in cluster_instances])
+        max_batch = 100
+        statuses = []
+        for j in xrange(0, len(cluster_instances), max_batch):
+            batch = [i.id for i in cluster_instances[j:j + max_batch]]
+            statuses.extend(conn.get_all_instance_status(instance_ids=batch))
 
         if cluster_state == 'ssh-ready':
             if all(i.state == 'running' for i in cluster_instances) and \
 
@@ -21,13 +21,11 @@ import java.lang.{Iterable => JIterable}
 
 import scala.collection.JavaConverters._
 
-import breeze.linalg.{Axis, DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum}
-import breeze.numerics.{exp => brzExp, log => brzLog}
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext, SparkException}
-import org.apache.spark.mllib.linalg.{BLAS, DenseVector, SparseVector, Vector}
+import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
@@ -50,6 +48,9 @@ class NaiveBayesModel private[mllib] (
     val modelType: String)
   extends ClassificationModel with Serializable with Saveable {
 
+  private val piVector = new DenseVector(pi)
+  private val thetaMatrix = new DenseMatrix(labels.size, theta(0).size, theta.flatten, true)
+
   private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) =
     this(labels, pi, theta, "Multinomial")
 
@@ -60,17 +61,18 @@ class NaiveBayesModel private[mllib] (
       theta: JIterable[JIterable[Double]]) =
     this(labels.asScala.toArray, pi.asScala.toArray, theta.asScala.toArray.map(_.asScala.toArray))
 
-  private val brzPi = new BDV[Double](pi)
-  private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t
-
   // Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
-  // This precomputes log(1.0 - exp(theta)) and its sum  which are used for the  linear algebra
+  // This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
   // application of this condition (in predict function).
-  private val (brzNegTheta, brzNegThetaSum) = modelType match {
+  private val (thetaMinusNegTheta, negThetaSum) = modelType match {
     case "Multinomial" => (None, None)
     case "Bernoulli" =>
-      val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x))
-      (Option(negTheta), Option(brzSum(negTheta, Axis._1)))
+      val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
+      val ones = new DenseVector(Array.fill(thetaMatrix.numCols){1.0})
+      val thetaMinusNegTheta = thetaMatrix.map { value =>
+        value - math.log(1.0 - math.exp(value))
+      }
+      (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
     case _ =>
       // This should never happen.
       throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
@@ -85,17 +87,22 @@ class NaiveBayesModel private[mllib] (
   }
 
   override def predict(testData: Vector): Double = {
-    val brzData = testData.toBreeze
     modelType match {
       case "Multinomial" =>
-        labels(brzArgmax(brzPi + brzTheta * brzData))
+        val prob = thetaMatrix.multiply(testData)
+        BLAS.axpy(1.0, piVector, prob)
+        labels(prob.argmax)
       case "Bernoulli" =>
-        if (!brzData.forall(v => v == 0.0 || v == 1.0)) {
-          throw new SparkException(
-            s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $testData.")
+        testData.foreachActive { (index, value) =>
+          if (value != 0.0 && value != 1.0) {
+            throw new SparkException(
+              s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $testData.")
+          }
         }
-        labels(brzArgmax(brzPi +
-          (brzTheta - brzNegTheta.get) * brzData + brzNegThetaSum.get))
+        val prob = thetaMinusNegTheta.get.multiply(testData)
+        BLAS.axpy(1.0, piVector, prob)
+        BLAS.axpy(1.0, negThetaSum.get, prob)
+        labels(prob.argmax)
       case _ =>
         // This should never happen.
         throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")