Merge branch 'master' of https://github.com/tgaloppo/spark

tgaloppo · tgaloppo · commit c3b8ce0fa808 · 2014-12-16T07:38:02.000-05:00
Adds predict() method
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala
@@ -47,5 +47,9 @@ object DenseGmmEM {
       println("weight=%f mu=%s sigma=\n%s\n" format 
         (clusters.weight(i), clusters.mu(i), clusters.sigma(i)))
     }
+    val (responsibility_matrix, cluster_labels) = clusters.predict(data)
+    for(x <- cluster_labels.collect()){
+        print(" " + x)
+    }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
+import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.Matrix
 import org.apache.spark.mllib.linalg.Vector
 
@@ -38,4 +39,12 @@ class GaussianMixtureModel(
   
   /** Number of gaussians in mixture */
   def k: Int = weight.length;
+
+  /** Maps given points to their cluster indices. */
+  def predict(points: RDD[Vector]): (RDD[Array[Double]],RDD[Int]) = {
+    val responsibility_matrix = new GaussianMixtureModelEM()
+        .predictClusters(points,mu,sigma,weight,k)
+    val cluster_labels = responsibility_matrix.map(r => r.indexOf(r.max))
+    (responsibility_matrix,cluster_labels)
+  }    
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala
@@ -21,7 +21,7 @@ import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix}
 import breeze.linalg.Transpose
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
+import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.stat.impl.MultivariateGaussian
 import org.apache.spark.{Accumulator, AccumulatorParam, SparkContext}
 import org.apache.spark.SparkContext.DoubleAccumulatorParam
@@ -208,6 +208,34 @@ class GaussianMixtureModelEM private (
     cov
   }
   
+  /** 
+  Given the input vectors, return the membership value of each vector
+  to all mixture components.  
+  */
+  def predictClusters(points:RDD[Vector],mu:Array[Vector],sigma:Array[Matrix],
+      weight:Array[Double],k:Int):RDD[Array[Double]]= {
+    val ctx = points.sparkContext
+    val dists = ctx.broadcast((0 until k).map(i => 
+        new MultivariateGaussian(mu(i).toBreeze.toDenseVector,sigma(i).toBreeze.toDenseMatrix))
+        .toArray)
+    val weights = ctx.broadcast((0 until k).map(i => weight(i)).toArray)
+    points.map(x=>compute_log_likelihood(x.toBreeze.toDenseVector,dists.value,weights.value,k))
+
+  }
+  /**
+  * Compute the log density of each vector
+  */
+  def compute_log_likelihood(pt:DenseDoubleVector,dists:Array[MultivariateGaussian],
+      weights:Array[Double],k:Int):Array[Double]={
+    val p = (0 until k).map(i => 
+          eps + weights(i) * dists(i).pdf(pt)).toArray
+    val pSum = p.sum 
+    for(i<- 0 until k){
+      p(i) /= pSum
+    }
+    p
+  }
+
   /** AccumulatorParam for Dense Breeze Vectors */
   private object DenseDoubleVectorAccumulatorParam extends AccumulatorParam[DenseDoubleVector] {
     def zero(initialVector: DenseDoubleVector): DenseDoubleVector = {

Original file line number	Diff line number	Diff line change
`@@ -47,5 +47,9 @@ object DenseGmmEM {`
`47`	`47`	`println("weight=%f mu=%s sigma=\n%s\n" format`
`48`	`48`	`(clusters.weight(i), clusters.mu(i), clusters.sigma(i)))`
`49`	`49`	`}`
	`50`	`+ val (responsibility_matrix, cluster_labels) = clusters.predict(data)`
	`51`	`+ for(x <- cluster_labels.collect()){`
	`52`	`+ print(" " + x)`
	`53`	`+ }`
`50`	`54`	`}`
`51`	`55`	`}`