first version

mengxr · mengxr · commit 0846e078bc3f · 2014-08-12T22:59:24.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.mllib.stat.correlation
 
+import org.apache.spark.storage.StorageLevel
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{Logging, HashPartitioner}
 import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.linalg.{DenseVector, Matrix, Vector}
+import org.apache.spark.mllib.linalg.{Vectors, DenseVector, Matrix, Vector}
 import org.apache.spark.rdd.{CoGroupedRDD, RDD}
 
 /**
@@ -48,82 +50,52 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging {
    * numCol RDD[Double]s, each of which sorted, and the joined back into a single RDD[Vector].
    */
   override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
-    val indexed = X.zipWithUniqueId()
-
-    val numCols = X.first.size
-    if (numCols > 50) {
-      logWarning("Computing the Spearman correlation matrix can be slow for large RDDs with more"
-        + " than 50 columns.")
-    }
-    val ranks = new Array[RDD[(Long, Double)]](numCols)
-
-    // Note: we use a for loop here instead of a while loop with a single index variable
-    // to avoid race condition caused by closure serialization
-    for (k <- 0 until numCols) {
-      val column = indexed.map { case (vector, index) => (vector(k), index) }
-      ranks(k) = getRanks(column)
-    }
-
-    val ranksMat: RDD[Vector] = makeRankMatrix(ranks, X)
-    PearsonCorrelation.computeCorrelationMatrix(ranksMat)
-  }
-
-  /**
-   * Compute the ranks for elements in the input RDD, using the average method for ties.
-   *
-   * With the average method, elements with the same value receive the same rank that's computed
-   * by taking the average of their positions in the sorted list.
-   * e.g. ranks([2, 1, 0, 2]) = [2.5, 1.0, 0.0, 2.5]
-   * Note that positions here are 0-indexed, instead of the 1-indexed as in the definition for
-   * ranks in the standard definition for Spearman's correlation. This does not affect the final
-   * results and is slightly more performant.
-   *
-   * @param indexed RDD[(Double, Long)] containing pairs of the format (originalValue, uniqueId)
-   * @return RDD[(Long, Double)] containing pairs of the format (uniqueId, rank), where uniqueId is
-   *         copied from the input RDD.
-   */
-  private def getRanks(indexed: RDD[(Double, Long)]): RDD[(Long, Double)] = {
-    // Get elements' positions in the sorted list for computing average rank for duplicate values
-    val sorted = indexed.sortByKey().zipWithIndex()
-
-    val ranks: RDD[(Long, Double)] = sorted.mapPartitions { iter =>
-      // add an extra element to signify the end of the list so that flatMap can flush the last
-      // batch of duplicates
-      val end = -1L
-      val padded = iter ++ Iterator[((Double, Long), Long)](((Double.NaN, end), end))
-      val firstEntry = padded.next()
-      var lastVal = firstEntry._1._1
-      var firstRank = firstEntry._2.toDouble
-      val idBuffer = ArrayBuffer(firstEntry._1._2)
-      padded.flatMap { case ((v, id), rank) =>
-        if (v == lastVal && id != end) {
-          idBuffer += id
-          Iterator.empty
+    val transposed = X.zipWithUniqueId().flatMap { case (vec, uid) =>
+      vec.toArray.view.zipWithIndex.map { case (v, j) =>
+        ((j, v), uid)
+      }
+    }.persist(StorageLevel.MEMORY_AND_DISK)
+    val sorted = transposed.sortByKey().persist(StorageLevel.MEMORY_AND_DISK)
+    val ranked = sorted.zipWithIndex().mapPartitions { iter =>
+      var preCol = -1
+      var preVal = Double.NaN
+      var startRank = -1.0
+      var cachedIds = ArrayBuffer.empty[Long]
+      def flush(): Iterable[(Long, (Int, Double))] = {
+        val averageRank = startRank + (cachedIds.size - 1) / 2.0
+        val output = cachedIds.map { i =>
+          (i, (preCol, averageRank))
+        }
+        cachedIds.clear()
+        output
+      }
+      iter.flatMap { case (((j, v), uid), rank) =>
+        if (j != preCol || v != preVal) {
+          val output = flush()
+          preCol = j
+          preVal = v
+          startRank = rank
+          cachedIds += uid
+          output
         } else {
-          val entries = if (idBuffer.size == 1) {
-            Iterator((idBuffer(0), firstRank))
-          } else {
-            val averageRank = firstRank + (idBuffer.size - 1.0) / 2.0
-            idBuffer.map(id => (id, averageRank))
-          }
-          lastVal = v
-          firstRank = rank
-          idBuffer.clear()
-          idBuffer += id
-          entries
+          cachedIds += uid
+          Iterator.empty
         }
+      } ++ {
+        flush()
       }
     }
-    ranks
-  }
-
-  private def makeRankMatrix(ranks: Array[RDD[(Long, Double)]], input: RDD[Vector]): RDD[Vector] = {
-    val partitioner = new HashPartitioner(input.partitions.size)
-    val cogrouped = new CoGroupedRDD[Long](ranks, partitioner)
-    cogrouped.map {
-      case (_, values: Array[Iterable[_]]) =>
-        val doubles = values.asInstanceOf[Array[Iterable[Double]]]
-        new DenseVector(doubles.flatten.toArray)
+    val ranks = tied.groupByKey().map { case (uid, iter) =>
+      val values = iter.toSeq.sortBy(_._1).map(_._2).toArray
+      println(values.toSeq)
+      Vectors.dense(values)
     }
+    val corrMatrix = PearsonCorrelation.computeCorrelationMatrix(ranks)
+
+    transposed.unpersist(blocking = false)
+    sorted.unpersist(blocking = false)
+
+    corrMatrix
   }
 }
+