@@ -51,16 +51,16 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging {
51
51
vec.toArray.view.zipWithIndex.map { case (v, j) =>
52
52
((j, v), uid)
53
53
}
54
- }.persist( StorageLevel . MEMORY_AND_DISK ) // used by sortByKey
54
+ }
55
55
// global sort by (columnIndex, value)
56
- val sorted = colBased.sortByKey().persist( StorageLevel . MEMORY_AND_DISK ) // used by zipWithIndex
56
+ val sorted = colBased.sortByKey()
57
57
// Assign global ranks (using average ranks for tied values)
58
58
val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
59
59
var preCol = - 1
60
60
var preVal = Double .NaN
61
61
var startRank = - 1.0
62
62
var cachedIds = ArrayBuffer .empty[Long ]
63
- def flush () : Iterable [(Long , (Int , Double ))] = {
63
+ def flush : () => Iterable [(Long , (Int , Double ))] = () => {
64
64
val averageRank = startRank + (cachedIds.size - 1 ) / 2.0
65
65
val output = cachedIds.map { i =>
66
66
(i, (preCol, averageRank))
@@ -69,7 +69,8 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging {
69
69
output
70
70
}
71
71
iter.flatMap { case (((j, v), uid), rank) =>
72
- if (j != preCol || v != preVal) {
72
+ // If we see a new value or cachedIds is too big, we flush ids with their average rank.
73
+ if (j != preCol || v != preVal || cachedIds.size >= 10000000 ) {
73
74
val output = flush()
74
75
preCol = j
75
76
preVal = v
0 commit comments