@@ -39,7 +39,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
39
39
40
40
// Check: describeTopics() with all terms
41
41
val fullTopicSummary = model.describeTopics()
42
- assert(fullTopicSummary.size === tinyK)
42
+ assert(fullTopicSummary.length === tinyK)
43
43
fullTopicSummary.zip(tinyTopicDescription).foreach {
44
44
case ((algTerms, algTermWeights), (terms, termWeights)) =>
45
45
assert(algTerms === terms)
@@ -101,7 +101,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
101
101
// Check: per-doc topic distributions
102
102
val topicDistributions = model.topicDistributions.collect()
103
103
// Ensure all documents are covered.
104
- assert(topicDistributions.size === tinyCorpus.size )
104
+ assert(topicDistributions.length === tinyCorpus.length )
105
105
assert(tinyCorpus.map(_._1).toSet === topicDistributions.map(_._1).toSet)
106
106
// Ensure we have proper distributions
107
107
topicDistributions.foreach { case (docId, topicDistribution) =>
@@ -139,8 +139,8 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
139
139
val corpus = sc.parallelize(tinyCorpus, 2 )
140
140
val op = new OnlineLDAOptimizer ().initialize(corpus, lda)
141
141
op.setKappa(0.9876 ).setMiniBatchFraction(0.123 ).setTau_0(567 )
142
- assert(op.alpha == 0.5 ) // default 1.0 / k
143
- assert(op.eta == 0.5 ) // default 1.0 / k
142
+ assert(op.getAlpha == 0.5 ) // default 1.0 / k
143
+ assert(op.getEta == 0.5 ) // default 1.0 / k
144
144
assert(op.getKappa == 0.9876 )
145
145
assert(op.getMiniBatchFraction == 0.123 )
146
146
assert(op.getTau_0 == 567 )
@@ -154,14 +154,14 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
154
154
155
155
def docs : Array [(Long , Vector )] = Array (
156
156
Vectors .sparse(vocabSize, Array (0 , 1 , 2 ), Array (1 , 1 , 1 )), // apple, orange, banana
157
- Vectors .sparse(vocabSize, Array (3 , 4 , 5 ), Array (1 , 1 , 1 ))) // tiger, cat, dog
158
- .zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
157
+ Vectors .sparse(vocabSize, Array (3 , 4 , 5 ), Array (1 , 1 , 1 )) // tiger, cat, dog
158
+ ) .zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
159
159
val corpus = sc.parallelize(docs, 2 )
160
160
161
- // setGammaShape large so to avoid the stochastic impact.
161
+ // Set GammaShape large to avoid the stochastic impact.
162
162
val op = new OnlineLDAOptimizer ().setTau_0(1024 ).setKappa(0.51 ).setGammaShape(1e40 )
163
163
.setMiniBatchFraction(1 )
164
- val lda = new LDA ().setK(k).setMaxIterations(1 ).setOptimizer(op)
164
+ val lda = new LDA ().setK(k).setMaxIterations(1 ).setOptimizer(op).setSeed( 12345 )
165
165
166
166
val state = op.initialize(corpus, lda)
167
167
// override lambda to simulate an intermediate state
@@ -175,8 +175,8 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
175
175
176
176
// verify the result, Note this generate the identical result as
177
177
// [[https://github.com/Blei-Lab/onlineldavb]]
178
- val topic1 = op.lambda (0 , :: ).inner.toArray.map(" %.4f" .format(_)).mkString(" , " )
179
- val topic2 = op.lambda (1 , :: ).inner.toArray.map(" %.4f" .format(_)).mkString(" , " )
178
+ val topic1 = op.getLambda (0 , :: ).inner.toArray.map(" %.4f" .format(_)).mkString(" , " )
179
+ val topic2 = op.getLambda (1 , :: ).inner.toArray.map(" %.4f" .format(_)).mkString(" , " )
180
180
assert(" 1.1101, 1.2076, 1.3050, 0.8899, 0.7924, 0.6950" == topic1)
181
181
assert(" 0.8899, 0.7924, 0.6950, 1.1101, 1.2076, 1.3050" == topic2)
182
182
}
@@ -186,7 +186,6 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
186
186
Vectors .sparse(6 , Array (0 , 1 ), Array (1 , 1 )),
187
187
Vectors .sparse(6 , Array (1 , 2 ), Array (1 , 1 )),
188
188
Vectors .sparse(6 , Array (0 , 2 ), Array (1 , 1 )),
189
-
190
189
Vectors .sparse(6 , Array (3 , 4 ), Array (1 , 1 )),
191
190
Vectors .sparse(6 , Array (3 , 5 ), Array (1 , 1 )),
192
191
Vectors .sparse(6 , Array (4 , 5 ), Array (1 , 1 ))
@@ -200,6 +199,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
200
199
.setTopicConcentration(0.01 )
201
200
.setMaxIterations(100 )
202
201
.setOptimizer(op)
202
+ .setSeed(12345 )
203
203
204
204
val ldaModel = lda.run(docs)
205
205
val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10 )
@@ -208,10 +208,10 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
208
208
}
209
209
210
210
// check distribution for each topic, typical distribution is (0.3, 0.3, 0.3, 0.02, 0.02, 0.02)
211
- topics.foreach( topic => {
212
- val smalls = topic.filter(t => ( t._2 < 0.1 ) ).map(_._2)
213
- assert(smalls.size == 3 && smalls.sum < 0.2 )
214
- })
211
+ topics.foreach { topic =>
212
+ val smalls = topic.filter(t => t._2 < 0.1 ).map(_._2)
213
+ assert(smalls.length == 3 && smalls.sum < 0.2 )
214
+ }
215
215
}
216
216
217
217
}
0 commit comments