@@ -227,8 +227,8 @@ class OnlineLDAOptimizer extends LDAOptimizer {
227
227
private var k : Int = 0
228
228
private var corpusSize : Long = 0
229
229
private var vocabSize : Int = 0
230
- private var alpha : Double = 0
231
- private var eta : Double = 0
230
+ private [clustering] var alpha : Double = 0
231
+ private [clustering] var eta : Double = 0
232
232
private var randomGenerator : java.util.Random = null
233
233
234
234
// Online LDA specific parameters
@@ -238,12 +238,11 @@ class OnlineLDAOptimizer extends LDAOptimizer {
238
238
239
239
// internal data structure
240
240
private var docs : RDD [(Long , Vector )] = null
241
- private var lambda : BDM [Double ] = null
242
- private var Elogbeta : BDM [Double ] = null
243
- private var expElogbeta : BDM [Double ] = null
241
+ private [clustering] var lambda : BDM [Double ] = null
244
242
245
243
// count of invocation to next, which helps deciding the weight for each iteration
246
244
private var iteration : Int = 0
245
+ private var gammaShape : Double = 100
247
246
248
247
/**
249
248
* A (positive) learning parameter that downweights early iterations. Larger values make early
@@ -295,7 +294,24 @@ class OnlineLDAOptimizer extends LDAOptimizer {
295
294
this
296
295
}
297
296
298
- override private [clustering] def initialize (docs : RDD [(Long , Vector )], lda : LDA ): LDAOptimizer = {
297
+ /**
298
+ * The function is for test only now. In the future, it can help support training strop/resume
299
+ */
300
+ private [clustering] def setLambda (lambda : BDM [Double ]): this .type = {
301
+ this .lambda = lambda
302
+ this
303
+ }
304
+
305
+ /**
306
+ * Used to control the gamma distribution. Larger value produces values closer to 1.0.
307
+ */
308
+ private [clustering] def setGammaShape (shape : Double ): this .type = {
309
+ this .gammaShape = shape
310
+ this
311
+ }
312
+
313
+ override private [clustering] def initialize (docs : RDD [(Long , Vector )], lda : LDA ):
314
+ OnlineLDAOptimizer = {
299
315
this .k = lda.getK
300
316
this .corpusSize = docs.count()
301
317
this .vocabSize = docs.first()._2.size
@@ -307,26 +323,30 @@ class OnlineLDAOptimizer extends LDAOptimizer {
307
323
308
324
// Initialize the variational distribution q(beta|lambda)
309
325
this .lambda = getGammaMatrix(k, vocabSize)
310
- this .Elogbeta = dirichletExpectation(lambda)
311
- this .expElogbeta = exp(Elogbeta )
312
326
this .iteration = 0
313
327
this
314
328
}
315
329
330
+ override private [clustering] def next (): OnlineLDAOptimizer = {
331
+ val batch = docs.sample(withReplacement = true , miniBatchFraction, randomGenerator.nextLong())
332
+ if (batch.isEmpty()) return this
333
+ submitMiniBatch(batch)
334
+ }
335
+
336
+
316
337
/**
317
338
* Submit a subset (like 1%, decide by the miniBatchFraction) of the corpus to the Online LDA
318
339
* model, and it will update the topic distribution adaptively for the terms appearing in the
319
340
* subset.
320
341
*/
321
- override private [clustering] def next ( ): OnlineLDAOptimizer = {
342
+ private [clustering] def submitMiniBatch ( batch : RDD [( Long , Vector )] ): OnlineLDAOptimizer = {
322
343
iteration += 1
323
- val batch = docs.sample(withReplacement = true , miniBatchFraction, randomGenerator.nextLong())
324
- if (batch.isEmpty()) return this
325
-
326
344
val k = this .k
327
345
val vocabSize = this .vocabSize
328
- val expElogbeta = this .expElogbeta
346
+ val Elogbeta = dirichletExpectation(lambda)
347
+ val expElogbeta = exp(Elogbeta )
329
348
val alpha = this .alpha
349
+ val gammaShape = this .gammaShape
330
350
331
351
val stats : RDD [BDM [Double ]] = batch.mapPartitions { docs =>
332
352
val stat = BDM .zeros[Double ](k, vocabSize)
@@ -340,7 +360,7 @@ class OnlineLDAOptimizer extends LDAOptimizer {
340
360
}
341
361
342
362
// Initialize the variational distribution q(theta|gamma) for the mini-batch
343
- var gammad = new Gamma (100 , 1.0 / 100.0 ).samplesVector(k).t // 1 * K
363
+ var gammad = new Gamma (gammaShape , 1.0 / gammaShape ).samplesVector(k).t // 1 * K
344
364
var Elogthetad = digamma(gammad) - digamma(sum(gammad)) // 1 * K
345
365
var expElogthetad = exp(Elogthetad ) // 1 * K
346
366
val expElogbetad = expElogbeta(:: , ids).toDenseMatrix // K * ids
@@ -350,7 +370,7 @@ class OnlineLDAOptimizer extends LDAOptimizer {
350
370
val ctsVector = new BDV [Double ](cts).t // 1 * ids
351
371
352
372
// Iterate between gamma and phi until convergence
353
- while (meanchange > 1e-5 ) {
373
+ while (meanchange > 1e-3 ) {
354
374
val lastgamma = gammad
355
375
// 1*K 1 * ids ids * k
356
376
gammad = (expElogthetad :* ((ctsVector / phinorm) * expElogbetad.t)) + alpha
@@ -372,7 +392,10 @@ class OnlineLDAOptimizer extends LDAOptimizer {
372
392
Iterator (stat)
373
393
}
374
394
375
- val batchResult : BDM [Double ] = stats.reduce(_ += _)
395
+ val statsSum : BDM [Double ] = stats.reduce(_ += _)
396
+ val batchResult = statsSum :* expElogbeta
397
+
398
+ // Note that this is an optimization to avoid batch.count
376
399
update(batchResult, iteration, (miniBatchFraction * corpusSize).toInt)
377
400
this
378
401
}
@@ -384,28 +407,23 @@ class OnlineLDAOptimizer extends LDAOptimizer {
384
407
/**
385
408
* Update lambda based on the batch submitted. batchSize can be different for each iteration.
386
409
*/
387
- private def update (raw : BDM [Double ], iter : Int , batchSize : Int ): Unit = {
410
+ private [clustering] def update (stat : BDM [Double ], iter : Int , batchSize : Int ): Unit = {
388
411
val tau_0 = this .getTau_0
389
412
val kappa = this .getKappa
390
413
391
414
// weight of the mini-batch.
392
415
val weight = math.pow(tau_0 + iter, - kappa)
393
416
394
- // This step finishes computing the sufficient statistics for the M step
395
- val stat = raw :* expElogbeta
396
-
397
417
// Update lambda based on documents.
398
418
lambda = lambda * (1 - weight) +
399
419
(stat * (corpusSize.toDouble / batchSize.toDouble) + eta) * weight
400
- Elogbeta = dirichletExpectation(lambda)
401
- expElogbeta = exp(Elogbeta )
402
420
}
403
421
404
422
/**
405
423
* Get a random matrix to initialize lambda
406
424
*/
407
425
private def getGammaMatrix (row : Int , col : Int ): BDM [Double ] = {
408
- val gammaRandomGenerator = new Gamma (100 , 1.0 / 100.0 )
426
+ val gammaRandomGenerator = new Gamma (gammaShape , 1.0 / gammaShape )
409
427
val temp = gammaRandomGenerator.sample(row * col).toArray
410
428
new BDM [Double ](col, row, temp).t
411
429
}
0 commit comments