@@ -78,35 +78,29 @@ class LDA private (
78
78
*
79
79
* This is the parameter to a symmetric Dirichlet distribution.
80
80
*/
81
- def getDocConcentration : Double = {
82
- if (this .docConcentration == - 1 ) {
83
- (50.0 / k) + 1.0
84
- } else {
85
- this .docConcentration
86
- }
87
- }
81
+ def getDocConcentration : Double = this .docConcentration
88
82
89
83
/**
90
84
* Concentration parameter (commonly named "alpha") for the prior placed on documents'
91
85
* distributions over topics ("theta").
92
86
*
93
- * This is the parameter to a symmetric Dirichlet distribution.
87
+ * This is the parameter to a symmetric Dirichlet distribution, where larger values
88
+ * mean more smoothing (more regularization).
94
89
*
95
- * This value should be > 1.0, where larger values mean more smoothing (more regularization).
96
90
* If set to -1, then docConcentration is set automatically.
97
91
* (default = -1 = automatic)
98
92
*
99
- * Automatic setting of parameter:
100
- * - For EM: default = (50 / k) + 1.
101
- * - The 50/k is common in LDA libraries.
102
- * - The +1 follows Asuncion et al. (2009), who recommend a +1 adjustment for EM.
103
- *
104
- * Note: The restriction > 1.0 may be relaxed in the future (allowing sparse solutions),
105
- * but values in (0,1) are not yet supported.
93
+ * Optimizer-specific parameter settings:
94
+ * - EM
95
+ * - Value should be > 1.0
96
+ * - default = (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
97
+ * Asuncion et al. (2009), who recommend a +1 adjustment for EM.
98
+ * - Online
99
+ * - Value should be >= 0
100
+ * - default = (1.0 / k), following the implementation from
101
+ * [[https://github.com/Blei-Lab/onlineldavb ]].
106
102
*/
107
103
def setDocConcentration (docConcentration : Double ): this .type = {
108
- require(docConcentration > 1.0 || docConcentration == - 1.0 ,
109
- s " LDA docConcentration must be > 1.0 (or -1 for auto), but was set to $docConcentration" )
110
104
this .docConcentration = docConcentration
111
105
this
112
106
}
@@ -126,13 +120,7 @@ class LDA private (
126
120
* Note: The topics' distributions over terms are called "beta" in the original LDA paper
127
121
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
128
122
*/
129
- def getTopicConcentration : Double = {
130
- if (this .topicConcentration == - 1 ) {
131
- 1.1
132
- } else {
133
- this .topicConcentration
134
- }
135
- }
123
+ def getTopicConcentration : Double = this .topicConcentration
136
124
137
125
/**
138
126
* Concentration parameter (commonly named "beta" or "eta") for the prior placed on topics'
@@ -143,21 +131,20 @@ class LDA private (
143
131
* Note: The topics' distributions over terms are called "beta" in the original LDA paper
144
132
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
145
133
*
146
- * This value should be > 0.0.
147
134
* If set to -1, then topicConcentration is set automatically.
148
135
* (default = -1 = automatic)
149
136
*
150
- * Automatic setting of parameter:
151
- * - For EM: default = 0.1 + 1.
152
- * - The 0.1 gives a small amount of smoothing.
153
- * - The +1 follows Asuncion et al. (2009), who recommend a +1 adjustment for EM.
154
- *
155
- * Note: The restriction > 1.0 may be relaxed in the future (allowing sparse solutions),
156
- * but values in (0,1) are not yet supported.
137
+ * Optimizer-specific parameter settings:
138
+ * - EM
139
+ * - Value should be > 1.0
140
+ * - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
141
+ * Asuncion et al. (2009), who recommend a +1 adjustment for EM.
142
+ * - Online
143
+ * - Value should be >= 0
144
+ * - default = (1.0 / k), following the implementation from
145
+ * [[https://github.com/Blei-Lab/onlineldavb ]].
157
146
*/
158
147
def setTopicConcentration (topicConcentration : Double ): this .type = {
159
- require(topicConcentration > 1.0 || topicConcentration == - 1.0 ,
160
- s " LDA topicConcentration must be > 1.0 (or -1 for auto), but was set to $topicConcentration" )
161
148
this .topicConcentration = topicConcentration
162
149
this
163
150
}
@@ -223,14 +210,15 @@ class LDA private (
223
210
224
211
/**
225
212
* Set the LDAOptimizer used to perform the actual calculation by algorithm name.
226
- * Currently "em" is supported.
213
+ * Currently "em", "online" is supported.
227
214
*/
228
215
def setOptimizer (optimizerName : String ): this .type = {
229
216
this .ldaOptimizer =
230
217
optimizerName.toLowerCase match {
231
218
case " em" => new EMLDAOptimizer
219
+ case " online" => new OnlineLDAOptimizer
232
220
case other =>
233
- throw new IllegalArgumentException (s " Only em is supported but got $other. " )
221
+ throw new IllegalArgumentException (s " Only em, online are supported but got $other. " )
234
222
}
235
223
this
236
224
}
@@ -245,8 +233,7 @@ class LDA private (
245
233
* @return Inferred LDA model
246
234
*/
247
235
def run (documents : RDD [(Long , Vector )]): LDAModel = {
248
- val state = ldaOptimizer.initialState(documents, k, getDocConcentration, getTopicConcentration,
249
- seed, checkpointInterval)
236
+ val state = ldaOptimizer.initialize(documents, this )
250
237
var iter = 0
251
238
val iterationTimes = Array .fill[Double ](maxIterations)(0 )
252
239
while (iter < maxIterations) {
0 commit comments