Skip to content

Commit 22eea62

Browse files
committed
Merge branch 'master' of github.com:apache/spark into maven-utils
2 parents 05cd0de + 5a1a107 commit 22eea62

File tree

15 files changed

+1190
-97
lines changed

15 files changed

+1190
-97
lines changed

dev/run-tests

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ echo "========================================================================="
236236
CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS
237237

238238
# add path for python 3 in jenkins
239-
export PATH="${PATH}:/home/anaonda/envs/py3k/bin"
239+
export PATH="${PATH}:/home/anaconda/envs/py3k/bin"
240240
./python/run-tests
241241

242242
echo ""

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala

Lines changed: 26 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -78,35 +78,29 @@ class LDA private (
7878
*
7979
* This is the parameter to a symmetric Dirichlet distribution.
8080
*/
81-
def getDocConcentration: Double = {
82-
if (this.docConcentration == -1) {
83-
(50.0 / k) + 1.0
84-
} else {
85-
this.docConcentration
86-
}
87-
}
81+
def getDocConcentration: Double = this.docConcentration
8882

8983
/**
9084
* Concentration parameter (commonly named "alpha") for the prior placed on documents'
9185
* distributions over topics ("theta").
9286
*
93-
* This is the parameter to a symmetric Dirichlet distribution.
87+
* This is the parameter to a symmetric Dirichlet distribution, where larger values
88+
* mean more smoothing (more regularization).
9489
*
95-
* This value should be > 1.0, where larger values mean more smoothing (more regularization).
9690
* If set to -1, then docConcentration is set automatically.
9791
* (default = -1 = automatic)
9892
*
99-
* Automatic setting of parameter:
100-
* - For EM: default = (50 / k) + 1.
101-
* - The 50/k is common in LDA libraries.
102-
* - The +1 follows Asuncion et al. (2009), who recommend a +1 adjustment for EM.
103-
*
104-
* Note: The restriction > 1.0 may be relaxed in the future (allowing sparse solutions),
105-
* but values in (0,1) are not yet supported.
93+
* Optimizer-specific parameter settings:
94+
* - EM
95+
* - Value should be > 1.0
96+
* - default = (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
97+
* Asuncion et al. (2009), who recommend a +1 adjustment for EM.
98+
* - Online
99+
* - Value should be >= 0
100+
* - default = (1.0 / k), following the implementation from
101+
* [[https://github.com/Blei-Lab/onlineldavb]].
106102
*/
107103
def setDocConcentration(docConcentration: Double): this.type = {
108-
require(docConcentration > 1.0 || docConcentration == -1.0,
109-
s"LDA docConcentration must be > 1.0 (or -1 for auto), but was set to $docConcentration")
110104
this.docConcentration = docConcentration
111105
this
112106
}
@@ -126,13 +120,7 @@ class LDA private (
126120
* Note: The topics' distributions over terms are called "beta" in the original LDA paper
127121
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
128122
*/
129-
def getTopicConcentration: Double = {
130-
if (this.topicConcentration == -1) {
131-
1.1
132-
} else {
133-
this.topicConcentration
134-
}
135-
}
123+
def getTopicConcentration: Double = this.topicConcentration
136124

137125
/**
138126
* Concentration parameter (commonly named "beta" or "eta") for the prior placed on topics'
@@ -143,21 +131,20 @@ class LDA private (
143131
* Note: The topics' distributions over terms are called "beta" in the original LDA paper
144132
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
145133
*
146-
* This value should be > 0.0.
147134
* If set to -1, then topicConcentration is set automatically.
148135
* (default = -1 = automatic)
149136
*
150-
* Automatic setting of parameter:
151-
* - For EM: default = 0.1 + 1.
152-
* - The 0.1 gives a small amount of smoothing.
153-
* - The +1 follows Asuncion et al. (2009), who recommend a +1 adjustment for EM.
154-
*
155-
* Note: The restriction > 1.0 may be relaxed in the future (allowing sparse solutions),
156-
* but values in (0,1) are not yet supported.
137+
* Optimizer-specific parameter settings:
138+
* - EM
139+
* - Value should be > 1.0
140+
* - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
141+
* Asuncion et al. (2009), who recommend a +1 adjustment for EM.
142+
* - Online
143+
* - Value should be >= 0
144+
* - default = (1.0 / k), following the implementation from
145+
* [[https://github.com/Blei-Lab/onlineldavb]].
157146
*/
158147
def setTopicConcentration(topicConcentration: Double): this.type = {
159-
require(topicConcentration > 1.0 || topicConcentration == -1.0,
160-
s"LDA topicConcentration must be > 1.0 (or -1 for auto), but was set to $topicConcentration")
161148
this.topicConcentration = topicConcentration
162149
this
163150
}
@@ -223,14 +210,15 @@ class LDA private (
223210

224211
/**
225212
* Set the LDAOptimizer used to perform the actual calculation by algorithm name.
226-
* Currently "em" is supported.
213+
* Currently "em", "online" is supported.
227214
*/
228215
def setOptimizer(optimizerName: String): this.type = {
229216
this.ldaOptimizer =
230217
optimizerName.toLowerCase match {
231218
case "em" => new EMLDAOptimizer
219+
case "online" => new OnlineLDAOptimizer
232220
case other =>
233-
throw new IllegalArgumentException(s"Only em is supported but got $other.")
221+
throw new IllegalArgumentException(s"Only em, online are supported but got $other.")
234222
}
235223
this
236224
}
@@ -245,8 +233,7 @@ class LDA private (
245233
* @return Inferred LDA model
246234
*/
247235
def run(documents: RDD[(Long, Vector)]): LDAModel = {
248-
val state = ldaOptimizer.initialState(documents, k, getDocConcentration, getTopicConcentration,
249-
seed, checkpointInterval)
236+
val state = ldaOptimizer.initialize(documents, this)
250237
var iter = 0
251238
val iterationTimes = Array.fill[Double](maxIterations)(0)
252239
while (iter < maxIterations) {

0 commit comments

Comments
 (0)