18
18
package org .apache .spark .mllib .regression
19
19
20
20
import org .apache .spark .annotation .DeveloperApi
21
+ import org .apache .spark .mllib .feature .StandardScaler
21
22
import org .apache .spark .{Logging , SparkException }
22
23
import org .apache .spark .rdd .RDD
23
24
import org .apache .spark .mllib .optimization ._
@@ -94,6 +95,22 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
94
95
95
96
protected var validateData : Boolean = true
96
97
98
+ /**
99
+ * Whether to perform feature scaling before model training to reduce the condition numbers
100
+ * which can significantly help the optimizer converging faster. The scaling correction will be
101
+ * translated back to resulting model weights, so it's transparent to users.
102
+ * Note: This technique is used in both libsvm and glmnet packages. Default false.
103
+ */
104
+ private var useFeatureScaling = false
105
+
106
+ /**
107
+ * Set if the algorithm should use feature scaling to improve the convergence during optimization.
108
+ */
109
+ private [mllib] def setFeatureScaling (useFeatureScaling : Boolean ): this .type = {
110
+ this .useFeatureScaling = useFeatureScaling
111
+ this
112
+ }
113
+
97
114
/**
98
115
* Create a model given the weights and intercept
99
116
*/
@@ -137,11 +154,45 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
137
154
throw new SparkException (" Input validation failed." )
138
155
}
139
156
157
+ /**
158
+ * Scaling to minimize the condition number:
159
+ *
160
+ * During the optimization process, the convergence (rate) depends on the condition number of
161
+ * the training dataset. Scaling the variables often reduces this condition number, thus
162
+ * improving the convergence rate dramatically. Without reducing the condition number,
163
+ * some training datasets mixing the columns with different scales may not be able to converge.
164
+ *
165
+ * GLMNET and LIBSVM packages perform the scaling to reduce the condition number, and return
166
+ * the weights in the original scale.
167
+ * See page 9 in http://cran.r-project.org/web/packages/glmnet/glmnet.pdf
168
+ *
169
+ * Here, if useFeatureScaling is enabled, we will standardize the training features by dividing
170
+ * the variance of each column (without subtracting the mean), and train the model in the
171
+ * scaled space. Then we transform the coefficients from the scaled space to the original scale
172
+ * as GLMNET and LIBSVM do.
173
+ *
174
+ * Currently, it's only enabled in LogisticRegressionWithLBFGS
175
+ */
176
+ val scaler = if (useFeatureScaling) {
177
+ (new StandardScaler ).fit(input.map(x => x.features))
178
+ } else {
179
+ null
180
+ }
181
+
140
182
// Prepend an extra variable consisting of all 1.0's for the intercept.
141
183
val data = if (addIntercept) {
142
- input.map(labeledPoint => (labeledPoint.label, appendBias(labeledPoint.features)))
184
+ if (useFeatureScaling) {
185
+ input.map(labeledPoint =>
186
+ (labeledPoint.label, appendBias(scaler.transform(labeledPoint.features))))
187
+ } else {
188
+ input.map(labeledPoint => (labeledPoint.label, appendBias(labeledPoint.features)))
189
+ }
143
190
} else {
144
- input.map(labeledPoint => (labeledPoint.label, labeledPoint.features))
191
+ if (useFeatureScaling) {
192
+ input.map(labeledPoint => (labeledPoint.label, scaler.transform(labeledPoint.features)))
193
+ } else {
194
+ input.map(labeledPoint => (labeledPoint.label, labeledPoint.features))
195
+ }
145
196
}
146
197
147
198
val initialWeightsWithIntercept = if (addIntercept) {
@@ -153,13 +204,25 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
153
204
val weightsWithIntercept = optimizer.optimize(data, initialWeightsWithIntercept)
154
205
155
206
val intercept = if (addIntercept) weightsWithIntercept(weightsWithIntercept.size - 1 ) else 0.0
156
- val weights =
207
+ var weights =
157
208
if (addIntercept) {
158
209
Vectors .dense(weightsWithIntercept.toArray.slice(0 , weightsWithIntercept.size - 1 ))
159
210
} else {
160
211
weightsWithIntercept
161
212
}
162
213
214
+ /**
215
+ * The weights and intercept are trained in the scaled space; we're converting them back to
216
+ * the original scale.
217
+ *
218
+ * Math shows that if we only perform standardization without subtracting means, the intercept
219
+ * will not be changed. w_i = w_i' / v_i where w_i' is the coefficient in the scaled space, w_i
220
+ * is the coefficient in the original space, and v_i is the variance of the column i.
221
+ */
222
+ if (useFeatureScaling) {
223
+ weights = scaler.transform(weights)
224
+ }
225
+
163
226
createModel(weights, intercept)
164
227
}
165
228
}
0 commit comments