From 3ac8874c351a228b596dfac018eb1578876aebef Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Sun, 27 Jul 2014 22:23:06 -0400 Subject: [PATCH 1/9] Added support for regularizer and intercection parameters for linear regression method. --- .../mllib/api/python/PythonMLLibAPI.scala | 59 +++++++++++++++++++ python/pyspark/mllib/regression.py | 17 ++++++ 2 files changed, 76 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 954621ee8b933..a4170dbd8a20a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -24,6 +24,7 @@ import org.apache.spark.api.java.{JavaSparkContext, JavaRDD} import org.apache.spark.mllib.classification._ import org.apache.spark.mllib.clustering._ import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} +import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.recommendation._ import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.util.MLUtils @@ -261,6 +262,64 @@ class PythonMLLibAPI extends Serializable { initialWeightsBA) } + /** + * Java stub for Python mllib LinearRegressionWithSGD.train() function + * allowing users to define the regularizer and intercept parameters using L2 + * optimization. + */ + def trainLinearRegressionModelWithSGDL2Opt( + dataBytesJRDD: JavaRDD[Array[Byte]], + numIterations: Int, + stepSize: Double, + regParam: Double, + intercept: Boolean, + miniBatchFraction: Double, + initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { + val lrAlg = new LinearRegressionWithSGD() + lrAlg.setIntercept(intercept) + lrAlg.optimizer. + setNumIterations(numIterations). + setRegParam(regParam). + setStepSize(stepSize). + setUpdater(new SquaredL2Updater) + trainRegressionModel( + (data, initialWeights) => + lrAlg.run( + data, + initialWeights), + dataBytesJRDD, + initialWeightsBA) + } + + /** + * Java stub for Python mllib LinearRegressionWithSGD.train() function + * allowing users to define the regularizer and intercept parameters using L1 + * optimization. + */ + def trainLinearRegressionModelWithSGDL1Opt( + dataBytesJRDD: JavaRDD[Array[Byte]], + numIterations: Int, + stepSize: Double, + regParam: Double, + intercept: Boolean, + miniBatchFraction: Double, + initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { + val lrAlg = new LinearRegressionWithSGD() + lrAlg.setIntercept(intercept) + lrAlg.optimizer. + setNumIterations(numIterations). + setRegParam(regParam). + setStepSize(stepSize). + setUpdater(new L1Updater) + trainRegressionModel( + (data, initialWeights) => + lrAlg.run( + data, + initialWeights), + dataBytesJRDD, + initialWeightsBA) + } + /** * Java stub for Python mllib LassoWithSGD.train() */ diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index b84bc531dec8c..fa50407909c7a 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -120,6 +120,23 @@ def train(cls, data, iterations=100, step=1.0, d._jrdd, iterations, step, miniBatchFraction, i) return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) + @classmethod + def trainL2Opt(cls, data, iterations=100, step=1.0, regParam=1.0, + intercept=False, miniBatchFraction=1.0, initialWeights=None): + """Train a linear regression model on the given data using L2 optimizer.""" + sc = data.context + train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGDL2Opt( + d._jrdd, iterations, step, regParam, intercept, miniBatchFraction, i) + return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) + + @classmethod + def trainL1Opt(cls, data, iterations=100, step=1.0, regParam=1.0, + intercept=False, miniBatchFraction=1.0, initialWeights=None): + """Train a linear regression model on the given data using L1 optimizer.""" + sc = data.context + train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGDL1Opt( + d._jrdd, iterations, step, regParam, intercept, miniBatchFraction, i) + return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) class LassoModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an From 78853ec7fa680e8256e94a64c43dd4f7b886af89 Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Wed, 30 Jul 2014 00:55:39 -0400 Subject: [PATCH 2/9] Providing intercept and regualizer functionallity for linear methods in only one function. --- .../mllib/api/python/PythonMLLibAPI.scala | 74 +++++-------------- python/pyspark/mllib/regression.py | 44 +++++------ 2 files changed, 39 insertions(+), 79 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index a4170dbd8a20a..fbce412c773ef 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -43,6 +43,16 @@ class PythonMLLibAPI extends Serializable { private val DENSE_MATRIX_MAGIC: Byte = 3 private val LABELED_POINT_MAGIC: Byte = 4 + /** + * Enumeration used to define the type of Regularizer + * used for linear methods. + */ + object RegularizerType extends Serializable { + val L2 : Int = 0 + val L1 : Int = 1 + val NONE : Int = 2 + } + private[python] def deserializeDoubleVector(bytes: Array[Byte], offset: Int = 0): Vector = { require(bytes.length - offset >= 5, "Byte array too short") val magic = bytes(offset) @@ -245,33 +255,11 @@ class PythonMLLibAPI extends Serializable { * Java stub for Python mllib LinearRegressionWithSGD.train() */ def trainLinearRegressionModelWithSGD( - dataBytesJRDD: JavaRDD[Array[Byte]], - numIterations: Int, - stepSize: Double, - miniBatchFraction: Double, - initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { - trainRegressionModel( - (data, initialWeights) => - LinearRegressionWithSGD.train( - data, - numIterations, - stepSize, - miniBatchFraction, - initialWeights), - dataBytesJRDD, - initialWeightsBA) - } - - /** - * Java stub for Python mllib LinearRegressionWithSGD.train() function - * allowing users to define the regularizer and intercept parameters using L2 - * optimization. - */ - def trainLinearRegressionModelWithSGDL2Opt( dataBytesJRDD: JavaRDD[Array[Byte]], numIterations: Int, stepSize: Double, regParam: Double, + regType: Int, intercept: Boolean, miniBatchFraction: Double, initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { @@ -280,42 +268,14 @@ class PythonMLLibAPI extends Serializable { lrAlg.optimizer. setNumIterations(numIterations). setRegParam(regParam). - setStepSize(stepSize). - setUpdater(new SquaredL2Updater) + setStepSize(stepSize) + if (regType == RegularizerType.L2) + lrAlg.optimizer.setUpdater(new SquaredL2Updater) + else if (regType == RegularizerType.L1) + lrAlg.optimizer.setUpdater(new L1Updater) trainRegressionModel( (data, initialWeights) => - lrAlg.run( - data, - initialWeights), - dataBytesJRDD, - initialWeightsBA) - } - - /** - * Java stub for Python mllib LinearRegressionWithSGD.train() function - * allowing users to define the regularizer and intercept parameters using L1 - * optimization. - */ - def trainLinearRegressionModelWithSGDL1Opt( - dataBytesJRDD: JavaRDD[Array[Byte]], - numIterations: Int, - stepSize: Double, - regParam: Double, - intercept: Boolean, - miniBatchFraction: Double, - initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { - val lrAlg = new LinearRegressionWithSGD() - lrAlg.setIntercept(intercept) - lrAlg.optimizer. - setNumIterations(numIterations). - setRegParam(regParam). - setStepSize(stepSize). - setUpdater(new L1Updater) - trainRegressionModel( - (data, initialWeights) => - lrAlg.run( - data, - initialWeights), + lrAlg.run(data, initialWeights), dataBytesJRDD, initialWeightsBA) } diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index fa50407909c7a..644ae849d2033 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -109,33 +109,33 @@ class LinearRegressionModel(LinearRegressionModelBase): True """ +class RegularizerType(object): + L2 = 0 + L1 = 1 + NONE = 2 class LinearRegressionWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, - miniBatchFraction=1.0, initialWeights=None): + def train(cls, data, iterations=100, step=1.0, regParam=1.0, regType=None, + intercept=False, miniBatchFraction=1.0, initialWeights=None): """Train a linear regression model on the given data.""" sc = data.context - train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, miniBatchFraction, i) - return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) - - @classmethod - def trainL2Opt(cls, data, iterations=100, step=1.0, regParam=1.0, - intercept=False, miniBatchFraction=1.0, initialWeights=None): - """Train a linear regression model on the given data using L2 optimizer.""" - sc = data.context - train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGDL2Opt( - d._jrdd, iterations, step, regParam, intercept, miniBatchFraction, i) - return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) - - @classmethod - def trainL1Opt(cls, data, iterations=100, step=1.0, regParam=1.0, - intercept=False, miniBatchFraction=1.0, initialWeights=None): - """Train a linear regression model on the given data using L1 optimizer.""" - sc = data.context - train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGDL1Opt( - d._jrdd, iterations, step, regParam, intercept, miniBatchFraction, i) + if regType is None: + train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( + d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().NONE(), + intercept, miniBatchFraction, i) + elif regType == RegularizerType.L2: + train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( + d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().L2(), + intercept, miniBatchFraction, i) + elif regType == RegularizerType.L1: + train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( + d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().L1(), + intercept, miniBatchFraction, i) + elif regType == RegularizerType.NONE: + train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( + d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().NONE(), + intercept, miniBatchFraction, i) return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) class LassoModel(LinearRegressionModelBase): From b962744906857b8096df9beb119373b4a3bf4bf9 Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Wed, 30 Jul 2014 20:24:42 -0400 Subject: [PATCH 3/9] Replaced the enum classes, with strings-keywords for defining the values of 'regType' parameter. --- .../mllib/api/python/PythonMLLibAPI.scala | 16 ++-------- python/pyspark/mllib/regression.py | 32 +++++++++---------- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index fbce412c773ef..b4f3b23199ac5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -43,16 +43,6 @@ class PythonMLLibAPI extends Serializable { private val DENSE_MATRIX_MAGIC: Byte = 3 private val LABELED_POINT_MAGIC: Byte = 4 - /** - * Enumeration used to define the type of Regularizer - * used for linear methods. - */ - object RegularizerType extends Serializable { - val L2 : Int = 0 - val L1 : Int = 1 - val NONE : Int = 2 - } - private[python] def deserializeDoubleVector(bytes: Array[Byte], offset: Int = 0): Vector = { require(bytes.length - offset >= 5, "Byte array too short") val magic = bytes(offset) @@ -259,7 +249,7 @@ class PythonMLLibAPI extends Serializable { numIterations: Int, stepSize: Double, regParam: Double, - regType: Int, + regType: String, intercept: Boolean, miniBatchFraction: Double, initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { @@ -269,9 +259,9 @@ class PythonMLLibAPI extends Serializable { setNumIterations(numIterations). setRegParam(regParam). setStepSize(stepSize) - if (regType == RegularizerType.L2) + if (regType == "SquaredUpdater") lrAlg.optimizer.setUpdater(new SquaredL2Updater) - else if (regType == RegularizerType.L1) + else if (regType == "L1Updater") lrAlg.optimizer.setUpdater(new L1Updater) trainRegressionModel( (data, initialWeights) => diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 644ae849d2033..e14952f67f5e7 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -109,33 +109,31 @@ class LinearRegressionModel(LinearRegressionModelBase): True """ -class RegularizerType(object): - L2 = 0 - L1 = 1 - NONE = 2 - class LinearRegressionWithSGD(object): @classmethod def train(cls, data, iterations=100, step=1.0, regParam=1.0, regType=None, intercept=False, miniBatchFraction=1.0, initialWeights=None): - """Train a linear regression model on the given data.""" + """Train a linear regression model on the given data. The 'regType' parameter can take + one from the following string values: "L1Updater" for invoking the lasso regularizer, + "SquaredUpdater" for invoking the ridge regularizer or "NONE" for not using a + regularizer at all. The user can determine the regularizer parameter by setting the + appropriate value to variable 'regParam' (by default is set to 1.0).""" sc = data.context if regType is None: train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().NONE(), - intercept, miniBatchFraction, i) - elif regType == RegularizerType.L2: + d._jrdd, iterations, step, regParam, "NONE", intercept, miniBatchFraction, i) + elif regType == "SquaredUpdater": train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().L2(), - intercept, miniBatchFraction, i) - elif regType == RegularizerType.L1: + d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) + elif regType == "L1Updater": train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().L1(), - intercept, miniBatchFraction, i) - elif regType == RegularizerType.NONE: + d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) + elif regType == "NONE": train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, sc._jvm.PythonMLLibAPI().RegularizerType().NONE(), - intercept, miniBatchFraction, i) + d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) + else: + raise ValueError("Invalid value for 'regType' parameter. Can only be initialized " + + "using the following string values [L1Updater, SquaredUpdater, NONE].") return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) class LassoModel(LinearRegressionModelBase): From ec50ee90dd02d62e31fa600d3d76dd4db01b695e Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Wed, 30 Jul 2014 20:55:10 -0400 Subject: [PATCH 4/9] Shorten the if-elif-else statement in regression.py file --- python/pyspark/mllib/regression.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index e14952f67f5e7..dbf0600f1db8b 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -122,13 +122,7 @@ def train(cls, data, iterations=100, step=1.0, regParam=1.0, regType=None, if regType is None: train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( d._jrdd, iterations, step, regParam, "NONE", intercept, miniBatchFraction, i) - elif regType == "SquaredUpdater": - train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) - elif regType == "L1Updater": - train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) - elif regType == "NONE": + elif regType == "SquaredUpdater" or regType == "L1Updater" or regType == "NONE": train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) else: From 638be479c474dbb91cdbffdf0f5e8568877a9288 Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Wed, 30 Jul 2014 23:16:07 -0400 Subject: [PATCH 5/9] Modified code to comply with code standards. --- .../mllib/api/python/PythonMLLibAPI.scala | 12 +++---- python/pyspark/mllib/regression.py | 34 ++++++++++++++----- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index b4f3b23199ac5..0ab20777e9b7a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -255,13 +255,13 @@ class PythonMLLibAPI extends Serializable { initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { val lrAlg = new LinearRegressionWithSGD() lrAlg.setIntercept(intercept) - lrAlg.optimizer. - setNumIterations(numIterations). - setRegParam(regParam). - setStepSize(stepSize) - if (regType == "SquaredUpdater") + lrAlg.optimizer + .setNumIterations(numIterations) + .setRegParam(regParam) + .setStepSize(stepSize) + if (regType == "l2") lrAlg.optimizer.setUpdater(new SquaredL2Updater) - else if (regType == "L1Updater") + else if (regType == "l1") lrAlg.optimizer.setUpdater(new L1Updater) trainRegressionModel( (data, initialWeights) => diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index dbf0600f1db8b..a4a13f214ffe8 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -113,21 +113,39 @@ class LinearRegressionWithSGD(object): @classmethod def train(cls, data, iterations=100, step=1.0, regParam=1.0, regType=None, intercept=False, miniBatchFraction=1.0, initialWeights=None): - """Train a linear regression model on the given data. The 'regType' parameter can take - one from the following string values: "L1Updater" for invoking the lasso regularizer, - "SquaredUpdater" for invoking the ridge regularizer or "NONE" for not using a - regularizer at all. The user can determine the regularizer parameter by setting the - appropriate value to variable 'regParam' (by default is set to 1.0).""" + """ + Train a linear regression model on the given data. + + @param data: The training data. + @param iterations: The number of iterations (default: 100). + @param step: The step parameter used in SGD + (default: 1.0). + @param regParam: The regularizer parameter (default: 1.0). + @param regType: The type of regularizer used for training + our model. + Allowed values: "l1" for using L1Updater, + "l2" for using + SquaredL2Updater, + "none" for no regularizer. + (default: None) + @param intercept: Boolean parameter which indicates the use + or not of the augmented representation for + training data (i.e. whether bias features + are activated or not). + @param miniBatchFraction: Fraction of data to be used for each SGD + iteration. + @param initialWeights: The initial weights (default: None). + """ sc = data.context if regType is None: train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, "NONE", intercept, miniBatchFraction, i) - elif regType == "SquaredUpdater" or regType == "L1Updater" or regType == "NONE": + d._jrdd, iterations, step, regParam, "none", intercept, miniBatchFraction, i) + elif regType == "l2" or regType == "l1" or regType == "none": train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) else: raise ValueError("Invalid value for 'regType' parameter. Can only be initialized " + - "using the following string values [L1Updater, SquaredUpdater, NONE].") + "using the following string values: [l1, l2, none].") return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) class LassoModel(LinearRegressionModelBase): From 8eba9c57ba6d295e6f465ea08f862624a87afbac Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Fri, 1 Aug 2014 19:13:46 -0400 Subject: [PATCH 6/9] Change function signatures. Exception is thrown from the scala component and not from the python one. --- .../mllib/api/python/PythonMLLibAPI.scala | 9 ++++--- python/pyspark/mllib/regression.py | 24 ++++++++----------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 0ab20777e9b7a..729d639c86e7e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -248,11 +248,11 @@ class PythonMLLibAPI extends Serializable { dataBytesJRDD: JavaRDD[Array[Byte]], numIterations: Int, stepSize: Double, + miniBatchFraction: Double, + initialWeightsBA: Array[Byte], regParam: Double, regType: String, - intercept: Boolean, - miniBatchFraction: Double, - initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = { + intercept: Boolean): java.util.List[java.lang.Object] = { val lrAlg = new LinearRegressionWithSGD() lrAlg.setIntercept(intercept) lrAlg.optimizer @@ -263,6 +263,9 @@ class PythonMLLibAPI extends Serializable { lrAlg.optimizer.setUpdater(new SquaredL2Updater) else if (regType == "l1") lrAlg.optimizer.setUpdater(new L1Updater) + else if (regType != "none") + throw new java.lang.IllegalArgumentException("Invalid value for 'regType' parameter." + + "Can only be initialized using the following string values: [l1, l2, none].") trainRegressionModel( (data, initialWeights) => lrAlg.run(data, initialWeights), diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index a4a13f214ffe8..32a50d367a9d2 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -111,8 +111,8 @@ class LinearRegressionModel(LinearRegressionModelBase): class LinearRegressionWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, regParam=1.0, regType=None, - intercept=False, miniBatchFraction=1.0, initialWeights=None): + def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, + initialWeights=None, regParam=1.0, regType=None, intercept=False): """ Train a linear regression model on the given data. @@ -120,6 +120,9 @@ def train(cls, data, iterations=100, step=1.0, regParam=1.0, regType=None, @param iterations: The number of iterations (default: 100). @param step: The step parameter used in SGD (default: 1.0). + @param miniBatchFraction: Fraction of data to be used for each SGD + iteration. + @param initialWeights: The initial weights (default: None). @param regParam: The regularizer parameter (default: 1.0). @param regType: The type of regularizer used for training our model. @@ -127,27 +130,20 @@ def train(cls, data, iterations=100, step=1.0, regParam=1.0, regType=None, "l2" for using SquaredL2Updater, "none" for no regularizer. - (default: None) + (default: "none") @param intercept: Boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias features are activated or not). - @param miniBatchFraction: Fraction of data to be used for each SGD - iteration. - @param initialWeights: The initial weights (default: None). """ sc = data.context if regType is None: - train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, "none", intercept, miniBatchFraction, i) - elif regType == "l2" or regType == "l1" or regType == "none": - train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, regParam, regType, intercept, miniBatchFraction, i) - else: - raise ValueError("Invalid value for 'regType' parameter. Can only be initialized " + - "using the following string values: [l1, l2, none].") + regType = "none" + train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( + d._jrdd, iterations, step, miniBatchFraction, i, regParam, regType, intercept) return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights) + class LassoModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an l_1 penalty term. From 44e6ff0ab39d3937a0553116acabe9ea3ad6107d Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Fri, 1 Aug 2014 19:19:04 -0400 Subject: [PATCH 7/9] Adding a blank line before python class LinearRegressionWithSGD. --- python/pyspark/mllib/regression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 32a50d367a9d2..041b119269427 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -109,6 +109,7 @@ class LinearRegressionModel(LinearRegressionModelBase): True """ + class LinearRegressionWithSGD(object): @classmethod def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, From fed8eaa95202e99ab075396bde1bf4cc56c474e7 Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Fri, 1 Aug 2014 19:36:54 -0400 Subject: [PATCH 8/9] Adding a space in the message related to the IllegalArgumentException. --- .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 729d639c86e7e..785ccef51d2a6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -265,7 +265,7 @@ class PythonMLLibAPI extends Serializable { lrAlg.optimizer.setUpdater(new L1Updater) else if (regType != "none") throw new java.lang.IllegalArgumentException("Invalid value for 'regType' parameter." - + "Can only be initialized using the following string values: [l1, l2, none].") + + " Can only be initialized using the following string values: [l1, l2, none].") trainRegressionModel( (data, initialWeights) => lrAlg.run(data, initialWeights), From 8dcb888048d1b7879f4a253ec58fee99829c7e37 Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Fri, 1 Aug 2014 21:03:45 -0400 Subject: [PATCH 9/9] Putting the if/else if statements in brackets. --- .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 785ccef51d2a6..82a993d03119b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -259,13 +259,14 @@ class PythonMLLibAPI extends Serializable { .setNumIterations(numIterations) .setRegParam(regParam) .setStepSize(stepSize) - if (regType == "l2") + if (regType == "l2") { lrAlg.optimizer.setUpdater(new SquaredL2Updater) - else if (regType == "l1") + } else if (regType == "l1") { lrAlg.optimizer.setUpdater(new L1Updater) - else if (regType != "none") + } else if (regType != "none") { throw new java.lang.IllegalArgumentException("Invalid value for 'regType' parameter." + " Can only be initialized using the following string values: [l1, l2, none].") + } trainRegressionModel( (data, initialWeights) => lrAlg.run(data, initialWeights),