Skip to content

Commit b01b262

Browse files
yanboliangmengxr
authored andcommitted
[SPARK-9773] [ML] [PySpark] Add Python API for MultilayerPerceptronClassifier
Add Python API for ```MultilayerPerceptronClassifier```. Author: Yanbo Liang <[email protected]> Closes #8067 from yanboliang/SPARK-9773.
1 parent b656e61 commit b01b262

File tree

2 files changed

+140
-1
lines changed

2 files changed

+140
-1
lines changed

mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.spark.ml.classification
1919

20+
import scala.collection.JavaConverters._
21+
2022
import org.apache.spark.annotation.Experimental
2123
import org.apache.spark.ml.param.shared.{HasTol, HasMaxIter, HasSeed}
2224
import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor}
@@ -181,6 +183,13 @@ class MultilayerPerceptronClassificationModel private[ml] (
181183

182184
private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights)
183185

186+
/**
187+
* Returns layers in a Java List.
188+
*/
189+
private[ml] def javaLayers: java.util.List[Int] = {
190+
layers.toList.asJava
191+
}
192+
184193
/**
185194
* Predict label for the given features.
186195
* This internal method is used to implement [[transform()]] and output [[predictionCol]].

python/pyspark/ml/classification.py

Lines changed: 131 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
__all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier',
2727
'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
2828
'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
29-
'NaiveBayesModel']
29+
'NaiveBayesModel', 'MultilayerPerceptronClassifier',
30+
'MultilayerPerceptronClassificationModel']
3031

3132

3233
@inherit_doc
@@ -755,6 +756,135 @@ def theta(self):
755756
return self._call_java("theta")
756757

757758

759+
@inherit_doc
760+
class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
761+
HasMaxIter, HasTol, HasSeed):
762+
"""
763+
Classifier trainer based on the Multilayer Perceptron.
764+
Each layer has sigmoid activation function, output layer has softmax.
765+
Number of inputs has to be equal to the size of feature vectors.
766+
Number of outputs has to be equal to the total number of labels.
767+
768+
>>> from pyspark.mllib.linalg import Vectors
769+
>>> df = sqlContext.createDataFrame([
770+
... (0.0, Vectors.dense([0.0, 0.0])),
771+
... (1.0, Vectors.dense([0.0, 1.0])),
772+
... (1.0, Vectors.dense([1.0, 0.0])),
773+
... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"])
774+
>>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 5, 2], blockSize=1, seed=11)
775+
>>> model = mlp.fit(df)
776+
>>> model.layers
777+
[2, 5, 2]
778+
>>> model.weights.size
779+
27
780+
>>> testDF = sqlContext.createDataFrame([
781+
... (Vectors.dense([1.0, 0.0]),),
782+
... (Vectors.dense([0.0, 0.0]),)], ["features"])
783+
>>> model.transform(testDF).show()
784+
+---------+----------+
785+
| features|prediction|
786+
+---------+----------+
787+
|[1.0,0.0]| 1.0|
788+
|[0.0,0.0]| 0.0|
789+
+---------+----------+
790+
...
791+
"""
792+
793+
# a placeholder to make it appear in the generated doc
794+
layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
795+
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
796+
"neurons and output layer of 10 neurons, default is [1, 1].")
797+
blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " +
798+
"matrices. Data is stacked within partitions. If block size is more than " +
799+
"remaining data in a partition then it is adjusted to the size of this " +
800+
"data. Recommended size is between 10 and 1000, default is 128.")
801+
802+
@keyword_only
803+
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
804+
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
805+
"""
806+
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
807+
maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
808+
"""
809+
super(MultilayerPerceptronClassifier, self).__init__()
810+
self._java_obj = self._new_java_obj(
811+
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
812+
self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
813+
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
814+
"100 neurons and output layer of 10 neurons, default is [1, 1].")
815+
self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
816+
"matrices. Data is stacked within partitions. If block size is " +
817+
"more than remaining data in a partition then it is adjusted to " +
818+
"the size of this data. Recommended size is between 10 and 1000, " +
819+
"default is 128.")
820+
self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
821+
kwargs = self.__init__._input_kwargs
822+
self.setParams(**kwargs)
823+
824+
@keyword_only
825+
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
826+
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
827+
"""
828+
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
829+
maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
830+
Sets params for MultilayerPerceptronClassifier.
831+
"""
832+
kwargs = self.setParams._input_kwargs
833+
if layers is None:
834+
return self._set(**kwargs).setLayers([1, 1])
835+
else:
836+
return self._set(**kwargs)
837+
838+
def _create_model(self, java_model):
839+
return MultilayerPerceptronClassificationModel(java_model)
840+
841+
def setLayers(self, value):
842+
"""
843+
Sets the value of :py:attr:`layers`.
844+
"""
845+
self._paramMap[self.layers] = value
846+
return self
847+
848+
def getLayers(self):
849+
"""
850+
Gets the value of layers or its default value.
851+
"""
852+
return self.getOrDefault(self.layers)
853+
854+
def setBlockSize(self, value):
855+
"""
856+
Sets the value of :py:attr:`blockSize`.
857+
"""
858+
self._paramMap[self.blockSize] = value
859+
return self
860+
861+
def getBlockSize(self):
862+
"""
863+
Gets the value of blockSize or its default value.
864+
"""
865+
return self.getOrDefault(self.blockSize)
866+
867+
868+
class MultilayerPerceptronClassificationModel(JavaModel):
869+
"""
870+
Model fitted by MultilayerPerceptronClassifier.
871+
"""
872+
873+
@property
874+
def layers(self):
875+
"""
876+
array of layer sizes including input and output layers.
877+
"""
878+
return self._call_java("javaLayers")
879+
880+
@property
881+
def weights(self):
882+
"""
883+
vector of initial weights for the model that consists of the weights of layers.
884+
"""
885+
return self._call_java("weights")
886+
887+
758888
if __name__ == "__main__":
759889
import doctest
760890
from pyspark.context import SparkContext

0 commit comments

Comments
 (0)