-
Notifications
You must be signed in to change notification settings - Fork 28.7k
[SPARK-9773] [ML] [PySpark] Add Python API for MultilayerPerceptronClassifier #8067
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
70d1da9
8510817
abec976
8c94570
db3c676
b093862
5ac6a70
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,8 @@ | |
__all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier', | ||
'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel', | ||
'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes', | ||
'NaiveBayesModel'] | ||
'NaiveBayesModel', 'MultilayerPerceptronClassifier', | ||
'MultilayerPerceptronClassificationModel'] | ||
|
||
|
||
@inherit_doc | ||
|
@@ -808,6 +809,135 @@ def theta(self): | |
return self._call_java("theta") | ||
|
||
|
||
@inherit_doc | ||
class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, | ||
HasMaxIter, HasTol, HasSeed): | ||
""" | ||
Classifier trainer based on the Multilayer Perceptron. | ||
Each layer has sigmoid activation function, output layer has softmax. | ||
Number of inputs has to be equal to the size of feature vectors. | ||
Number of outputs has to be equal to the total number of labels. | ||
|
||
>>> from pyspark.mllib.linalg import Vectors | ||
>>> df = sqlContext.createDataFrame([ | ||
... (0.0, Vectors.dense([0.0, 0.0])), | ||
... (1.0, Vectors.dense([0.0, 1.0])), | ||
... (1.0, Vectors.dense([1.0, 0.0])), | ||
... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) | ||
>>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 5, 2], blockSize=1, seed=11) | ||
>>> model = mlp.fit(df) | ||
>>> model.layers | ||
[2, 5, 2] | ||
>>> model.weights.size | ||
27 | ||
>>> testDF = sqlContext.createDataFrame([ | ||
... (Vectors.dense([1.0, 0.0]),), | ||
... (Vectors.dense([0.0, 0.0]),)], ["features"]) | ||
>>> model.transform(testDF).show() | ||
+---------+----------+ | ||
| features|prediction| | ||
+---------+----------+ | ||
|[1.0,0.0]| 1.0| | ||
|[0.0,0.0]| 0.0| | ||
+---------+----------+ | ||
... | ||
""" | ||
|
||
# a placeholder to make it appear in the generated doc | ||
layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + | ||
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + | ||
"neurons and output layer of 10 neurons, default is [1, 1].") | ||
blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " + | ||
"matrices. Data is stacked within partitions. If block size is more than " + | ||
"remaining data in a partition then it is adjusted to the size of this " + | ||
"data. Recommended size is between 10 and 1000, default is 128.") | ||
|
||
@keyword_only | ||
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", | ||
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): | ||
""" | ||
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ | ||
maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) | ||
""" | ||
super(MultilayerPerceptronClassifier, self).__init__() | ||
self._java_obj = self._new_java_obj( | ||
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) | ||
self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " + | ||
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " + | ||
"100 neurons and output layer of 10 neurons, default is [1, 1].") | ||
self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " + | ||
"matrices. Data is stacked within partitions. If block size is " + | ||
"more than remaining data in a partition then it is adjusted to " + | ||
"the size of this data. Recommended size is between 10 and 1000, " + | ||
"default is 128.") | ||
self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128) | ||
kwargs = self.__init__._input_kwargs | ||
self.setParams(**kwargs) | ||
|
||
@keyword_only | ||
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", | ||
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): | ||
""" | ||
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ | ||
maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) | ||
Sets params for MultilayerPerceptronClassifier. | ||
""" | ||
kwargs = self.setParams._input_kwargs | ||
if layers is None: | ||
return self._set(**kwargs).setLayers([1, 1]) | ||
else: | ||
return self._set(**kwargs) | ||
|
||
def _create_model(self, java_model): | ||
return MultilayerPerceptronClassificationModel(java_model) | ||
|
||
def setLayers(self, value): | ||
""" | ||
Sets the value of :py:attr:`layers`. | ||
""" | ||
self._paramMap[self.layers] = value | ||
return self | ||
|
||
def getLayers(self): | ||
""" | ||
Gets the value of layers or its default value. | ||
""" | ||
return self.getOrDefault(self.layers) | ||
|
||
def setBlockSize(self, value): | ||
""" | ||
Sets the value of :py:attr:`blockSize`. | ||
""" | ||
self._paramMap[self.blockSize] = value | ||
return self | ||
|
||
def getBlockSize(self): | ||
""" | ||
Gets the value of blockSize or its default value. | ||
""" | ||
return self.getOrDefault(self.blockSize) | ||
|
||
|
||
class MultilayerPerceptronClassificationModel(JavaModel): | ||
""" | ||
Model fitted by MultilayerPerceptronClassifier. | ||
""" | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we also need to make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shall we add a package private method to Scala's MPCM that returns a Java list of integers? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree, done. |
||
@property | ||
def layers(self): | ||
""" | ||
array of layer sizes including input and output layers. | ||
""" | ||
return self._call_java("javaLayers") | ||
|
||
@property | ||
def weights(self): | ||
""" | ||
vector of initial weights for the model that consists of the weights of layers. | ||
""" | ||
return self._call_java("weights") | ||
|
||
|
||
if __name__ == "__main__": | ||
import doctest | ||
from pyspark.context import SparkContext | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
L882 we still keep
layers=[1, 1]
in doc to tell users the default value.