Skip to content

[Spark-7511][MLLIB] pyspark ml seed param should be random by default or 42 is quite funny but not very random #6139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
>>> si_model = stringIndexer.fit(df)
>>> td = si_model.transform(df)
>>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed")
>>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42)
>>> model = rf.fit(td)
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> model.transform(test0).head().prediction
Expand All @@ -319,12 +319,12 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",
numTrees=20, featureSubsetStrategy="auto", seed=42):
numTrees=20, featureSubsetStrategy="auto", seed=None):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
numTrees=20, featureSubsetStrategy="auto", seed=42)
numTrees=20, featureSubsetStrategy="auto", seed=None)
"""
super(RandomForestClassifier, self).__init__()
self._java_obj = self._new_java_obj(
Expand All @@ -347,20 +347,20 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
"The number of features to consider for splits at each tree node. Supported " +
"options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
impurity="gini", numTrees=20, featureSubsetStrategy="auto")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
impurity="gini", numTrees=20, featureSubsetStrategy="auto"):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
impurity="gini", numTrees=20, featureSubsetStrategy="auto")
Sets params for linear classification.
"""
Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,10 +876,10 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has

@keyword_only
def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=42, inputCol=None, outputCol=None):
seed=None, inputCol=None, outputCol=None):
"""
__init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \
seed=42, inputCol=None, outputCol=None)
seed=None, inputCol=None, outputCol=None)
"""
super(Word2Vec, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
Expand All @@ -891,15 +891,15 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
"the minimum number of times a token must appear to be included " +
"in the word2vec model's vocabulary")
self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=42)
seed=None)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=42, inputCol=None, outputCol=None):
seed=None, inputCol=None, outputCol=None):
"""
setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=42, \
setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \
inputCol=None, outputCol=None)
Sets params for this Word2Vec.
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/param/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def hasParam(self, paramName):
def getOrDefault(self, param):
"""
Gets the value of a param in the user-supplied param map or its
default value. Raises an error if either is set.
default value. Raises an error if neither is set.
"""
param = self._resolveParam(param)
if param in self._paramMap:
Expand Down
9 changes: 5 additions & 4 deletions python/pyspark/ml/param/_shared_params_code_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ def _gen_param_header(name, doc, defaultValueStr):
def __init__(self):
super(Has$Name, self).__init__()
#: param for $doc
self.$name = Param(self, "$name", "$doc")
if $defaultValueStr is not None:
self._setDefault($name=$defaultValueStr)'''
self.$name = Param(self, "$name", "$doc")'''
if defaultValueStr is not None:
template += '''
self._setDefault($name=$defaultValueStr)'''

Name = name[0].upper() + name[1:]
return template \
Expand Down Expand Up @@ -118,7 +119,7 @@ def get$Name(self):
("outputCol", "output column name", None),
("numFeatures", "number of features", None),
("checkpointInterval", "checkpoint interval (>= 1)", None),
("seed", "random seed", None),
("seed", "random seed", "hash(type(self).__name__)"),
("tol", "the convergence tolerance for iterative algorithms", None),
("stepSize", "Step size to be used for each iteration of optimization.", None)]
code = []
Expand Down
37 changes: 7 additions & 30 deletions python/pyspark/ml/param/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ def __init__(self):
super(HasMaxIter, self).__init__()
#: param for max number of iterations (>= 0)
self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0)")
if None is not None:
self._setDefault(maxIter=None)

def setMaxIter(self, value):
"""
Expand Down Expand Up @@ -61,8 +59,6 @@ def __init__(self):
super(HasRegParam, self).__init__()
#: param for regularization parameter (>= 0)
self.regParam = Param(self, "regParam", "regularization parameter (>= 0)")
if None is not None:
self._setDefault(regParam=None)

def setRegParam(self, value):
"""
Expand Down Expand Up @@ -90,8 +86,7 @@ def __init__(self):
super(HasFeaturesCol, self).__init__()
#: param for features column name
self.featuresCol = Param(self, "featuresCol", "features column name")
if 'features' is not None:
self._setDefault(featuresCol='features')
self._setDefault(featuresCol='features')

def setFeaturesCol(self, value):
"""
Expand Down Expand Up @@ -119,8 +114,7 @@ def __init__(self):
super(HasLabelCol, self).__init__()
#: param for label column name
self.labelCol = Param(self, "labelCol", "label column name")
if 'label' is not None:
self._setDefault(labelCol='label')
self._setDefault(labelCol='label')

def setLabelCol(self, value):
"""
Expand Down Expand Up @@ -148,8 +142,7 @@ def __init__(self):
super(HasPredictionCol, self).__init__()
#: param for prediction column name
self.predictionCol = Param(self, "predictionCol", "prediction column name")
if 'prediction' is not None:
self._setDefault(predictionCol='prediction')
self._setDefault(predictionCol='prediction')

def setPredictionCol(self, value):
"""
Expand Down Expand Up @@ -177,8 +170,7 @@ def __init__(self):
super(HasProbabilityCol, self).__init__()
#: param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.")
if 'probability' is not None:
self._setDefault(probabilityCol='probability')
self._setDefault(probabilityCol='probability')

def setProbabilityCol(self, value):
"""
Expand Down Expand Up @@ -206,8 +198,7 @@ def __init__(self):
super(HasRawPredictionCol, self).__init__()
#: param for raw prediction (a.k.a. confidence) column name
self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
if 'rawPrediction' is not None:
self._setDefault(rawPredictionCol='rawPrediction')
self._setDefault(rawPredictionCol='rawPrediction')

def setRawPredictionCol(self, value):
"""
Expand Down Expand Up @@ -235,8 +226,6 @@ def __init__(self):
super(HasInputCol, self).__init__()
#: param for input column name
self.inputCol = Param(self, "inputCol", "input column name")
if None is not None:
self._setDefault(inputCol=None)

def setInputCol(self, value):
"""
Expand Down Expand Up @@ -264,8 +253,6 @@ def __init__(self):
super(HasInputCols, self).__init__()
#: param for input column names
self.inputCols = Param(self, "inputCols", "input column names")
if None is not None:
self._setDefault(inputCols=None)

def setInputCols(self, value):
"""
Expand Down Expand Up @@ -293,8 +280,6 @@ def __init__(self):
super(HasOutputCol, self).__init__()
#: param for output column name
self.outputCol = Param(self, "outputCol", "output column name")
if None is not None:
self._setDefault(outputCol=None)

def setOutputCol(self, value):
"""
Expand Down Expand Up @@ -322,8 +307,6 @@ def __init__(self):
super(HasNumFeatures, self).__init__()
#: param for number of features
self.numFeatures = Param(self, "numFeatures", "number of features")
if None is not None:
self._setDefault(numFeatures=None)

def setNumFeatures(self, value):
"""
Expand Down Expand Up @@ -351,8 +334,6 @@ def __init__(self):
super(HasCheckpointInterval, self).__init__()
#: param for checkpoint interval (>= 1)
self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1)")
if None is not None:
self._setDefault(checkpointInterval=None)

def setCheckpointInterval(self, value):
"""
Expand Down Expand Up @@ -380,8 +361,7 @@ def __init__(self):
super(HasSeed, self).__init__()
#: param for random seed
self.seed = Param(self, "seed", "random seed")
if None is not None:
self._setDefault(seed=None)
self._setDefault(seed=hash(type(self).__name__))

def setSeed(self, value):
"""
Expand Down Expand Up @@ -409,8 +389,6 @@ def __init__(self):
super(HasTol, self).__init__()
#: param for the convergence tolerance for iterative algorithms
self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms")
if None is not None:
self._setDefault(tol=None)

def setTol(self, value):
"""
Expand Down Expand Up @@ -438,8 +416,6 @@ def __init__(self):
super(HasStepSize, self).__init__()
#: param for Step size to be used for each iteration of optimization.
self.stepSize = Param(self, "stepSize", "Step size to be used for each iteration of optimization.")
if None is not None:
self._setDefault(stepSize=None)

def setStepSize(self, value):
"""
Expand Down Expand Up @@ -467,6 +443,7 @@ class DecisionTreeParams(Params):
minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")


def __init__(self):
super(DecisionTreeParams, self).__init__()
Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/ml/recommendation.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha

@keyword_only
def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
ratingCol="rating", nonnegative=False, checkpointInterval=10):
"""
__init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \
implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=0, \
implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=None, \
ratingCol="rating", nonnegative=false, checkpointInterval=10)
"""
super(ALS, self).__init__()
Expand All @@ -109,18 +109,18 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
self.nonnegative = Param(self, "nonnegative",
"whether to use nonnegative constraint for least squares")
self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
ratingCol="rating", nonnegative=False, checkpointInterval=10)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
ratingCol="rating", nonnegative=False, checkpointInterval=10):
"""
setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0, \
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, \
ratingCol="rating", nonnegative=False, checkpointInterval=10)
Sets params for ALS.
"""
Expand Down
13 changes: 7 additions & 6 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
>>> df = sqlContext.createDataFrame([
... (1.0, Vectors.dense(1.0)),
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
>>> rf = RandomForestRegressor(numTrees=2, maxDepth=2)
>>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
>>> model = rf.fit(df)
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> model.transform(test0).head().prediction
Expand All @@ -284,12 +284,13 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
numTrees=20, featureSubsetStrategy="auto", seed=42):
numTrees=20, featureSubsetStrategy="auto", seed=None):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=42)
impurity="variance", numTrees=20, \
featureSubsetStrategy="auto", seed=None)
"""
super(RandomForestRegressor, self).__init__()
self._java_obj = self._new_java_obj(
Expand All @@ -312,20 +313,20 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
"The number of features to consider for splits at each tree node. Supported " +
"options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
impurity="variance", numTrees=20, featureSubsetStrategy="auto")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
impurity="variance", numTrees=20, featureSubsetStrategy="auto"):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
impurity="variance", numTrees=20, featureSubsetStrategy="auto")
Sets params for linear regression.
"""
Expand Down
Loading