Skip to content

Commit acf14a3

Browse files
holdenkjeanlyn
authored andcommitted
[SPARK-7511] [MLLIB] pyspark ml seed param should be random by default or 42 is quite funny but not very random
Author: Holden Karau <[email protected]> Closes apache#6139 from holdenk/SPARK-7511-pyspark-ml-seed-param-should-be-random-by-default-or-42-is-quite-funny-but-not-very-random and squashes the following commits: 591f8e5 [Holden Karau] specify old seed for doc tests 2470004 [Holden Karau] Fix a bunch of seeds with default values to have None as the default which will then result in using the hash of the class name cbad96d [Holden Karau] Add the setParams function that is used in the real code 423b8d7 [Holden Karau] Switch the test code to behave slightly more like production code. also don't check the param map value only check for key existence 140d25d [Holden Karau] remove extra space 926165a [Holden Karau] Add some missing newlines for pep8 style 8616751 [Holden Karau] merge in master 58532e6 [Holden Karau] its the __name__ method, also treat None values as not set 56ef24a [Holden Karau] fix test and regenerate base afdaa5c [Holden Karau] make sure different classes have different results 68eb528 [Holden Karau] switch default seed to hash of type of self 89c4611 [Holden Karau] Merge branch 'master' into SPARK-7511-pyspark-ml-seed-param-should-be-random-by-default-or-42-is-quite-funny-but-not-very-random 31cd96f [Holden Karau] specify the seed to randomforestregressor test e1b947f [Holden Karau] Style fixes ce90ec8 [Holden Karau] merge in master bcdf3c9 [Holden Karau] update docstring seeds to none and some other default seeds from 42 65eba21 [Holden Karau] pep8 fixes 0e3797e [Holden Karau] Make seed default to random in more places 213a543 [Holden Karau] Simplify the generated code to only include set default if there is a default rather than having None is note None in the generated code 1ff17c2 [Holden Karau] Make the seed random for HasSeed in python
1 parent 5f1ef29 commit acf14a3

File tree

8 files changed

+96
-64
lines changed

8 files changed

+96
-64
lines changed

python/pyspark/ml/classification.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
292292
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
293293
>>> si_model = stringIndexer.fit(df)
294294
>>> td = si_model.transform(df)
295-
>>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed")
295+
>>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42)
296296
>>> model = rf.fit(td)
297297
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
298298
>>> model.transform(test0).head().prediction
@@ -319,12 +319,12 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
319319
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
320320
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
321321
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",
322-
numTrees=20, featureSubsetStrategy="auto", seed=42):
322+
numTrees=20, featureSubsetStrategy="auto", seed=None):
323323
"""
324324
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
325325
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
326326
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
327-
numTrees=20, featureSubsetStrategy="auto", seed=42)
327+
numTrees=20, featureSubsetStrategy="auto", seed=None)
328328
"""
329329
super(RandomForestClassifier, self).__init__()
330330
self._java_obj = self._new_java_obj(
@@ -347,20 +347,20 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
347347
"The number of features to consider for splits at each tree node. Supported " +
348348
"options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
349349
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
350-
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
350+
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
351351
impurity="gini", numTrees=20, featureSubsetStrategy="auto")
352352
kwargs = self.__init__._input_kwargs
353353
self.setParams(**kwargs)
354354

355355
@keyword_only
356356
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
357357
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
358-
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
358+
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
359359
impurity="gini", numTrees=20, featureSubsetStrategy="auto"):
360360
"""
361361
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
362362
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
363-
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42, \
363+
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
364364
impurity="gini", numTrees=20, featureSubsetStrategy="auto")
365365
Sets params for linear classification.
366366
"""

python/pyspark/ml/feature.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -876,10 +876,10 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
876876

877877
@keyword_only
878878
def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
879-
seed=42, inputCol=None, outputCol=None):
879+
seed=None, inputCol=None, outputCol=None):
880880
"""
881881
__init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \
882-
seed=42, inputCol=None, outputCol=None)
882+
seed=None, inputCol=None, outputCol=None)
883883
"""
884884
super(Word2Vec, self).__init__()
885885
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
@@ -891,15 +891,15 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
891891
"the minimum number of times a token must appear to be included " +
892892
"in the word2vec model's vocabulary")
893893
self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
894-
seed=42)
894+
seed=None)
895895
kwargs = self.__init__._input_kwargs
896896
self.setParams(**kwargs)
897897

898898
@keyword_only
899899
def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
900-
seed=42, inputCol=None, outputCol=None):
900+
seed=None, inputCol=None, outputCol=None):
901901
"""
902-
setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=42, \
902+
setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \
903903
inputCol=None, outputCol=None)
904904
Sets params for this Word2Vec.
905905
"""

python/pyspark/ml/param/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def hasParam(self, paramName):
147147
def getOrDefault(self, param):
148148
"""
149149
Gets the value of a param in the user-supplied param map or its
150-
default value. Raises an error if either is set.
150+
default value. Raises an error if neither is set.
151151
"""
152152
param = self._resolveParam(param)
153153
if param in self._paramMap:

python/pyspark/ml/param/_shared_params_code_gen.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,10 @@ def _gen_param_header(name, doc, defaultValueStr):
5656
def __init__(self):
5757
super(Has$Name, self).__init__()
5858
#: param for $doc
59-
self.$name = Param(self, "$name", "$doc")
60-
if $defaultValueStr is not None:
61-
self._setDefault($name=$defaultValueStr)'''
59+
self.$name = Param(self, "$name", "$doc")'''
60+
if defaultValueStr is not None:
61+
template += '''
62+
self._setDefault($name=$defaultValueStr)'''
6263

6364
Name = name[0].upper() + name[1:]
6465
return template \
@@ -118,7 +119,7 @@ def get$Name(self):
118119
("outputCol", "output column name", None),
119120
("numFeatures", "number of features", None),
120121
("checkpointInterval", "checkpoint interval (>= 1)", None),
121-
("seed", "random seed", None),
122+
("seed", "random seed", "hash(type(self).__name__)"),
122123
("tol", "the convergence tolerance for iterative algorithms", None),
123124
("stepSize", "Step size to be used for each iteration of optimization.", None)]
124125
code = []

python/pyspark/ml/param/shared.py

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ def __init__(self):
3232
super(HasMaxIter, self).__init__()
3333
#: param for max number of iterations (>= 0)
3434
self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0)")
35-
if None is not None:
36-
self._setDefault(maxIter=None)
3735

3836
def setMaxIter(self, value):
3937
"""
@@ -61,8 +59,6 @@ def __init__(self):
6159
super(HasRegParam, self).__init__()
6260
#: param for regularization parameter (>= 0)
6361
self.regParam = Param(self, "regParam", "regularization parameter (>= 0)")
64-
if None is not None:
65-
self._setDefault(regParam=None)
6662

6763
def setRegParam(self, value):
6864
"""
@@ -90,8 +86,7 @@ def __init__(self):
9086
super(HasFeaturesCol, self).__init__()
9187
#: param for features column name
9288
self.featuresCol = Param(self, "featuresCol", "features column name")
93-
if 'features' is not None:
94-
self._setDefault(featuresCol='features')
89+
self._setDefault(featuresCol='features')
9590

9691
def setFeaturesCol(self, value):
9792
"""
@@ -119,8 +114,7 @@ def __init__(self):
119114
super(HasLabelCol, self).__init__()
120115
#: param for label column name
121116
self.labelCol = Param(self, "labelCol", "label column name")
122-
if 'label' is not None:
123-
self._setDefault(labelCol='label')
117+
self._setDefault(labelCol='label')
124118

125119
def setLabelCol(self, value):
126120
"""
@@ -148,8 +142,7 @@ def __init__(self):
148142
super(HasPredictionCol, self).__init__()
149143
#: param for prediction column name
150144
self.predictionCol = Param(self, "predictionCol", "prediction column name")
151-
if 'prediction' is not None:
152-
self._setDefault(predictionCol='prediction')
145+
self._setDefault(predictionCol='prediction')
153146

154147
def setPredictionCol(self, value):
155148
"""
@@ -177,8 +170,7 @@ def __init__(self):
177170
super(HasProbabilityCol, self).__init__()
178171
#: param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
179172
self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.")
180-
if 'probability' is not None:
181-
self._setDefault(probabilityCol='probability')
173+
self._setDefault(probabilityCol='probability')
182174

183175
def setProbabilityCol(self, value):
184176
"""
@@ -206,8 +198,7 @@ def __init__(self):
206198
super(HasRawPredictionCol, self).__init__()
207199
#: param for raw prediction (a.k.a. confidence) column name
208200
self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
209-
if 'rawPrediction' is not None:
210-
self._setDefault(rawPredictionCol='rawPrediction')
201+
self._setDefault(rawPredictionCol='rawPrediction')
211202

212203
def setRawPredictionCol(self, value):
213204
"""
@@ -235,8 +226,6 @@ def __init__(self):
235226
super(HasInputCol, self).__init__()
236227
#: param for input column name
237228
self.inputCol = Param(self, "inputCol", "input column name")
238-
if None is not None:
239-
self._setDefault(inputCol=None)
240229

241230
def setInputCol(self, value):
242231
"""
@@ -264,8 +253,6 @@ def __init__(self):
264253
super(HasInputCols, self).__init__()
265254
#: param for input column names
266255
self.inputCols = Param(self, "inputCols", "input column names")
267-
if None is not None:
268-
self._setDefault(inputCols=None)
269256

270257
def setInputCols(self, value):
271258
"""
@@ -293,8 +280,6 @@ def __init__(self):
293280
super(HasOutputCol, self).__init__()
294281
#: param for output column name
295282
self.outputCol = Param(self, "outputCol", "output column name")
296-
if None is not None:
297-
self._setDefault(outputCol=None)
298283

299284
def setOutputCol(self, value):
300285
"""
@@ -322,8 +307,6 @@ def __init__(self):
322307
super(HasNumFeatures, self).__init__()
323308
#: param for number of features
324309
self.numFeatures = Param(self, "numFeatures", "number of features")
325-
if None is not None:
326-
self._setDefault(numFeatures=None)
327310

328311
def setNumFeatures(self, value):
329312
"""
@@ -351,8 +334,6 @@ def __init__(self):
351334
super(HasCheckpointInterval, self).__init__()
352335
#: param for checkpoint interval (>= 1)
353336
self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1)")
354-
if None is not None:
355-
self._setDefault(checkpointInterval=None)
356337

357338
def setCheckpointInterval(self, value):
358339
"""
@@ -380,8 +361,7 @@ def __init__(self):
380361
super(HasSeed, self).__init__()
381362
#: param for random seed
382363
self.seed = Param(self, "seed", "random seed")
383-
if None is not None:
384-
self._setDefault(seed=None)
364+
self._setDefault(seed=hash(type(self).__name__))
385365

386366
def setSeed(self, value):
387367
"""
@@ -409,8 +389,6 @@ def __init__(self):
409389
super(HasTol, self).__init__()
410390
#: param for the convergence tolerance for iterative algorithms
411391
self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms")
412-
if None is not None:
413-
self._setDefault(tol=None)
414392

415393
def setTol(self, value):
416394
"""
@@ -438,8 +416,6 @@ def __init__(self):
438416
super(HasStepSize, self).__init__()
439417
#: param for Step size to be used for each iteration of optimization.
440418
self.stepSize = Param(self, "stepSize", "Step size to be used for each iteration of optimization.")
441-
if None is not None:
442-
self._setDefault(stepSize=None)
443419

444420
def setStepSize(self, value):
445421
"""
@@ -467,6 +443,7 @@ class DecisionTreeParams(Params):
467443
minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
468444
maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
469445
cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
446+
470447

471448
def __init__(self):
472449
super(DecisionTreeParams, self).__init__()

python/pyspark/ml/recommendation.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,11 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
8989

9090
@keyword_only
9191
def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
92-
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
92+
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
9393
ratingCol="rating", nonnegative=False, checkpointInterval=10):
9494
"""
9595
__init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \
96-
implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=0, \
96+
implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=None, \
9797
ratingCol="rating", nonnegative=false, checkpointInterval=10)
9898
"""
9999
super(ALS, self).__init__()
@@ -109,18 +109,18 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
109109
self.nonnegative = Param(self, "nonnegative",
110110
"whether to use nonnegative constraint for least squares")
111111
self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
112-
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
112+
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
113113
ratingCol="rating", nonnegative=False, checkpointInterval=10)
114114
kwargs = self.__init__._input_kwargs
115115
self.setParams(**kwargs)
116116

117117
@keyword_only
118118
def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
119-
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0,
119+
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
120120
ratingCol="rating", nonnegative=False, checkpointInterval=10):
121121
"""
122122
setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \
123-
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=0, \
123+
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, \
124124
ratingCol="rating", nonnegative=False, checkpointInterval=10)
125125
Sets params for ALS.
126126
"""

python/pyspark/ml/regression.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
257257
>>> df = sqlContext.createDataFrame([
258258
... (1.0, Vectors.dense(1.0)),
259259
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
260-
>>> rf = RandomForestRegressor(numTrees=2, maxDepth=2)
260+
>>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
261261
>>> model = rf.fit(df)
262262
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
263263
>>> model.transform(test0).head().prediction
@@ -284,12 +284,13 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
284284
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
285285
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
286286
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
287-
numTrees=20, featureSubsetStrategy="auto", seed=42):
287+
numTrees=20, featureSubsetStrategy="auto", seed=None):
288288
"""
289289
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
290290
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
291291
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
292-
impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=42)
292+
impurity="variance", numTrees=20, \
293+
featureSubsetStrategy="auto", seed=None)
293294
"""
294295
super(RandomForestRegressor, self).__init__()
295296
self._java_obj = self._new_java_obj(
@@ -312,20 +313,20 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
312313
"The number of features to consider for splits at each tree node. Supported " +
313314
"options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
314315
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
315-
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
316+
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
316317
impurity="variance", numTrees=20, featureSubsetStrategy="auto")
317318
kwargs = self.__init__._input_kwargs
318319
self.setParams(**kwargs)
319320

320321
@keyword_only
321322
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
322323
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
323-
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
324+
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
324325
impurity="variance", numTrees=20, featureSubsetStrategy="auto"):
325326
"""
326327
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
327328
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
328-
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42, \
329+
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
329330
impurity="variance", numTrees=20, featureSubsetStrategy="auto")
330331
Sets params for linear regression.
331332
"""

0 commit comments

Comments
 (0)