Make the seed random for HasSeed in python

holdenk · holdenk · commit 1ff17c240ef9 · 2015-05-13T18:56:57.000-07:00
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -790,7 +790,7 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
 
     @keyword_only
     def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
-                 seed=42, inputCol=None, outputCol=None):
+                 seed=None, inputCol=None, outputCol=None):
         """
         __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
                  seed=42, inputCol=None, outputCol=None)
@@ -810,9 +810,9 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
 
     @keyword_only
     def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
-                  seed=42, inputCol=None, outputCol=None):
+                  seed=None, inputCol=None, outputCol=None):
         """
-        setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=42,
+        setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None,
                  inputCol=None, outputCol=None)
         Sets params for this Word2Vec.
         """
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
@@ -123,7 +123,7 @@ def isDefined(self, param):
     def getOrDefault(self, param):
         """
         Gets the value of a param in the user-supplied param map or its
-        default value. Raises an error if either is set.
+        default value. Raises an error if neither is set.
         """
         if isinstance(param, Param):
             if param in self.paramMap:
@@ -135,6 +135,7 @@ def getOrDefault(self, param):
         else:
             raise KeyError("Cannot recognize %r as a param." % param)
 
+
     def extractParamMap(self, extraParamMap={}):
         """
         Extracts the embedded default param values and user-supplied
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -57,8 +57,9 @@ def __init__(self):
         super(Has$Name, self).__init__()
         #: param for $doc
         self.$name = Param(self, "$name", "$doc")
-        if $defaultValueStr is not None:
-            self._setDefault($name=$defaultValueStr)'''
+        x = $defaultValueStr
+        if x is not None:
+            self._setDefault($name=x)'''
 
     Name = name[0].upper() + name[1:]
     return template \
@@ -102,7 +103,9 @@ def get$Name(self):
 if __name__ == "__main__":
     print(header)
     print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n")
-    print("from pyspark.ml.param import Param, Params\n\n")
+    print("from pyspark.ml.param import Param, Params\n")
+    print("import random\n")
+    print("import sys\n\n")
     shared = [
         ("maxIter", "max number of iterations (>= 0)", None),
         ("regParam", "regularization parameter (>= 0)", None),
@@ -115,7 +118,7 @@ def get$Name(self):
         ("outputCol", "output column name", None),
         ("numFeatures", "number of features", None),
         ("checkpointInterval", "checkpoint interval (>= 1)", None),
-        ("seed", "random seed", None),
+        ("seed", "random seed", "random.randint(0, sys.maxsize)"),
         ("tol", "the convergence tolerance for iterative algorithms", None),
         ("stepSize", "Step size to be used for each iteration of optimization.", None)]
     code = []
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
@@ -19,6 +19,10 @@
 
 from pyspark.ml.param import Param, Params
 
+import random
+
+import sys
+
 
 class HasMaxIter(Params):
     """
@@ -351,8 +355,8 @@ def __init__(self):
         super(HasSeed, self).__init__()
         #: param for random seed
         self.seed = Param(self, "seed", "random seed")
-        if None is not None:
-            self._setDefault(seed=None)
+        if random.randint(0, sys.maxsize) is not None:
+            self._setDefault(seed=random.randint(0, sys.maxsize))
 
     def setSeed(self, value):
         """
@@ -438,22 +442,17 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+    
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
-        #: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
         self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
-        #: param for Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.
         self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
-        #: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
         self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
-        #: param for Minimum information gain for a split to be considered at a tree node.
         self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
-        #: param for Maximum memory in MB allocated to histogram aggregation.
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
-        #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+        
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -33,7 +33,7 @@
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
 from pyspark.sql import DataFrame
 from pyspark.ml.param import Param
-from pyspark.ml.param.shared import HasMaxIter, HasInputCol
+from pyspark.ml.param.shared import HasMaxIter, HasInputCol, HasSeed
 from pyspark.ml.pipeline import Estimator, Model, Pipeline, Transformer
 
 
@@ -112,15 +112,15 @@ def test_pipeline(self):
         self.assertEqual(6, dataset.index)
 
 
-class TestParams(HasMaxIter, HasInputCol):
+class TestParams(HasMaxIter, HasInputCol, HasSeed):
     """
-    A subclass of Params mixed with HasMaxIter and HasInputCol.
+    A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
     """
 
-    def __init__(self):
+    def __init__(self, seed=None):
         super(TestParams, self).__init__()
         self._setDefault(maxIter=10)
-
+        self._set(seed=seed)
 
 class ParamTests(PySparkTestCase):
 
@@ -135,9 +135,10 @@ def test_params(self):
         testParams = TestParams()
         maxIter = testParams.maxIter
         inputCol = testParams.inputCol
+        seed = testParams.seed
 
         params = testParams.params
-        self.assertEqual(params, [inputCol, maxIter])
+        self.assertEqual(params, [inputCol, maxIter, seed])
 
         self.assertTrue(testParams.hasDefault(maxIter))
         self.assertFalse(testParams.isSet(maxIter))
@@ -153,10 +154,26 @@ def test_params(self):
         with self.assertRaises(KeyError):
             testParams.getInputCol()
 
+        # Since the default is normally random, set it to a known number for debug str
+        testParams._setDefault(seed=41)
+        testParams.setSeed(43)
+
         self.assertEquals(
             testParams.explainParams(),
             "\n".join(["inputCol: input column name (undefined)",
-                       "maxIter: max number of iterations (>= 0) (default: 10, current: 100)"]))
+                       "maxIter: max number of iterations (>= 0) (default: 10, current: 100)",
+                       "seed: random seed (default: 41, current: 43)"]))
+
+    def test_hasseed(self):
+        noSeedSpecd = TestParams()
+        withSeedSpecd = TestParams(seed = 42)
+        # Check that we no longer use 42 as the magic number
+        self.assertNotEqual(noSeedSpecd.getSeed(), 42)
+        origSeed = noSeedSpecd.getSeed()
+        # Check that we only compute the seed once
+        self.assertEqual(noSeedSpecd.getSeed(), origSeed)
+        # Check that a specified seed is honored
+        self.assertEqual(withSeedSpecd.getSeed(), 42)
 
 
 if __name__ == "__main__":