Skip to content

[SPARK-7678] [ml] Fix default random seed in HasSeed #6251

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ private[feature] trait Word2VecBase extends Params

setDefault(stepSize -> 0.025)
setDefault(maxIter -> 1)
setDefault(seed -> 42L)

/**
* Validate and transform the input schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ private[shared] object SharedParamsCodeGen {
ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1)",
isValid = "ParamValidators.gtEq(1)"),
ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
ParamDesc[Long]("seed", "random seed", Some("Utils.random.nextLong()")),
ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")),
ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." +
" For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
isValid = "ParamValidators.inRange(0, 1)"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ private[ml] trait HasFitIntercept extends Params {
}

/**
* (private[ml]) Trait for shared param seed (default: Utils.random.nextLong()).
* (private[ml]) Trait for shared param seed (default: this.getClass.getName.hashCode.toLong).
*/
private[ml] trait HasSeed extends Params {

Expand All @@ -242,7 +242,7 @@ private[ml] trait HasSeed extends Params {
*/
final val seed: LongParam = new LongParam(this, "seed", "random seed")

setDefault(seed, Utils.random.nextLong())
setDefault(seed, this.getClass.getName.hashCode.toLong)

/** @group getParam */
final def getSeed: Long = $(seed)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR

setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10,
implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item",
ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10, seed -> 0L)
ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10)

/**
* Validates and transforms the input schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
.setVectorSize(3)
.setInputCol("text")
.setOutputCol("result")
.setSeed(42L)
.fit(docDF)

model.transform(docDF).select("result", "expected").collect().foreach {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
.setImplicitPrefs(implicitPrefs)
.setNumUserBlocks(numUserBlocks)
.setNumItemBlocks(numItemBlocks)
.setSeed(0)
val alpha = als.getAlpha
val model = als.fit(training.toDF())
val predictions = model.transform(test.toDF())
Expand Down Expand Up @@ -425,17 +426,18 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)

val longRatings = ratings.map(r => Rating(r.user.toLong, r.item.toLong, r.rating))
val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4)
val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4, seed = 0)
assert(longUserFactors.first()._1.getClass === classOf[Long])

val strRatings = ratings.map(r => Rating(r.user.toString, r.item.toString, r.rating))
val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4)
val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4, seed = 0)
assert(strUserFactors.first()._1.getClass === classOf[String])
}

test("nonnegative constraint") {
val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
val (userFactors, itemFactors) = ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true)
val (userFactors, itemFactors) =
ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true, seed = 0)
def isNonnegative(factors: RDD[(Int, Array[Float])]): Boolean = {
factors.values.map { _.forall(_ >= 0.0) }.reduce(_ && _)
}
Expand All @@ -459,7 +461,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
test("partitioner in returned factors") {
val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
val (userFactors, itemFactors) = ALS.train(
ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4)
ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4, seed = 0)
for ((tpe, factors) <- Seq(("User", userFactors), ("Item", itemFactors))) {
assert(userFactors.partitioner.isDefined, s"$tpe factors should have partitioner.")
val part = userFactors.partitioner.get
Expand All @@ -476,8 +478,8 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {

test("als with large number of iterations") {
val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2)
ALS.train(
ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, implicitPrefs = true)
ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, seed = 0)
ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2,
implicitPrefs = true, seed = 0)
}
}