add unit test for pipeline

mengxr · mengxr · commit 9fd493310973 · 2014-11-10T02:22:49.000-08:00
diff --git a/mllib/pom.xml b/mllib/pom.xml
@@ -101,6 +101,12 @@
       <scope>test</scope>
     </dependency>
     <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-all</artifactId>
+      <version>1.9.0</version>
+      <scope>test</scope>
+      </dependency>
+      <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.api.java.JavaSchemaRDD
 /**
  * Abstract class for estimators that fit models to data.
  */
-abstract class Estimator[M <: Model] extends PipelineStage with Params {
+abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
 
   /**
    * Fits a single model to the input data with optional parameters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -22,11 +22,11 @@ import org.apache.spark.ml.param.ParamMap
 /**
  * A fitted model.
  */
-abstract class Model extends Transformer {
+abstract class Model[M <: Model[M]] extends Transformer {
   /**
    * The parent estimator that produced this model.
    */
-  val parent: Estimator[_]
+  val parent: Estimator[M]
 
   /**
    * Fitting parameters, such that parent.fit(..., trainingParamMap) could reproduce the model.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -105,15 +105,15 @@ class Pipeline extends Estimator[PipelineModel] {
 class PipelineModel(
     override val parent: Pipeline,
     override val fittingParamMap: ParamMap,
-    val transformers: Array[Transformer]) extends Model with Logging {
+    val transformers: Array[Transformer]) extends Model[PipelineModel] with Logging {
 
   /**
    * Gets the model produced by the input estimator. Throws an NoSuchElementException is the input
    * estimator does not exist in the pipeline.
    */
-  def getModel[M <: Model](estimator: Estimator[M]): M = {
+  def getModel[M <: Model[M]](estimator: Estimator[M]): M = {
     val matched = transformers.filter {
-      case m: Model => m.parent.eq(estimator)
+      case m: Model[_] => m.parent.eq(estimator)
       case _ => false
     }
     if (matched.isEmpty) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -71,11 +71,11 @@ abstract class Transformer extends PipelineStage with Params {
  * Abstract class for transformers that take one input column, apply transformation, and output the
  * result as a new column.
  */
-abstract class UnaryTransformer[IN, OUT: TypeTag, SELF <: UnaryTransformer[IN, OUT, SELF]]
+abstract class UnaryTransformer[IN, OUT: TypeTag, T <: UnaryTransformer[IN, OUT, T]]
     extends Transformer with HasInputCol with HasOutputCol with Logging {
 
-  def setInputCol(value: String): SELF = { set(inputCol, value); this.asInstanceOf[SELF] }
-  def setOutputCol(value: String): SELF = { set(outputCol, value); this.asInstanceOf[SELF] }
+  def setInputCol(value: String): T = { set(inputCol, value); this.asInstanceOf[T] }
+  def setOutputCol(value: String): T = { set(outputCol, value); this.asInstanceOf[T] }
 
   /**
    * Creates the transform function using the given param map. The input param map already takes
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -96,7 +96,7 @@ class LogisticRegression extends Estimator[LogisticRegressionModel] with Logisti
 class LogisticRegressionModel private[ml] (
     override val parent: LogisticRegression,
     override val fittingParamMap: ParamMap,
-    val weights: Vector) extends Model with LogisticRegressionParams {
+    val weights: Vector) extends Model[LogisticRegressionModel] with LogisticRegressionParams {
 
   def setThreshold(value: Double): this.type = { set(threshold, value); this }
   def setFeaturesCol(value: String): this.type = { set(featuresCol, value); this }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -71,7 +71,8 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP
 class StandardScalerModel private[ml] (
     override val parent: StandardScaler,
     override val fittingParamMap: ParamMap,
-    scaler: feature.StandardScalerModel) extends Model with StandardScalerParams {
+    scaler: feature.StandardScalerModel) extends Model[StandardScalerModel]
+  with StandardScalerParams {
 
   def setInputCol(value: String): this.type = { set(inputCol, value); this }
   def setOutputCol(value: String): this.type = { set(outputCol, value); this }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -79,7 +79,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
       val validationDataset = sqlCtx.applySchema(validation, schema).cache()
       // multi-model training
       logDebug(s"Train split $splitIndex with multiple sets of parameters.")
-      val models = est.fit(trainingDataset, epm).asInstanceOf[Seq[Model]]
+      val models = est.fit(trainingDataset, epm).asInstanceOf[Seq[Model[_]]]
       var i = 0
       while (i < numModels) {
         val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)), map)
@@ -93,7 +93,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
     val (bestMetric, bestIndex) = metrics.zipWithIndex.maxBy(_._1)
     logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
     logInfo(s"Best cross-validation metric: $bestMetric.")
-    val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model]
+    val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
     val cvModel = new CrossValidatorModel(this, map, bestModel)
     Params.copyValues(this, cvModel)
     cvModel
@@ -111,7 +111,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
 class CrossValidatorModel private[ml] (
     override val parent: CrossValidator,
     override val fittingParamMap: ParamMap,
-    val bestModel: Model) extends Model with CrossValidatorParams {
+    val bestModel: Model[_]) extends Model[CrossValidatorModel] with CrossValidatorParams {
 
   override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
     bestModel.transform(dataset, paramMap)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.mockito.Matchers.{any, eq => meq}
+import org.mockito.Mockito.when
+import org.scalatest.FunSuite
+import org.scalatest.mock.MockitoSugar.mock
+
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.sql.SchemaRDD
+
+class PipelineSuite extends FunSuite {
+
+  abstract class MyModel extends Model[MyModel]
+
+  test("pipeline") {
+    val estimator0 = mock[Estimator[MyModel]]
+    val model0 = mock[MyModel]
+    val transformer1 = mock[Transformer]
+    val estimator2 = mock[Estimator[MyModel]]
+    val model2 = mock[MyModel]
+    val transformer3 = mock[Transformer]
+    val dataset0 = mock[SchemaRDD]
+    val dataset1 = mock[SchemaRDD]
+    val dataset2 = mock[SchemaRDD]
+    val dataset3 = mock[SchemaRDD]
+    val dataset4 = mock[SchemaRDD]
+
+    when(estimator0.fit(meq(dataset0), any[ParamMap]())).thenReturn(model0)
+    when(model0.transform(meq(dataset0), any[ParamMap]())).thenReturn(dataset1)
+    when(model0.parent).thenReturn(estimator0)
+    when(transformer1.transform(meq(dataset1), any[ParamMap])).thenReturn(dataset2)
+    when(estimator2.fit(meq(dataset2), any[ParamMap]())).thenReturn(model2)
+    when(model2.transform(meq(dataset2), any[ParamMap]())).thenReturn(dataset3)
+    when(model2.parent).thenReturn(estimator2)
+    when(transformer3.transform(meq(dataset3), any[ParamMap]())).thenReturn(dataset4)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(estimator0, transformer1, estimator2, transformer3))
+    val pipelineModel = pipeline.fit(dataset0)
+
+    assert(pipelineModel.transformers(0).eq(model0))
+    assert(pipelineModel.transformers(1).eq(transformer1))
+    assert(pipelineModel.transformers(2).eq(model2))
+    assert(pipelineModel.transformers(3).eq(transformer3))
+
+    assert(pipelineModel.getModel(estimator0).eq(model0))
+    assert(pipelineModel.getModel(estimator2).eq(model2))
+    intercept[NoSuchElementException] {
+      pipelineModel.getModel(mock[Estimator[MyModel]])
+    }
+    val output = pipelineModel.transform(dataset0)
+    assert(output.eq(dataset4))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -70,34 +70,4 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll {
       .select('label, 'probability, 'prediction)
       .foreach(println)
   }
-
-  test("logistic regression with cross validation") {
-    val lr = new LogisticRegression
-    val lrParamMaps = new ParamGridBuilder()
-      .addGrid(lr.regParam, Array(0.1, 100.0))
-      .addGrid(lr.maxIter, Array(0, 5))
-      .build()
-    val eval = new BinaryClassificationEvaluator
-    val cv = new CrossValidator()
-      .setEstimator(lr)
-      .setEstimatorParamMaps(lrParamMaps)
-      .setEvaluator(eval)
-      .setNumFolds(3)
-    val bestModel = cv.fit(dataset)
-  }
-
-  test("logistic regression with pipeline") {
-    val scaler = new StandardScaler()
-      .setInputCol("features")
-      .setOutputCol("scaledFeatures")
-    val lr = new LogisticRegression()
-      .setFeaturesCol("scaledFeatures")
-    val pipeline = new Pipeline()
-      .setStages(Array(scaler, lr))
-    val model = pipeline.fit(dataset)
-    val predictions = model.transform(dataset)
-      .select('label, 'score, 'prediction)
-      .collect()
-      .foreach(println)
-  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tuning
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
+import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput
+import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.test.TestSQLContext._
+
+class CrossValidatorSuite extends FunSuite with BeforeAndAfterAll {
+
+  var dataset: SchemaRDD = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    dataset = sparkContext.makeRDD(generateLogisticInput(1.0, 1.0, 1000, 42), 2)
+  }
+
+  test("cross validation with logistic regression") {
+    val lr = new LogisticRegression
+    val lrParamMaps = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.1, 100.0))
+      .addGrid(lr.maxIter, Array(2, 10))
+      .build()
+    val eval = new BinaryClassificationEvaluator
+    val cv = new CrossValidator()
+      .setEstimator(lr)
+      .setEstimatorParamMaps(lrParamMaps)
+      .setEvaluator(eval)
+      .setNumFolds(3)
+    val cvModel = cv.fit(dataset)
+    val bestParamMap = cvModel.bestModel.fittingParamMap
+    assert(bestParamMap(lr.regParam) === 0.1)
+    assert(bestParamMap(lr.maxIter) === 10)
+  }
+}