SPARK-3278 changed Java api to match Scala api's (Double, Double, Double)

zapletal-martin · zapletal-martin · commit 0d14bd3df57b · 2015-01-13T00:54:41.000Z
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -98,7 +98,7 @@ class PoolAdjacentViolators private [mllib]
 
   override def run(
       input: RDD[(Double, Double, Double)],
-      isotonic: Boolean): IsotonicRegressionModel = {
+      isotonic: Boolean = true): IsotonicRegressionModel = {
     createModel(
       parallelPoolAdjacentViolators(input, isotonic),
       isotonic)
@@ -217,18 +217,20 @@ object IsotonicRegression {
   }
 
   /**
-   * Train a monotone regression model given an RDD of (label, feature).
+   * Train a monotone regression model given an RDD of (label, feature, weight).
    * Label is the dependent y value
-   * Weight defaults to 1
+   * Weight of the data point is the number of measurements. Default is 1
    *
-   * @param input RDD of (label, feature).
+   * @param input RDD of (label, feature, weight).
    * @param isotonic isotonic (increasing) or antitonic (decreasing) sequence
    * @return
    */
   def train(
-      input: JavaPairRDD[java.lang.Double, java.lang.Double],
+      input: JavaRDD[(java.lang.Double, java.lang.Double, java.lang.Double)],
       isotonic: Boolean): IsotonicRegressionModel = {
     new PoolAdjacentViolators()
-      .run(input.rdd.map(x => (x._1.doubleValue(), x._2.doubleValue(), 1d)), isotonic)
+      .run(
+        input.rdd.map(x => (x._1.doubleValue(), x._2.doubleValue(), x._3.doubleValue())),
+        isotonic)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/IsotonicDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/IsotonicDataGenerator.scala
@@ -34,10 +34,11 @@ object IsotonicDataGenerator {
    * @param labels list of labels for the data points
    * @return Java List of input.
    */
-  def generateIsotonicInputAsList(labels: Array[Double]): java.util.List[(JDouble, JDouble)] = {
+  def generateIsotonicInputAsList(
+      labels: Array[Double]):java.util.List[(JDouble, JDouble, JDouble)] = {
     seqAsJavaList(
-      generateIsotonicInput(
-        wrapDoubleArray(labels):_*).map(x => (new JDouble(x._1), new JDouble(x._2))))
+      generateIsotonicInput(wrapDoubleArray(labels):_*)
+        .map(x => (new JDouble(x._1), new JDouble(x._2), new JDouble(1))))
   }
 
   /**
diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.mllib.regression;
 
-import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
@@ -26,7 +25,7 @@
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
-import scala.Tuple2;
+import scala.Tuple3;
 
 import java.io.Serializable;
 import java.util.List;
@@ -45,11 +44,11 @@ public void tearDown() {
     sc = null;
   }
 
-  double difference(List<Tuple2<Double, Double>> expected, IsotonicRegressionModel model) {
+  double difference(List<Tuple3<Double, Double, Double>> expected, IsotonicRegressionModel model) {
     double diff = 0;
 
     for(int i = 0; i < model.predictions().length(); i++) {
-      Tuple2<Double, Double> exp = expected.get(i);
+      Tuple3<Double, Double, Double> exp = expected.get(i);
       diff += Math.abs(model.predict(exp._2()) - exp._1());
     }
 
@@ -58,13 +57,13 @@ public void tearDown() {
 
   @Test
   public void runIsotonicRegressionUsingStaticMethod() {
-    JavaPairRDD<Double, Double> trainRDD = sc.parallelizePairs(
+    JavaRDD<Tuple3<Double, Double, Double>> trainRDD = sc.parallelize(
       IsotonicDataGenerator.generateIsotonicInputAsList(
         new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12})).cache();
 
     IsotonicRegressionModel model = IsotonicRegression.train(trainRDD, true);
 
-    List<Tuple2<Double, Double>> expected = IsotonicDataGenerator
+    List<Tuple3<Double, Double, Double>> expected = IsotonicDataGenerator
       .generateIsotonicInputAsList(
         new double[] {1, 2, 7d/3, 7d/3, 7d/3, 6, 7, 8, 10, 10, 10, 12});
 
@@ -73,15 +72,15 @@ public void runIsotonicRegressionUsingStaticMethod() {
 
   @Test
   public void testPredictJavaRDD() {
-    JavaPairRDD<Double, Double> trainRDD = sc.parallelizePairs(
+    JavaRDD<Tuple3<Double, Double, Double>> trainRDD = sc.parallelize(
       IsotonicDataGenerator.generateIsotonicInputAsList(
         new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12})).cache();
 
     IsotonicRegressionModel model = IsotonicRegression.train(trainRDD, true);
 
-    JavaRDD<Double> testRDD = trainRDD.map(new Function<Tuple2<Double, Double>, Double>() {
+    JavaRDD<Double> testRDD = trainRDD.map(new Function<Tuple3<Double, Double, Double>, Double>() {
       @Override
-      public Double call(Tuple2<Double, Double> v) throws Exception {
+      public Double call(Tuple3<Double, Double, Double> v) throws Exception {
         return v._2();
       }
     });
@@ -91,5 +90,4 @@ public Double call(Tuple2<Double, Double> v) throws Exception {
     Assert.assertTrue(predictions.get(0) == 1d);
     Assert.assertTrue(predictions.get(11) == 12d);
   }
-}
-
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala
@@ -197,16 +197,15 @@ class IsotonicRegressionClusterSuite
   extends FunSuite
   with LocalClusterSparkContext {
 
-  //TODO: FIX
   test("task size should be small in both training and prediction") {
-    val n = 135000
+    val n = 1000
 
     val trainData = (0 to n).map(i => (i.toDouble, i.toDouble, 1d))
-    val points = sc.parallelize(trainData, 1)
+    val points = sc.parallelize(trainData, 2)
 
     // If we serialize data directly in the task closure, the size of the serialized task would be
     // greater than 1MB and hence Spark would throw an error.
-    val model = IsotonicRegression.train(points, true)
+    val model = IsotonicRegression.train(points)
     val predictions = model.predict(points.map(_._2))
   }
-}
+}