Updated docs. Added LabeledPointSuite to spark.ml

jkbradley · jkbradley · commit e433872d2777 · 2015-02-05T13:07:18.000-08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -34,7 +34,8 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
    * Fits a single model to the input data with optional parameters.
    *
    * @param dataset input dataset
-   * @param paramPairs optional list of param pairs (overwrite embedded params)
+   * @param paramPairs Optional list of param pairs.
+   *                   These values override any specified in this Estimator's embedded ParamMap.
    * @return fitted model
    */
   @varargs
@@ -47,7 +48,8 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
    * Fits a single model to the input data with provided parameter map.
    *
    * @param dataset input dataset
-   * @param paramMap parameter map
+   * @param paramMap Parameter map.
+   *                 These values override any specified in this Estimator's embedded ParamMap.
    * @return fitted model
    */
   def fit(dataset: DataFrame, paramMap: ParamMap): M
@@ -58,7 +60,8 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
    * Subclasses could overwrite this to optimize multi-model training.
    *
    * @param dataset input dataset
-   * @param paramMaps an array of parameter maps
+   * @param paramMaps An array of parameter maps.
+   *                  These values override any specified in this Estimator's embedded ParamMap.
    * @return fitted models, matching the input parameter maps
    */
   def fit(dataset: DataFrame, paramMaps: Array[ParamMap]): Seq[M] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -68,6 +68,13 @@ class LogisticRegression extends Classifier[LogisticRegression, LogisticRegressi
   def setThreshold(value: Double): this.type = set(threshold, value)
   def setScoreCol(value: String): this.type = set(scoreCol, value)
 
+  /**
+   * Same as [[fit()]], but using strong types.
+   *
+   * @param dataset  Training data.  WARNING: This does not yet handle instance weights.
+   * @param paramMap  Parameters for training.
+   *                  These values override any specified in this Estimator's embedded ParamMap.
+   */
   def train(dataset: RDD[LabeledPoint], paramMap: ParamMap): LogisticRegressionModel = {
     val oldDataset = dataset.map { case LabeledPoint(label: Double, features: Vector, weight) =>
       org.apache.spark.mllib.regression.LabeledPoint(label, features)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
@@ -94,12 +94,11 @@ private[ml] abstract class Predictor[Learner <: Predictor[Learner, M], M <: Pred
   }
 
   /**
-   * Notes to developers:
-   *  - Unlike [[fit()]], this method takes [[paramMap]] which has already been
-   *    combined with the internal paramMap.
-   *  - This should handle caching the dataset if needed.
+   * Same as [[fit()]], but using strong types.
+   *
    * @param dataset  Training data
    * @param paramMap  Parameters for training.
+   *                  These values override any specified in this Estimator's embedded ParamMap.
    */
   def train(dataset: RDD[LabeledPoint], paramMap: ParamMap): M
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -45,6 +45,13 @@ class LinearRegression extends Regressor[LinearRegression, LinearRegressionModel
   def setRegParam(value: Double): this.type = set(regParam, value)
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
+  /**
+   * Same as [[fit()]], but using strong types.
+   *
+   * @param dataset  Training data.  WARNING: This does not yet handle instance weights.
+   * @param paramMap  Parameters for training.
+   *                  These values override any specified in this Estimator's embedded ParamMap.
+   */
   def train(dataset: RDD[LabeledPoint], paramMap: ParamMap): LinearRegressionModel = {
     val oldDataset = dataset.map { case LabeledPoint(label: Double, features: Vector, weight) =>
       org.apache.spark.mllib.regression.LabeledPoint(label, features)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/LabeledPointSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{SQLContext, SchemaRDD}
+
+class LabeledPointSuite extends FunSuite with MLlibTestSparkContext {
+
+  @transient var sqlContext: SQLContext = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sqlContext = new SQLContext(sc)
+  }
+
+  test("LabeledPoint default weight 1.0") {
+    val label = 1.0
+    val features = Vectors.dense(1.0, 2.0, 3.0)
+    val lp1 = LabeledPoint(label, features)
+    val lp2 = LabeledPoint(label, features, weight = 1.0)
+    assert(lp1 === lp2)
+  }
+
+  test("Create SchemaRDD from RDD[LabeledPoint]") {
+    val sqlContext = this.sqlContext
+    import sqlContext._
+    val arr = Seq(
+      LabeledPoint(0.0, Vectors.dense(1.0, 2.0, 3.0)),
+      LabeledPoint(1.0, Vectors.dense(1.1, 2.1, 3.1)),
+      LabeledPoint(0.0, Vectors.dense(1.2, 2.2, 3.2)),
+      LabeledPoint(1.0, Vectors.dense(1.3, 2.3, 3.3)))
+    val rdd = sc.parallelize(arr)
+    val schemaRDD = rdd.select('label, 'features)
+    val points = schemaRDD.collect()
+    assert(points.size === arr.size)
+  }
+}