Merge remote-tracking branch 'apache/master' into range-part

mengxr · mengxr · commit cc12f47f670a · 2014-07-22T01:02:28.000-07:00
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
@@ -16,6 +16,9 @@
 #
 
 """
+This is an example implementation of ALS for learning how to use Spark. Please refer to
+ALS in pyspark.mllib.recommendation for more conventional use.
+
 This example requires numpy (http://www.numpy.org/)
 """
 from os.path import realpath
@@ -49,9 +52,15 @@ def update(i, vec, mat, ratings):
 
 
 if __name__ == "__main__":
+
     """
     Usage: als [M] [U] [F] [iterations] [slices]"
     """
+
+    print >> sys.stderr, """WARN: This is a naive implementation of ALS and is given as an
+      example. Please use the ALS method found in pyspark.mllib.recommendation for more
+      conventional use."""
+
     sc = SparkContext(appName="PythonALS")
     M = int(sys.argv[1]) if len(sys.argv) > 1 else 100
     U = int(sys.argv[2]) if len(sys.argv) > 2 else 500
diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
@@ -45,9 +45,15 @@ def closestPoint(p, centers):
 
 
 if __name__ == "__main__":
+
     if len(sys.argv) != 4:
         print >> sys.stderr, "Usage: kmeans <file> <k> <convergeDist>"
         exit(-1)
+
+    print >> sys.stderr, """WARN: This is a naive implementation of KMeans Clustering and is given
+       as an example! Please refer to examples/src/main/python/mllib/kmeans.py for an example on
+       how to use MLlib's KMeans implementation."""
+
     sc = SparkContext(appName="PythonKMeans")
     lines = sc.textFile(sys.argv[1])
     data = lines.map(parseVector).cache()
diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
@@ -47,9 +47,15 @@ def readPointBatch(iterator):
     return [matrix]
 
 if __name__ == "__main__":
+
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: logistic_regression <file> <iterations>"
         exit(-1)
+
+    print >> sys.stderr,  """WARN: This is a naive implementation of Logistic Regression and is
+      given as an example! Please refer to examples/src/main/python/mllib/logistic_regression.py
+      to see how MLlib's implementation is used."""
+
     sc = SparkContext(appName="PythonLR")
     points = sc.textFile(sys.argv[1]).mapPartitions(readPointBatch).cache()
     iterations = int(sys.argv[2])
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala b/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
@@ -25,6 +25,9 @@ import cern.jet.math._
 
 /**
  * Alternating least squares matrix factorization.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.recommendation.ALS
  */
 object LocalALS {
   // Parameters set through command line arguments
@@ -107,7 +110,16 @@ object LocalALS {
     solved2D.viewColumn(0)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of ALS and is given as an example!
+        |Please use the ALS method found in org.apache.spark.mllib.recommendation
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     args match {
       case Array(m, u, f, iters) => {
         M = m.toInt
@@ -120,6 +132,9 @@ object LocalALS {
         System.exit(1)
       }
     }
+
+    showWarning()
+
     printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
 
     val R = generateR()
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
@@ -21,6 +21,12 @@ import java.util.Random
 
 import breeze.linalg.{Vector, DenseVector}
 
+/**
+ * Logistic regression based classification.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
+ */
 object LocalFileLR {
   val D = 10   // Numer of dimensions
   val rand = new Random(42)
@@ -32,7 +38,18 @@ object LocalFileLR {
     DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
+    showWarning()
+
     val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
     val points = lines.map(parsePoint _)
     val ITERATIONS = args(1).toInt
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
@@ -28,6 +28,9 @@ import org.apache.spark.SparkContext._
 
 /**
  * K-means clustering.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.clustering.KMeans
  */
 object LocalKMeans {
   val N = 1000
@@ -61,7 +64,18 @@ object LocalKMeans {
     bestIndex
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
+        |Please use the KMeans method found in org.apache.spark.mllib.clustering
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
+    showWarning()
+
     val data = generateData
     var points = new HashSet[Vector[Double]]
     var kPoints = new HashMap[Int, Vector[Double]]
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
@@ -23,6 +23,9 @@ import breeze.linalg.{Vector, DenseVector}
 
 /**
  * Logistic regression based classification.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
  */
 object LocalLR {
   val N = 10000  // Number of data points
@@ -42,9 +45,19 @@ object LocalLR {
     Array.tabulate(N)(generatePoint)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
-    val data = generateData
 
+    showWarning()
+
+    val data = generateData
     // Initialize w to a random value
     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
     println("Initial w: " + w)
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
@@ -27,6 +27,9 @@ import org.apache.spark._
 
 /**
  * Alternating least squares matrix factorization.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.recommendation.ALS
  */
 object SparkALS {
   // Parameters set through command line arguments
@@ -87,7 +90,16 @@ object SparkALS {
     solved2D.viewColumn(0)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of ALS and is given as an example!
+        |Please use the ALS method found in org.apache.spark.mllib.recommendation
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     var slices = 0
 
     val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None)
@@ -103,7 +115,11 @@ object SparkALS {
         System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]")
         System.exit(1)
     }
+
+    showWarning()
+
     printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
+
     val sparkConf = new SparkConf().setAppName("SparkALS")
     val sc = new SparkContext(sparkConf)
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
@@ -30,6 +30,9 @@ import org.apache.spark.scheduler.InputFormatInfo
 
 /**
  * Logistic regression based classification.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
  */
 object SparkHdfsLR {
   val D = 10   // Numer of dimensions
@@ -48,12 +51,23 @@ object SparkHdfsLR {
     DataPoint(new DenseVector(x), y)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     if (args.length < 2) {
       System.err.println("Usage: SparkHdfsLR <file> <iters>")
       System.exit(1)
     }
 
+    showWarning()
+
     val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
     val inputPath = args(0)
     val conf = SparkHadoopUtil.get.newConfiguration()
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
@@ -24,6 +24,9 @@ import org.apache.spark.SparkContext._
 
 /**
  * K-means clustering.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.clustering.KMeans
  */
 object SparkKMeans {
 
@@ -46,11 +49,23 @@ object SparkKMeans {
     bestIndex
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
+        |Please use the KMeans method found in org.apache.spark.mllib.clustering
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     if (args.length < 3) {
       System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")
       System.exit(1)
     }
+
+    showWarning()
+
     val sparkConf = new SparkConf().setAppName("SparkKMeans")
     val sc = new SparkContext(sparkConf)
     val lines = sc.textFile(args(0))
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
@@ -28,6 +28,9 @@ import org.apache.spark._
 /**
  * Logistic regression based classification.
  * Usage: SparkLR [slices]
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
  */
 object SparkLR {
   val N = 10000  // Number of data points
@@ -47,7 +50,18 @@ object SparkLR {
     Array.tabulate(N)(generatePoint)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
+    showWarning()
+
     val sparkConf = new SparkConf().setAppName("SparkLR")
     val sc = new SparkContext(sparkConf)
     val numSlices = if (args.length > 0) args(0).toInt else 2
@@ -66,6 +80,7 @@ object SparkLR {
     }
 
     println("Final w: " + w)
+
     sc.stop()
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala