WIP added test case

giwa · giwa · commit 9cde7c96b9c1 · 2014-09-20T20:45:44.000+09:00
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -350,8 +350,6 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
-    println("RDDDD ==================")
-    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
@@ -9,15 +9,22 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([[1],[1],[1]])
-#    ssc.checkpoint("/tmp/spark_ckp")
-    fm_test = test_input.flatMap(lambda x: x.split(" "))
-    mapped_test = fm_test.map(lambda x: (x, 1))
+    test_input = ssc._testInputStream([1,2,3])
+    class buff:
+        pass
+   
+    fm_test = test_input.map(lambda x: (x, 1))
+    fm_test.test_output(buff)
 
-
-    mapped_test.print_()
     ssc.start()
-#    ssc.awaitTermination()
-#    ssc.stop()
+    while True:
+        ssc.awaitTermination(50)
+        try:
+            buff.result
+            break
+        except AttributeError:
+            pass
+
+    ssc.stop()
+    print buff.result
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
@@ -236,6 +236,7 @@ def pyprint(self):
         operator, so this DStream will be registered as an output stream and there materialized.
         """
         def takeAndPrint(rdd, time):
+            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -420,7 +421,6 @@ def saveAsTextFile(rdd, time):
 # TODO: implemtnt rightOuterJoin
 
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
@@ -444,6 +444,17 @@ def tearDown(self):
     def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
+        start_time = time.time()
+        while True:
+            current_time = time.time()
+            # check time out
+            if (current_time - start_time) > self.timeout:
+                self.ssc.stop()
+                break
+            self.ssc.awaitTermination(50)
+            if buff.result is not None:
+                break
+        return buff.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -61,7 +61,7 @@ def main(infile, outfile):
         SparkFiles._is_running_on_worker = True
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
+        sys.path.append(spark_files_dir) # *.py files that were added will be copied here
         num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,15 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  def print(label: String = null): Unit = {
+    dstream.print(label)
+  }
+
+  def outputToFile(): Unit = {
+    dstream.outputToFile()
+  }
+
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,8 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
+        logInfo("RDD ID in python DStream     ===========")
+        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,6 +620,23 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+  def print(label: String = null) {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val first11 = rdd.take(11)
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+      if(label != null){
+        println (label)
+      }
+      first11.take(10).foreach(println)
+      if (first11.size > 10) println("...")
+      println()
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

Original file line number	Diff line number	Diff line change
`@@ -350,8 +350,6 @@ private[spark] object PythonRDD extends Logging {`
`350`	`350`	`} catch {`
`351`	`351`	`case eof: EOFException => {}`
`352`	`352`	`}`
`353`		`- println("RDDDD ==================")`
`354`		`- println(objs)`
`355`	`353`	`JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))`
`356`	`354`	`}`
`357`	`355`