add more comments to the example

mengxr · mengxr · commit 0586c7b0b575 · 2015-01-27T14:15:20.000-08:00
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -34,6 +34,8 @@
 if __name__ == "__main__":
     sc = SparkContext(appName="SimpleTextClassificationPipeline")
     sqlCtx = SQLContext(sc)
+
+    # Prepare training documents, which are labeled.
     LabeledDocument = Row('id', 'text', 'label')
     training = sqlCtx.inferSchema(
         sc.parallelize([(0L, "a b c d e spark", 1.0),
@@ -42,6 +44,7 @@
                         (3L, "hadoop mapreduce", 0.0)])
           .map(lambda x: LabeledDocument(*x)))
 
+    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
     tokenizer = Tokenizer() \
         .setInputCol("text") \
         .setOutputCol("words")
@@ -54,8 +57,10 @@
     pipeline = Pipeline() \
         .setStages([tokenizer, hashingTF, lr])
 
+    # Fit the pipeline to training documents.
     model = pipeline.fit(training)
 
+    # Prepare test documents, which are unlabeled.
     Document = Row('id', 'text')
     test = sqlCtx.inferSchema(
         sc.parallelize([(4L, "spark i j k"),
@@ -64,9 +69,11 @@
                         (7L, "apache hadoop")])
           .map(lambda x: Document(*x)))
 
+    # Make predictions on test documents and print columns of interest.
     prediction = model.transform(test)
-
     prediction.registerTempTable("prediction")
     selected = sqlCtx.sql("SELECT id, text, prediction from prediction")
     for row in selected.collect():
         print row
+
+    sc.stop()