Skip to content

Commit 0586c7b

Browse files
committed
add more comments to the example
1 parent 5153cff commit 0586c7b

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

examples/src/main/python/ml/simple_text_classification_pipeline.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
if __name__ == "__main__":
3535
sc = SparkContext(appName="SimpleTextClassificationPipeline")
3636
sqlCtx = SQLContext(sc)
37+
38+
# Prepare training documents, which are labeled.
3739
LabeledDocument = Row('id', 'text', 'label')
3840
training = sqlCtx.inferSchema(
3941
sc.parallelize([(0L, "a b c d e spark", 1.0),
@@ -42,6 +44,7 @@
4244
(3L, "hadoop mapreduce", 0.0)])
4345
.map(lambda x: LabeledDocument(*x)))
4446

47+
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
4548
tokenizer = Tokenizer() \
4649
.setInputCol("text") \
4750
.setOutputCol("words")
@@ -54,8 +57,10 @@
5457
pipeline = Pipeline() \
5558
.setStages([tokenizer, hashingTF, lr])
5659

60+
# Fit the pipeline to training documents.
5761
model = pipeline.fit(training)
5862

63+
# Prepare test documents, which are unlabeled.
5964
Document = Row('id', 'text')
6065
test = sqlCtx.inferSchema(
6166
sc.parallelize([(4L, "spark i j k"),
@@ -64,9 +69,11 @@
6469
(7L, "apache hadoop")])
6570
.map(lambda x: Document(*x)))
6671

72+
# Make predictions on test documents and print columns of interest.
6773
prediction = model.transform(test)
68-
6974
prediction.registerTempTable("prediction")
7075
selected = sqlCtx.sql("SELECT id, text, prediction from prediction")
7176
for row in selected.collect():
7277
print row
78+
79+
sc.stop()

0 commit comments

Comments
 (0)