Skip to content

Commit 2bd33ce

Browse files
lianchengrxin
authored andcommitted
[Minor] [SQL] Cleans up DataFrame variable names and toDF() calls
Although we've migrated to the DataFrame API, lots of code still uses `rdd` or `srdd` as local variable names. This PR tries to address these naming inconsistencies and some other minor DataFrame related style issues. <!-- Reviewable:start --> [<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4670) <!-- Reviewable:end --> Author: Cheng Lian <[email protected]> Closes #4670 from liancheng/df-cleanup and squashes the following commits: 3e14448 [Cheng Lian] Cleans up DataFrame variable names and toDF() calls (cherry picked from commit 61ab085) Signed-off-by: Reynold Xin <[email protected]>
1 parent f8f9a64 commit 2bd33ce

37 files changed

+250
-259
lines changed

examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ object CrossValidatorExample {
9090
crossval.setNumFolds(2) // Use 3+ in practice
9191

9292
// Run cross-validation, and choose the best set of parameters.
93-
val cvModel = crossval.fit(training.toDF)
93+
val cvModel = crossval.fit(training.toDF())
9494

9595
// Prepare test documents, which are unlabeled.
9696
val test = sc.parallelize(Seq(

examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ object DeveloperApiExample {
5858
lr.setMaxIter(10)
5959

6060
// Learn a LogisticRegression model. This uses the parameters stored in lr.
61-
val model = lr.fit(training.toDF)
61+
val model = lr.fit(training.toDF())
6262

6363
// Prepare test data.
6464
val test = sc.parallelize(Seq(
@@ -67,7 +67,7 @@ object DeveloperApiExample {
6767
LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
6868

6969
// Make predictions on test data.
70-
val sumPredictions: Double = model.transform(test.toDF)
70+
val sumPredictions: Double = model.transform(test.toDF())
7171
.select("features", "label", "prediction")
7272
.collect()
7373
.map { case Row(features: Vector, label: Double, prediction: Double) =>

examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,9 @@ object MovieLensALS {
137137
.setRegParam(params.regParam)
138138
.setNumBlocks(params.numBlocks)
139139

140-
val model = als.fit(training.toDF)
140+
val model = als.fit(training.toDF())
141141

142-
val predictions = model.transform(test.toDF).cache()
142+
val predictions = model.transform(test.toDF()).cache()
143143

144144
// Evaluate the model.
145145
// TODO: Create an evaluator to compute RMSE.
@@ -158,7 +158,7 @@ object MovieLensALS {
158158

159159
// Inspect false positives.
160160
predictions.registerTempTable("prediction")
161-
sc.textFile(params.movies).map(Movie.parseMovie).toDF.registerTempTable("movie")
161+
sc.textFile(params.movies).map(Movie.parseMovie).toDF().registerTempTable("movie")
162162
sqlContext.sql(
163163
"""
164164
|SELECT userId, prediction.movieId, title, rating, prediction

examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ object SimpleParamsExample {
5858
.setRegParam(0.01)
5959

6060
// Learn a LogisticRegression model. This uses the parameters stored in lr.
61-
val model1 = lr.fit(training.toDF)
61+
val model1 = lr.fit(training.toDF())
6262
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
6363
// we can view the parameters it used during fit().
6464
// This prints the parameter (name: value) pairs, where names are unique IDs for this
@@ -77,7 +77,7 @@ object SimpleParamsExample {
7777

7878
// Now learn a new model using the paramMapCombined parameters.
7979
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
80-
val model2 = lr.fit(training.toDF, paramMapCombined)
80+
val model2 = lr.fit(training.toDF(), paramMapCombined)
8181
println("Model 2 was fit using parameters: " + model2.fittingParamMap)
8282

8383
// Prepare test data.
@@ -90,7 +90,7 @@ object SimpleParamsExample {
9090
// LogisticRegression.transform will only use the 'features' column.
9191
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
9292
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
93-
model2.transform(test.toDF)
93+
model2.transform(test.toDF())
9494
.select("features", "label", "myProbability", "prediction")
9595
.collect()
9696
.foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>

examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ object SimpleTextClassificationPipeline {
6969
.setStages(Array(tokenizer, hashingTF, lr))
7070

7171
// Fit the pipeline to training documents.
72-
val model = pipeline.fit(training.toDF)
72+
val model = pipeline.fit(training.toDF())
7373

7474
// Prepare test documents, which are unlabeled.
7575
val test = sc.parallelize(Seq(
@@ -79,7 +79,7 @@ object SimpleTextClassificationPipeline {
7979
Document(7L, "apache hadoop")))
8080

8181
// Make predictions on test documents.
82-
model.transform(test.toDF)
82+
model.transform(test.toDF())
8383
.select("id", "text", "probability", "prediction")
8484
.collect()
8585
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>

examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ object DatasetExample {
8181
println(s"Loaded ${origData.count()} instances from file: ${params.input}")
8282

8383
// Convert input data to DataFrame explicitly.
84-
val df: DataFrame = origData.toDF
84+
val df: DataFrame = origData.toDF()
8585
println(s"Inferred schema:\n${df.schema.prettyJson}")
8686
println(s"Converted to DataFrame with ${df.count()} records")
8787

examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ object RDDRelation {
3434
// Importing the SQL context gives access to all the SQL functions and implicit conversions.
3535
import sqlContext.implicits._
3636

37-
val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF
37+
val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF()
3838
// Any RDD containing case classes can be registered as a table. The schema of the table is
3939
// automatically inferred using scala reflection.
4040
df.registerTempTable("records")

examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ object HiveFromSpark {
6868

6969
// You can also register RDDs as temporary tables within a HiveContext.
7070
val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
71-
rdd.toDF.registerTempTable("records")
71+
rdd.toDF().registerTempTable("records")
7272

7373
// Queries can then join RDD data with data stored in Hive.
7474
println("Result of SELECT *:")

mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
102102
sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
103103

104104
// Create Parquet data.
105-
val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF
105+
val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF()
106106
dataRDD.saveAsParquetFile(dataPath(path))
107107
}
108108

mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ private[classification] object GLMClassificationModel {
6262

6363
// Create Parquet data.
6464
val data = Data(weights, intercept, threshold)
65-
sc.parallelize(Seq(data), 1).toDF.saveAsParquetFile(Loader.dataPath(path))
65+
sc.parallelize(Seq(data), 1).toDF().saveAsParquetFile(Loader.dataPath(path))
6666
}
6767

6868
/**

0 commit comments

Comments
 (0)