@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._
26
26
import org .apache .spark .sql .catalyst .planning .PhysicalOperation
27
27
import org .apache .spark .sql .catalyst .plans .logical
28
28
import org .apache .spark .sql .catalyst .plans .logical .LogicalPlan
29
+ import org .apache .spark .sql .execution .SparkPlan
29
30
import org .apache .spark .sql .sources ._
30
31
import org .apache .spark .sql .types .{StringType , StructType }
31
32
import org .apache .spark .sql .{SaveMode , Strategy , execution , sources , _ }
@@ -121,7 +122,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
121
122
projections : Seq [NamedExpression ],
122
123
filters : Seq [Expression ],
123
124
partitionColumns : StructType ,
124
- partitions : Array [Partition ]) = {
125
+ partitions : Array [Partition ]): SparkPlan = {
125
126
val relation = logicalRelation.relation.asInstanceOf [HadoopFsRelation ]
126
127
127
128
// Because we are creating one RDD per partition, we need to have a shared HadoopConf.
@@ -130,49 +131,51 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
130
131
val confBroadcast =
131
132
relation.sqlContext.sparkContext.broadcast(new SerializableConfiguration (sharedHadoopConf))
132
133
133
- // Builds RDD[Row]s for each selected partition.
134
- val perPartitionRows = partitions.map { case Partition (partitionValues, dir) =>
135
- // The table scan operator (PhysicalRDD) which retrieves required columns from data files.
136
- // Notice that the schema of data files, represented by `relation.dataSchema`, may contain
137
- // some partition column(s).
138
- val scan =
139
- pruneFilterProject(
140
- logicalRelation,
141
- projections,
142
- filters,
143
- (columns : Seq [Attribute ], filters) => {
144
- val partitionColNames = partitionColumns.fieldNames
145
-
146
- // Don't scan any partition columns to save I/O. Here we are being optimistic and
147
- // assuming partition columns data stored in data files are always consistent with those
148
- // partition values encoded in partition directory paths.
149
- val needed = columns.filterNot(a => partitionColNames.contains(a.name))
150
- val dataRows =
151
- relation.buildScan(needed.map(_.name).toArray, filters, Array (dir), confBroadcast)
152
-
153
- // Merges data values with partition values.
154
- mergeWithPartitionValues(
155
- relation.schema,
156
- columns.map(_.name).toArray,
157
- partitionColNames,
158
- partitionValues,
159
- toCatalystRDD(logicalRelation, needed, dataRows))
160
- })
161
-
162
- scan.execute()
163
- }
134
+ // Now, we create a scan builder, which will be used by pruneFilterProject. This scan builder
135
+ // will union all partitions and attach partition values if needed.
136
+ val scanBuilder = {
137
+ (columns : Seq [Attribute ], filters : Array [Filter ]) => {
138
+ // Builds RDD[Row]s for each selected partition.
139
+ val perPartitionRows = partitions.map { case Partition (partitionValues, dir) =>
140
+ val partitionColNames = partitionColumns.fieldNames
141
+
142
+ // Don't scan any partition columns to save I/O. Here we are being optimistic and
143
+ // assuming partition columns data stored in data files are always consistent with those
144
+ // partition values encoded in partition directory paths.
145
+ val needed = columns.filterNot(a => partitionColNames.contains(a.name))
146
+ val dataRows =
147
+ relation.buildScan(needed.map(_.name).toArray, filters, Array (dir), confBroadcast)
148
+
149
+ // Merges data values with partition values.
150
+ mergeWithPartitionValues(
151
+ relation.schema,
152
+ columns.map(_.name).toArray,
153
+ partitionColNames,
154
+ partitionValues,
155
+ toCatalystRDD(logicalRelation, needed, dataRows))
156
+ }
157
+
158
+ val unionedRows =
159
+ if (perPartitionRows.length == 0 ) {
160
+ relation.sqlContext.emptyResult
161
+ } else {
162
+ new UnionRDD (relation.sqlContext.sparkContext, perPartitionRows)
163
+ }
164
164
165
- val unionedRows =
166
- if (perPartitionRows.length == 0 ) {
167
- relation.sqlContext.emptyResult
168
- } else {
169
- new UnionRDD (relation.sqlContext.sparkContext, perPartitionRows)
165
+ unionedRows
170
166
}
167
+ }
168
+
169
+ // Create the scan operator. If needed, add Filter and/or Project on top of the scan.
170
+ // The added Filter/Project is on top of the unioned RDD. We do not want to create
171
+ // one Filter/Project for every partition.
172
+ val sparkPlan = pruneFilterProject(
173
+ logicalRelation,
174
+ projections,
175
+ filters,
176
+ scanBuilder)
171
177
172
- execution.PhysicalRDD .createFromDataSource(
173
- projections.map(_.toAttribute),
174
- unionedRows,
175
- logicalRelation.relation)
178
+ sparkPlan
176
179
}
177
180
178
181
// TODO: refactor this thing. It is very complicated because it does projection internally.
0 commit comments