Adding SparkConf setting to disable filter predicate pushdown

AndreSchumacher · AndreSchumacher · commit 85fea2df5a89 · 2014-05-16T20:37:18.000+03:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
@@ -31,6 +31,10 @@ import org.apache.spark.sql.execution.SparkSqlSerializer
 
 object ParquetFilters {
   val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter"
+  // set this to false if pushdown should be disabled
+  // Note: prefix is "spark.hadoop." so that it will be copied from SparkConf
+  // to Hadoop configuration
+  val PARQUET_FILTER_PUSHDOWN_ENABLED = "org.apache.spark.sql.parquet.filter.pushdown"
 
   def createFilter(filterExpressions: Seq[Expression]): UnboundRecordFilter = {
     def createEqualityFilter(name: String, literal: Literal) = literal.dataType match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -71,10 +71,13 @@ case class ParquetTableScan(
         ParquetTypesConverter.convertFromAttributes(output).toString)
 
     // Store record filtering predicate in `Configuration`
-    // Note: the input format ignores all predicates that cannot be expressed
+    // Note 1: the input format ignores all predicates that cannot be expressed
     // as simple column predicate filters in Parquet. Here we just record
     // the whole pruning predicate.
-    if (columnPruningPred.isDefined) {
+    // Note 2: you can disable filter predicate pushdown by setting
+    // "org.apache.spark.sql.parquet.filter.pushdown" to false inside SparkConf.
+    if (columnPruningPred.isDefined &&
+      sc.conf.getBoolean(ParquetFilters.PARQUET_FILTER_PUSHDOWN_ENABLED, true)) {
       ParquetFilters.serializeFilterExpressions(columnPruningPred.get, conf)
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
@@ -130,14 +130,15 @@ private[sql] object ParquetTestData {
     writer.close()
   }
 
-  def writeFilterFile() = {
+  def writeFilterFile(records: Int = 200) = {
+    // for microbenchmark use: records = 300000000
     testFilterDir.delete
     val path: Path = new Path(new Path(testFilterDir.toURI), new Path("part-r-0.parquet"))
     val schema: MessageType = MessageTypeParser.parseMessageType(testFilterSchema)
     val writeSupport = new TestGroupWriteSupport(schema)
     val writer = new ParquetWriter[Group](path, writeSupport)
 
-    for(i <- 0 to 200) {
+    for(i <- 0 to records) {
       val record = new SimpleGroup(schema)
       if (i % 4 == 0) {
         record.add(0, true)