Fix for ineligible filters, use compressed block size (apache#70)

pwoody · web-flow · commit ded7fee4d168 · 2016-11-28T16:03:49.000-05:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileSplitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileSplitter.scala
@@ -78,9 +78,9 @@ class ParquetMetadataFileSplitter(
       (applied, unapplied, filteredBlocks)
     }
 
-    val eligible = parquetFilter(unapplied, filteredBlocks).map { bmd =>
+    val eligible = applyParquetFilter(unapplied, filteredBlocks).map { bmd =>
       val blockPath = new Path(root, bmd.getPath)
-      new FileSplit(blockPath, bmd.getStartingPos, bmd.getTotalByteSize, Array.empty)
+      new FileSplit(blockPath, bmd.getStartingPos, bmd.getCompressedSize, Array.empty)
     }
 
     val statFilter: (FileStatus => Seq[FileSplit]) = { stat =>
@@ -95,19 +95,20 @@ class ParquetMetadataFileSplitter(
     statFilter
   }
 
-  private def parquetFilter(
+  private def applyParquetFilter(
       filters: Seq[Filter],
       blocks: Seq[BlockMetaData]): Seq[BlockMetaData] = {
-    if (filters.nonEmpty) {
+    val predicates = filters.flatMap {
+      ParquetFilters.createFilter(schema, _)
+    }
+    if (predicates.nonEmpty) {
       // Asynchronously build bitmaps
       Future {
         buildFilterBitMaps(filters)
       }(ParquetMetadataFileSplitter.executionContext)
 
-      val predicate = filters.flatMap {
-        ParquetFilters.createFilter(schema, _)
-      }.reduce(FilterApi.and)
-      blocks.filter(bmd => !StatisticsFilter.canDrop(predicate, bmd.getColumns))
+      val predicate = predicates.reduce(FilterApi.and)
+      blocks.filterNot(bmd => StatisticsFilter.canDrop(predicate, bmd.getColumns))
     } else {
       blocks
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -769,6 +769,32 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       }
     }
   }
+
+  test("Ensure file with multiple blocks splits properly with filters") {
+    withSQLConf(ParquetOutputFormat.JOB_SUMMARY_LEVEL -> "ALL",
+      SQLConf.FILES_MAX_PARTITION_BYTES.key -> "1024",
+      ParquetOutputFormat.BLOCK_SIZE -> "1") {
+      withTempPath { path =>
+        spark.sparkContext.parallelize((1 to 1000).map(x => x.toString), 1)
+          .toDF("x").write.parquet(path.getCanonicalPath)
+        val df = spark.read.parquet(path.getCanonicalPath)
+        val column: Column = df.col("x").isNotNull
+        assert(df.filter(column).count == df.count)
+      }
+    }
+  }
+
+  test("Ensure unconvertable filters don't break splitting") {
+    withSQLConf(ParquetOutputFormat.JOB_SUMMARY_LEVEL -> "ALL") {
+      withTempPath { path =>
+        spark.sparkContext.parallelize((1 to 1000).map(x => x.toString), 1)
+          .toDF("x").write.parquet(path.getCanonicalPath)
+        val df = spark.read.parquet(path.getCanonicalPath)
+        val column: Column = df.col("x").startsWith("1000")
+        assert(df.filter(column).count == 1)
+      }
+    }
+  }
 }
 
 class CountingFileSystem extends RawLocalFileSystem {