[CARMEL-6324] Support bucket skew detection (#1130)

xingchaozh · GitHub Enterprise · commit d0a000ee91cd · 2022-11-11T10:24:07.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.plans.logical.TableParallelInfo
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.catalyst.util.truncatedString
+import org.apache.spark.sql.execution.adaptive.SkewHandlingUtil
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource}
 import org.apache.spark.sql.execution.index.IndexMetaLookup
@@ -421,6 +422,18 @@ case class FileSourceScanExec(
         tableParallelInfo.get.partitionNumber.isEmpty))
   }
 
+  lazy val isBucketSkew = {
+    bucketedScan && {
+      val files = selectedPartitions.flatMap(partition => partition.files)
+      val bucketFilesGroupingSize = files.map(file => (file.getPath.getName, file.getLen))
+        .groupBy(file => BucketingUtils.getBucketId(file._1))
+        .map(bucket => bucket._2.map(_._2).sum).toArray
+
+      SkewHandlingUtil.isSkewed(bucketFilesGroupingSize.max,
+        Utils.median(bucketFilesGroupingSize, false), conf)
+    }
+  }
+
   override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = {
     if (bucketedScan) {
       // For bucketed columns:
@@ -522,7 +535,11 @@ case class FileSourceScanExec(
     val withSelectedBucketsCount = relation.bucketSpec.map { spec =>
       val bucketedKey = "Bucketed"
       val withBucketedScanStatus = if (bucketedScan) {
-        metadata + (bucketedKey -> "true")
+        if (isBucketSkew) {
+          metadata + (bucketedKey -> s"true skewed")
+        } else {
+          metadata + (bucketedKey -> s"true")
+        }
       } else if (!relation.sparkSession.sessionState.conf.bucketingEnabled) {
         metadata + (bucketedKey -> "false (disabled by configuration)")
       } else if (disableBucketedScan) {