Skip to content

[SPARK-52777][SQL] Enable shuffle cleanup mode configuration in Spark SQL #51458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,8 @@ import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_
import org.apache.spark.sql.connect.planner.SparkConnectPlanner
import org.apache.spark.sql.connect.service.ExecuteHolder
import org.apache.spark.sql.connect.utils.MetricGenerator
import org.apache.spark.sql.execution.{DoNotCleanup, LocalTableScanExec, RemoveShuffleFiles, SkipMigration, SQLExecution}
import org.apache.spark.sql.execution.{LocalTableScanExec, QueryExecution, SQLExecution}
import org.apache.spark.sql.execution.arrow.ArrowConverters
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.ThreadUtils

Expand All @@ -60,14 +59,7 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder)
val planner = new SparkConnectPlanner(executeHolder)
val tracker = executeHolder.eventsManager.createQueryPlanningTracker()
val conf = session.sessionState.conf
val shuffleCleanupMode =
if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED)) {
RemoveShuffleFiles
} else if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED)) {
SkipMigration
} else {
DoNotCleanup
}
val shuffleCleanupMode = QueryExecution.determineShuffleCleanupMode(conf)
val dataframe =
Dataset.ofRows(
sessionHolder.session,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule}
import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
import org.apache.spark.sql.catalyst.util.truncatedString
import org.apache.spark.sql.classic.SparkSession
import org.apache.spark.sql.execution.QueryExecution.determineShuffleCleanupMode
import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan}
import org.apache.spark.sql.execution.bucketing.{CoalesceBucketsInJoin, DisableUnnecessaryBucketedScan}
import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters
Expand All @@ -63,7 +64,8 @@ class QueryExecution(
val logical: LogicalPlan,
val tracker: QueryPlanningTracker = new QueryPlanningTracker,
val mode: CommandExecutionMode.Value = CommandExecutionMode.ALL,
val shuffleCleanupMode: ShuffleCleanupMode = DoNotCleanup) extends Logging {
val shuffleCleanupMode: ShuffleCleanupMode =
determineShuffleCleanupMode(SQLConf.get)) extends Logging {

val id: Long = QueryExecution.nextExecutionId

Expand Down Expand Up @@ -683,4 +685,14 @@ object QueryExecution {
normalized
}
}

def determineShuffleCleanupMode(conf: SQLConf): ShuffleCleanupMode = {
if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED)) {
RemoveShuffleFiles
} else if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED)) {
SkipMigration
} else {
DoNotCleanup
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,27 @@ class QueryExecutionSuite extends SharedSparkSession {
}
}

test("determineShuffleCleanupMode should return correct mode based on SQL configuration") {
val conf = new SQLConf()

// Defaults to doNotCleanup
conf.setConf(SQLConf.SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED, false)
conf.setConf(SQLConf.SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED, false)
assert(QueryExecution.determineShuffleCleanupMode(conf) === DoNotCleanup)

// Test RemoveShuffleFiles
conf.setConf(SQLConf.SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED, true)
conf.setConf(SQLConf.SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED, false)
assert(QueryExecution.determineShuffleCleanupMode(conf) === RemoveShuffleFiles)

// Test SkipMigration
conf.setConf(SQLConf.SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED, false)
conf.setConf(SQLConf.SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED, true)
assert(QueryExecution.determineShuffleCleanupMode(conf) === SkipMigration)

// TODO, when both enabled, RemoveShuffle tasks Precedence, log a warning?
}

case class MockCallbackEagerCommand(
var trackerAnalyzed: QueryPlanningTracker = null,
var trackerReadyForExecution: QueryPlanningTracker = null)
Expand Down