[SC-5559] Support logical delete and gc of commit markers in DatabricksAtomicCommitProtocol

ericl · ericl · commit 0eb17f8e6875 · 2017-01-20T14:48:41.000-08:00
## What changes were proposed in this pull request? This implements the newly added `deleteWithJob()` hook in `DatabricksAtomicCommitProtocol`. It can be flag-enabled independent of the commit protocol choice. I also implemented support for eventual gc of the commit markers using the mechanism we discussed of marking them as deleted in another commit marker. ### Config flags: com.databricks.sql.enableLogicalDelete -- whether to enable atomic overwrites (default true) ## How was this patch tested? Existing unit tests. Author: Eric Liang <ekl@databricks.com> Author: Eric Liang <ekhliang@gmail.com> Closes apache#167 from ericl/overwrite.
diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
 
 import org.apache.spark.util.Utils
@@ -112,6 +113,15 @@ abstract class FileCommitProtocol {
    * just crashes (or killed) before it can call abort.
    */
   def abortTask(taskContext: TaskAttemptContext): Unit
+
+  /**
+   * Specifies that a file should be deleted with the commit of this job. The default
+   * implementation deletes the file immediately, but this may be overriden to delay the physical
+   * deletion of the file until commit time.
+   */
+  def deleteWithJob(fs: FileSystem, path: Path, recursive: Boolean): Boolean = {
+    fs.delete(path, recursive)
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/com/databricks/sql/transaction/DatabricksAtomicCommitProtocol.scala b/sql/core/src/main/scala/com/databricks/sql/transaction/DatabricksAtomicCommitProtocol.scala
@@ -48,7 +48,7 @@ class DatabricksAtomicCommitProtocol(jobId: String, path: String)
   import DatabricksAtomicCommitProtocol._
 
   // Globally unique alphanumeric string. We decouple this from jobId for possible future use.
-  private val txnId: TxnId = math.abs(scala.util.Random.nextLong).toString
+  private val txnId: TxnId = newTxnId()
 
   // The list of files staged by this committer. These are collected to the driver on task commit.
   private val stagedFiles = mutable.Set[String]()
@@ -79,6 +79,42 @@ class DatabricksAtomicCommitProtocol(jobId: String, path: String)
     finalPath.toString
   }
 
+  override def deleteWithJob(_fs: FileSystem, path: Path, recursive: Boolean): Boolean = {
+    val fs = testingFs.getOrElse(_fs)
+    val sparkSession = SparkSession.getActiveSession.get
+    if (!sparkSession.sqlContext.getConf(
+        "com.databricks.sql.enableLogicalDelete", "true").toBoolean) {
+      return super.deleteWithJob(fs, path, recursive)
+    }
+    if (recursive && fs.getFileStatus(path).isFile) {
+      // In this case Spark is attempting to delete a file to make room for a directory.
+      // We cannot stage this sort of deletion, so just perform it immediately.
+      logWarning(s"Deleting $path immediately since it is a file not directory.")
+      return super.deleteWithJob(fs, path, true)
+    }
+    if (recursive) {
+      val (dirs, initialFiles) = fs.listStatus(path).partition(_.isDirectory)
+      val resolvedFiles = filterDirectoryListing(fs, path, initialFiles)
+      stagedDeletions ++= resolvedFiles.map(_.getPath).filter { path =>
+        path.getName match {
+          // Don't allow our metadata markers to be deleted with this API. That can result in
+          // unexpected results if e.g. a start marker is deleted in the middle of a job.
+          case STARTED_MARKER(_) | COMMITTED_MARKER(_) => false
+          case _ => true
+        }
+      }.toList
+      dirs.foreach { dir =>
+        deleteWithJob(fs, dir.getPath, true)
+      }
+    } else {
+      if (fs.getFileStatus(path).isDirectory) {
+        throw new IOException(s"Cannot delete directory $path unless recursive=true.")
+      }
+      stagedDeletions += path
+    }
+    true
+  }
+
   private def getFilename(taskContext: TaskAttemptContext, ext: String): String = {
     // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
     // the file name is fine and won't overflow.
@@ -172,7 +208,7 @@ object DatabricksAtomicCommitProtocol extends Logging {
    * @return the list of deleted files
    */
   def vacuum(path: Path, horizon: Long): List[Path] = {
-    val fs = path.getFileSystem(sparkSession.sparkContext.hadoopConfiguration)
+    val fs = testingFs.getOrElse(path.getFileSystem(sparkSession.sparkContext.hadoopConfiguration))
     val (dirs, initialFiles) = fs.listStatus(path).partition(_.isDirectory)
 
     def checkPositive(time: Long): Long = { assert(time > 0); time }
@@ -203,6 +239,7 @@ object DatabricksAtomicCommitProtocol extends Logging {
             s"(${state.getStartTime(txnId)} < $horizon).")
           delete(file.getPath)
 
+        // always safe to delete since the commit marker is present
         case STARTED_MARKER(txnId) if state.isCommitted(txnId) &&
             checkPositive(file.getModificationTime) < horizon =>
           logInfo(s"Garbage collecting start marker ${file.getPath} of committed job.")
@@ -212,6 +249,39 @@ object DatabricksAtomicCommitProtocol extends Logging {
       }
     }
 
+    // Queue up stale markers for deletion. We do this by writing out a _committed file that
+    // will cause them to be garbage collected in the next cycle.
+    var deleteLater: List[Path] = Nil
+    for (file <- resolvedFiles) {
+      file.getPath.getName match {
+        case name @ COMMITTED_MARKER(txnId) if state.getDeletionTime(name) == 0 &&
+            checkPositive(file.getModificationTime) < horizon =>
+          val startMarker = new Path(file.getPath.getParent, s"_started_$txnId")
+          if (fs.exists(startMarker)) {
+            delete(startMarker)  // make sure we delete it just in case
+          }
+          deleteLater ::= file.getPath
+
+        // the data files were deleted above, but we need to delay marker deletion
+        case STARTED_MARKER(txnId) if !state.isCommitted(txnId) &&
+            checkPositive(file.getModificationTime) < horizon =>
+          deleteLater ::= file.getPath
+
+        case _ =>
+      }
+    }
+
+    if (deleteLater.nonEmpty) {
+      val vacuumCommitMarker = new Path(path, "_committed_vacuum" + newTxnId())
+      val output = fs.create(vacuumCommitMarker)
+      deleteLater ::= vacuumCommitMarker  // it's self-deleting!
+      try {
+        serializeFileChanges(Nil, deleteLater.map(_.getName), output)
+      } finally {
+        output.close()
+      }
+    }
+
     // recurse
     for (d <- dirs) {
       deletedPaths :::= vacuum(d.getPath, horizon)
@@ -223,4 +293,6 @@ object DatabricksAtomicCommitProtocol extends Logging {
 
     deletedPaths
   }
+
+  private def newTxnId(): String = math.abs(scala.util.Random.nextLong).toString
 }
diff --git a/sql/core/src/main/scala/com/databricks/sql/transaction/DatabricksAtomicReadProtocol.scala b/sql/core/src/main/scala/com/databricks/sql/transaction/DatabricksAtomicReadProtocol.scala
@@ -8,7 +8,7 @@
 
 package org.apache.spark.sql.transaction
 
-import java.io.{File, InputStream, InputStreamReader, IOException, OutputStream}
+import java.io.{File, FileNotFoundException, InputStream, InputStreamReader, IOException, OutputStream}
 import java.nio.charset.StandardCharsets
 
 import scala.collection.mutable
@@ -60,10 +60,10 @@ object DatabricksAtomicReadProtocol extends Logging {
       val name = f.getPath.getName
       name match {
         case _ if state.getDeletionTime(name) > 0 =>
-          logInfo(s"Ignoring ${f.getPath} since it is marked as deleted.")
+          logDebug(s"Ignoring ${f.getPath} since it is marked as deleted.")
           false
         case FILE_WITH_TXN_ID(txnId) if !state.isFileCommitted(txnId, name) =>
-          logInfo(s"Ignoring ${f.getPath} since it is not marked as committed.")
+          logDebug(s"Ignoring ${f.getPath} since it is not marked as committed.")
           false
         case _ =>
           true
@@ -155,7 +155,7 @@ object DatabricksAtomicReadProtocol extends Logging {
 
     if ((state.missingMarkers.nonEmpty || state.missingDataFiles.nonEmpty) &&
           state.lastModified > clock.getTimeMillis - horizonMillis) {
-      logDebug("Repeating list request since some files are suspected to be missing.")
+      logInfo("Repeating list request since some files are suspected to be missing.")
       val newlyCommitted = mutable.Set[TxnId]()
       val extraStatuses = fs.listStatus(dir).filter { f =>
         f.isFile && (f.getPath.getName match {
@@ -199,6 +199,9 @@ object DatabricksAtomicReadProtocol extends Logging {
         (state, initialFiles)
       }
     } else {
+      logDebug("List request was not repeated since " + state.missingMarkers.nonEmpty + " " +
+        state.missingDataFiles.nonEmpty + " " + state.lastModified + " " + clock.getTimeMillis +
+        " " + horizonMillis)
       (state, initialFiles)
     }
   }
@@ -232,12 +235,16 @@ object DatabricksAtomicReadProtocol extends Logging {
             }
             commitMarkers(txnId) = filesAdded.toSet
           } catch {
+            case e: FileNotFoundException =>
+              logWarning("Job commit marker disappeared before we could read it: " + stat)
+              corruptCommitMarkers.add(txnId)
+
             case NonFatal(e) =>
               // we use SparkEnv for this escape-hatch flag since this may be called on executors
               if (SparkEnv.get.conf.getBoolean(
                   "spark.databricks.sql.ignoreCorruptCommitMarkers", false)) {
                 logWarning("Failed to read job commit marker: " + stat, e)
-                corruptCommitMarkers += txnId
+                corruptCommitMarkers.add(txnId)
               } else {
                 throw new IOException("Failed to read job commit marker: " + stat, e)
               }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -75,11 +75,20 @@ case class InsertIntoHadoopFsRelationCommand(
     val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
 
     val pathExists = fs.exists(qualifiedOutputPath)
+    // If we are appending data to an existing dir.
+    val isAppend = pathExists && (mode == SaveMode.Append)
+
+    val committer = FileCommitProtocol.instantiate(
+      sparkSession.sessionState.conf.fileCommitProtocolClass,
+      jobId = java.util.UUID.randomUUID().toString,
+      outputPath = outputPath.toString,
+      isAppend = isAppend)
+
     val doInsertion = (mode, pathExists) match {
       case (SaveMode.ErrorIfExists, true) =>
         throw new AnalysisException(s"path $qualifiedOutputPath already exists.")
       case (SaveMode.Overwrite, true) =>
-        deleteMatchingPartitions(fs, qualifiedOutputPath)
+        deleteMatchingPartitions(fs, qualifiedOutputPath, committer)
         true
       case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
         true
@@ -88,15 +97,8 @@ case class InsertIntoHadoopFsRelationCommand(
       case (s, exists) =>
         throw new IllegalStateException(s"unsupported save mode $s ($exists)")
     }
-    // If we are appending data to an existing dir.
-    val isAppend = pathExists && (mode == SaveMode.Append)
 
     if (doInsertion) {
-      val committer = FileCommitProtocol.instantiate(
-        sparkSession.sessionState.conf.fileCommitProtocolClass,
-        jobId = java.util.UUID.randomUUID().toString,
-        outputPath = outputPath.toString,
-        isAppend = isAppend)
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
@@ -121,7 +123,8 @@ case class InsertIntoHadoopFsRelationCommand(
    * Deletes all partition files that match the specified static prefix. Partitions with custom
    * locations are also cleared based on the custom locations map given to this class.
    */
-  private def deleteMatchingPartitions(fs: FileSystem, qualifiedOutputPath: Path): Unit = {
+  private def deleteMatchingPartitions(
+      fs: FileSystem, qualifiedOutputPath: Path, committer: FileCommitProtocol): Unit = {
     val staticPartitionPrefix = if (staticPartitionKeys.nonEmpty) {
       "/" + partitionColumns.flatMap { p =>
         staticPartitionKeys.get(p.name) match {
@@ -136,7 +139,7 @@ case class InsertIntoHadoopFsRelationCommand(
     }
     // first clear the path determined by the static partition keys (e.g. /table/foo=1)
     val staticPrefixPath = qualifiedOutputPath.suffix(staticPartitionPrefix)
-    if (fs.exists(staticPrefixPath) && !fs.delete(staticPrefixPath, true /* recursively */)) {
+    if (fs.exists(staticPrefixPath) && !committer.deleteWithJob(fs, staticPrefixPath, true)) {
       throw new IOException(s"Unable to clear output " +
         s"directory $staticPrefixPath prior to writing to it")
     }
@@ -146,7 +149,7 @@ case class InsertIntoHadoopFsRelationCommand(
         (staticPartitionKeys.toSet -- spec).isEmpty,
         "Custom partition location did not match static partitioning keys")
       val path = new Path(customLoc)
-      if (fs.exists(path) && !fs.delete(path, true)) {
+      if (fs.exists(path) && !committer.deleteWithJob(fs, path, true)) {
         throw new IOException(s"Unable to clear partition " +
           s"directory $path prior to writing to it")
       }
diff --git a/sql/core/src/test/scala/com/databricks/sql/transaction/DatabricksAtomicCommitProtocolSuite.scala b/sql/core/src/test/scala/com/databricks/sql/transaction/DatabricksAtomicCommitProtocolSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala