Skip to content

Commit f71a09d

Browse files
committed
[SPARK-8037] [SQL] Ignores files whose name starts with dot in HadoopFsRelation
Author: Cheng Lian <[email protected]> Closes #6581 from liancheng/spark-8037 and squashes the following commits: d08e97b [Cheng Lian] Ignores files whose name starts with dot in HadoopFsRelation (cherry picked from commit 1bb5d71) Signed-off-by: Cheng Lian <[email protected]>
1 parent 8c3fc3a commit f71a09d

File tree

3 files changed

+26
-6
lines changed

3 files changed

+26
-6
lines changed

sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ private[sql] object PartitioningUtils {
187187
Seq.empty
188188
} else {
189189
assert(distinctPartitionsColNames.size == 1, {
190-
val list = distinctPartitionsColNames.mkString("\t", "\n", "")
190+
val list = distinctPartitionsColNames.mkString("\t", "\n\t", "")
191191
s"Conflicting partition column names detected:\n$list"
192192
})
193193

sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -379,10 +379,10 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
379379
var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
380380

381381
def refresh(): Unit = {
382-
// We don't filter files/directories whose name start with "_" or "." here, as specific data
383-
// sources may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
384-
// But "_temporary" directories are explicitly ignored since failed tasks/jobs may leave
385-
// partial/corrupted data files there.
382+
// We don't filter files/directories whose name start with "_" except "_temporary" here, as
383+
// specific data sources may take advantages over them (e.g. Parquet _metadata and
384+
// _common_metadata files). "_temporary" directories are explicitly ignored since failed
385+
// tasks/jobs may leave partial/corrupted data files there.
386386
def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = {
387387
if (status.getPath.getName.toLowerCase == "_temporary") {
388388
Set.empty
@@ -400,6 +400,9 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
400400
val fs = hdfsPath.getFileSystem(hadoopConf)
401401
val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
402402
Try(fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _))
403+
}.filterNot { status =>
404+
// SPARK-8037: Ignores files like ".DS_Store" and other hidden files/directories
405+
status.getPath.getName.startsWith(".")
403406
}
404407

405408
val files = statuses.filterNot(_.isDir)

sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ package org.apache.spark.sql.parquet
1818

1919
import java.io.File
2020
import java.math.BigInteger
21-
import java.sql.{Timestamp, Date}
21+
import java.sql.Timestamp
2222

2323
import scala.collection.mutable.ArrayBuffer
2424

25+
import com.google.common.io.Files
2526
import org.apache.hadoop.fs.Path
2627

2728
import org.apache.spark.sql.catalyst.expressions.Literal
@@ -432,4 +433,20 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
432433
checkAnswer(read.load(dir.toString).select(fields: _*), row)
433434
}
434435
}
436+
437+
test("SPARK-8037: Ignores files whose name starts with dot") {
438+
withTempPath { dir =>
439+
val df = (1 to 3).map(i => (i, i, i, i)).toDF("a", "b", "c", "d")
440+
441+
df.write
442+
.format("parquet")
443+
.partitionBy("b", "c", "d")
444+
.save(dir.getCanonicalPath)
445+
446+
Files.touch(new File(s"${dir.getCanonicalPath}/b=1", ".DS_Store"))
447+
Files.createParentDirs(new File(s"${dir.getCanonicalPath}/b=1/c=1/.foo/bar"))
448+
449+
checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df)
450+
}
451+
}
435452
}

0 commit comments

Comments
 (0)