@@ -23,6 +23,7 @@ import org.apache.spark.sql.{Column, DataFrame, DataFrameWriter, QueryTest, SQLC
23
23
import org .apache .spark .sql .catalyst .expressions .UnsafeProjection
24
24
import org .apache .spark .sql .catalyst .plans .physical .HashPartitioning
25
25
import org .apache .spark .sql .execution .Exchange
26
+ import org .apache .spark .sql .execution .datasources .BucketSpec
26
27
import org .apache .spark .sql .execution .joins .SortMergeJoin
27
28
import org .apache .spark .sql .functions ._
28
29
import org .apache .spark .sql .hive .test .TestHiveSingleton
@@ -61,15 +62,30 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
61
62
private val df1 = (0 until 50 ).map(i => (i % 5 , i % 13 , i.toString)).toDF(" i" , " j" , " k" ).as(" df1" )
62
63
private val df2 = (0 until 50 ).map(i => (i % 7 , i % 11 , i.toString)).toDF(" i" , " j" , " k" ).as(" df2" )
63
64
65
+ /**
66
+ * A helper method to test the bucket read functionality using join. It will save `df1` and `df2`
67
+ * to hive tables, bucketed or not, according to the given bucket specifics. Next we will join
68
+ * these 2 tables, and firstly make sure the answer is corrected, and then check if the shuffle
69
+ * exists as user expected according to the `shuffleLeft` and `shuffleRight`.
70
+ */
64
71
private def testBucketing (
65
- bucketing1 : DataFrameWriter => DataFrameWriter ,
66
- bucketing2 : DataFrameWriter => DataFrameWriter ,
72
+ bucketSpecLeft : Option [ BucketSpec ] ,
73
+ bucketSpecRight : Option [ BucketSpec ] ,
67
74
joinColumns : Seq [String ],
68
75
shuffleLeft : Boolean ,
69
76
shuffleRight : Boolean ): Unit = {
70
77
withTable(" bucketed_table1" , " bucketed_table2" ) {
71
- bucketing1(df1.write.format(" parquet" )).saveAsTable(" bucketed_table1" )
72
- bucketing2(df2.write.format(" parquet" )).saveAsTable(" bucketed_table2" )
78
+ def withBucket (writer : DataFrameWriter , bucketSpec : Option [BucketSpec ]): DataFrameWriter = {
79
+ bucketSpec.map { spec =>
80
+ writer.bucketBy(
81
+ spec.numBuckets,
82
+ spec.bucketColumnNames.head,
83
+ spec.bucketColumnNames.tail: _* )
84
+ }.getOrElse(writer)
85
+ }
86
+
87
+ withBucket(df1.write.format(" parquet" ), bucketSpecLeft).saveAsTable(" bucketed_table1" )
88
+ withBucket(df2.write.format(" parquet" ), bucketSpecRight).saveAsTable(" bucketed_table2" )
73
89
74
90
withSQLConf(SQLConf .AUTO_BROADCASTJOIN_THRESHOLD .key -> " 0" ) {
75
91
val t1 = hiveContext.table(" bucketed_table1" )
@@ -95,42 +111,42 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
95
111
}
96
112
97
113
test(" avoid shuffle when join 2 bucketed tables" ) {
98
- val bucketing = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " i" , " j" )
99
- testBucketing(bucketing, bucketing , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = false )
114
+ val bucketSpec = Some ( BucketSpec (8 , Seq ( " i" , " j" ), Nil ) )
115
+ testBucketing(bucketSpec, bucketSpec , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = false )
100
116
}
101
117
102
118
// Enable it after fix https://issues.apache.org/jira/browse/SPARK-12704
103
119
ignore(" avoid shuffle when join keys are a super-set of bucket keys" ) {
104
- val bucketing = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " i" )
105
- testBucketing(bucketing, bucketing , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = false )
120
+ val bucketSpec = Some ( BucketSpec (8 , Seq ( " i" ), Nil ) )
121
+ testBucketing(bucketSpec, bucketSpec , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = false )
106
122
}
107
123
108
124
test(" only shuffle one side when join bucketed table and non-bucketed table" ) {
109
- val bucketing = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " i" , " j" )
110
- testBucketing(bucketing, identity , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = true )
125
+ val bucketSpec = Some ( BucketSpec (8 , Seq ( " i" , " j" ), Nil ) )
126
+ testBucketing(bucketSpec, None , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = true )
111
127
}
112
128
113
129
test(" only shuffle one side when 2 bucketed tables have different bucket number" ) {
114
- val bucketing1 = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " i" , " j" )
115
- val bucketing2 = ( writer : DataFrameWriter ) => writer.bucketBy (5 , " i" , " j" )
116
- testBucketing(bucketing1, bucketing2 , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = true )
130
+ val bucketSpec1 = Some ( BucketSpec (8 , Seq ( " i" , " j" ), Nil ) )
131
+ val bucketSpec2 = Some ( BucketSpec (5 , Seq ( " i" , " j" ), Nil ) )
132
+ testBucketing(bucketSpec1, bucketSpec2 , Seq (" i" , " j" ), shuffleLeft = false , shuffleRight = true )
117
133
}
118
134
119
135
test(" only shuffle one side when 2 bucketed tables have different bucket keys" ) {
120
- val bucketing1 = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " i" )
121
- val bucketing2 = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " j" )
122
- testBucketing(bucketing1, bucketing2 , Seq (" i" ), shuffleLeft = false , shuffleRight = true )
136
+ val bucketSpec1 = Some ( BucketSpec (8 , Seq ( " i" ), Nil ) )
137
+ val bucketSpec2 = Some ( BucketSpec (8 , Seq ( " j" ), Nil ) )
138
+ testBucketing(bucketSpec1, bucketSpec2 , Seq (" i" ), shuffleLeft = false , shuffleRight = true )
123
139
}
124
140
125
141
test(" shuffle when join keys are not equal to bucket keys" ) {
126
- val bucketing = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " i" )
127
- testBucketing(bucketing, bucketing , Seq (" j" ), shuffleLeft = true , shuffleRight = true )
142
+ val bucketSpec = Some ( BucketSpec (8 , Seq ( " i" ), Nil ) )
143
+ testBucketing(bucketSpec, bucketSpec , Seq (" j" ), shuffleLeft = true , shuffleRight = true )
128
144
}
129
145
130
146
test(" shuffle when join 2 bucketed tables with bucketing disabled" ) {
131
- val bucketing = ( writer : DataFrameWriter ) => writer.bucketBy (8 , " i" , " j" )
147
+ val bucketSpec = Some ( BucketSpec (8 , Seq ( " i" , " j" ), Nil ) )
132
148
withSQLConf(SQLConf .BUCKETING_ENABLED .key -> " false" ) {
133
- testBucketing(bucketing, bucketing , Seq (" i" , " j" ), shuffleLeft = true , shuffleRight = true )
149
+ testBucketing(bucketSpec, bucketSpec , Seq (" i" , " j" ), shuffleLeft = true , shuffleRight = true )
134
150
}
135
151
}
136
152
0 commit comments