@@ -22,7 +22,8 @@ import org.scalatest.BeforeAndAfterAll
22
22
import scala .reflect .ClassTag
23
23
24
24
import org .apache .spark .sql .{SQLConf , QueryTest }
25
- import org .apache .spark .sql .execution .joins .{BroadcastHashJoin , ShuffledHashJoin }
25
+ import org .apache .spark .sql .catalyst .plans .logical .NativeCommand
26
+ import org .apache .spark .sql .execution .joins ._
26
27
import org .apache .spark .sql .hive .test .TestHive
27
28
import org .apache .spark .sql .hive .test .TestHive ._
28
29
import org .apache .spark .sql .hive .execution ._
@@ -193,4 +194,70 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
193
194
)
194
195
}
195
196
197
+ test(" auto converts to broadcast left semi join, by size estimate of a relation" ) {
198
+ def mkTest (
199
+ before : () => Unit ,
200
+ after : () => Unit ,
201
+ query : String ,
202
+ expectedAnswer : Seq [Any ],
203
+ ct : ClassTag [_]) = {
204
+ before()
205
+
206
+ var rdd = sql(query)
207
+
208
+ // Assert src has a size smaller than the threshold.
209
+ val sizes = rdd.queryExecution.analyzed.collect {
210
+ case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.statistics.sizeInBytes
211
+ }
212
+ assert(sizes.size === 2 && sizes(1 ) <= autoBroadcastJoinThreshold
213
+ && sizes(0 ) <= autoBroadcastJoinThreshold,
214
+ s " query should contain two relations, each of which has size smaller than autoConvertSize " )
215
+
216
+ // Using `sparkPlan` because for relevant patterns in HashJoin to be
217
+ // matched, other strategies need to be applied.
218
+ var bhj = rdd.queryExecution.sparkPlan.collect {
219
+ case j : BroadcastLeftSemiJoinHash => j
220
+ }
221
+ assert(bhj.size === 1 ,
222
+ s " actual query plans do not contain broadcast join: ${rdd.queryExecution}" )
223
+
224
+ checkAnswer(rdd, expectedAnswer) // check correctness of output
225
+
226
+ TestHive .settings.synchronized {
227
+ val tmp = autoBroadcastJoinThreshold
228
+
229
+ sql( s """ SET ${SQLConf .AUTO_BROADCASTJOIN_THRESHOLD }=-1 """ )
230
+ rdd = sql(query)
231
+ bhj = rdd.queryExecution.sparkPlan.collect {
232
+ case j : BroadcastLeftSemiJoinHash => j
233
+ }
234
+ assert(bhj.isEmpty, " BroadcastHashJoin still planned even though it is switched off" )
235
+
236
+ val shj = rdd.queryExecution.sparkPlan.collect {
237
+ case j : LeftSemiJoinHash => j
238
+ }
239
+ assert(shj.size === 1 ,
240
+ " LeftSemiJoinHash should be planned when BroadcastHashJoin is turned off" )
241
+
242
+ sql( s """ SET ${SQLConf .AUTO_BROADCASTJOIN_THRESHOLD }= $tmp""" )
243
+ }
244
+
245
+ after()
246
+ }
247
+
248
+ /** Tests for MetastoreRelation */
249
+ val leftSemiJoinQuery =
250
+ """ SELECT * FROM src a
251
+ |left semi JOIN src b ON a.key=86 and a.key = b.key""" .stripMargin
252
+ val Answer = (86 , " val_86" ) :: Nil
253
+
254
+ mkTest(
255
+ () => (),
256
+ () => (),
257
+ leftSemiJoinQuery,
258
+ Answer ,
259
+ implicitly[ClassTag [MetastoreRelation ]]
260
+ )
261
+
262
+ }
196
263
}
0 commit comments