@@ -85,17 +85,28 @@ case class Exchange(
85
85
keySchema : Array [DataType ],
86
86
valueSchema : Array [DataType ],
87
87
numPartitions : Int ): Serializer = {
88
+ // In ExternalSorter's spillToMergeableFile function, key-value pairs are written out
89
+ // through write(key) and then write(value) instead of write((key, value)). Because
90
+ // SparkSqlSerializer2 assumes that objects passed in are Product2, we cannot safely use
91
+ // it when spillToMergeableFile in ExternalSorter will be used.
92
+ // So, we will not use SparkSqlSerializer2 when
93
+ // - Sort-based shuffle is enabled and the number of reducers (numPartitions) is greater
94
+ // then the bypassMergeThreshold; or
95
+ // - newOrdering is defined.
96
+ val cannotUseSqlSerializer2 =
97
+ (sortBasedShuffleOn && numPartitions > bypassMergeThreshold) || newOrdering.nonEmpty
98
+
88
99
val useSqlSerializer2 =
89
- ! (sortBasedShuffleOn && numPartitions > bypassMergeThreshold) &&
90
- child.sqlContext.conf.useSqlSerializer2 &&
91
- SparkSqlSerializer2 .support(keySchema) &&
92
- SparkSqlSerializer2 .support(valueSchema)
100
+ child.sqlContext.conf.useSqlSerializer2 && // SparkSqlSerializer2 is enabled.
101
+ ! cannotUseSqlSerializer2 && // Safe to use Serializer2.
102
+ SparkSqlSerializer2 .support(keySchema) && // The schema of key is supported.
103
+ SparkSqlSerializer2 .support(valueSchema) // The schema of value is supported.
93
104
94
105
val serializer = if (useSqlSerializer2) {
95
- logInfo(" Use SparkSqlSerializer2." )
106
+ logInfo(" Using SparkSqlSerializer2." )
96
107
new SparkSqlSerializer2 (keySchema, valueSchema)
97
108
} else {
98
- logInfo(" Use SparkSqlSerializer." )
109
+ logInfo(" Using SparkSqlSerializer." )
99
110
new SparkSqlSerializer (sparkConf)
100
111
}
101
112
@@ -160,7 +171,7 @@ case class Exchange(
160
171
} else {
161
172
new ShuffledRDD [Row , Null , Null ](rdd, part)
162
173
}
163
- val keySchema = sortingExpressions .map(_.dataType).toArray
174
+ val keySchema = child.output .map(_.dataType).toArray
164
175
shuffled.setSerializer(serializer(keySchema, null , numPartitions))
165
176
166
177
shuffled.map(_._1)
0 commit comments