apache · maropu · Mar 9, 2021 · Jul 16, 2021 · maropu · May 17, 2021
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
@@ -30,6 +30,8 @@
 
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.hash.Murmur3_x86_32;
 
 import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
 
@@ -112,6 +114,22 @@ public UnsafeArrayData valueArray() {
     return values;
   }
 
+  @Override
+  public int hashCode() {
+    return Murmur3_x86_32.hashUnsafeBytes(baseObject, baseOffset, sizeInBytes, 42);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof UnsafeMapData) {
+      UnsafeMapData o = (UnsafeMapData) other;
+      return (sizeInBytes == o.sizeInBytes) &&
+        ByteArrayMethods.arrayEquals(baseObject, baseOffset, o.baseObject, o.baseOffset,
+          sizeInBytes);
+    }
+    return false;
+  }
+
   public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
   }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -61,14 +61,6 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog {
     dt.existsRecursively(_.isInstanceOf[MapType])
   }
 
-  protected def mapColumnInSetOperation(plan: LogicalPlan): Option[Attribute] = plan match {
-    case _: Intersect | _: Except | _: Distinct =>
-      plan.output.find(a => hasMapType(a.dataType))
-    case d: Deduplicate =>
-      d.keys.find(a => hasMapType(a.dataType))
-    case _ => None
-  }
-
   private def checkLimitLikeClause(name: String, limitExpr: Expression): Unit = {
     limitExpr match {
       case e if !e.foldable => failAnalysis(
@@ -588,14 +580,6 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog {
                  |Conflicting attributes: ${conflictingAttributes.mkString(",")}
                """.stripMargin)
 
-          // TODO: although map type is not orderable, technically map type should be able to be
-          // used in equality comparison, remove this type check once we support it.
-          case o if mapColumnInSetOperation(o).isDefined =>
-            val mapCol = mapColumnInSetOperation(o).get
-            failAnalysis("Cannot have map type columns in DataFrame which calls " +
-              s"set operations(intersect, except, etc.), but the type of column ${mapCol.name} " +
-              "is " + mapCol.dataType.catalogString)
-
           case o if o.expressions.exists(!_.deterministic) &&
             !o.isInstanceOf[Project] && !o.isInstanceOf[Filter] &&
             !o.isInstanceOf[Aggregate] && !o.isInstanceOf[Window] =>

diff --git a/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -625,6 +625,7 @@ class CodegenContext extends Logging {
     case dt: DataType if dt.isInstanceOf[AtomicType] => s"$c1.equals($c2)"
     case array: ArrayType => genComp(array, c1, c2) + " == 0"
     case struct: StructType => genComp(struct, c1, c2) + " == 0"
+    case map: MapType => genComp(map, c1, c2) + " == 0"
     case udt: UserDefinedType[_] => genEqual(udt.sqlType, c1, c2)
     case NullType => "false"
     case _ =>
@@ -700,6 +701,32 @@ class CodegenContext extends Logging {
           }
         """
       s"${addNewFunction(compareFunc, funcCode)}($c1, $c2)"
+
+    case _ @ MapType(keyType, valueType, _) =>
+      val keyArrayType = ArrayType(keyType)
+      val valueArrayType = ArrayType(valueType)
+      val compareFunc = freshName("compareMap")
+      val funcCode: String =
+        s"""
+          public int $compareFunc(MapData a, MapData b) {
+            ArrayData aKeys = a.keyArray();
+            ArrayData bKeys = b.keyArray();
+            int keyComp = ${genComp(keyArrayType, "aKeys", "bKeys")};
+            if (keyComp != 0) {
+              return keyComp;
+            }
+
+            ArrayData aValues = a.valueArray();
+            ArrayData bValues = b.valueArray();
+            int valueComp = ${genComp(valueArrayType, "aValues", "bValues")};
+            if (valueComp != 0) {
+              return valueComp;
+            }
+            return 0;
+          }
+        """
+      s"${addNewFunction(compareFunc, funcCode)}($c1, $c2)"
+
     case schema: StructType =>
       val comparisons = GenerateOrdering.genComparisons(this, schema)
       val compareFunc = freshName("compareStruct")

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ordering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ordering.scala
@@ -65,6 +65,10 @@ class InterpretedOrdering(ordering: Seq[SortOrder]) extends BaseOrdering {
             a.interpretedOrdering.asInstanceOf[Ordering[Any]].compare(left, right)
           case a: ArrayType if order.direction == Descending =>
             - a.interpretedOrdering.asInstanceOf[Ordering[Any]].compare(left, right)
+          case a: MapType if order.direction == Ascending =>
+            a.interpretedOrdering.asInstanceOf[Ordering[Any]].compare(left, right)
+          case a: MapType if order.direction == Descending =>
+            - a.interpretedOrdering.asInstanceOf[Ordering[Any]].compare(left, right)
           case s: StructType if order.direction == Ascending =>
             s.interpretedOrdering.asInstanceOf[Ordering[Any]].compare(left, right)
           case s: StructType if order.direction == Descending =>
@@ -104,6 +108,7 @@ object RowOrdering extends CodeGeneratorWithInterpretedFallback[Seq[SortOrder],
     case dt: AtomicType => true
     case struct: StructType => struct.fields.forall(f => isOrderable(f.dataType))
     case array: ArrayType => isOrderable(array.elementType)
+    case map: MapType => isOrderable(map.keyType) && isOrderable(map.valueType)
     case udt: UserDefinedType[_] => isOrderable(udt.sqlType)
     case _ => false
   }

diff --git a/...yst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala b/...yst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.{Alias, And, ArrayTransform, CaseWhen, Coalesce, CreateArray, CreateMap, CreateNamedStruct, EqualTo, ExpectsInputTypes, Expression, GetStructField, If, IsNull, KnownFloatingPointNormalized, LambdaFunction, Literal, NamedLambdaVariable, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.{Alias, And, ArrayTransform, CaseWhen, Coalesce, CreateArray, CreateMap, CreateNamedStruct, EqualTo, ExpectsInputTypes, Expression, GetStructField, If, IsNull, KnownFloatingPointNormalized, LambdaFunction, Literal, NamedLambdaVariable, TransformKeys, TransformValues, UnaryExpression}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window}
@@ -96,9 +96,7 @@ object NormalizeFloatingNumbers extends Rule[LogicalPlan] {
     case FloatType | DoubleType => true
     case StructType(fields) => fields.exists(f => needNormalize(f.dataType))
     case ArrayType(et, _) => needNormalize(et)
-    // Currently MapType is not comparable and analyzer should fail earlier if this case happens.
-    case _: MapType =>
-      throw new IllegalStateException("grouping/join/window partition keys cannot be map type.")
+    case MapType(kt, vt, _) => needNormalize(kt) || needNormalize(vt)
     case _ => false
   }
 
@@ -142,6 +140,26 @@ object NormalizeFloatingNumbers extends Rule[LogicalPlan] {
       val function = normalize(lv)
       KnownFloatingPointNormalized(ArrayTransform(expr, LambdaFunction(function, Seq(lv))))
 
+    case _ if expr.dataType.isInstanceOf[MapType] =>
+      val MapType(kt, vt, containsNull) = expr.dataType
+      val maybeKeyNormalized = if (needNormalize(kt)) {
+        val lv1 = NamedLambdaVariable("arg1", kt, nullable = false)
+        val lv2 = NamedLambdaVariable("arg2", vt, containsNull)
+        val function = normalize(lv1)
+        TransformKeys(expr, LambdaFunction(function, Seq(lv1, lv2)))
+      } else {
+        expr
+      }
+      val maybeKeyValueNormalized = if (needNormalize(vt)) {
+        val lv1 = NamedLambdaVariable("arg1", kt, nullable = false)
+        val lv2 = NamedLambdaVariable("arg2", vt, containsNull)
+        val function = normalize(lv2)
+        TransformValues(maybeKeyNormalized, LambdaFunction(function, Seq(lv1, lv2)))
+      } else {
+        maybeKeyNormalized
+      }
+      KnownFloatingPointNormalized(maybeKeyValueNormalized)
+
     case _ => throw new IllegalStateException(s"fail to normalize $expr")
   }
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMaps.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMaps.scala
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.math.Ordering
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator._
+import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, GenericArrayData, MapData, TypeUtils}
+import org.apache.spark.sql.types._
+
+/**
+ * When comparing two maps, we have to make sure two maps having the same key value pairs but
+ * with different key ordering are equal (e.g., Map('a' -> 1, 'b' -> 2) should equal to
+ * Map('b' -> 2, 'a' -> 1). To make sure the assumption holds,
+ * this rule inserts a [[SortMapKeys]] expression to sort map entries by keys.
+ *
+ * NOTE: this rule must be executed at the end of the optimizer because it may create
+ * new joins (the subquery rewrite) and new join conditions (the join reorder).
+ */
+object NormalizeMaps extends Rule[LogicalPlan] {
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressions {
+    // The analyzer guarantees left and right types are the same, so
+    // we only need to check a type in one side.
+    case cmp @ BinaryComparison(left, right) if needNormalize(left) =>
+      cmp.withNewChildren(SortMapKeys(left) :: SortMapKeys(right) :: Nil)
+
+    case In(value, list) if needNormalize(value) =>
+      In(SortMapKeys(value), list.map(SortMapKeys))
+
+    case in @ InSet(value, list) if needNormalize(value) =>
+      val newHset = list.map(c => SortMapKeys(Literal(c, in.child.dataType)).eval())
+      InSet(SortMapKeys(value), newHset)
+
+    case sort: SortOrder if needNormalize(sort.child) =>
+      sort.copy(child = SortMapKeys(sort.child))
+  }.transform {
+    case w: Window if w.partitionSpec.exists(p => needNormalize(p)) =>
+      w.copy(partitionSpec = w.partitionSpec.map(normalize))
+
+    // TODO: `NormalizeMaps` has the same restriction with `NormalizeFloatingNumbers`;
+    // ideally Aggregate should also be handled here, but its grouping expressions are
+    // mixed in its aggregate expressions. It's unreliable to change the grouping expressions
+    // here. For now we normalize grouping expressions in `AggUtils` during planning.
+  }
+
+  private def needNormalize(expr: Expression): Boolean = expr match {
+    case SortMapKeys(_) => false
+    case _ => needNormalize(expr.dataType)
+  }
+
+  private def needNormalize(dt: DataType): Boolean = dt match {
+    case StructType(fields) => fields.exists(f => needNormalize(f.dataType))
+    case ArrayType(et, _) => needNormalize(et)
+    case _: MapType => true
+    case _ => false
+  }
+
+  private[sql] def normalize(expr: Expression): Expression = expr match {
+    case _ if !needNormalize(expr) => expr
+    case _ => SortMapKeys(expr)
+  }
+}
+
+/**
+ * This expression sorts all maps in an expression's result. This expression enables the use of
+ * maps in comparisons and equality operations.
+ */
+case class SortMapKeys(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(ArrayType, MapType, StructType))
+
+  override def dataType: DataType = child.dataType
+
+  override protected def withNewChildInternal(newChild: Expression): Expression = {
+    copy(child = newChild)
+  }
+
+  private def createFuncToSortRecursively(dt: DataType): Any => Any = dt match {
+    case m @ MapType(keyType, valueType, _) =>
+      val sf = createFuncToSortRecursively(valueType)
+      val keyOrdering = new Ordering[(Any, Any)] {
+        val ordering = TypeUtils.getInterpretedOrdering(keyType)
+        override def compare(x: (Any, Any), y: (Any, Any)): Int = ordering.compare(x._1, y._1)
+
+      }
+      (data: Any) => {
+        val input = data.asInstanceOf[MapData]
+        val length = input.numElements()
+        val keys = input.keyArray()
+        val values = input.valueArray()
+        val buffer = Array.ofDim[(Any, Any)](length)
+        var i = 0
+        while (i < length) {
+          // Map keys cannot contain map types (See `TypeUtils.checkForMapKeyType`),
+          // so we recursively sort values only.
+          val k = keys.get(i, m.keyType)
+          val v = if (!values.isNullAt(i)) {
+            sf(values.get(i, m.valueType))
+          } else {
+            null
+          }
+          buffer(i) = k -> v
+          i += 1
+        }
+
+        java.util.Arrays.sort(buffer, keyOrdering)
+
+        ArrayBasedMapData(buffer.toIterator, length, identity, identity)
+      }
+
+    case ArrayType(dt, _) =>
+      val sf = createFuncToSortRecursively(dt)
+      (data: Any) => {
+        val input = data.asInstanceOf[ArrayData]
+        val length = input.numElements()
+        val output = Array.ofDim[Any](length)
+        var i = 0
+        while (i < length) {
+          if (!input.isNullAt(i)) {
+            output(i) = sf(input.get(i, dt))
+          } else {
+            output(i) = null
+          }
+          i += 1
+        }
+        new GenericArrayData(output)
+      }
+
+    case StructType(fields) =>
+      val fs = fields.map { field =>
+        val sf = createFuncToSortRecursively(field.dataType)
+        (input: InternalRow, i: Int) => {
+          sf(input.get(i, field.dataType))
+        }
+      }
+      val length = fields.length
+      (data: Any) => {
+        val input = data.asInstanceOf[InternalRow]
+        val output = Array.ofDim[Any](length)
+        var i = 0
+        while (i < length) {
+          if (!input.isNullAt(i)) {
+            output(i) = fs(i)(input, i)
+          } else {
+            output(i) = null
+          }
+          i += 1
+        }
+        new GenericInternalRow(output)
+      }
+
+    case _ =>
+      identity
+  }
+
+  @transient private[this] lazy val sortFunc = {
+    createFuncToSortRecursively(dataType)
+  }
+
+  override def nullSafeEval(input: Any): Any = sortFunc(input)
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    // TODO: we should code generate this
+    val tf = ctx.addReferenceObj("sortFunc", sortFunc, classOf[Any => Any].getCanonicalName)
+    nullSafeCodeGen(ctx, ev, eval => {
+      s"${ev.value} = (${javaType(dataType)})$tf.apply($eval);"
+    })
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -167,6 +167,10 @@ abstract class Optimizer(catalogManager: CatalogManager)
       RemoveNoopUnion) ::
     Batch("OptimizeLimitZero", Once,
       OptimizeLimitZero) ::
+    // After applying ConvertToLocalRelation, we cannot normalize maps in Filter/Project.
+    // So, we need to apply NormalizeMaps just before ConvertToLocalRelation.
+    Batch("Normalize Maps Before Converting LocalRelation", Once,
+      NormalizeMaps) ::
     // Run this once earlier. This might simplify the plan and reduce cost of optimizer.
     // For example, a query such as Filter(LocalRelation) would go through all the heavy
     // optimizer rules that are triggered when there is a filter
@@ -234,8 +238,9 @@ abstract class Optimizer(catalogManager: CatalogManager)
       ColumnPruning,
       CollapseProject,
       RemoveNoopOperators) :+
-    // This batch must be executed after the `RewriteSubquery` batch, which creates joins.
+    // Following batches must be executed after the `RewriteSubquery` batch, which creates joins.
     Batch("NormalizeFloatingNumbers", Once, NormalizeFloatingNumbers) :+
+    Batch("NormalizeMaps", Once, NormalizeMaps) :+
     Batch("ReplaceUpdateFieldsExpression", Once, ReplaceUpdateFieldsExpression)
 
     // remove any batches with no rules. this may happen when subclasses do not add optional rules.
@@ -271,7 +276,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
       RewritePredicateSubquery.ruleName ::
       NormalizeFloatingNumbers.ruleName ::
       ReplaceUpdateFieldsExpression.ruleName ::
-      PullOutGroupingExpressions.ruleName :: Nil
+      PullOutGroupingExpressions.ruleName ::
+      NormalizeMaps.ruleName :: Nil
 
   /**
    * Optimize all the subqueries inside expression.