[SPARK-53311][SQL][PYTHON][CORE] Make PullOutNonDeterministic use canonicalized expressions

benhurdelhey · mzhang · commit 15f7ff0e37e8 · 2025-08-20T18:26:29.000-07:00
### What changes were proposed in this pull request? Make PullOutNonDeterministic use canonicalized expressions to dedup group and aggregate expressions. This affects pyspark udfs in particular. Example: ``` from pyspark.sql.functions import col, avg, udf pythonUDF = udf(lambda x: x).asNondeterministic() spark.range(10)\ .selectExpr("id", "id % 3 as value")\ .groupBy(pythonUDF(col("value")))\ .agg(avg("id"), pythonUDF(col("value")))\ .explain(extended=True) ``` Currently results in a plan like this: ``` Aggregate [_nondeterministic#15](apache#15), [_nondeterministic#15 AS dummyNondeterministicUDF(value)apache#12, avg(id#0L) AS avg(id)apache#13, dummyNondeterministicUDF(value#6L)apache#8 AS dummyNondeterministicUDF(value)apache#14](apache#15%20AS%20dummyNondeterministicUDF(value)apache#12,%20avg(id#0L)%20AS%20avg(id)apache#13,%20dummyNondeterministicUDF(value#6L)apache#8%20AS%20dummyNondeterministicUDF(value)apache#14) +- Project [id#0L, value#6L, dummyNondeterministicUDF(value#6L)apache#7 AS _nondeterministic#15](#0L,%20value#6L,%20dummyNondeterministicUDF(value#6L)apache#7%20AS%20_nondeterministic#15) +- Project [id#0L, (id#0L % cast(3 as bigint)) AS value#6L](#0L,%20(id#0L%20%%20cast(3%20as%20bigint))%20AS%20value#6L) +- Range (0, 10, step=1, splits=Some(2)) ``` and then it throws: ``` [[MISSING_AGGREGATION] The non-aggregating expression "value" is based on columns which are not participating in the GROUP BY clause. Add the columns or the expression to the GROUP BY, aggregate the expression, or use "any_value(value)" if you do not care which of the values within a group is returned. SQLSTATE: 42803 ``` - how canonicalized fixes this: - nondeterministic PythonUDF expressions always have distinct resultIds per udf - The fix is to canonicalize the expressions when matching. Canonicalized means that we're setting the resultIds to -1, allowing us to dedup the PythonUDF expressions. - for deterministic UDFs, this rule does not apply and "Post Analysis" batch extracts and deduplicates the expressions, as expected ### Why are the changes needed? - the output of the query with the fix applied still makes sense - the nondeterministic UDF is invoked only once, in the project. ### Does this PR introduce _any_ user-facing change? Yes, it's additive, it enables queries to run that previously threw errors. ### How was this patch tested? - added unit test ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#52061 from benrobby/adhoc-fix-pull-out-nondeterministic. Authored-by: Ben Hurdelhey <ben.hurdelhey@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NondeterministicExpressionCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NondeterministicExpressionCollection.scala
@@ -38,7 +38,7 @@ object NondeterministicExpressionCollection {
             case namedExpression: NamedExpression => namedExpression
             case _ => Alias(nondeterministicExpr, "_nondeterministic")()
           }
-          nonDeterministicToAttributes.put(nondeterministicExpr, namedExpression)
+          nonDeterministicToAttributes.put(nondeterministicExpr.canonicalized, namedExpression)
         }
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/PullOutNondeterministic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/PullOutNondeterministic.scala
@@ -42,7 +42,7 @@ object PullOutNondeterministic extends Rule[LogicalPlan] {
         NondeterministicExpressionCollection.getNondeterministicToAttributes(a.groupingExpressions)
       val newChild = Project(a.child.output ++ nondeterToAttr.values.asScala.toSeq, a.child)
       val deterministicAggregate = a.transformExpressions { case e =>
-        Option(nondeterToAttr.get(e)).map(_.toAttribute).getOrElse(e)
+        Option(nondeterToAttr.get(e.canonicalized)).map(_.toAttribute).getOrElse(e)
       }.copy(child = newChild)
 
       deterministicAggregate.groupingExpressions.foreach(expr => if (!expr.deterministic) {
@@ -69,7 +69,7 @@ object PullOutNondeterministic extends Rule[LogicalPlan] {
       val nondeterToAttr =
         NondeterministicExpressionCollection.getNondeterministicToAttributes(p.expressions)
       val newPlan = p.transformExpressions { case e =>
-        Option(nondeterToAttr.get(e)).map(_.toAttribute).getOrElse(e)
+        Option(nondeterToAttr.get(e.canonicalized)).map(_.toAttribute).getOrElse(e)
       }
       val newChild = Project(p.child.output ++ nondeterToAttr.values.asScala.toSeq, p.child)
       Project(p.output, newPlan.withNewChildren(newChild :: Nil))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala
@@ -474,7 +474,8 @@ object IntegratedUDFTestUtils extends SQLHelper {
    *   casted_col.cast(df.schema["col"].dataType)
    * }}}
    */
-  case class TestPythonUDF(name: String, returnType: Option[DataType] = None) extends TestUDF {
+  case class TestPythonUDF(name: String, returnType: Option[DataType] = None,
+      deterministic: Boolean = true) extends TestUDF {
     private[IntegratedUDFTestUtils] lazy val udf = new UserDefinedPythonFunction(
       name = name,
       func = SimplePythonFunction(
@@ -487,7 +488,7 @@ object IntegratedUDFTestUtils extends SQLHelper {
         accumulator = null),
       dataType = StringType,
       pythonEvalType = PythonEvalType.SQL_BATCHED_UDF,
-      udfDeterministic = true) {
+      udfDeterministic = deterministic) {
 
       override def builder(e: Seq[Expression]): Expression = {
         assert(e.length == 1, "Defined UDF only has one column")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.python
 
 import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest, Row}
-import org.apache.spark.sql.functions.{array, col, count, transform}
+import org.apache.spark.sql.functions.{array, avg, col, count, transform}
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.LongType
 
@@ -139,4 +139,21 @@ class PythonUDFSuite extends QueryTest with SharedSparkSession {
       checkAnswer(df, Row(0, 1, 1, 0, 1, 1))
     }
   }
+
+  test("SPARK-53311: Nondeterministic Python UDF pull out in aggregate with grouping") {
+    assume(shouldTestPythonUDFs)
+
+    // nondeterministic UDF
+    val pythonUDF = TestPythonUDF(name = "foo", Some(LongType), deterministic = false)
+
+    // This query should work without throwing an analysis exception
+    // The UDF foo(value) appears in both grouping expressions and aggregate expressions
+    // The fix ensures that both instances are properly mapped to the same attribute
+    val df = spark.range(1)
+      .selectExpr("id", "id % 3 as value")
+      .groupBy(pythonUDF(col("value")))
+      .agg(avg("id"), pythonUDF(col("value")))
+
+    checkAnswer(df, Row(0, 0.0, 0))
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ object NondeterministicExpressionCollection {`
`38`	`38`	`case namedExpression: NamedExpression => namedExpression`
`39`	`39`	`case _ => Alias(nondeterministicExpr, "_nondeterministic")()`
`40`	`40`	`}`
`41`		`- nonDeterministicToAttributes.put(nondeterministicExpr, namedExpression)`
	`41`	`+ nonDeterministicToAttributes.put(nondeterministicExpr.canonicalized, namedExpression)`
`42`	`42`	`}`
`43`	`43`	`}`
`44`	`44`	`}`