Skip to content

Commit 19bcce1

Browse files
committed
[SPARK-28270][SQL][FOLLOW-UP] Explicitly cast into int/long/decimal in udf-aggregates_part1.sql to avoid Python float limitation
## What changes were proposed in this pull request? The tests added at #25069 seem flaky in some environments. See #25069 (comment) Python's string representation of floats can make the tests flaky. See https://docs.python.org/3/tutorial/floatingpoint.html. I think it's just better to explicitly cast everywhere udf returns a float (or a double) to stay safe. (note that we're not targeting the Python <> Scala value conversions - there are inevitable differences between Python and Scala; therefore, other languages' UDFs cannot guarantee the same results between Python and Scala). This PR proposes to cast cases to long, integer and decimal explicitly to make the test cases robust. <details><summary>Diff comparing to 'pgSQL/aggregates_part1.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out index 51ca1d5..734634b7388 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out -3,23 +3,23 -- !query 0 -SELECT avg(four) AS avg_1 FROM onek +SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek -- !query 0 schema -struct<avg_1:double> +struct<avg_1:decimal(10,3)> -- !query 0 output 1.5 -- !query 1 -SELECT avg(a) AS avg_32 FROM aggtest WHERE a < 100 +SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100 -- !query 1 schema -struct<avg_32:double> +struct<avg_32:decimal(10,3)> -- !query 1 output -32.666666666666664 +32.667 -- !query 2 -select CAST(avg(b) AS Decimal(10,3)) AS avg_107_943 FROM aggtest +select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest -- !query 2 schema struct<avg_107_943:decimal(10,3)> -- !query 2 output -27,39 +27,39 struct<avg_107_943:decimal(10,3)> -- !query 3 -SELECT sum(four) AS sum_1500 FROM onek +SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek -- !query 3 schema -struct<sum_1500:bigint> +struct<sum_1500:int> -- !query 3 output 1500 -- !query 4 -SELECT sum(a) AS sum_198 FROM aggtest +SELECT udf(sum(a)) AS sum_198 FROM aggtest -- !query 4 schema -struct<sum_198:bigint> +struct<sum_198:string> -- !query 4 output 198 -- !query 5 -SELECT sum(b) AS avg_431_773 FROM aggtest +SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest -- !query 5 schema -struct<avg_431_773:double> +struct<avg_431_773:decimal(10,3)> -- !query 5 output -431.77260909229517 +431.773 -- !query 6 -SELECT max(four) AS max_3 FROM onek +SELECT udf(max(four)) AS max_3 FROM onek -- !query 6 schema -struct<max_3:int> +struct<max_3:string> -- !query 6 output 3 -- !query 7 -SELECT max(a) AS max_100 FROM aggtest +SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest -- !query 7 schema struct<max_100:int> -- !query 7 output -67,245 +67,246 struct<max_100:int> -- !query 8 -SELECT max(aggtest.b) AS max_324_78 FROM aggtest +SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest -- !query 8 schema -struct<max_324_78:float> +struct<max_324_78:decimal(10,3)> -- !query 8 output 324.78 -- !query 9 -SELECT stddev_pop(b) FROM aggtest +SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest -- !query 9 schema -struct<stddev_pop(CAST(b AS DOUBLE)):double> +struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)> -- !query 9 output -131.10703231895047 +131.107 -- !query 10 -SELECT stddev_samp(b) FROM aggtest +SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest -- !query 10 schema -struct<stddev_samp(CAST(b AS DOUBLE)):double> +struct<CAST(udf(stddev_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)> -- !query 10 output -151.38936080399804 +151.389 -- !query 11 -SELECT var_pop(b) FROM aggtest +SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest -- !query 11 schema -struct<var_pop(CAST(b AS DOUBLE)):double> +struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)> -- !query 11 output -17189.053923482323 +17189.054 -- !query 12 -SELECT var_samp(b) FROM aggtest +SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest -- !query 12 schema -struct<var_samp(CAST(b AS DOUBLE)):double> +struct<CAST(udf(var_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)> -- !query 12 output -22918.738564643096 +22918.739 -- !query 13 -SELECT stddev_pop(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest -- !query 13 schema -struct<stddev_pop(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<CAST(udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)> -- !query 13 output -131.18117242958306 +131.181 -- !query 14 -SELECT stddev_samp(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest -- !query 14 schema -struct<stddev_samp(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<CAST(stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)> -- !query 14 output -151.47497042966097 +151.475 -- !query 15 -SELECT var_pop(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest -- !query 15 schema -struct<var_pop(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<CAST(udf(var_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)> -- !query 15 output 17208.5 -- !query 16 -SELECT var_samp(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest -- !query 16 schema -struct<var_samp(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<CAST(var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)> -- !query 16 output -22944.666666666668 +22944.667 -- !query 17 -SELECT var_pop(1.0), var_samp(2.0) +SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0)) -- !query 17 schema -struct<var_pop(CAST(1.0 AS DOUBLE)):double,var_samp(CAST(2.0 AS DOUBLE)):double> +struct<CAST(udf(var_pop(cast(1.0 as double))) AS INT):int,var_samp(CAST(udf(2.0) AS DOUBLE)):double> -- !query 17 output -0.0 NaN +0 NaN -- !query 18 -SELECT stddev_pop(CAST(3.0 AS Decimal(38,0))), stddev_samp(CAST(4.0 AS Decimal(38,0))) +SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0))) -- !query 18 schema -struct<stddev_pop(CAST(CAST(3.0 AS DECIMAL(38,0)) AS DOUBLE)):double,stddev_samp(CAST(CAST(4.0 AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<CAST(stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)) AS INT):int,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double> -- !query 18 output -0.0 NaN +0 NaN -- !query 19 -select sum(CAST(null AS int)) from range(1,4) +select sum(udf(CAST(null AS int))) from range(1,4) -- !query 19 schema -struct<sum(CAST(NULL AS INT)):bigint> +struct<sum(CAST(udf(cast(null as int)) AS DOUBLE)):double> -- !query 19 output NULL -- !query 20 -select sum(CAST(null AS long)) from range(1,4) +select sum(udf(CAST(null AS long))) from range(1,4) -- !query 20 schema -struct<sum(CAST(NULL AS BIGINT)):bigint> +struct<sum(CAST(udf(cast(null as bigint)) AS DOUBLE)):double> -- !query 20 output NULL -- !query 21 -select sum(CAST(null AS Decimal(38,0))) from range(1,4) +select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query 21 schema -struct<sum(CAST(NULL AS DECIMAL(38,0))):decimal(38,0)> +struct<sum(CAST(udf(cast(null as decimal(38,0))) AS DOUBLE)):double> -- !query 21 output NULL -- !query 22 -select sum(CAST(null AS DOUBLE)) from range(1,4) +select sum(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query 22 schema -struct<sum(CAST(NULL AS DOUBLE)):double> +struct<sum(CAST(udf(cast(null as double)) AS DOUBLE)):double> -- !query 22 output NULL -- !query 23 -select avg(CAST(null AS int)) from range(1,4) +select avg(udf(CAST(null AS int))) from range(1,4) -- !query 23 schema -struct<avg(CAST(NULL AS INT)):double> +struct<avg(CAST(udf(cast(null as int)) AS DOUBLE)):double> -- !query 23 output NULL -- !query 24 -select avg(CAST(null AS long)) from range(1,4) +select avg(udf(CAST(null AS long))) from range(1,4) -- !query 24 schema -struct<avg(CAST(NULL AS BIGINT)):double> +struct<avg(CAST(udf(cast(null as bigint)) AS DOUBLE)):double> -- !query 24 output NULL -- !query 25 -select avg(CAST(null AS Decimal(38,0))) from range(1,4) +select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query 25 schema -struct<avg(CAST(NULL AS DECIMAL(38,0))):decimal(38,4)> +struct<avg(CAST(udf(cast(null as decimal(38,0))) AS DOUBLE)):double> -- !query 25 output NULL -- !query 26 -select avg(CAST(null AS DOUBLE)) from range(1,4) +select avg(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query 26 schema -struct<avg(CAST(NULL AS DOUBLE)):double> +struct<avg(CAST(udf(cast(null as double)) AS DOUBLE)):double> -- !query 26 output NULL -- !query 27 -select sum(CAST('NaN' AS DOUBLE)) from range(1,4) +select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query 27 schema -struct<sum(CAST(NaN AS DOUBLE)):double> +struct<sum(CAST(udf(NaN) AS DOUBLE)):double> -- !query 27 output NaN -- !query 28 -select avg(CAST('NaN' AS DOUBLE)) from range(1,4) +select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query 28 schema -struct<avg(CAST(NaN AS DOUBLE)):double> +struct<avg(CAST(udf(NaN) AS DOUBLE)):double> -- !query 28 output NaN -- !query 30 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('1')) v(x) -- !query 30 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double> -- !query 30 output Infinity NaN -- !query 31 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('Infinity')) v(x) -- !query 31 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double> -- !query 31 output Infinity NaN -- !query 32 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('-Infinity'), ('Infinity')) v(x) -- !query 32 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double> -- !query 32 output NaN NaN -- !query 33 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3)) FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x) -- !query 33 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS INT):int,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)> -- !query 33 output -1.00000005E8 2.5 +100000005 2.5 -- !query 34 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3)) FROM (VALUES (7000000000005), (7000000000007)) v(x) -- !query 34 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS BIGINT):bigint,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)> -- !query 34 output -7.000000000006E12 1.0 +7000000000006 1 -- !query 35 -SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest +SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest -- !query 35 schema -struct<covar_pop(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double,covar_samp(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double> +struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS DECIMAL(10,3)):decimal(10,3),CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)> -- !query 35 output -653.6289553875104 871.5052738500139 +653.629 871.505 -- !query 36 -SELECT corr(b, a) FROM aggtest +SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest -- !query 36 schema -struct<corr(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double> +struct<CAST(corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)> -- !query 36 output -0.1396345165178734 +0.14 -- !query 37 -SELECT count(four) AS cnt_1000 FROM onek +SELECT count(udf(four)) AS cnt_1000 FROM onek -- !query 37 schema struct<cnt_1000:bigint> -- !query 37 output -313,18 +314,18 struct<cnt_1000:bigint> -- !query 38 -SELECT count(DISTINCT four) AS cnt_4 FROM onek +SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek -- !query 38 schema -struct<cnt_4:bigint> +struct<cnt_4:string> -- !query 38 output 4 -- !query 39 -select ten, count(*), sum(four) from onek +select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek group by ten order by ten -- !query 39 schema -struct<ten:int,count(1):bigint,sum(four):bigint> +struct<ten:int,udf(count(1)):string,CAST(sum(CAST(udf(four) AS DOUBLE)) AS INT):int> -- !query 39 output 0 100 100 1 100 200 -339,10 +340,10 struct<ten:int,count(1):bigint,sum(four):bigint> -- !query 40 -select ten, count(four), sum(DISTINCT four) from onek +select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek group by ten order by ten -- !query 40 schema -struct<ten:int,count(four):bigint,sum(DISTINCT four):bigint> +struct<ten:int,count(udf(four)):bigint,udf(sum(distinct cast(four as bigint))):string> -- !query 40 output 0 100 2 1 100 4 -357,11 +358,11 struct<ten:int,count(four):bigint,sum(DISTINCT four):bigint> -- !query 41 -select ten, sum(distinct four) from onek a +select ten, udf(sum(distinct four)) from onek a group by ten -having exists (select 1 from onek b where sum(distinct a.four) = b.four) +having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four) -- !query 41 schema -struct<ten:int,sum(DISTINCT four):bigint> +struct<ten:int,udf(sum(distinct cast(four as bigint))):string> -- !query 41 output 0 2 2 2 -374,23 +375,23 struct<ten:int,sum(DISTINCT four):bigint> select ten, sum(distinct four) from onek a group by ten having exists (select 1 from onek b - where sum(distinct a.four + b.four) = b.four) + where sum(distinct a.four + b.four) = udf(b.four)) -- !query 42 schema struct<> -- !query 42 output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. -Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(b.`four` AS BIGINT))] +Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(udf(four) AS BIGINT))] Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))]; -- !query 43 select - (select max((select i.unique2 from tenk1 i where i.unique1 = o.unique1))) + (select udf(max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))) from tenk1 o -- !query 43 schema struct<> -- !query 43 output org.apache.spark.sql.AnalysisException -cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 63 +cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 67 ``` </p> </details> ## How was this patch tested? Manually tested in local. Also, with JDK 11: ``` Using /.../jdk-11.0.3.jdk/Contents/Home as default JAVA_HOME. Note, this will be overridden by -java-home if it is set. [info] Loading project definition from /.../spark/project [info] Updating {file:/.../spark/project/}spark-build... ... [info] SQLQueryTestSuite: ... [info] - udf/pgSQL/udf-aggregates_part1.sql - Scala UDF (17 seconds, 228 milliseconds) [info] - udf/pgSQL/udf-aggregates_part1.sql - Regular Python UDF (36 seconds, 170 milliseconds) [info] - udf/pgSQL/udf-aggregates_part1.sql - Scalar Pandas UDF (41 seconds, 132 milliseconds) ... ``` Closes #25110 from HyukjinKwon/SPARK-28270-1. Authored-by: HyukjinKwon <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent 7021588 commit 19bcce1

File tree

2 files changed

+91
-91
lines changed

2 files changed

+91
-91
lines changed

sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
-- Note that currently registered UDF returns a string. So there are some differences, for instance
1313
-- in string cast within UDF in Scala and Python.
1414

15-
SELECT avg(udf(four)) AS avg_1 FROM onek;
15+
SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek;
1616

17-
SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100;
17+
SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100;
1818

1919
-- In 7.1, avg(float4) is computed using float8 arithmetic.
2020
-- Round the result to 3 digits to avoid platform-specific results.
@@ -23,32 +23,32 @@ select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest;
2323
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
2424
-- SELECT avg(gpa) AS avg_3_4 FROM ONLY student;
2525

26-
SELECT sum(udf(four)) AS sum_1500 FROM onek;
26+
SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek;
2727
SELECT udf(sum(a)) AS sum_198 FROM aggtest;
28-
SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest;
28+
SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest;
2929
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
3030
-- SELECT sum(gpa) AS avg_6_8 FROM ONLY student;
3131

3232
SELECT udf(max(four)) AS max_3 FROM onek;
33-
SELECT max(udf(a)) AS max_100 FROM aggtest;
34-
SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest;
33+
SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest;
34+
SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest;
3535
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
3636
-- SELECT max(student.gpa) AS max_3_7 FROM student;
3737

38-
SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest;
39-
SELECT udf(stddev_samp(b)) FROM aggtest;
40-
SELECT CAST(var_pop(udf(b)) as int) FROM aggtest;
41-
SELECT udf(var_samp(b)) FROM aggtest;
38+
SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest;
39+
SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest;
40+
SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest;
41+
SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest;
4242

43-
SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
44-
SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest;
45-
SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
46-
SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest;
43+
SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
44+
SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest;
45+
SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
46+
SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
4747

4848
-- population variance is defined for a single tuple, sample variance
4949
-- is not
50-
SELECT udf(var_pop(1.0)), var_samp(udf(2.0));
51-
SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)));
50+
SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0));
51+
SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)));
5252

5353

5454
-- verify correct results for null and NaN inputs
@@ -76,9 +76,9 @@ FROM (VALUES ('-Infinity'), ('Infinity')) v(x);
7676

7777

7878
-- test accuracy with a large input offset
79-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
79+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
8080
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x);
81-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
81+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
8282
FROM (VALUES (7000000000005), (7000000000007)) v(x);
8383

8484
-- SQL2003 binary aggregates [SPARK-23907]
@@ -89,8 +89,8 @@ FROM (VALUES (7000000000005), (7000000000007)) v(x);
8989
-- SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
9090
-- SELECT regr_r2(b, a) FROM aggtest;
9191
-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
92-
SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest;
93-
SELECT corr(b, udf(a)) FROM aggtest;
92+
SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest;
93+
SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest;
9494

9595

9696
-- test accum and combine functions directly [SPARK-23907]
@@ -122,7 +122,7 @@ SELECT corr(b, udf(a)) FROM aggtest;
122122
SELECT count(udf(four)) AS cnt_1000 FROM onek;
123123
SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek;
124124

125-
select ten, udf(count(*)), sum(udf(four)) from onek
125+
select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek
126126
group by ten order by ten;
127127

128128
select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek

sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out

Lines changed: 70 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,19 @@
33

44

55
-- !query 0
6-
SELECT avg(udf(four)) AS avg_1 FROM onek
6+
SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek
77
-- !query 0 schema
8-
struct<avg_1:double>
8+
struct<avg_1:decimal(10,3)>
99
-- !query 0 output
1010
1.5
1111

1212

1313
-- !query 1
14-
SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100
14+
SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100
1515
-- !query 1 schema
16-
struct<avg_32:string>
16+
struct<avg_32:decimal(10,3)>
1717
-- !query 1 output
18-
32.666666666666664
18+
32.667
1919

2020

2121
-- !query 2
@@ -27,11 +27,11 @@ struct<avg_107_943:decimal(10,3)>
2727

2828

2929
-- !query 3
30-
SELECT sum(udf(four)) AS sum_1500 FROM onek
30+
SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek
3131
-- !query 3 schema
32-
struct<sum_1500:double>
32+
struct<sum_1500:int>
3333
-- !query 3 output
34-
1500.0
34+
1500
3535

3636

3737
-- !query 4
@@ -43,11 +43,11 @@ struct<sum_198:string>
4343

4444

4545
-- !query 5
46-
SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest
46+
SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest
4747
-- !query 5 schema
48-
struct<avg_431_773:string>
48+
struct<avg_431_773:decimal(10,3)>
4949
-- !query 5 output
50-
431.77260909229517
50+
431.773
5151

5252

5353
-- !query 6
@@ -59,99 +59,99 @@ struct<max_3:string>
5959

6060

6161
-- !query 7
62-
SELECT max(udf(a)) AS max_100 FROM aggtest
62+
SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest
6363
-- !query 7 schema
64-
struct<max_100:string>
64+
struct<max_100:int>
6565
-- !query 7 output
66-
56
66+
100
6767

6868

6969
-- !query 8
70-
SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest
70+
SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest
7171
-- !query 8 schema
72-
struct<max_324_78:int>
72+
struct<max_324_78:decimal(10,3)>
7373
-- !query 8 output
74-
324
74+
324.78
7575

7676

7777
-- !query 9
78-
SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest
78+
SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest
7979
-- !query 9 schema
80-
struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS INT):int>
80+
struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
8181
-- !query 9 output
82-
131
82+
131.107
8383

8484

8585
-- !query 10
86-
SELECT udf(stddev_samp(b)) FROM aggtest
86+
SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest
8787
-- !query 10 schema
88-
struct<udf(stddev_samp(cast(b as double))):string>
88+
struct<CAST(udf(stddev_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
8989
-- !query 10 output
90-
151.38936080399804
90+
151.389
9191

9292

9393
-- !query 11
94-
SELECT CAST(var_pop(udf(b)) as int) FROM aggtest
94+
SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest
9595
-- !query 11 schema
96-
struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS INT):int>
96+
struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
9797
-- !query 11 output
98-
17189
98+
17189.054
9999

100100

101101
-- !query 12
102-
SELECT udf(var_samp(b)) FROM aggtest
102+
SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest
103103
-- !query 12 schema
104-
struct<udf(var_samp(cast(b as double))):string>
104+
struct<CAST(udf(var_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
105105
-- !query 12 output
106-
22918.738564643096
106+
22918.739
107107

108108

109109
-- !query 13
110-
SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
110+
SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
111111
-- !query 13 schema
112-
struct<udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))):string>
112+
struct<CAST(udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
113113
-- !query 13 output
114-
131.18117242958306
114+
131.181
115115

116116

117117
-- !query 14
118-
SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest
118+
SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest
119119
-- !query 14 schema
120-
struct<stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)):double>
120+
struct<CAST(stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
121121
-- !query 14 output
122-
151.47497042966097
122+
151.475
123123

124124

125125
-- !query 15
126-
SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
126+
SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
127127
-- !query 15 schema
128-
struct<udf(var_pop(cast(cast(b as decimal(38,0)) as double))):string>
128+
struct<CAST(udf(var_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
129129
-- !query 15 output
130130
17208.5
131131

132132

133133
-- !query 16
134-
SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest
134+
SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
135135
-- !query 16 schema
136-
struct<var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)):double>
136+
struct<CAST(var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
137137
-- !query 16 output
138-
22944.666666666668
138+
22944.667
139139

140140

141141
-- !query 17
142-
SELECT udf(var_pop(1.0)), var_samp(udf(2.0))
142+
SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0))
143143
-- !query 17 schema
144-
struct<udf(var_pop(cast(1.0 as double))):string,var_samp(CAST(udf(2.0) AS DOUBLE)):double>
144+
struct<CAST(udf(var_pop(cast(1.0 as double))) AS INT):int,var_samp(CAST(udf(2.0) AS DOUBLE)):double>
145145
-- !query 17 output
146-
0.0 NaN
146+
0 NaN
147147

148148

149149
-- !query 18
150-
SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
150+
SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
151151
-- !query 18 schema
152-
struct<stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)):double,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double>
152+
struct<CAST(stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)) AS INT):int,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double>
153153
-- !query 18 output
154-
0.0 NaN
154+
0 NaN
155155

156156

157157
-- !query 19
@@ -262,37 +262,37 @@ NaN NaN
262262

263263

264264
-- !query 32
265-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
265+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
266266
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x)
267267
-- !query 32 schema
268-
struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string>
268+
struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS INT):int,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
269269
-- !query 32 output
270-
1.00000005E8 2.5
270+
100000005 2.5
271271

272272

273273
-- !query 33
274-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
274+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
275275
FROM (VALUES (7000000000005), (7000000000007)) v(x)
276276
-- !query 33 schema
277-
struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string>
277+
struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS BIGINT):bigint,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
278278
-- !query 33 output
279-
7.000000000006E12 1.0
279+
7000000000006 1
280280

281281

282282
-- !query 34
283-
SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest
283+
SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest
284284
-- !query 34 schema
285-
struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS INT):int,CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS INT):int>
285+
struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS DECIMAL(10,3)):decimal(10,3),CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
286286
-- !query 34 output
287-
653 871
287+
653.629 871.505
288288

289289

290290
-- !query 35
291-
SELECT corr(b, udf(a)) FROM aggtest
291+
SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest
292292
-- !query 35 schema
293-
struct<corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)):double>
293+
struct<CAST(corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
294294
-- !query 35 output
295-
0.1396345165178734
295+
0.14
296296

297297

298298
-- !query 36
@@ -312,21 +312,21 @@ struct<cnt_4:string>
312312

313313

314314
-- !query 38
315-
select ten, udf(count(*)), sum(udf(four)) from onek
315+
select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek
316316
group by ten order by ten
317317
-- !query 38 schema
318-
struct<ten:int,udf(count(1)):string,sum(CAST(udf(four) AS DOUBLE)):double>
318+
struct<ten:int,udf(count(1)):string,CAST(sum(CAST(udf(four) AS DOUBLE)) AS INT):int>
319319
-- !query 38 output
320-
0 100 100.0
321-
1 100 200.0
322-
2 100 100.0
323-
3 100 200.0
324-
4 100 100.0
325-
5 100 200.0
326-
6 100 100.0
327-
7 100 200.0
328-
8 100 100.0
329-
9 100 200.0
320+
0 100 100
321+
1 100 200
322+
2 100 100
323+
3 100 200
324+
4 100 100
325+
5 100 200
326+
6 100 100
327+
7 100 200
328+
8 100 100
329+
9 100 200
330330

331331

332332
-- !query 39

0 commit comments

Comments
 (0)