Skip to content

Commit 21870c6

Browse files
committed
Explicitly cast into integer in UDF integrated tests to avoid Python float limitation
1 parent 7021588 commit 21870c6

File tree

2 files changed

+91
-91
lines changed

2 files changed

+91
-91
lines changed

sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
-- Note that currently registered UDF returns a string. So there are some differences, for instance
1313
-- in string cast within UDF in Scala and Python.
1414

15-
SELECT avg(udf(four)) AS avg_1 FROM onek;
15+
SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek;
1616

17-
SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100;
17+
SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100;
1818

1919
-- In 7.1, avg(float4) is computed using float8 arithmetic.
2020
-- Round the result to 3 digits to avoid platform-specific results.
@@ -23,32 +23,32 @@ select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest;
2323
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
2424
-- SELECT avg(gpa) AS avg_3_4 FROM ONLY student;
2525

26-
SELECT sum(udf(four)) AS sum_1500 FROM onek;
26+
SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek;
2727
SELECT udf(sum(a)) AS sum_198 FROM aggtest;
28-
SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest;
28+
SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest;
2929
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
3030
-- SELECT sum(gpa) AS avg_6_8 FROM ONLY student;
3131

3232
SELECT udf(max(four)) AS max_3 FROM onek;
33-
SELECT max(udf(a)) AS max_100 FROM aggtest;
34-
SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest;
33+
SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest;
34+
SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest;
3535
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
3636
-- SELECT max(student.gpa) AS max_3_7 FROM student;
3737

38-
SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest;
39-
SELECT udf(stddev_samp(b)) FROM aggtest;
40-
SELECT CAST(var_pop(udf(b)) as int) FROM aggtest;
41-
SELECT udf(var_samp(b)) FROM aggtest;
38+
SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest;
39+
SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest;
40+
SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest;
41+
SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest;
4242

43-
SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
44-
SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest;
45-
SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
46-
SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest;
43+
SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
44+
SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest;
45+
SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
46+
SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
4747

4848
-- population variance is defined for a single tuple, sample variance
4949
-- is not
50-
SELECT udf(var_pop(1.0)), var_samp(udf(2.0));
51-
SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)));
50+
SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0));
51+
SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)));
5252

5353

5454
-- verify correct results for null and NaN inputs
@@ -76,9 +76,9 @@ FROM (VALUES ('-Infinity'), ('Infinity')) v(x);
7676

7777

7878
-- test accuracy with a large input offset
79-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
79+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
8080
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x);
81-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
81+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
8282
FROM (VALUES (7000000000005), (7000000000007)) v(x);
8383

8484
-- SQL2003 binary aggregates [SPARK-23907]
@@ -89,8 +89,8 @@ FROM (VALUES (7000000000005), (7000000000007)) v(x);
8989
-- SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
9090
-- SELECT regr_r2(b, a) FROM aggtest;
9191
-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
92-
SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest;
93-
SELECT corr(b, udf(a)) FROM aggtest;
92+
SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest;
93+
SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest;
9494

9595

9696
-- test accum and combine functions directly [SPARK-23907]
@@ -122,7 +122,7 @@ SELECT corr(b, udf(a)) FROM aggtest;
122122
SELECT count(udf(four)) AS cnt_1000 FROM onek;
123123
SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek;
124124

125-
select ten, udf(count(*)), sum(udf(four)) from onek
125+
select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek
126126
group by ten order by ten;
127127

128128
select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek

sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out

Lines changed: 70 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,19 @@
33

44

55
-- !query 0
6-
SELECT avg(udf(four)) AS avg_1 FROM onek
6+
SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek
77
-- !query 0 schema
8-
struct<avg_1:double>
8+
struct<avg_1:decimal(10,3)>
99
-- !query 0 output
1010
1.5
1111

1212

1313
-- !query 1
14-
SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100
14+
SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100
1515
-- !query 1 schema
16-
struct<avg_32:string>
16+
struct<avg_32:decimal(10,3)>
1717
-- !query 1 output
18-
32.666666666666664
18+
32.667
1919

2020

2121
-- !query 2
@@ -27,11 +27,11 @@ struct<avg_107_943:decimal(10,3)>
2727

2828

2929
-- !query 3
30-
SELECT sum(udf(four)) AS sum_1500 FROM onek
30+
SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek
3131
-- !query 3 schema
32-
struct<sum_1500:double>
32+
struct<sum_1500:int>
3333
-- !query 3 output
34-
1500.0
34+
1500
3535

3636

3737
-- !query 4
@@ -43,11 +43,11 @@ struct<sum_198:string>
4343

4444

4545
-- !query 5
46-
SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest
46+
SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest
4747
-- !query 5 schema
48-
struct<avg_431_773:string>
48+
struct<avg_431_773:decimal(10,3)>
4949
-- !query 5 output
50-
431.77260909229517
50+
431.773
5151

5252

5353
-- !query 6
@@ -59,99 +59,99 @@ struct<max_3:string>
5959

6060

6161
-- !query 7
62-
SELECT max(udf(a)) AS max_100 FROM aggtest
62+
SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest
6363
-- !query 7 schema
64-
struct<max_100:string>
64+
struct<max_100:int>
6565
-- !query 7 output
66-
56
66+
100
6767

6868

6969
-- !query 8
70-
SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest
70+
SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest
7171
-- !query 8 schema
72-
struct<max_324_78:int>
72+
struct<max_324_78:decimal(10,3)>
7373
-- !query 8 output
74-
324
74+
324.78
7575

7676

7777
-- !query 9
78-
SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest
78+
SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest
7979
-- !query 9 schema
80-
struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS INT):int>
80+
struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
8181
-- !query 9 output
82-
131
82+
131.107
8383

8484

8585
-- !query 10
86-
SELECT udf(stddev_samp(b)) FROM aggtest
86+
SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest
8787
-- !query 10 schema
88-
struct<udf(stddev_samp(cast(b as double))):string>
88+
struct<CAST(udf(stddev_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
8989
-- !query 10 output
90-
151.38936080399804
90+
151.389
9191

9292

9393
-- !query 11
94-
SELECT CAST(var_pop(udf(b)) as int) FROM aggtest
94+
SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest
9595
-- !query 11 schema
96-
struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS INT):int>
96+
struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
9797
-- !query 11 output
98-
17189
98+
17189.054
9999

100100

101101
-- !query 12
102-
SELECT udf(var_samp(b)) FROM aggtest
102+
SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest
103103
-- !query 12 schema
104-
struct<udf(var_samp(cast(b as double))):string>
104+
struct<CAST(udf(var_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
105105
-- !query 12 output
106-
22918.738564643096
106+
22918.739
107107

108108

109109
-- !query 13
110-
SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
110+
SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
111111
-- !query 13 schema
112-
struct<udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))):string>
112+
struct<CAST(udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
113113
-- !query 13 output
114-
131.18117242958306
114+
131.181
115115

116116

117117
-- !query 14
118-
SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest
118+
SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest
119119
-- !query 14 schema
120-
struct<stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)):double>
120+
struct<CAST(stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
121121
-- !query 14 output
122-
151.47497042966097
122+
151.475
123123

124124

125125
-- !query 15
126-
SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
126+
SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
127127
-- !query 15 schema
128-
struct<udf(var_pop(cast(cast(b as decimal(38,0)) as double))):string>
128+
struct<CAST(udf(var_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
129129
-- !query 15 output
130130
17208.5
131131

132132

133133
-- !query 16
134-
SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest
134+
SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
135135
-- !query 16 schema
136-
struct<var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)):double>
136+
struct<CAST(var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
137137
-- !query 16 output
138-
22944.666666666668
138+
22944.667
139139

140140

141141
-- !query 17
142-
SELECT udf(var_pop(1.0)), var_samp(udf(2.0))
142+
SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0))
143143
-- !query 17 schema
144-
struct<udf(var_pop(cast(1.0 as double))):string,var_samp(CAST(udf(2.0) AS DOUBLE)):double>
144+
struct<CAST(udf(var_pop(cast(1.0 as double))) AS INT):int,var_samp(CAST(udf(2.0) AS DOUBLE)):double>
145145
-- !query 17 output
146-
0.0 NaN
146+
0 NaN
147147

148148

149149
-- !query 18
150-
SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
150+
SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
151151
-- !query 18 schema
152-
struct<stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)):double,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double>
152+
struct<CAST(stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)) AS INT):int,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double>
153153
-- !query 18 output
154-
0.0 NaN
154+
0 NaN
155155

156156

157157
-- !query 19
@@ -262,37 +262,37 @@ NaN NaN
262262

263263

264264
-- !query 32
265-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
265+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
266266
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x)
267267
-- !query 32 schema
268-
struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string>
268+
struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS INT):int,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
269269
-- !query 32 output
270-
1.00000005E8 2.5
270+
100000005 2.5
271271

272272

273273
-- !query 33
274-
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
274+
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
275275
FROM (VALUES (7000000000005), (7000000000007)) v(x)
276276
-- !query 33 schema
277-
struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string>
277+
struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS BIGINT):bigint,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
278278
-- !query 33 output
279-
7.000000000006E12 1.0
279+
7000000000006 1
280280

281281

282282
-- !query 34
283-
SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest
283+
SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest
284284
-- !query 34 schema
285-
struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS INT):int,CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS INT):int>
285+
struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS DECIMAL(10,3)):decimal(10,3),CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
286286
-- !query 34 output
287-
653 871
287+
653.629 871.505
288288

289289

290290
-- !query 35
291-
SELECT corr(b, udf(a)) FROM aggtest
291+
SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest
292292
-- !query 35 schema
293-
struct<corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)):double>
293+
struct<CAST(corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
294294
-- !query 35 output
295-
0.1396345165178734
295+
0.14
296296

297297

298298
-- !query 36
@@ -312,21 +312,21 @@ struct<cnt_4:string>
312312

313313

314314
-- !query 38
315-
select ten, udf(count(*)), sum(udf(four)) from onek
315+
select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek
316316
group by ten order by ten
317317
-- !query 38 schema
318-
struct<ten:int,udf(count(1)):string,sum(CAST(udf(four) AS DOUBLE)):double>
318+
struct<ten:int,udf(count(1)):string,CAST(sum(CAST(udf(four) AS DOUBLE)) AS INT):int>
319319
-- !query 38 output
320-
0 100 100.0
321-
1 100 200.0
322-
2 100 100.0
323-
3 100 200.0
324-
4 100 100.0
325-
5 100 200.0
326-
6 100 100.0
327-
7 100 200.0
328-
8 100 100.0
329-
9 100 200.0
320+
0 100 100
321+
1 100 200
322+
2 100 100
323+
3 100 200
324+
4 100 100
325+
5 100 200
326+
6 100 100
327+
7 100 200
328+
8 100 100
329+
9 100 200
330330

331331

332332
-- !query 39

0 commit comments

Comments
 (0)