Skip to content

Commit 5837b38

Browse files
committed
added some python unit tests
added more conversion tests short type should have a bit-width of 16 closes apache#17
1 parent 5dbad22 commit 5837b38

17 files changed

+1117
-18
lines changed

python/pyspark/sql/tests.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2354,27 +2354,43 @@ class ArrowTests(ReusedPySparkTestCase):
23542354
def setUpClass(cls):
23552355
ReusedPySparkTestCase.setUpClass()
23562356
cls.spark = SparkSession(cls.sc)
2357+
cls.schema = StructType([
2358+
StructField("str_t", StringType(), True),
2359+
StructField("int_t", IntegerType(), True),
2360+
StructField("long_t", LongType(), True),
2361+
StructField("float_t", FloatType(), True),
2362+
StructField("double_t", DoubleType(), True)])
2363+
cls.data = [("a", 1, 10, 0.2, 2.0),
2364+
("b", 2, 20, 0.4, 4.0),
2365+
("c", 3, 30, 0.8, 6.0)]
23572366

23582367
def assertFramesEqual(self, df_with_arrow, df_without):
23592368
msg = ("DataFrame from Arrow is not equal" +
23602369
("\n\nWith Arrow:\n%s\n%s" % (df_with_arrow, df_with_arrow.dtypes)) +
23612370
("\n\nWithout:\n%s\n%s" % (df_without, df_without.dtypes)))
23622371
self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
23632372

2364-
def test_arrow_toPandas(self):
2365-
schema = StructType([
2366-
StructField("str_t", StringType(), True), # Fails in conversion
2367-
StructField("int_t", IntegerType(), True), # Fails, without is converted to int64
2368-
StructField("long_t", LongType(), True), # Fails if nullable=False
2369-
StructField("double_t", DoubleType(), True)])
2370-
data = [("a", 1, 10, 2.0),
2371-
("b", 2, 20, 4.0),
2372-
("c", 3, 30, 6.0)]
2373+
def test_null_conversion(self):
2374+
df_null = self.spark.createDataFrame([tuple([None for _ in range(len(self.data[0]))])] +
2375+
self.data)
2376+
pdf = df_null.toPandas(useArrow=True)
2377+
null_counts = pdf.isnull().sum().tolist()
2378+
self.assertTrue(all([c == 1 for c in null_counts]))
2379+
2380+
def test_toPandas_arrow_toggle(self):
2381+
df = self.spark.createDataFrame(self.data, schema=self.schema)
2382+
# NOTE - toPandas(useArrow=False) will infer standard data types
2383+
df_sel = df.select("str_t", "long_t", "double_t")
2384+
pdf = df_sel.toPandas(useArrow=False)
2385+
pdf_arrow = df_sel.toPandas(useArrow=True)
2386+
self.assertFramesEqual(pdf_arrow, pdf)
23732387

2374-
df = self.spark.createDataFrame(data, schema=schema)
2375-
df = df.select("long_t", "double_t")
2376-
pdf = df.toPandas(useArrow=False)
2377-
pdf_arrow = df.toPandas(useArrow=True)
2388+
def test_pandas_round_trip(self):
2389+
import pandas as pd
2390+
data_dict = {name: [self.data[i][j] for i in range(len(self.data))]
2391+
for j, name in enumerate(self.schema.names)}
2392+
pdf = pd.DataFrame(data=data_dict)
2393+
pdf_arrow = self.spark.createDataFrame(pdf).toPandas(useArrow=True)
23782394
self.assertFramesEqual(pdf_arrow, pdf)
23792395

23802396

sql/core/src/main/scala/org/apache/spark/sql/Arrow.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ object Arrow {
5353
buf.writeBoolean(row.getBoolean(ordinal)))
5454
case ShortType =>
5555
TypeFuncs(
56-
() => new ArrowType.Int(4 * ShortType.defaultSize, true), // TODO - check on this
56+
() => new ArrowType.Int(8 * ShortType.defaultSize, true),
5757
(buf: ArrowBuf) => buf.writeShort(0),
5858
(row: InternalRow, ordinal: Int, buf: ArrowBuf) => buf.writeShort(row.getShort(ordinal)))
5959
case IntegerType =>
@@ -127,7 +127,7 @@ object Arrow {
127127
val numOfRows = rows.length
128128

129129
field.dataType match {
130-
case IntegerType | LongType | DoubleType | FloatType | BooleanType | ByteType =>
130+
case ShortType | IntegerType | LongType | DoubleType | FloatType | BooleanType | ByteType =>
131131
val validityVector = new BitVector("validity", allocator)
132132
val validityMutator = validityVector.getMutator
133133
validityVector.allocateNew(numOfRows)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"schema": {
3+
"fields": [
4+
{
5+
"name": "a",
6+
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
7+
"nullable": false,
8+
"children": [],
9+
"typeLayout": {
10+
"vectors": [
11+
{"type": "VALIDITY", "typeBitWidth": 1},
12+
{"type": "DATA", "typeBitWidth": 32}
13+
]
14+
}
15+
},
16+
{
17+
"name": "b",
18+
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
19+
"nullable": false,
20+
"children": [],
21+
"typeLayout": {
22+
"vectors": [
23+
{"type": "VALIDITY", "typeBitWidth": 1},
24+
{"type": "DATA", "typeBitWidth": 32}
25+
]
26+
}
27+
}
28+
]
29+
},
30+
31+
"batches": [
32+
{
33+
"count": 6,
34+
"columns": [
35+
{
36+
"name": "a",
37+
"count": 6,
38+
"VALIDITY": [1, 1, 1, 1, 1, 1],
39+
"DATA": [1, 1, 2, 2, 3, 3]
40+
},
41+
{
42+
"name": "b",
43+
"count": 6,
44+
"VALIDITY": [1, 1, 1, 1, 1, 1],
45+
"DATA": [1, 2, 1, 2, 1, 2]
46+
}
47+
]
48+
}
49+
]
50+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"schema": {
3+
"fields": [
4+
{
5+
"name": "i",
6+
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
7+
"nullable": false,
8+
"children": [],
9+
"typeLayout": {
10+
"vectors": [
11+
{"type": "VALIDITY", "typeBitWidth": 1},
12+
{"type": "DATA", "typeBitWidth": 8}
13+
]
14+
}
15+
},
16+
{
17+
"name": "a_d",
18+
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
19+
"nullable": false,
20+
"children": [],
21+
"typeLayout": {
22+
"vectors": [
23+
{"type": "VALIDITY", "typeBitWidth": 1},
24+
{"type": "DATA", "typeBitWidth": 32}
25+
]
26+
}
27+
},
28+
{
29+
"name": "b_d",
30+
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
31+
"nullable": true,
32+
"children": [],
33+
"typeLayout": {
34+
"vectors": [
35+
{"type": "VALIDITY", "typeBitWidth": 1},
36+
{"type": "DATA", "typeBitWidth": 32}
37+
]
38+
}
39+
}
40+
]
41+
},
42+
43+
"batches": [
44+
{
45+
"count": 6,
46+
"columns": [
47+
{
48+
"name": "i",
49+
"count": 6,
50+
"VALIDITY": [1, 1, 1, 1, 1, 1],
51+
"DATA": [1, 2, 3, 4, 5, 6]
52+
},
53+
{
54+
"name": "a_d",
55+
"count": 6,
56+
"VALIDITY": [1, 1, 1, 1, 1, 1],
57+
"DATA": [1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0]
58+
},
59+
{
60+
"name": "b_d",
61+
"count": 6,
62+
"VALIDITY": [1, 0, 0, 1, 0, 1],
63+
"DATA": [1.1, 0, 0, 2.2, 0, 3.3]
64+
}
65+
]
66+
}
67+
]
68+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"schema": {
3+
"fields": [
4+
{
5+
"name": "i",
6+
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
7+
"nullable": false,
8+
"children": [],
9+
"typeLayout": {
10+
"vectors": [
11+
{"type": "VALIDITY", "typeBitWidth": 1},
12+
{"type": "DATA", "typeBitWidth": 8}
13+
]
14+
}
15+
},
16+
{
17+
"name": "a_f",
18+
"type": {"name": "floatingpoint", "precision": "SINGLE"},
19+
"nullable": false,
20+
"children": [],
21+
"typeLayout": {
22+
"vectors": [
23+
{"type": "VALIDITY", "typeBitWidth": 1},
24+
{"type": "DATA", "typeBitWidth": 32}
25+
]
26+
}
27+
},
28+
{
29+
"name": "b_f",
30+
"type": {"name": "floatingpoint", "precision": "SINGLE"},
31+
"nullable": true,
32+
"children": [],
33+
"typeLayout": {
34+
"vectors": [
35+
{"type": "VALIDITY", "typeBitWidth": 1},
36+
{"type": "DATA", "typeBitWidth": 32}
37+
]
38+
}
39+
}
40+
]
41+
},
42+
43+
"batches": [
44+
{
45+
"count": 6,
46+
"columns": [
47+
{
48+
"name": "i",
49+
"count": 6,
50+
"VALIDITY": [1, 1, 1, 1, 1, 1],
51+
"DATA": [1, 2, 3, 4, 5, 6]
52+
},
53+
{
54+
"name": "a_f",
55+
"count": 6,
56+
"VALIDITY": [1, 1, 1, 1, 1, 1],
57+
"DATA": [1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0]
58+
},
59+
{
60+
"name": "b_f",
61+
"count": 6,
62+
"VALIDITY": [1, 0, 0, 1, 0, 1],
63+
"DATA": [1.1, 0, 0, 2.2, 0, 3.3]
64+
}
65+
]
66+
}
67+
]
68+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"schema": {
3+
"fields": [
4+
{
5+
"name": "i",
6+
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
7+
"nullable": false,
8+
"children": [],
9+
"typeLayout": {
10+
"vectors": [
11+
{"type": "VALIDITY", "typeBitWidth": 1},
12+
{"type": "DATA", "typeBitWidth": 8}
13+
]
14+
}
15+
}
16+
]
17+
},
18+
19+
"batches": [
20+
{
21+
"count": 6,
22+
"columns": [
23+
{
24+
"name": "i",
25+
"count": 6,
26+
"VALIDITY": [1, 1, 1, 1, 1, 1],
27+
"DATA": [1, 2, 3, 4, 5, 6]
28+
}
29+
]
30+
}
31+
]
32+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"schema": {
3+
"fields": [
4+
{
5+
"name": "i",
6+
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
7+
"nullable": false,
8+
"children": [],
9+
"typeLayout": {
10+
"vectors": [
11+
{"type": "VALIDITY", "typeBitWidth": 1},
12+
{"type": "DATA", "typeBitWidth": 8}
13+
]
14+
}
15+
},
16+
{
17+
"name": "a_i",
18+
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
19+
"nullable": false,
20+
"children": [],
21+
"typeLayout": {
22+
"vectors": [
23+
{"type": "VALIDITY", "typeBitWidth": 1},
24+
{"type": "DATA", "typeBitWidth": 32}
25+
]
26+
}
27+
},
28+
{
29+
"name": "b_i",
30+
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
31+
"nullable": true,
32+
"children": [],
33+
"typeLayout": {
34+
"vectors": [
35+
{"type": "VALIDITY", "typeBitWidth": 1},
36+
{"type": "DATA", "typeBitWidth": 32}
37+
]
38+
}
39+
}
40+
]
41+
},
42+
43+
"batches": [
44+
{
45+
"count": 6,
46+
"columns": [
47+
{
48+
"name": "i",
49+
"count": 6,
50+
"VALIDITY": [1, 1, 1, 1, 1, 1],
51+
"DATA": [1, 2, 3, 4, 5, 6]
52+
},
53+
{
54+
"name": "a_i",
55+
"count": 6,
56+
"VALIDITY": [1, 1, 1, 1, 1, 1],
57+
"DATA": [1, -1, 2, -2, 2147483647, -2147483648]
58+
},
59+
{
60+
"name": "b_i",
61+
"count": 6,
62+
"VALIDITY": [1, 0, 0, 1, 0, 1],
63+
"DATA": [1, -1, 2, -2, 2147483647, -2147483648]
64+
}
65+
]
66+
}
67+
]
68+
}

0 commit comments

Comments
 (0)