Skip to content

Commit 640000b

Browse files
committed
[SPARK-10434] [SQL] Fixes Parquet schema of arrays that may contain null
To keep full compatibility of Parquet write path with Spark 1.4, we should rename the innermost field name of arrays that may contain null from "array_element" to "array". Please refer to [SPARK-10434] [1] for more details. [1]: https://issues.apache.org/jira/browse/SPARK-10434 Author: Cheng Lian <[email protected]> Closes apache#8586 from liancheng/spark-10434/fix-parquet-array-type. (cherry picked from commit bca8c07) Signed-off-by: Cheng Lian <[email protected]>
1 parent ec750a7 commit 640000b

File tree

2 files changed

+10
-9
lines changed

2 files changed

+10
-9
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -432,13 +432,14 @@ private[parquet] class CatalystSchemaConverter(
432432
// ArrayType and MapType (for Spark versions <= 1.4.x)
433433
// ===================================================
434434

435-
// Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
436-
// LIST structure. This behavior mimics parquet-hive (1.6.0rc3). Note that this case is
437-
// covered by the backwards-compatibility rules implemented in `isElementType()`.
435+
// Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
436+
// `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro
437+
// (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
438+
// field name "array" is borrowed from parquet-avro.
438439
case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
439440
// <list-repetition> group <name> (LIST) {
440441
// optional group bag {
441-
// repeated <element-type> element;
442+
// repeated <element-type> array;
442443
// }
443444
// }
444445
ConversionPatterns.listType(
@@ -447,8 +448,8 @@ private[parquet] class CatalystSchemaConverter(
447448
Types
448449
.buildGroup(REPEATED)
449450
// "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
450-
.addField(convertField(StructField("array_element", elementType, nullable)))
451-
.named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
451+
.addField(convertField(StructField("array", elementType, nullable)))
452+
.named("bag"))
452453

453454
// Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
454455
// LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
196196
|message root {
197197
| optional group _1 (LIST) {
198198
| repeated group bag {
199-
| optional int32 array_element;
199+
| optional int32 array;
200200
| }
201201
| }
202202
|}
@@ -265,7 +265,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
265265
| optional binary _1 (UTF8);
266266
| optional group _2 (LIST) {
267267
| repeated group bag {
268-
| optional group array_element {
268+
| optional group array {
269269
| required int32 _1;
270270
| required double _2;
271271
| }
@@ -644,7 +644,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
644644
"""message root {
645645
| optional group f1 (LIST) {
646646
| repeated group bag {
647-
| optional int32 array_element;
647+
| optional int32 array;
648648
| }
649649
| }
650650
|}

0 commit comments

Comments
 (0)