[SPARK-4386] Improve performance when writing Parquet files

MickDavies · MickDavies · commit 892519d3bb71 · 2014-12-30T13:00:25.000Z
Convert type of RowWriteSupport.attributes to Array.

Analysis of performance for writing very wide tables shows that time is spent predominantly in apply method on  attributes var. Type of attributes previously was LinearSeqOptimized and apply is O(N) which made write O(N squared).

Measurements on 575 column table showed this change showed a 6x improvement in write times.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -130,15 +130,15 @@ private[parquet] object RowReadSupport {
 private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
 
   private[parquet] var writer: RecordConsumer = null
-  private[parquet] var attributes: Seq[Attribute] = null
+  private[parquet] var attributes: Array[Attribute] = null
 
   override def init(configuration: Configuration): WriteSupport.WriteContext = {
     val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
     val metadata = new JHashMap[String, String]()
     metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
 
     if (attributes == null) {
-      attributes = ParquetTypesConverter.convertFromString(origAttributesStr)
+      attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray
     }
 
     log.debug(s"write support initialized for requested schema $attributes")