@@ -20,15 +20,15 @@ package org.apache.spark.sql.parquet
20
20
21
21
import java .io .File
22
22
23
- import org .apache .spark .sql .catalyst .expressions .Row
24
23
import org .scalatest .BeforeAndAfterAll
25
24
26
25
import org .apache .spark .sql .{SQLConf , QueryTest }
26
+ import org .apache .spark .sql .catalyst .expressions .Row
27
27
import org .apache .spark .sql .execution .PhysicalRDD
28
28
import org .apache .spark .sql .hive .execution .HiveTableScan
29
29
import org .apache .spark .sql .hive .test .TestHive ._
30
30
import org .apache .spark .sql .hive .test .TestHive .implicits ._
31
-
31
+ import org . apache . spark . sql . sources . LogicalRelation
32
32
33
33
// The data where the partitioning key exists only in the directory structure.
34
34
case class ParquetData (intField : Int , stringField : String )
@@ -121,13 +121,123 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
121
121
122
122
override def beforeAll (): Unit = {
123
123
super .beforeAll()
124
+
125
+ val rdd = sparkContext.parallelize((1 to 10 ).map(i => s """ {"a": $i, "b":"str ${i}"} """ ))
126
+ jsonRDD(rdd).registerTempTable(" jt" )
127
+
128
+ sql(
129
+ """
130
+ |create table test_parquet
131
+ |(
132
+ | intField INT,
133
+ | stringField STRING
134
+ |)
135
+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
136
+ |STORED AS
137
+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
138
+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
139
+ """ .stripMargin)
140
+
124
141
conf.setConf(SQLConf .PARQUET_USE_DATA_SOURCE_API , " true" )
125
142
}
126
143
127
144
override def afterAll (): Unit = {
128
145
super .afterAll()
146
+ sql(" DROP TABLE IF EXISTS jt" )
147
+ sql(" DROP TABLE IF EXISTS test_parquet" )
148
+
129
149
setConf(SQLConf .PARQUET_USE_DATA_SOURCE_API , originalConf.toString)
130
150
}
151
+
152
+ test(" scan an empty parquet table" ) {
153
+ checkAnswer(sql(" SELECT count(*) FROM test_parquet" ), Row (0 ))
154
+ }
155
+
156
+ test(" scan an empty parquet table with upper case" ) {
157
+ checkAnswer(sql(" SELECT count(INTFIELD) FROM TEST_parquet" ), Row (0 ))
158
+ }
159
+
160
+ test(" insert into an empty parquet table" ) {
161
+ sql(
162
+ """
163
+ |create table test_insert_parquet
164
+ |(
165
+ | intField INT,
166
+ | stringField STRING
167
+ |)
168
+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
169
+ |STORED AS
170
+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
171
+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
172
+ """ .stripMargin)
173
+
174
+ // Insert into am empty table.
175
+ sql(" insert into table test_insert_parquet select a, b from jt where jt.a > 5" )
176
+ checkAnswer(
177
+ sql(s " SELECT intField, stringField FROM test_insert_parquet WHERE intField < 8 " ),
178
+ Row (6 , " str6" ) :: Row (7 , " str7" ) :: Nil
179
+ )
180
+ // Insert overwrite.
181
+ sql(" insert overwrite table test_insert_parquet select a, b from jt where jt.a < 5" )
182
+ checkAnswer(
183
+ sql(s " SELECT intField, stringField FROM test_insert_parquet WHERE intField > 2 " ),
184
+ Row (3 , " str3" ) :: Row (4 , " str4" ) :: Nil
185
+ )
186
+ sql(" DROP TABLE IF EXISTS test_insert_parquet" )
187
+
188
+ // Create it again.
189
+ sql(
190
+ """
191
+ |create table test_insert_parquet
192
+ |(
193
+ | intField INT,
194
+ | stringField STRING
195
+ |)
196
+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
197
+ |STORED AS
198
+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
199
+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
200
+ """ .stripMargin)
201
+ // Insert overwrite an empty table.
202
+ sql(" insert overwrite table test_insert_parquet select a, b from jt where jt.a < 5" )
203
+ checkAnswer(
204
+ sql(s " SELECT intField, stringField FROM test_insert_parquet WHERE intField > 2 " ),
205
+ Row (3 , " str3" ) :: Row (4 , " str4" ) :: Nil
206
+ )
207
+ // Insert into the table.
208
+ sql(" insert into table test_insert_parquet select a, b from jt" )
209
+ checkAnswer(
210
+ sql(s " SELECT intField, stringField FROM test_insert_parquet " ),
211
+ (1 to 10 ).map(i => Row (i, s " str $i" )) ++ (1 to 4 ).map(i => Row (i, s " str $i" ))
212
+ )
213
+ sql(" DROP TABLE IF EXISTS test_insert_parquet" )
214
+ }
215
+
216
+ test(" scan a parquet table created through a CTAS statement" ) {
217
+ sql(
218
+ """
219
+ |create table test_parquet_ctas ROW FORMAT
220
+ |SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
221
+ |STORED AS
222
+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
223
+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
224
+ |AS select * from jt
225
+ """ .stripMargin)
226
+
227
+ checkAnswer(
228
+ sql(s " SELECT a, b FROM test_parquet_ctas WHERE a = 1 " ),
229
+ Seq (Row (1 , " str1" ))
230
+ )
231
+
232
+ table(" test_parquet_ctas" ).queryExecution.analyzed match {
233
+ case LogicalRelation (p : ParquetRelation2 ) => // OK
234
+ case _ =>
235
+ fail(
236
+ s " test_parquet_ctas should be converted to ${classOf [ParquetRelation2 ].getCanonicalName}" )
237
+ }
238
+
239
+ sql(" DROP TABLE IF EXISTS test_parquet_ctas" )
240
+ }
131
241
}
132
242
133
243
class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {
0 commit comments