Skip to content

Commit c8803c0

Browse files
Eric Wassermancloud-fan
authored andcommitted
[SPARK-16548][SQL] Inconsistent error handling in JSON parsing SQL functions
## What changes were proposed in this pull request? change to using Jackson's `com.fasterxml.jackson.core.JsonFactory` public JsonParser createParser(String content) ## How was this patch tested? existing unit tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Eric Wasserman <[email protected]> Closes #17693 from ewasserman/SPARK-20314. (cherry picked from commit 57e1da3) Signed-off-by: Wenchen Fan <[email protected]>
1 parent f0de600 commit c8803c0

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.expressions
1919

20-
import java.io.{ByteArrayOutputStream, CharArrayWriter, StringWriter}
20+
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, CharArrayWriter, InputStreamReader, StringWriter}
2121

2222
import scala.util.parsing.combinator.RegexParsers
2323

@@ -149,7 +149,10 @@ case class GetJsonObject(json: Expression, path: Expression)
149149

150150
if (parsed.isDefined) {
151151
try {
152-
Utils.tryWithResource(jsonFactory.createParser(jsonStr.getBytes)) { parser =>
152+
/* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson
153+
detect character encoding which could fail for some malformed strings */
154+
Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
155+
new ByteArrayInputStream(jsonStr.getBytes), "UTF-8"))) { parser =>
153156
val output = new ByteArrayOutputStream()
154157
val matched = Utils.tryWithResource(
155158
jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { generator =>
@@ -393,7 +396,10 @@ case class JsonTuple(children: Seq[Expression])
393396
}
394397

395398
try {
396-
Utils.tryWithResource(jsonFactory.createParser(json.getBytes)) {
399+
/* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson
400+
detect character encoding which could fail for some malformed strings */
401+
Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
402+
new ByteArrayInputStream(json.getBytes), "UTF-8"))) {
397403
parser => parseRow(parser, input)
398404
}
399405
} catch {

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
3939
|"fb:testid":"1234"}
4040
|""".stripMargin
4141

42+
/* invalid json with leading nulls would trigger java.io.CharConversionException
43+
in Jackson's JsonFactory.createParser(byte[]) due to RFC-4627 encoding detection */
44+
val badJson = "\0\0\0A\1AAA"
45+
4246
test("$.store.bicycle") {
4347
checkEvaluation(
4448
GetJsonObject(Literal(json), Literal("$.store.bicycle")),
@@ -224,6 +228,13 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
224228
null)
225229
}
226230

231+
test("SPARK-16548: character conversion") {
232+
checkEvaluation(
233+
GetJsonObject(Literal(badJson), Literal("$.a")),
234+
null
235+
)
236+
}
237+
227238
test("non foldable literal") {
228239
checkEvaluation(
229240
GetJsonObject(NonFoldableLiteral(json), NonFoldableLiteral("$.fb:testid")),
@@ -340,6 +351,12 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
340351
InternalRow(null, null, null, null, null))
341352
}
342353

354+
test("SPARK-16548: json_tuple - invalid json with leading nulls") {
355+
checkJsonTuple(
356+
JsonTuple(Literal(badJson) :: jsonTupleQuery),
357+
InternalRow(null, null, null, null, null))
358+
}
359+
343360
test("json_tuple - preserve newlines") {
344361
checkJsonTuple(
345362
JsonTuple(Literal("{\"a\":\"b\nc\"}") :: Literal("a") :: Nil),

0 commit comments

Comments
 (0)