Skip to content

Commit e32dab2

Browse files
Jacky Lijeanlyn
authored andcommitted
[SPARK-4699] [SQL] Make caseSensitive configurable in spark sql analyzer
based on apache#3558 Author: Jacky Li <[email protected]> Author: wangfei <[email protected]> Author: scwf <[email protected]> Closes apache#5806 from scwf/case and squashes the following commits: cd51712 [wangfei] fix compile d4b724f [wangfei] address michael's comment af512c7 [wangfei] fix conflicts 4ef1be7 [wangfei] fix conflicts 269cf21 [scwf] fix conflicts b73df6c [scwf] style issue 9e11752 [scwf] improve SimpleCatalystConf b35529e [scwf] minor style a3f7659 [scwf] remove unsed imports 2a56515 [scwf] fix conflicts 6db4bf5 [scwf] also fix for HiveContext 7fc4a98 [scwf] fix test case d5a9933 [wangfei] fix style eee75ba [wangfei] fix EmptyConf 6ef31cf [wangfei] revert pom changes 5d7c456 [wangfei] set CASE_SENSITIVE false in TestHive 966e719 [wangfei] set CASE_SENSITIVE false in hivecontext fd30e25 [wangfei] added override 69b3b70 [wangfei] fix AnalysisSuite 5472b08 [wangfei] fix compile issue 56034ca [wangfei] fix conflicts and improve for catalystconf 664d1e9 [Jacky Li] Merge branch 'master' of https://github.com/apache/spark into case 12eca9a [Jacky Li] solve conflict with master 39e369c [Jacky Li] fix confilct after DataFrame PR dee56e9 [Jacky Li] fix test case failure 05b09a3 [Jacky Li] fix conflict base on the latest master branch 73c16b1 [Jacky Li] fix bug in sql/hive 9bf4cc7 [Jacky Li] fix bug in catalyst 005c56d [Jacky Li] make SQLContext caseSensitivity configurable 6332e0f [Jacky Li] fix bug fcbf0d9 [Jacky Li] fix scalastyle check e7bca31 [Jacky Li] make caseSensitive configuration in Analyzer and Catalog 91b1b96 [Jacky Li] make caseSensitive configurable in Analyzer f57f15c [Jacky Li] add testcase 578d167 [Jacky Li] make caseSensitive configurable
1 parent efd0894 commit e32dab2

File tree

15 files changed

+127
-70
lines changed

15 files changed

+127
-70
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst
19+
20+
private[spark] trait CatalystConf {
21+
def caseSensitiveAnalysis: Boolean
22+
}
23+
24+
/**
25+
* A trivial conf that is empty. Used for testing when all
26+
* relations are already filled in and the analyser needs only to resolve attribute references.
27+
*/
28+
object EmptyConf extends CatalystConf {
29+
override def caseSensitiveAnalysis: Boolean = {
30+
throw new UnsupportedOperationException
31+
}
32+
}
33+
34+
/** A CatalystConf that can be used for local testing. */
35+
case class SimpleCatalystConf(caseSensitiveAnalysis: Boolean) extends CatalystConf

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,21 @@ package org.apache.spark.sql.catalyst.analysis
1919

2020
import scala.collection.mutable.ArrayBuffer
2121

22-
import org.apache.spark.util.collection.OpenHashSet
2322
import org.apache.spark.sql.AnalysisException
23+
import org.apache.spark.sql.catalyst.{SimpleCatalystConf, CatalystConf}
2424
import org.apache.spark.sql.catalyst.expressions._
2525
import org.apache.spark.sql.catalyst.plans.logical._
2626
import org.apache.spark.sql.catalyst.rules._
2727
import org.apache.spark.sql.types._
28+
import org.apache.spark.util.collection.OpenHashSet
2829

2930
/**
3031
* A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
3132
* when all relations are already filled in and the analyzer needs only to resolve attribute
3233
* references.
3334
*/
34-
object SimpleAnalyzer extends Analyzer(EmptyCatalog, EmptyFunctionRegistry, true)
35+
object SimpleAnalyzer
36+
extends Analyzer(EmptyCatalog, EmptyFunctionRegistry, new SimpleCatalystConf(true))
3537

3638
/**
3739
* Provides a logical query plan analyzer, which translates [[UnresolvedAttribute]]s and
@@ -41,11 +43,17 @@ object SimpleAnalyzer extends Analyzer(EmptyCatalog, EmptyFunctionRegistry, true
4143
class Analyzer(
4244
catalog: Catalog,
4345
registry: FunctionRegistry,
44-
caseSensitive: Boolean,
46+
conf: CatalystConf,
4547
maxIterations: Int = 100)
4648
extends RuleExecutor[LogicalPlan] with HiveTypeCoercion with CheckAnalysis {
4749

48-
val resolver = if (caseSensitive) caseSensitiveResolution else caseInsensitiveResolution
50+
def resolver: Resolver = {
51+
if (conf.caseSensitiveAnalysis) {
52+
caseSensitiveResolution
53+
} else {
54+
caseInsensitiveResolution
55+
}
56+
}
4957

5058
val fixedPoint = FixedPoint(maxIterations)
5159

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.analysis
1919

2020
import scala.collection.mutable
2121

22+
import org.apache.spark.sql.catalyst.CatalystConf
23+
import org.apache.spark.sql.catalyst.EmptyConf
2224
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery}
2325

2426
/**
@@ -34,7 +36,7 @@ class NoSuchDatabaseException extends Exception
3436
*/
3537
trait Catalog {
3638

37-
def caseSensitive: Boolean
39+
val conf: CatalystConf
3840

3941
def tableExists(tableIdentifier: Seq[String]): Boolean
4042

@@ -57,10 +59,10 @@ trait Catalog {
5759
def unregisterAllTables(): Unit
5860

5961
protected def processTableIdentifier(tableIdentifier: Seq[String]): Seq[String] = {
60-
if (!caseSensitive) {
61-
tableIdentifier.map(_.toLowerCase)
62-
} else {
62+
if (conf.caseSensitiveAnalysis) {
6363
tableIdentifier
64+
} else {
65+
tableIdentifier.map(_.toLowerCase)
6466
}
6567
}
6668

@@ -78,7 +80,7 @@ trait Catalog {
7880
}
7981
}
8082

81-
class SimpleCatalog(val caseSensitive: Boolean) extends Catalog {
83+
class SimpleCatalog(val conf: CatalystConf) extends Catalog {
8284
val tables = new mutable.HashMap[String, LogicalPlan]()
8385

8486
override def registerTable(
@@ -164,10 +166,10 @@ trait OverrideCatalog extends Catalog {
164166
}
165167

166168
abstract override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
167-
val dbName = if (!caseSensitive) {
168-
if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None
169-
} else {
169+
val dbName = if (conf.caseSensitiveAnalysis) {
170170
databaseName
171+
} else {
172+
if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None
171173
}
172174

173175
val temporaryTables = overrides.filter {
@@ -207,7 +209,7 @@ trait OverrideCatalog extends Catalog {
207209
*/
208210
object EmptyCatalog extends Catalog {
209211

210-
override val caseSensitive: Boolean = true
212+
override val conf: CatalystConf = EmptyConf
211213

212214
override def tableExists(tableIdentifier: Seq[String]): Boolean = {
213215
throw new UnsupportedOperationException

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,24 +23,26 @@ import org.apache.spark.sql.AnalysisException
2323
import org.apache.spark.sql.catalyst.expressions._
2424
import org.apache.spark.sql.catalyst.plans.logical._
2525
import org.apache.spark.sql.types._
26-
26+
import org.apache.spark.sql.catalyst.SimpleCatalystConf
2727
import org.apache.spark.sql.catalyst.dsl.expressions._
2828
import org.apache.spark.sql.catalyst.dsl.plans._
2929

3030
class AnalysisSuite extends FunSuite with BeforeAndAfter {
31-
val caseSensitiveCatalog = new SimpleCatalog(true)
32-
val caseInsensitiveCatalog = new SimpleCatalog(false)
31+
val caseSensitiveConf = new SimpleCatalystConf(true)
32+
val caseInsensitiveConf = new SimpleCatalystConf(false)
33+
34+
val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf)
35+
val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf)
3336

3437
val caseSensitiveAnalyzer =
35-
new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitive = true) {
38+
new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) {
3639
override val extendedResolutionRules = EliminateSubQueries :: Nil
3740
}
3841
val caseInsensitiveAnalyzer =
39-
new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseSensitive = false) {
42+
new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseInsensitiveConf) {
4043
override val extendedResolutionRules = EliminateSubQueries :: Nil
4144
}
4245

43-
4446
def caseSensitiveAnalyze(plan: LogicalPlan): Unit =
4547
caseSensitiveAnalyzer.checkAnalysis(caseSensitiveAnalyzer.execute(plan))
4648

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717

1818
package org.apache.spark.sql.catalyst.analysis
1919

20+
import org.scalatest.{BeforeAndAfter, FunSuite}
21+
2022
import org.apache.spark.sql.catalyst.expressions._
2123
import org.apache.spark.sql.catalyst.plans.logical.{Union, Project, LocalRelation}
2224
import org.apache.spark.sql.types._
23-
import org.scalatest.{BeforeAndAfter, FunSuite}
25+
import org.apache.spark.sql.catalyst.SimpleCatalystConf
2426

2527
class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter {
26-
val catalog = new SimpleCatalog(false)
27-
val analyzer = new Analyzer(catalog, EmptyFunctionRegistry, caseSensitive = false)
28+
val conf = new SimpleCatalystConf(true)
29+
val catalog = new SimpleCatalog(conf)
30+
val analyzer = new Analyzer(catalog, EmptyFunctionRegistry, conf)
2831

2932
val relation = LocalRelation(
3033
AttributeReference("i", IntegerType)(),

sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717

1818
package org.apache.spark.sql
1919

20+
import java.util.Properties
21+
2022
import scala.collection.immutable
2123
import scala.collection.JavaConversions._
2224

23-
import java.util.Properties
25+
import org.apache.spark.sql.catalyst.CatalystConf
2426

2527
private[spark] object SQLConf {
2628
val COMPRESS_CACHED = "spark.sql.inMemoryColumnarStorage.compressed"
@@ -32,6 +34,7 @@ private[spark] object SQLConf {
3234
val CODEGEN_ENABLED = "spark.sql.codegen"
3335
val UNSAFE_ENABLED = "spark.sql.unsafe.enabled"
3436
val DIALECT = "spark.sql.dialect"
37+
val CASE_SENSITIVE = "spark.sql.caseSensitive"
3538

3639
val PARQUET_BINARY_AS_STRING = "spark.sql.parquet.binaryAsString"
3740
val PARQUET_INT96_AS_TIMESTAMP = "spark.sql.parquet.int96AsTimestamp"
@@ -90,7 +93,8 @@ private[spark] object SQLConf {
9093
*
9194
* SQLConf is thread-safe (internally synchronized, so safe to be used in multiple threads).
9295
*/
93-
private[sql] class SQLConf extends Serializable {
96+
97+
private[sql] class SQLConf extends Serializable with CatalystConf {
9498
import SQLConf._
9599

96100
/** Only low degree of contention is expected for conf, thus NOT using ConcurrentHashMap. */
@@ -164,6 +168,11 @@ private[sql] class SQLConf extends Serializable {
164168
*/
165169
private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, "false").toBoolean
166170

171+
/**
172+
* caseSensitive analysis true by default
173+
*/
174+
def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, "true").toBoolean
175+
167176
/**
168177
* When set to true, Spark SQL will use managed memory for certain operations. This option only
169178
* takes effect if codegen is enabled.

sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,15 +148,15 @@ class SQLContext(@transient val sparkContext: SparkContext)
148148

149149
// TODO how to handle the temp table per user session?
150150
@transient
151-
protected[sql] lazy val catalog: Catalog = new SimpleCatalog(true)
151+
protected[sql] lazy val catalog: Catalog = new SimpleCatalog(conf)
152152

153153
// TODO how to handle the temp function per user session?
154154
@transient
155155
protected[sql] lazy val functionRegistry: FunctionRegistry = new SimpleFunctionRegistry(true)
156156

157157
@transient
158158
protected[sql] lazy val analyzer: Analyzer =
159-
new Analyzer(catalog, functionRegistry, caseSensitive = true) {
159+
new Analyzer(catalog, functionRegistry, conf) {
160160
override val extendedResolutionRules =
161161
ExtractPythonUdfs ::
162162
sources.PreInsertCastAndRename ::

sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import org.scalatest.BeforeAndAfterAll
2222
import org.apache.spark.sql.catalyst.errors.DialectException
2323
import org.apache.spark.sql.execution.GeneratedAggregate
2424
import org.apache.spark.sql.functions._
25+
import org.apache.spark.sql.catalyst.CatalystConf
2526
import org.apache.spark.sql.TestData._
2627
import org.apache.spark.sql.test.TestSQLContext
2728
import org.apache.spark.sql.test.TestSQLContext.{udf => _, _}
@@ -1277,6 +1278,15 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
12771278
checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
12781279
}
12791280

1281+
test("SPARK-4699 case sensitivity SQL query") {
1282+
setConf(SQLConf.CASE_SENSITIVE, "false")
1283+
val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
1284+
val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
1285+
rdd.toDF().registerTempTable("testTable1")
1286+
checkAnswer(sql("SELECT VALUE FROM TESTTABLE1 where KEY = 1"), Row("val_1"))
1287+
setConf(SQLConf.CASE_SENSITIVE, "true")
1288+
}
1289+
12801290
test("SPARK-6145: ORDER BY test for nested fields") {
12811291
jsonRDD(sparkContext.makeRDD("""{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""" :: Nil))
12821292
.registerTempTable("nestedOrder")

sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,13 @@
1818
package org.apache.spark.sql.sources
1919

2020
import org.apache.spark.sql._
21-
import org.apache.spark.sql.catalyst.analysis.Analyzer
21+
import org.apache.spark.sql.catalyst.CatalystConf
2222
import org.apache.spark.sql.test.TestSQLContext
2323
import org.scalatest.BeforeAndAfter
2424

2525
abstract class DataSourceTest extends QueryTest with BeforeAndAfter {
26-
// Case sensitivity is not configurable yet, but we want to test some edge cases.
27-
// TODO: Remove when it is configurable
28-
implicit val caseInsensisitiveContext = new SQLContext(TestSQLContext.sparkContext) {
29-
@transient
30-
override protected[sql] lazy val analyzer: Analyzer =
31-
new Analyzer(catalog, functionRegistry, caseSensitive = false) {
32-
override val extendedResolutionRules =
33-
PreInsertCastAndRename ::
34-
Nil
26+
// We want to test some edge cases.
27+
implicit val caseInsensisitiveContext = new SQLContext(TestSQLContext.sparkContext)
3528

36-
override val extendedCheckRules = Seq(
37-
sources.PreWriteCheck(catalog)
38-
)
39-
}
40-
}
29+
caseInsensisitiveContext.setConf(SQLConf.CASE_SENSITIVE, "false")
4130
}
42-

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, Query
4646
import org.apache.spark.sql.hive.client._
4747
import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
4848
import org.apache.spark.sql.sources.{DDLParser, DataSourceStrategy}
49+
import org.apache.spark.sql.catalyst.CatalystConf
4950
import org.apache.spark.sql.types._
5051
import org.apache.spark.util.Utils
5152

@@ -329,7 +330,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
329330
/* An analyzer that uses the Hive metastore. */
330331
@transient
331332
override protected[sql] lazy val analyzer =
332-
new Analyzer(catalog, functionRegistry, caseSensitive = false) {
333+
new Analyzer(catalog, functionRegistry, conf) {
333334
override val extendedResolutionRules =
334335
catalog.ParquetConversions ::
335336
catalog.CreateTables ::
@@ -350,6 +351,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
350351
protected[hive] class SQLSession extends super.SQLSession {
351352
protected[sql] override lazy val conf: SQLConf = new SQLConf {
352353
override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
354+
override def caseSensitiveAnalysis: Boolean =
355+
getConf(SQLConf.CASE_SENSITIVE, "false").toBoolean
353356
}
354357

355358
/**

0 commit comments

Comments
 (0)